[pixiv] transition to pixiv public api

2015-05-14 19:08:20 +02:00
parent c8a942a77a
commit 68c4306040
2 changed files with 122 additions and 112 deletions
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -8,115 +8,96 @@

 """Extract images and ugoira from http://www.pixiv.net/"""

-from .common import AsynchronousExtractor
+from .common import SequentialExtractor
 from .common import Message
-from .common import safe_request
 import re
-import csv
+import json
 import requests

 info = {
    "category": "pixiv",
    "extractor": "PixivExtractor",
-    "directory": ["{category}", "{artist-id}"],
-    "filename": "{category}_{artist-id}_{illust-id}{num}.{extension}",
+    "directory": ["{category}", "{artist-id}-{artist-nick}"],
+    "filename": "{category}_{artist-id}_{id}{num}.{extension}",
    "pattern": [
        r"(?:https?://)?(?:www\.)?pixiv\.net/member(?:_illust)?\.php\?id=(\d+)",
    ],
 }


-class PixivExtractor(AsynchronousExtractor):
+class PixivExtractor(SequentialExtractor):

    member_url = "http://www.pixiv.net/member_illust.php"
    illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"

-    singl_v1_fmt = ("http://i{thumbnail-url[8]}.pixiv.net/img{directory:>02}"
-                    "/img/{artist-nick}/{illust-id}.{extension}")
-    manga_v1_fmt = ("http://i{thumbnail-url[8]}.pixiv.net/img{directory:>02}"
-                    "/img/{artist-nick}/{illust-id}{big}_p{index}.{extension}")
-
-    singl_v2_fmt = ("http://i{thumbnail-url[8]}.pixiv.net/img-original/img"
-                    "/{url-date}/{illust-id}_p0.{extension}")
-    manga_v2_fmt = ("http://i{thumbnail-url[8]}.pixiv.net/img-original/img"
-                    "/{url-date}/{illust-id}_p{index}.{extension}")
-
    def __init__(self, match, config):
-        AsynchronousExtractor.__init__(self, config)
+        SequentialExtractor.__init__(self, config)
        self.config = config
        self.artist_id = match.group(1)
-        self.api = PixivAPI(config["pixiv-cookies"]["PHPSESSID"])
-        self.session.headers.update({"Referer": "http://www.pixiv.net/"})
-        self.session.cookies.update(self.config["pixiv-cookies"])
+        self.api = PixivAPI(self.session)

    def items(self):
+        self.api.login(
+            self.config.get("pixiv", "username"),
+            self.config.get("pixiv", "password"),
+        )
+        metadata = self.get_job_metadata()
+
        yield Message.Version, 1
        yield Message.Headers, self.session.headers
        yield Message.Cookies, self.session.cookies
-        yield Message.Directory, self.get_job_metadata()
+        yield Message.Directory, metadata

-        for illust_id in self.get_illust_ids():
-            data = self.api.request(illust_id)
-            # debug
-            # for i, value in enumerate(data):
-                # print("{:02}: {}".format(i, value))
-            # return
-            # debug end
+        for work in self.get_works():
+            work.update(metadata)

-            # if "うごイラ" in data["tags"]:
-                # ugoira / animations
-                    # url, framelist = self.parse_ugoira(img)
-                    # data[2] = "zip"
-                    # yield (url, sname_fmt.format(*data))
-                    # data[2] = "txt"
-                    # yield (framelist, sname_fmt.format(*data))
-                    # continue
+            if work["type"] == "ugoira":
+                url, framelist = self.parse_ugoira(work["id"])
+                work["extension"] = "zip"
+                yield Message.Url, url, work.copy()
+                work["extension"] = "txt"
+                yield Message.Url, "text://"+framelist, work
+
+            elif work["page_count"] == 1:
+                yield Message.Url, work["url"], work

-            # images
-            if illust_id > 46270949:
-                big = ""
-                url_s_fmt = self.singl_v2_fmt
-                url_m_fmt = self.manga_v2_fmt
            else:
-                big = "_big" if illust_id > 11319935 else ""
-                url_s_fmt = self.singl_v1_fmt
-                url_m_fmt = self.manga_v1_fmt
+                url = work["url"]
+                ext = work["extension"]
+                if work["id"] > 11319935 and "/img-original/" not in url:
+                    big = "_big"
+                else:
+                    big = ""
+                if url[-6] == "p":
+                    part = url[:-7]
+                else:
+                    part = url[:-4]
+                for i in range(work["page_count"]):
+                    work["num"] = "_p{:02}".format(i)
+                    url = "{}{}_p{}.{}".format(part, big, i, ext)
+                    yield Message.Url, url, work.copy()

-            if not data["count"]:
-                yield Message.Url, url_s_fmt.format(**data), data
-            else:
-                for i in range(0, int(data["count"])):
-                    data["num"] = "_p{:02}".format(i)
-                    yield (Message.Url,
-                           url_m_fmt.format(index=i, big=big, **data),
-                           data.copy())
-
-    def get_illust_ids(self):
-        """Yield all illust-ids for a pixiv-member"""
-        needle = ('<li class="image-item "><a href="'
-                  '/member_illust.php?mode=medium&amp;illust_id=')
-        params = {"id": self.artist_id, "p": 1}
+    def get_works(self):
+        """Yield all work-items for a pixiv-member"""
+        page = 1
        while True:
-            text = self.request(self.member_url, params=params).text
-            pos = 0
-            found = 0
-            while True:
-                illust_id, pos = self.extract(text, needle, '"', pos)
-                if illust_id is None:
-                    break
-                found += 1
-                yield int(illust_id)
-            if found != 20:
+            data = self.api.user_works(self.artist_id, page)
+            for work in data["response"]:
+                url = work["image_urls"]["large"]
+                work["num"] = ""
+                work["url"] = url
+                work["extension"] = url[url.rfind(".")+1:]
+                yield work
+            pinfo = data["pagination"]
+            if pinfo["current"] == pinfo["pages"]:
                return
-            params["p"] += 1
-
+            page = pinfo["next"]

    def parse_ugoira(self, illust_id):
        """Parse ugoira data"""
        # get illust page
        text = self.request(
-            self.illust_url,
-            params={"illust_id": illust_id},
+            self.illust_url, params={"illust_id": illust_id},
        ).text

        # parse page
@@ -127,57 +108,85 @@ class PixivExtractor(AsynchronousExtractor):
        url = url.replace("\\/", "/")

        # build framelist
-        framelist = "text://" + re.sub(
+        framelist = re.sub(
            r'\{"file":"([^"]+)","delay":(\d+)\},?',
-            r'\1 \2\n',
-            frames
+            r'\1 \2\n', frames
        )
        return url, framelist

    def get_job_metadata(self):
        """Collect metadata for extractor-job"""
+        data = self.api.user(self.artist_id)["response"][0]
        return {
            "category": info["category"],
            "artist-id": self.artist_id,
+            "artist-name": data["name"],
+            "artist-nick": data["account"],
        }


 class PixivAPI():
-    api_url = "http://spapi.pixiv.net/iphone/illust.php"
+    """Minimal interface for the Pixiv Public-API for mobile devices

-    def __init__(self, session_id):
-        self.session = requests.Session()
-        self.session.params["PHPSESSID"] = session_id
+    For a better and more complete implementation, see
+    - https://github.com/upbit/pixivpy
+    For in-depth information regarding the Pixiv Public-API, see
+    - http://blog.imaou.com/opensource/2014/10/09/pixiv_api_for_ios_update.html
+    """

-    def request(self, illust_id):
-        data = next(csv.reader(
-            [self.api_call(illust_id)]
-        ))
-        return {
-            "category": info["category"],
-            "illust-id": data[0],
-            "artist-id": data[1],
-            "extension": data[2],
-            "title": data[3],
-            "directory": data[4],
-            "artist-name": data[5],
-            "thumbnail-url": data[6],
-            "url-date": data[6][45:64],
-            # "thumbnail-mobile-url": data[9],
-            "date": data[12],
-            "tags": data[13],
-            # "description": data[18],
-            "count": data[19],
-            "artist-nick": data[24],
-            # "artist-avatar-url": data[29],
-            "num": "",
+    def __init__(self, session=None):
+        self.session = session or requests.Session()
+        self.session.headers.update({
+            "Referer": "http://www.pixiv.net/",
+            "User-Agent": "PixivIOSApp/5.1.1",
+            # "Authorization": "Bearer 8mMXXWT9iuwdJvsVIvQsFYDwuZpRCMePeyagSh30ZdU",
+        })
+
+    def login(self, username, password):
+        """Login and gain a Pixiv Public-API access token"""
+        data = {
+            "username": username,
+            "password": password,
+            "grant_type": "password",
+            "client_id": "bYGKuGVw91e0NMfPGp44euvGt59s",
+            "client_secret": "HP3RmkgAmEGro0gn1x9ioawQE8WMfvLXDz3ZqxpK",
        }
+        response = self.session.post(
+            "https://oauth.secure.pixiv.net/auth/token", data=data
+        )
+        if response.status_code not in (200, 301, 302):
+            raise Exception("login() failed! check username and password.\n"
+                            "HTTP %s: %s" % (response.status_code, response.text))
+        try:
+            token = self._parse(response)
+            self.session.headers["Authorization"] = (
+                "Bearer " + token["response"]["access_token"]
+            )
+        except:
+            raise Exception("Get access_token error! Response: %s" % (token))

-    def api_call(self, illust_id):
-        text = ""
-        while len(text) < 32:
-            text = safe_request(
-                self.session, self.api_url,
-                params={"illust_id": illust_id}
-            ).text
-        return text
+    def user(self, user_id):
+        """Query information about a pixiv user"""
+        response = self.session.get(
+            "https://public-api.secure.pixiv.net/v1/users/"
+            "{user}.json".format(user=user_id)
+        )
+        return self._parse(response)
+
+    def user_works(self, user_id, page, per_page=20):
+        """Query information about the works of a pixiv user"""
+        params = {
+            'page': page,
+            'per_page': per_page,
+            'image_sizes': 'large',
+        }
+        response = self.session.get(
+            "https://public-api.secure.pixiv.net/v1/users/"
+            "{user}/works.json".format(user=user_id), params=params
+        )
+        return self._parse(response)
+
+    @staticmethod
+    def _parse(response):
+        """Parse a Pixiv Public-API response"""
+        return json.loads(response.text)