[tumblr] use API v2 (#48)

2017-11-03 22:16:57 +01:00
parent d6bed9f36f
commit 980fd3616d
1 changed files with 95 additions and 83 deletions
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -9,119 +9,131 @@
 """Extract images from https://www.tumblr.com/"""
 from .common import Extractor, Message
-from .. import text
+from .. import text, exception
-import json
+from ..cache import memcache
-class TumblrUserExtractor(Extractor):
+class TumblrExtractor(Extractor):
-    """Extractor for all images from a tumblr-user"""
+    """Base class for tumblr extractors"""
    category = "tumblr"
-    subcategory = "user"
+    directory_fmt = ["{category}", "{name}"]
-    directory_fmt = ["{category}", "{user}"]
+    filename_fmt = "{category}_{blog[name]}_{id}{offset}.{extension}"
    filename_fmt = "{category}_{user}_{id}{offset}.{extension}"
    pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com(?:/page/\d+)?/?$"]
    test = [("http://demo.tumblr.com/", {
        "keyword": "8f1b06c2a0a562b10df3e62ab2a8156e3da1855b",
        "pattern": "https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg",
        "count": 1,
    })]
    def __init__(self, match):
        Extractor.__init__(self)
        self.user = match.group(1)
-        self.api_url = "https://{}.tumblr.com/api/read/json".format(self.user)
+        self.api = TumblrAPI(self, "photo")
        self.api_params = {"start": 0, "type": "photo", "num": 20}
    def items(self):
-        images = self.get_image_data()
+        blog = self.api.info(self.user)
        data = self.get_job_metadata(images)
        yield Message.Version, 1
-        yield Message.Directory, data
+        yield Message.Directory, blog
        for image in images:
            url = image["photo-url-1280"]
            self.delete_keywords(image)
            image.update(data)
            image = text.nameext_from_url(url, image)
            image["hash"] = text.extract(image["name"], "_", "_")[0]
            image = {key.replace("-", "_"): value
                     for key, value in image.items()}
            yield Message.Url, url, image
-    def get_job_metadata(self, image_data):
+        for post in self.posts():
-        """Collect metadata for extractor-job"""
+            if "photos" not in post:
-        data = next(image_data)
+                continue
        data["user"] = self.user
        del data["cname"]
        del data["description"]
        del data["feeds"]
        return data
    def get_image_data(self):
        """Yield metadata for all images from a user"""
        params = self.api_params.copy()
        while True:
            page = self.request(self.api_url, params=params).text
            data = json.loads(page[22:-2])
            if params["start"] == 0:
                yield data["tumblelog"]
            for post in data["posts"]:
                yield from self.get_images_from_post(post)
            if not data["posts"] or "id" in params:
                return
            params["start"] += 20
    @staticmethod
    def get_images_from_post(post):
        """Yield all images from a single post"""
        try:
            photos = post["photos"]
        except KeyError:
            return
            del post["photos"]
-        if photos:
+            del post["trail"]
-            for photo in photos:
+            for offset, photo in enumerate(photos, 1):
-                post.update(photo)
+                photo.update(photo["original_size"])
-                yield post
+                del photo["original_size"]
-        else:
+                del photo["alt_sizes"]
-            post["offset"] = "o1"
+                post["extension"] = photo["url"].rpartition(".")[2]
-            yield post
+                post["offset"] = "o{}".format(offset)
                post["photo"] = photo
                post["blog"] = blog
                yield Message.Url, photo["url"], post
-    @staticmethod
+    def posts(self):
-    def delete_keywords(data):
+        """Return an iterable containing all relevant posts"""
        """Delete unnecessary keywords from dict"""
        keys = [
            k for k in data.keys()
            if k.startswith(("photo-url-", "note-")) or k.endswith("-button")
        ]
        for key in keys:
            del data[key]
-class TumblrPostExtractor(TumblrUserExtractor):
+class TumblrUserExtractor(TumblrExtractor):
    """Extractor for all images from a tumblr-user"""
    subcategory = "user"
    pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com(?:/page/\d+)?/?$"]
    test = [("http://demo.tumblr.com/", {
        "pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg",
        "count": 1,
    })]
    def posts(self):
        return self.api.posts(self.user, {})
 class TumblrPostExtractor(TumblrExtractor):
    """Extractor for images from a single post on tumblr"""
    subcategory = "post"
    pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com/post/(\d+)"]
    test = [("http://demo.tumblr.com/post/459265350", {
-        "keyword": "4d5bc44bf8ec334fdaf78696edf215574fa6d998",
+        "pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg",
        "pattern": "https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg",
        "count": 1,
    })]
    def __init__(self, match):
-        TumblrUserExtractor.__init__(self, match)
+        TumblrExtractor.__init__(self, match)
-        self.api_params["id"] = match.group(2)
+        self.post_id = match.group(2)
    def posts(self):
        return self.api.posts(self.user, {"id": self.post_id})
-class TumblrTagExtractor(TumblrUserExtractor):
+class TumblrTagExtractor(TumblrExtractor):
    """Extractor for images from a tumblr-user by tag"""
    subcategory = "tag"
    pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com/tagged/(.+)"]
    test = [("http://demo.tumblr.com/tagged/Times%20Square", {
-        "keyword": "b0465d131ecb097633127b79805432dacae06d14",
+        "pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg",
        "pattern": "https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg",
        "count": 1,
    })]
    def __init__(self, match):
-        TumblrUserExtractor.__init__(self, match)
+        TumblrExtractor.__init__(self, match)
-        self.api_params["tagged"] = text.unquote(match.group(2))
+        self.tag = text.unquote(match.group(2))
    def posts(self):
        return self.api.posts(self.user, {"tag": self.tag})
 class TumblrAPI():
    """Minimal interface for the Tumblr API v2"""
    API_KEY = "O3hU2tMi5e4Qs5t3vezEi6L0qRORJ5y9oUpSGsrWu8iA3UCc3B"
    def __init__(self, extractor, typ=None):
        self.api_key = extractor.config("api-key", TumblrAPI.API_KEY)
        self.params = {"offset": 0, "limit": 50, "type": typ}
        self.extractor = extractor
    @memcache(keyarg=1)
    def info(self, blog):
        """Return general information about a blog"""
        return self._call(blog, "info", {})["blog"]
    def posts(self, blog, params):
        """Retrieve published posts"""
        params.update(self.params)
        return self._pagination(blog, "posts", params)
    def _call(self, blog, endpoint, params):
        params["api_key"] = self.api_key
        url = "https://api.tumblr.com/v2/blog/{}.tumblr.com/{}".format(
            blog, endpoint)
        response = self.extractor.request(
            url, params=params, fatal=False).json()
        if response["meta"]["status"] == 404:
            raise exception.NotFoundError("user")
        elif response["meta"]["status"] != 200:
            self.extractor.log.error(response)
            raise exception.StopExtraction()
        return response["response"]
    def _pagination(self, blog, endpoint, params):
        while True:
            data = self._call(blog, endpoint, params)
            yield from data["posts"]
            params["offset"] += params["limit"]
            if params["offset"] >= data["total_posts"]:
                return