From 980fd3616d1b48049cce895e8a78bac0caa8d036 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 3 Nov 2017 22:16:57 +0100 Subject: [PATCH] [tumblr] use API v2 (#48) --- gallery_dl/extractor/tumblr.py | 178 ++++++++++++++++++--------------- 1 file changed, 95 insertions(+), 83 deletions(-) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index e6324980..2f0a807c 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -9,119 +9,131 @@ """Extract images from https://www.tumblr.com/""" from .common import Extractor, Message -from .. import text -import json +from .. import text, exception +from ..cache import memcache -class TumblrUserExtractor(Extractor): - """Extractor for all images from a tumblr-user""" +class TumblrExtractor(Extractor): + """Base class for tumblr extractors""" category = "tumblr" - subcategory = "user" - directory_fmt = ["{category}", "{user}"] - filename_fmt = "{category}_{user}_{id}{offset}.{extension}" - pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com(?:/page/\d+)?/?$"] - test = [("http://demo.tumblr.com/", { - "keyword": "8f1b06c2a0a562b10df3e62ab2a8156e3da1855b", - "pattern": "https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg", - "count": 1, - })] + directory_fmt = ["{category}", "{name}"] + filename_fmt = "{category}_{blog[name]}_{id}{offset}.{extension}" def __init__(self, match): Extractor.__init__(self) self.user = match.group(1) - self.api_url = "https://{}.tumblr.com/api/read/json".format(self.user) - self.api_params = {"start": 0, "type": "photo", "num": 20} + self.api = TumblrAPI(self, "photo") def items(self): - images = self.get_image_data() - data = self.get_job_metadata(images) + blog = self.api.info(self.user) yield Message.Version, 1 - yield Message.Directory, data - for image in images: - url = image["photo-url-1280"] - self.delete_keywords(image) - image.update(data) - image = text.nameext_from_url(url, image) - image["hash"] = text.extract(image["name"], "_", "_")[0] - image = {key.replace("-", "_"): value - for key, value in image.items()} - yield Message.Url, url, image + yield Message.Directory, blog - def get_job_metadata(self, image_data): - """Collect metadata for extractor-job""" - data = next(image_data) - data["user"] = self.user - del data["cname"] - del data["description"] - del data["feeds"] - return data - - def get_image_data(self): - """Yield metadata for all images from a user""" - params = self.api_params.copy() - while True: - page = self.request(self.api_url, params=params).text - data = json.loads(page[22:-2]) - if params["start"] == 0: - yield data["tumblelog"] - for post in data["posts"]: - yield from self.get_images_from_post(post) - if not data["posts"] or "id" in params: - return - params["start"] += 20 - - @staticmethod - def get_images_from_post(post): - """Yield all images from a single post""" - try: + for post in self.posts(): + if "photos" not in post: + continue photos = post["photos"] - except KeyError: - return - del post["photos"] - if photos: - for photo in photos: - post.update(photo) - yield post - else: - post["offset"] = "o1" - yield post + del post["photos"] + del post["trail"] + for offset, photo in enumerate(photos, 1): + photo.update(photo["original_size"]) + del photo["original_size"] + del photo["alt_sizes"] + post["extension"] = photo["url"].rpartition(".")[2] + post["offset"] = "o{}".format(offset) + post["photo"] = photo + post["blog"] = blog + yield Message.Url, photo["url"], post - @staticmethod - def delete_keywords(data): - """Delete unnecessary keywords from dict""" - keys = [ - k for k in data.keys() - if k.startswith(("photo-url-", "note-")) or k.endswith("-button") - ] - for key in keys: - del data[key] + def posts(self): + """Return an iterable containing all relevant posts""" -class TumblrPostExtractor(TumblrUserExtractor): +class TumblrUserExtractor(TumblrExtractor): + """Extractor for all images from a tumblr-user""" + subcategory = "user" + pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com(?:/page/\d+)?/?$"] + test = [("http://demo.tumblr.com/", { + "pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg", + "count": 1, + })] + + def posts(self): + return self.api.posts(self.user, {}) + + +class TumblrPostExtractor(TumblrExtractor): """Extractor for images from a single post on tumblr""" subcategory = "post" pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com/post/(\d+)"] test = [("http://demo.tumblr.com/post/459265350", { - "keyword": "4d5bc44bf8ec334fdaf78696edf215574fa6d998", - "pattern": "https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg", + "pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg", "count": 1, })] def __init__(self, match): - TumblrUserExtractor.__init__(self, match) - self.api_params["id"] = match.group(2) + TumblrExtractor.__init__(self, match) + self.post_id = match.group(2) + + def posts(self): + return self.api.posts(self.user, {"id": self.post_id}) -class TumblrTagExtractor(TumblrUserExtractor): +class TumblrTagExtractor(TumblrExtractor): """Extractor for images from a tumblr-user by tag""" subcategory = "tag" pattern = [r"(?:https?://)?([^.]+)\.tumblr\.com/tagged/(.+)"] test = [("http://demo.tumblr.com/tagged/Times%20Square", { - "keyword": "b0465d131ecb097633127b79805432dacae06d14", - "pattern": "https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg", + "pattern": r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg", "count": 1, })] def __init__(self, match): - TumblrUserExtractor.__init__(self, match) - self.api_params["tagged"] = text.unquote(match.group(2)) + TumblrExtractor.__init__(self, match) + self.tag = text.unquote(match.group(2)) + + def posts(self): + return self.api.posts(self.user, {"tag": self.tag}) + + +class TumblrAPI(): + """Minimal interface for the Tumblr API v2""" + API_KEY = "O3hU2tMi5e4Qs5t3vezEi6L0qRORJ5y9oUpSGsrWu8iA3UCc3B" + + def __init__(self, extractor, typ=None): + self.api_key = extractor.config("api-key", TumblrAPI.API_KEY) + self.params = {"offset": 0, "limit": 50, "type": typ} + self.extractor = extractor + + @memcache(keyarg=1) + def info(self, blog): + """Return general information about a blog""" + return self._call(blog, "info", {})["blog"] + + def posts(self, blog, params): + """Retrieve published posts""" + params.update(self.params) + return self._pagination(blog, "posts", params) + + def _call(self, blog, endpoint, params): + params["api_key"] = self.api_key + url = "https://api.tumblr.com/v2/blog/{}.tumblr.com/{}".format( + blog, endpoint) + + response = self.extractor.request( + url, params=params, fatal=False).json() + if response["meta"]["status"] == 404: + raise exception.NotFoundError("user") + elif response["meta"]["status"] != 200: + self.extractor.log.error(response) + raise exception.StopExtraction() + + return response["response"] + + def _pagination(self, blog, endpoint, params): + while True: + data = self._call(blog, endpoint, params) + yield from data["posts"] + params["offset"] += params["limit"] + if params["offset"] >= data["total_posts"]: + return