[tumblr] add search extractor

2024-09-03 08:18:58 +02:00
parent 57da9ebfb5
commit d2ef9a590f
2 changed files with 80 additions and 9 deletions
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -11,6 +11,7 @@
 from .common import Extractor, Message
 from .. import text, util, oauth, exception
 from datetime import datetime, date, timedelta
 from urllib.parse import urlparse
 import re
@@ -22,7 +23,7 @@ BASE_PATTERN = (
 )
 POST_TYPES = frozenset((
-    "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
+    "text", "quote", "link", "answer", "video", "audio", "photo", "chat", "search"))
 class TumblrExtractor(Extractor):
@@ -37,11 +38,14 @@ class TumblrExtractor(Extractor):
        Extractor.__init__(self, match)
        name = match.group(2)
        if name:
            self.blog = name + ".tumblr.com"
        else:
            self.blog = match.group(1) or match.group(3)
        self.is_timeline = False
    def _init(self):
        self.api = TumblrAPI(self)
        self.types = self._setup_posttypes()
@@ -83,12 +87,23 @@ class TumblrExtractor(Extractor):
                return
            if post["type"] not in self.types:
                continue
            if not blog:
-                blog = self.api.info(self.blog)
+                if self.is_timeline:
-                blog["uuid"] = self.blog
+                    blog = post.get("blog")
                    self.blog = blog.get("name") + ".tumblr.com"
                    for image in blog.get("avatar", []):
                        if int(image.get("width")) == 512:
                            avatar_url = image.get("url")
                            break
                else:
                    blog = self.api.info(self.blog)
                    blog["uuid"] = self.blog
                if self.avatar:
-                    url = self.api.avatar(self.blog)
+                    url = avatar_url or self.api.avatar(self.blog)
                    yield Message.Directory, {"blog": blog}
                    yield self._prepare_avatar(url, post.copy(), blog)
@@ -349,6 +364,38 @@ class TumblrLikesExtractor(TumblrExtractor):
        return self.api.likes(self.blog)
 class TumblrSearchExtractor(TumblrExtractor):
    """Extractor for a Tumblr search"""
    subcategory = "search"
    """ https://www.tumblr.com/search/nathan%20fielder?src=suggested_tag """
    pattern = BASE_PATTERN + r'/search/(.*?)(\?.*)?$'
    example = "https://www.tumblr.com/search/QUERY"
    def __init__(self, match):
        TumblrExtractor.__init__(self, match)
        self.is_timeline = True
        self.query = text.unquote(match.group(4))
        parsed_url = urlparse(self.url)
        self.params = text.parse_query(parsed_url.query)
    def search(self, query, params):
        """Retrieve published posts"""
        params["limit"] = 50
        params["days"] = self.params.get("t") or 0
        params["query"] = query
        params["mode"] = "top"
        params["reblog_info"] = "true" if self.reblogs else "false"
        endpoint = "/v2/timeline/search"
        return self.api._pagination(endpoint, params, cache=True)
    def posts(self):
        return self.search(self.query, {})
 class TumblrAPI(oauth.OAuth1API):
    """Interface for the Tumblr API v2
@@ -394,7 +441,8 @@ class TumblrAPI(oauth.OAuth1API):
        if self.before and params["offset"]:
            self.log.warning("'offset' and 'date-max' cannot be used together")
-        return self._pagination(blog, "/posts", params, cache=True)
+        endpoint = "/v2/blog/{}/posts".format(blog)
        return self._pagination(endpoint, params, cache=True)
    def likes(self, blog):
        """Retrieve liked posts"""
@@ -478,20 +526,32 @@ class TumblrAPI(oauth.OAuth1API):
            raise exception.StopExtraction(data)
-    def _pagination(self, blog, endpoint, params, key="posts", cache=False):
+    def _pagination(self, full_endpoint, params, key="posts", cache=False):
-        endpoint = "/v2/blog/{}{}".format(blog, endpoint)
+        if not full_endpoint.endswith("?"):
            full_endpoint = full_endpoint + "?"
        endpoint = full_endpoint
        if self.api_key:
            params["api_key"] = self.api_key
        strategy = self.extractor.config("pagination")
        while True:
            data = self._call(endpoint, params)
            if "/timeline/" in endpoint:
                key = "elements"
                posts = data.get("timeline", {}).get(key, [])
            else:
                posts = data[key]
            if cache:
-                self.BLOG_CACHE[blog] = data["blog"]
+                for post in posts:
                    p_blog = post.get("blog", {})
                    self.BLOG_CACHE[p_blog.get("name", "")] = p_blog
                cache = False
            posts = data[key]
            yield from posts
            if strategy == "api":
--- a/test/results/tumblr.py
+++ b/test/results/tumblr.py
@@ -360,4 +360,15 @@ __tests__ = (
    "#class"   : tumblr.TumblrLikesExtractor,
 },
 {
    "#url"     : "https://www.tumblr.com/search/nathan fielder?src=typed_query",
    "#category": ("", "tumblr", "search"),
    "#class"   : tumblr.TumblrSearchExtractor,
 },
 {
    "#url"     : "https://www.tumblr.com/search/nathan%20fielder?t=90",
    "#category": ("", "tumblr", "search"),
    "#class"   : tumblr.TumblrSearchExtractor,
 },
 )