[tumblr] add search extractor

This commit is contained in:
Allen
2024-09-03 08:18:58 +02:00
parent 57da9ebfb5
commit d2ef9a590f
2 changed files with 80 additions and 9 deletions

View File

@@ -11,6 +11,7 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util, oauth, exception from .. import text, util, oauth, exception
from datetime import datetime, date, timedelta from datetime import datetime, date, timedelta
from urllib.parse import urlparse
import re import re
@@ -22,7 +23,7 @@ BASE_PATTERN = (
) )
POST_TYPES = frozenset(( POST_TYPES = frozenset((
"text", "quote", "link", "answer", "video", "audio", "photo", "chat")) "text", "quote", "link", "answer", "video", "audio", "photo", "chat", "search"))
class TumblrExtractor(Extractor): class TumblrExtractor(Extractor):
@@ -37,11 +38,14 @@ class TumblrExtractor(Extractor):
Extractor.__init__(self, match) Extractor.__init__(self, match)
name = match.group(2) name = match.group(2)
if name: if name:
self.blog = name + ".tumblr.com" self.blog = name + ".tumblr.com"
else: else:
self.blog = match.group(1) or match.group(3) self.blog = match.group(1) or match.group(3)
self.is_timeline = False
def _init(self): def _init(self):
self.api = TumblrAPI(self) self.api = TumblrAPI(self)
self.types = self._setup_posttypes() self.types = self._setup_posttypes()
@@ -83,12 +87,23 @@ class TumblrExtractor(Extractor):
return return
if post["type"] not in self.types: if post["type"] not in self.types:
continue continue
if not blog: if not blog:
blog = self.api.info(self.blog) if self.is_timeline:
blog["uuid"] = self.blog blog = post.get("blog")
self.blog = blog.get("name") + ".tumblr.com"
for image in blog.get("avatar", []):
if int(image.get("width")) == 512:
avatar_url = image.get("url")
break
else:
blog = self.api.info(self.blog)
blog["uuid"] = self.blog
if self.avatar: if self.avatar:
url = self.api.avatar(self.blog) url = avatar_url or self.api.avatar(self.blog)
yield Message.Directory, {"blog": blog} yield Message.Directory, {"blog": blog}
yield self._prepare_avatar(url, post.copy(), blog) yield self._prepare_avatar(url, post.copy(), blog)
@@ -349,6 +364,38 @@ class TumblrLikesExtractor(TumblrExtractor):
return self.api.likes(self.blog) return self.api.likes(self.blog)
class TumblrSearchExtractor(TumblrExtractor):
"""Extractor for a Tumblr search"""
subcategory = "search"
""" https://www.tumblr.com/search/nathan%20fielder?src=suggested_tag """
pattern = BASE_PATTERN + r'/search/(.*?)(\?.*)?$'
example = "https://www.tumblr.com/search/QUERY"
def __init__(self, match):
TumblrExtractor.__init__(self, match)
self.is_timeline = True
self.query = text.unquote(match.group(4))
parsed_url = urlparse(self.url)
self.params = text.parse_query(parsed_url.query)
def search(self, query, params):
"""Retrieve published posts"""
params["limit"] = 50
params["days"] = self.params.get("t") or 0
params["query"] = query
params["mode"] = "top"
params["reblog_info"] = "true" if self.reblogs else "false"
endpoint = "/v2/timeline/search"
return self.api._pagination(endpoint, params, cache=True)
def posts(self):
return self.search(self.query, {})
class TumblrAPI(oauth.OAuth1API): class TumblrAPI(oauth.OAuth1API):
"""Interface for the Tumblr API v2 """Interface for the Tumblr API v2
@@ -394,7 +441,8 @@ class TumblrAPI(oauth.OAuth1API):
if self.before and params["offset"]: if self.before and params["offset"]:
self.log.warning("'offset' and 'date-max' cannot be used together") self.log.warning("'offset' and 'date-max' cannot be used together")
return self._pagination(blog, "/posts", params, cache=True) endpoint = "/v2/blog/{}/posts".format(blog)
return self._pagination(endpoint, params, cache=True)
def likes(self, blog): def likes(self, blog):
"""Retrieve liked posts""" """Retrieve liked posts"""
@@ -478,20 +526,32 @@ class TumblrAPI(oauth.OAuth1API):
raise exception.StopExtraction(data) raise exception.StopExtraction(data)
def _pagination(self, blog, endpoint, params, key="posts", cache=False): def _pagination(self, full_endpoint, params, key="posts", cache=False):
endpoint = "/v2/blog/{}{}".format(blog, endpoint) if not full_endpoint.endswith("?"):
full_endpoint = full_endpoint + "?"
endpoint = full_endpoint
if self.api_key: if self.api_key:
params["api_key"] = self.api_key params["api_key"] = self.api_key
strategy = self.extractor.config("pagination") strategy = self.extractor.config("pagination")
while True: while True:
data = self._call(endpoint, params) data = self._call(endpoint, params)
if "/timeline/" in endpoint:
key = "elements"
posts = data.get("timeline", {}).get(key, [])
else:
posts = data[key]
if cache: if cache:
self.BLOG_CACHE[blog] = data["blog"] for post in posts:
p_blog = post.get("blog", {})
self.BLOG_CACHE[p_blog.get("name", "")] = p_blog
cache = False cache = False
posts = data[key]
yield from posts yield from posts
if strategy == "api": if strategy == "api":

View File

@@ -360,4 +360,15 @@ __tests__ = (
"#class" : tumblr.TumblrLikesExtractor, "#class" : tumblr.TumblrLikesExtractor,
}, },
{
"#url" : "https://www.tumblr.com/search/nathan fielder?src=typed_query",
"#category": ("", "tumblr", "search"),
"#class" : tumblr.TumblrSearchExtractor,
},
{
"#url" : "https://www.tumblr.com/search/nathan%20fielder?t=90",
"#category": ("", "tumblr", "search"),
"#class" : tumblr.TumblrSearchExtractor,
},
) )