[tumblr] add search extractor
This commit is contained in:
@@ -11,6 +11,7 @@
|
|||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import text, util, oauth, exception
|
from .. import text, util, oauth, exception
|
||||||
from datetime import datetime, date, timedelta
|
from datetime import datetime, date, timedelta
|
||||||
|
from urllib.parse import urlparse
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
@@ -22,7 +23,7 @@ BASE_PATTERN = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
POST_TYPES = frozenset((
|
POST_TYPES = frozenset((
|
||||||
"text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
|
"text", "quote", "link", "answer", "video", "audio", "photo", "chat", "search"))
|
||||||
|
|
||||||
|
|
||||||
class TumblrExtractor(Extractor):
|
class TumblrExtractor(Extractor):
|
||||||
@@ -37,11 +38,14 @@ class TumblrExtractor(Extractor):
|
|||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
|
|
||||||
name = match.group(2)
|
name = match.group(2)
|
||||||
|
|
||||||
if name:
|
if name:
|
||||||
self.blog = name + ".tumblr.com"
|
self.blog = name + ".tumblr.com"
|
||||||
else:
|
else:
|
||||||
self.blog = match.group(1) or match.group(3)
|
self.blog = match.group(1) or match.group(3)
|
||||||
|
|
||||||
|
self.is_timeline = False
|
||||||
|
|
||||||
def _init(self):
|
def _init(self):
|
||||||
self.api = TumblrAPI(self)
|
self.api = TumblrAPI(self)
|
||||||
self.types = self._setup_posttypes()
|
self.types = self._setup_posttypes()
|
||||||
@@ -83,12 +87,23 @@ class TumblrExtractor(Extractor):
|
|||||||
return
|
return
|
||||||
if post["type"] not in self.types:
|
if post["type"] not in self.types:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not blog:
|
if not blog:
|
||||||
blog = self.api.info(self.blog)
|
if self.is_timeline:
|
||||||
blog["uuid"] = self.blog
|
blog = post.get("blog")
|
||||||
|
self.blog = blog.get("name") + ".tumblr.com"
|
||||||
|
|
||||||
|
for image in blog.get("avatar", []):
|
||||||
|
if int(image.get("width")) == 512:
|
||||||
|
avatar_url = image.get("url")
|
||||||
|
break
|
||||||
|
|
||||||
|
else:
|
||||||
|
blog = self.api.info(self.blog)
|
||||||
|
blog["uuid"] = self.blog
|
||||||
|
|
||||||
if self.avatar:
|
if self.avatar:
|
||||||
url = self.api.avatar(self.blog)
|
url = avatar_url or self.api.avatar(self.blog)
|
||||||
yield Message.Directory, {"blog": blog}
|
yield Message.Directory, {"blog": blog}
|
||||||
yield self._prepare_avatar(url, post.copy(), blog)
|
yield self._prepare_avatar(url, post.copy(), blog)
|
||||||
|
|
||||||
@@ -349,6 +364,38 @@ class TumblrLikesExtractor(TumblrExtractor):
|
|||||||
return self.api.likes(self.blog)
|
return self.api.likes(self.blog)
|
||||||
|
|
||||||
|
|
||||||
|
class TumblrSearchExtractor(TumblrExtractor):
|
||||||
|
"""Extractor for a Tumblr search"""
|
||||||
|
subcategory = "search"
|
||||||
|
""" https://www.tumblr.com/search/nathan%20fielder?src=suggested_tag """
|
||||||
|
pattern = BASE_PATTERN + r'/search/(.*?)(\?.*)?$'
|
||||||
|
example = "https://www.tumblr.com/search/QUERY"
|
||||||
|
|
||||||
|
def __init__(self, match):
|
||||||
|
TumblrExtractor.__init__(self, match)
|
||||||
|
|
||||||
|
self.is_timeline = True
|
||||||
|
self.query = text.unquote(match.group(4))
|
||||||
|
|
||||||
|
parsed_url = urlparse(self.url)
|
||||||
|
self.params = text.parse_query(parsed_url.query)
|
||||||
|
|
||||||
|
def search(self, query, params):
|
||||||
|
"""Retrieve published posts"""
|
||||||
|
|
||||||
|
params["limit"] = 50
|
||||||
|
params["days"] = self.params.get("t") or 0
|
||||||
|
params["query"] = query
|
||||||
|
params["mode"] = "top"
|
||||||
|
params["reblog_info"] = "true" if self.reblogs else "false"
|
||||||
|
|
||||||
|
endpoint = "/v2/timeline/search"
|
||||||
|
return self.api._pagination(endpoint, params, cache=True)
|
||||||
|
|
||||||
|
def posts(self):
|
||||||
|
return self.search(self.query, {})
|
||||||
|
|
||||||
|
|
||||||
class TumblrAPI(oauth.OAuth1API):
|
class TumblrAPI(oauth.OAuth1API):
|
||||||
"""Interface for the Tumblr API v2
|
"""Interface for the Tumblr API v2
|
||||||
|
|
||||||
@@ -394,7 +441,8 @@ class TumblrAPI(oauth.OAuth1API):
|
|||||||
if self.before and params["offset"]:
|
if self.before and params["offset"]:
|
||||||
self.log.warning("'offset' and 'date-max' cannot be used together")
|
self.log.warning("'offset' and 'date-max' cannot be used together")
|
||||||
|
|
||||||
return self._pagination(blog, "/posts", params, cache=True)
|
endpoint = "/v2/blog/{}/posts".format(blog)
|
||||||
|
return self._pagination(endpoint, params, cache=True)
|
||||||
|
|
||||||
def likes(self, blog):
|
def likes(self, blog):
|
||||||
"""Retrieve liked posts"""
|
"""Retrieve liked posts"""
|
||||||
@@ -478,20 +526,32 @@ class TumblrAPI(oauth.OAuth1API):
|
|||||||
|
|
||||||
raise exception.StopExtraction(data)
|
raise exception.StopExtraction(data)
|
||||||
|
|
||||||
def _pagination(self, blog, endpoint, params, key="posts", cache=False):
|
def _pagination(self, full_endpoint, params, key="posts", cache=False):
|
||||||
endpoint = "/v2/blog/{}{}".format(blog, endpoint)
|
if not full_endpoint.endswith("?"):
|
||||||
|
full_endpoint = full_endpoint + "?"
|
||||||
|
|
||||||
|
endpoint = full_endpoint
|
||||||
|
|
||||||
if self.api_key:
|
if self.api_key:
|
||||||
params["api_key"] = self.api_key
|
params["api_key"] = self.api_key
|
||||||
|
|
||||||
strategy = self.extractor.config("pagination")
|
strategy = self.extractor.config("pagination")
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
data = self._call(endpoint, params)
|
data = self._call(endpoint, params)
|
||||||
|
|
||||||
|
if "/timeline/" in endpoint:
|
||||||
|
key = "elements"
|
||||||
|
posts = data.get("timeline", {}).get(key, [])
|
||||||
|
else:
|
||||||
|
posts = data[key]
|
||||||
|
|
||||||
if cache:
|
if cache:
|
||||||
self.BLOG_CACHE[blog] = data["blog"]
|
for post in posts:
|
||||||
|
p_blog = post.get("blog", {})
|
||||||
|
self.BLOG_CACHE[p_blog.get("name", "")] = p_blog
|
||||||
cache = False
|
cache = False
|
||||||
|
|
||||||
posts = data[key]
|
|
||||||
yield from posts
|
yield from posts
|
||||||
|
|
||||||
if strategy == "api":
|
if strategy == "api":
|
||||||
|
|||||||
@@ -360,4 +360,15 @@ __tests__ = (
|
|||||||
"#class" : tumblr.TumblrLikesExtractor,
|
"#class" : tumblr.TumblrLikesExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://www.tumblr.com/search/nathan fielder?src=typed_query",
|
||||||
|
"#category": ("", "tumblr", "search"),
|
||||||
|
"#class" : tumblr.TumblrSearchExtractor,
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://www.tumblr.com/search/nathan%20fielder?t=90",
|
||||||
|
"#category": ("", "tumblr", "search"),
|
||||||
|
"#class" : tumblr.TumblrSearchExtractor,
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user