[tumblr] pre-compile regular expressions
This commit is contained in:
@@ -14,17 +14,6 @@ from datetime import datetime, timedelta
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
def _original_video(url):
|
|
||||||
return re.sub(
|
|
||||||
(r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
|
|
||||||
r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"),
|
|
||||||
r"https://\1.\2", url
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
POST_TYPES = frozenset((
|
|
||||||
"text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
|
|
||||||
|
|
||||||
BASE_PATTERN = (
|
BASE_PATTERN = (
|
||||||
r"(?:tumblr:(?:https?://)?([^/]+)|"
|
r"(?:tumblr:(?:https?://)?([^/]+)|"
|
||||||
r"(?:https?://)?"
|
r"(?:https?://)?"
|
||||||
@@ -32,6 +21,9 @@ BASE_PATTERN = (
|
|||||||
r"([\w-]+\.tumblr\.com)))"
|
r"([\w-]+\.tumblr\.com)))"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
POST_TYPES = frozenset((
|
||||||
|
"text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
|
||||||
|
|
||||||
|
|
||||||
class TumblrExtractor(Extractor):
|
class TumblrExtractor(Extractor):
|
||||||
"""Base class for tumblr extractors"""
|
"""Base class for tumblr extractors"""
|
||||||
@@ -71,6 +63,18 @@ class TumblrExtractor(Extractor):
|
|||||||
def items(self):
|
def items(self):
|
||||||
blog = None
|
blog = None
|
||||||
|
|
||||||
|
# pre-compile regular expressions
|
||||||
|
self._sub_video = re.compile(
|
||||||
|
r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
|
||||||
|
r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub
|
||||||
|
if self.inline:
|
||||||
|
self._sub_image = re.compile(
|
||||||
|
r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
|
||||||
|
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub
|
||||||
|
self._subn_orig_image = re.compile(r"/s\d+x\d+/").subn
|
||||||
|
_findall_image = re.compile('<img src="([^"]+)"').findall
|
||||||
|
_findall_video = re.compile('<source src="([^"]+)"').findall
|
||||||
|
|
||||||
for post in self.posts():
|
for post in self.posts():
|
||||||
if self.date_min > post["timestamp"]:
|
if self.date_min > post["timestamp"]:
|
||||||
return
|
return
|
||||||
@@ -112,7 +116,7 @@ class TumblrExtractor(Extractor):
|
|||||||
|
|
||||||
if self.original and "/s2048x3072/" in photo["url"] and (
|
if self.original and "/s2048x3072/" in photo["url"] and (
|
||||||
photo["width"] == 2048 or photo["height"] == 3072):
|
photo["width"] == 2048 or photo["height"] == 3072):
|
||||||
photo["url"] = self._original_image(photo["url"])
|
photo["url"] = self._original_photo(photo["url"])
|
||||||
|
|
||||||
del photo["original_size"]
|
del photo["original_size"]
|
||||||
del photo["alt_sizes"]
|
del photo["alt_sizes"]
|
||||||
@@ -126,17 +130,18 @@ class TumblrExtractor(Extractor):
|
|||||||
|
|
||||||
url = post.get("video_url") # type "video"
|
url = post.get("video_url") # type "video"
|
||||||
if url:
|
if url:
|
||||||
posts.append(self._prepare(_original_video(url), post.copy()))
|
posts.append(self._prepare(
|
||||||
|
self._original_video(url), post.copy()))
|
||||||
|
|
||||||
if self.inline and "reblog" in post: # inline media
|
if self.inline and "reblog" in post: # inline media
|
||||||
# only "chat" posts are missing a "reblog" key in their
|
# only "chat" posts are missing a "reblog" key in their
|
||||||
# API response, but they can't contain images/videos anyway
|
# API response, but they can't contain images/videos anyway
|
||||||
body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
|
body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
|
||||||
for url in re.findall('<img src="([^"]+)"', body):
|
for url in _findall_image(body):
|
||||||
url = self._original_inline_image(url)
|
url = self._original_inline_image(url)
|
||||||
posts.append(self._prepare_image(url, post.copy()))
|
posts.append(self._prepare_image(url, post.copy()))
|
||||||
for url in re.findall('<source src="([^"]+)"', body):
|
for url in _findall_video(body):
|
||||||
url = _original_video(url)
|
url = self._original_video(url)
|
||||||
posts.append(self._prepare(url, post.copy()))
|
posts.append(self._prepare(url, post.copy()))
|
||||||
|
|
||||||
if self.external: # external links
|
if self.external: # external links
|
||||||
@@ -212,20 +217,19 @@ class TumblrExtractor(Extractor):
|
|||||||
def _skip_reblog_same_blog(self, post):
|
def _skip_reblog_same_blog(self, post):
|
||||||
return self.blog != post.get("reblogged_root_uuid")
|
return self.blog != post.get("reblogged_root_uuid")
|
||||||
|
|
||||||
def _original_image(self, url):
|
def _original_photo(self, url):
|
||||||
return self._update_image_token(
|
return self._update_image_token(
|
||||||
url.replace("/s2048x3072/", "/s99999x99999/", 1))
|
url.replace("/s2048x3072/", "/s99999x99999/", 1))
|
||||||
|
|
||||||
def _original_inline_image(self, url):
|
def _original_inline_image(self, url):
|
||||||
if self.original:
|
if self.original:
|
||||||
url, n = re.subn(r"/s\d+x\d+/", "/s99999x99999/", url, 1)
|
url, n = self._subn_orig_image("/s99999x99999/", url, 1)
|
||||||
if n:
|
if n:
|
||||||
return self._update_image_token(url)
|
return self._update_image_token(url)
|
||||||
return re.sub(
|
return self._sub_image(r"https://\1_1280.\2", url)
|
||||||
(r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
|
|
||||||
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
|
def _original_video(self, url):
|
||||||
r"https://\1_1280.\2", url
|
return self._sub_video(r"https://\1.\2", url)
|
||||||
)
|
|
||||||
|
|
||||||
def _update_image_token(self, url):
|
def _update_image_token(self, url):
|
||||||
headers = {"Accept": "text/html,*/*;q=0.8"}
|
headers = {"Accept": "text/html,*/*;q=0.8"}
|
||||||
|
|||||||
Reference in New Issue
Block a user