diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index fa65862c..4a130e87 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -14,17 +14,6 @@ from datetime import datetime, timedelta import re -def _original_video(url): - return re.sub( - (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" - r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"), - r"https://\1.\2", url - ) - - -POST_TYPES = frozenset(( - "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) - BASE_PATTERN = ( r"(?:tumblr:(?:https?://)?([^/]+)|" r"(?:https?://)?" @@ -32,6 +21,9 @@ BASE_PATTERN = ( r"([\w-]+\.tumblr\.com)))" ) +POST_TYPES = frozenset(( + "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) + class TumblrExtractor(Extractor): """Base class for tumblr extractors""" @@ -71,6 +63,18 @@ class TumblrExtractor(Extractor): def items(self): blog = None + # pre-compile regular expressions + self._sub_video = re.compile( + r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" + r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub + if self.inline: + self._sub_image = re.compile( + r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?" + r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub + self._subn_orig_image = re.compile(r"/s\d+x\d+/").subn + _findall_image = re.compile(' post["timestamp"]: return @@ -112,7 +116,7 @@ class TumblrExtractor(Extractor): if self.original and "/s2048x3072/" in photo["url"] and ( photo["width"] == 2048 or photo["height"] == 3072): - photo["url"] = self._original_image(photo["url"]) + photo["url"] = self._original_photo(photo["url"]) del photo["original_size"] del photo["alt_sizes"] @@ -126,17 +130,18 @@ class TumblrExtractor(Extractor): url = post.get("video_url") # type "video" if url: - posts.append(self._prepare(_original_video(url), post.copy())) + posts.append(self._prepare( + self._original_video(url), post.copy())) if self.inline and "reblog" in post: # inline media # only "chat" posts are missing a "reblog" key in their # API response, but they can't contain images/videos anyway body = post["reblog"]["comment"] + post["reblog"]["tree_html"] - for url in re.findall('