diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index fa65862c..4a130e87 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -14,17 +14,6 @@ from datetime import datetime, timedelta
import re
-def _original_video(url):
- return re.sub(
- (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
- r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"),
- r"https://\1.\2", url
- )
-
-
-POST_TYPES = frozenset((
- "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
-
BASE_PATTERN = (
r"(?:tumblr:(?:https?://)?([^/]+)|"
r"(?:https?://)?"
@@ -32,6 +21,9 @@ BASE_PATTERN = (
r"([\w-]+\.tumblr\.com)))"
)
+POST_TYPES = frozenset((
+ "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
+
class TumblrExtractor(Extractor):
"""Base class for tumblr extractors"""
@@ -71,6 +63,18 @@ class TumblrExtractor(Extractor):
def items(self):
blog = None
+ # pre-compile regular expressions
+ self._sub_video = re.compile(
+ r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
+ r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub
+ if self.inline:
+ self._sub_image = re.compile(
+ r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
+ r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub
+ self._subn_orig_image = re.compile(r"/s\d+x\d+/").subn
+ _findall_image = re.compile('
post["timestamp"]:
return
@@ -112,7 +116,7 @@ class TumblrExtractor(Extractor):
if self.original and "/s2048x3072/" in photo["url"] and (
photo["width"] == 2048 or photo["height"] == 3072):
- photo["url"] = self._original_image(photo["url"])
+ photo["url"] = self._original_photo(photo["url"])
del photo["original_size"]
del photo["alt_sizes"]
@@ -126,17 +130,18 @@ class TumblrExtractor(Extractor):
url = post.get("video_url") # type "video"
if url:
- posts.append(self._prepare(_original_video(url), post.copy()))
+ posts.append(self._prepare(
+ self._original_video(url), post.copy()))
if self.inline and "reblog" in post: # inline media
# only "chat" posts are missing a "reblog" key in their
# API response, but they can't contain images/videos anyway
body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
- for url in re.findall('