[tumblr] warn when unable to fetch higher-resolution images (#2957)
and download the smaller version instead of failing with a 404 error
This commit is contained in:
@@ -116,8 +116,10 @@ class TumblrExtractor(Extractor):
|
|||||||
|
|
||||||
if self.original and "/s2048x3072/" in photo["url"] and (
|
if self.original and "/s2048x3072/" in photo["url"] and (
|
||||||
photo["width"] == 2048 or photo["height"] == 3072):
|
photo["width"] == 2048 or photo["height"] == 3072):
|
||||||
photo["url"] = self._original_photo(photo["url"])
|
try:
|
||||||
|
photo["url"] = self._original_photo(photo["url"])
|
||||||
|
except Exception:
|
||||||
|
self._warn_original(photo["url"], post)
|
||||||
del photo["original_size"]
|
del photo["original_size"]
|
||||||
del photo["alt_sizes"]
|
del photo["alt_sizes"]
|
||||||
posts.append(
|
posts.append(
|
||||||
@@ -138,7 +140,10 @@ class TumblrExtractor(Extractor):
|
|||||||
# API response, but they can't contain images/videos anyway
|
# API response, but they can't contain images/videos anyway
|
||||||
body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
|
body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
|
||||||
for url in _findall_image(body):
|
for url in _findall_image(body):
|
||||||
url = self._original_inline_image(url)
|
try:
|
||||||
|
url = self._original_inline_image(url)
|
||||||
|
except Exception:
|
||||||
|
self._warn_original(url, post)
|
||||||
posts.append(self._prepare_image(url, post.copy()))
|
posts.append(self._prepare_image(url, post.copy()))
|
||||||
for url in _findall_video(body):
|
for url in _findall_video(body):
|
||||||
url = self._original_video(url)
|
url = self._original_video(url)
|
||||||
@@ -218,23 +223,33 @@ class TumblrExtractor(Extractor):
|
|||||||
return self.blog != post.get("reblogged_root_uuid")
|
return self.blog != post.get("reblogged_root_uuid")
|
||||||
|
|
||||||
def _original_photo(self, url):
|
def _original_photo(self, url):
|
||||||
return self._update_image_token(
|
resized = url.replace("/s2048x3072/", "/s99999x99999/", 1)
|
||||||
url.replace("/s2048x3072/", "/s99999x99999/", 1))
|
return self._update_image_token(resized)
|
||||||
|
|
||||||
def _original_inline_image(self, url):
|
def _original_inline_image(self, url):
|
||||||
if self.original:
|
if self.original:
|
||||||
url, n = self._subn_orig_image("/s99999x99999/", url, 1)
|
resized, n = self._subn_orig_image("/s99999x99999/", url, 1)
|
||||||
if n:
|
if n:
|
||||||
return self._update_image_token(url)
|
return self._update_image_token(resized)
|
||||||
return self._sub_image(r"https://\1_1280.\2", url)
|
return self._sub_image(r"https://\1_1280.\2", url)
|
||||||
|
|
||||||
def _original_video(self, url):
|
def _original_video(self, url):
|
||||||
return self._sub_video(r"https://\1.\2", url)
|
return self._sub_video(r"https://\1.\2", url)
|
||||||
|
|
||||||
def _update_image_token(self, url):
|
def _update_image_token(self, resized):
|
||||||
headers = {"Accept": "text/html,*/*;q=0.8"}
|
headers = {"Accept": "text/html,*/*;q=0.8"}
|
||||||
response = self.request(url, headers=headers)
|
|
||||||
return text.extract(response.text, '" src="', '"')[0]
|
for _ in range(3):
|
||||||
|
response = self.request(resized, headers=headers)
|
||||||
|
updated = text.extract(response.text, '" src="', '"')[0]
|
||||||
|
if updated != resized:
|
||||||
|
return updated
|
||||||
|
|
||||||
|
raise RuntimeError("invalid token")
|
||||||
|
|
||||||
|
def _warn_original(self, url, post):
|
||||||
|
self.log.warning("Unable to fetch higher-resolution "
|
||||||
|
"version of %s (%s)", url, post["id"])
|
||||||
|
|
||||||
|
|
||||||
class TumblrUserExtractor(TumblrExtractor):
|
class TumblrUserExtractor(TumblrExtractor):
|
||||||
|
|||||||
Reference in New Issue
Block a user