diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 5e68f138..4838844c 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -13,6 +13,7 @@ from .. import text, util, exception from ..cache import cache import itertools import json +import re BASE_PATTERN = r"(?:https?://)?(?:www\.|mobile\.)?(?:[fv]x)?twitter\.com" @@ -75,6 +76,10 @@ class TwitterExtractor(Extractor): else: seen_tweets = None + if self.twitpic: + self._find_twitpic = re.compile( + r"https?://(twitpic\.com/\w+)").finditer + for tweet in self.tweets(): if "legacy" in tweet: @@ -231,12 +236,27 @@ class TwitterExtractor(Extractor): files.append({"url": url}) def _extract_twitpic(self, tweet, files): - for url in tweet["entities"].get("urls", ()): + # collect urls + urls = [] + for url in tweet["entities"].get("urls") or (): url = url["expanded_url"] if "//twitpic.com/" not in url or "/photos/" in url: continue if url.startswith("http:"): url = "https" + url[4:] + urls.append(url) + tget = tweet.get + for match in self._find_twitpic( + tget("full_text") or tget("text") or ""): + urls.append(text.ensure_http_scheme(match.group(1))) + + # extract actual urls + seen = set() + for url in urls: + if url in seen: + self.log.debug("Skipping %s (previously seen)", url) + continue + seen.add(url) response = self.request(url, fatal=False) if response.status_code >= 400: continue @@ -781,7 +801,13 @@ class TwitterTweetExtractor(TwitterExtractor): ("https://twitter.com/i/web/status/112900228289540096", { "options": (("twitpic", True), ("cards", False)), "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg", - "count": 3, + "count": 2, # 1 duplicate + }), + # TwitPic URL not in 'urls' (#3792) + ("https://twitter.com/shimoigusaP/status/8138669971", { + "options": (("twitpic", True),), + "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.png", + "count": 1, }), # Twitter card (#1005) ("https://twitter.com/billboard/status/1306599586602135555", {