merge #3796: [twitter] extract TwitPic URLs in text (#3792)

This commit is contained in:
Mike Fährmann
2023-05-25 14:59:07 +02:00

View File

@@ -13,6 +13,7 @@ from .. import text, util, exception
from ..cache import cache
import itertools
import json
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.|mobile\.)?(?:[fv]x)?twitter\.com"
@@ -75,6 +76,10 @@ class TwitterExtractor(Extractor):
else:
seen_tweets = None
if self.twitpic:
self._find_twitpic = re.compile(
r"https?://(twitpic\.com/\w+)").finditer
for tweet in self.tweets():
if "legacy" in tweet:
@@ -231,12 +236,27 @@ class TwitterExtractor(Extractor):
files.append({"url": url})
def _extract_twitpic(self, tweet, files):
for url in tweet["entities"].get("urls", ()):
# collect urls
urls = []
for url in tweet["entities"].get("urls") or ():
url = url["expanded_url"]
if "//twitpic.com/" not in url or "/photos/" in url:
continue
if url.startswith("http:"):
url = "https" + url[4:]
urls.append(url)
tget = tweet.get
for match in self._find_twitpic(
tget("full_text") or tget("text") or ""):
urls.append(text.ensure_http_scheme(match.group(1)))
# extract actual urls
seen = set()
for url in urls:
if url in seen:
self.log.debug("Skipping %s (previously seen)", url)
continue
seen.add(url)
response = self.request(url, fatal=False)
if response.status_code >= 400:
continue
@@ -781,7 +801,13 @@ class TwitterTweetExtractor(TwitterExtractor):
("https://twitter.com/i/web/status/112900228289540096", {
"options": (("twitpic", True), ("cards", False)),
"pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
"count": 3,
"count": 2, # 1 duplicate
}),
# TwitPic URL not in 'urls' (#3792)
("https://twitter.com/shimoigusaP/status/8138669971", {
"options": (("twitpic", True),),
"pattern": r"https://\w+.cloudfront.net/photos/large/\d+.png",
"count": 1,
}),
# Twitter card (#1005)
("https://twitter.com/billboard/status/1306599586602135555", {