@@ -13,6 +13,7 @@ from .. import text, util, exception
|
|||||||
from ..cache import cache
|
from ..cache import cache
|
||||||
import itertools
|
import itertools
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
|
|
||||||
BASE_PATTERN = r"(?:https?://)?(?:www\.|mobile\.)?(?:[fv]x)?twitter\.com"
|
BASE_PATTERN = r"(?:https?://)?(?:www\.|mobile\.)?(?:[fv]x)?twitter\.com"
|
||||||
|
|
||||||
@@ -75,6 +76,10 @@ class TwitterExtractor(Extractor):
|
|||||||
else:
|
else:
|
||||||
seen_tweets = None
|
seen_tweets = None
|
||||||
|
|
||||||
|
if self.twitpic:
|
||||||
|
self._find_twitpic = re.compile(
|
||||||
|
r"https?://(twitpic\.com/\w+)").finditer
|
||||||
|
|
||||||
for tweet in self.tweets():
|
for tweet in self.tweets():
|
||||||
|
|
||||||
if "legacy" in tweet:
|
if "legacy" in tweet:
|
||||||
@@ -231,12 +236,27 @@ class TwitterExtractor(Extractor):
|
|||||||
files.append({"url": url})
|
files.append({"url": url})
|
||||||
|
|
||||||
def _extract_twitpic(self, tweet, files):
|
def _extract_twitpic(self, tweet, files):
|
||||||
for url in tweet["entities"].get("urls", ()):
|
# collect urls
|
||||||
|
urls = []
|
||||||
|
for url in tweet["entities"].get("urls") or ():
|
||||||
url = url["expanded_url"]
|
url = url["expanded_url"]
|
||||||
if "//twitpic.com/" not in url or "/photos/" in url:
|
if "//twitpic.com/" not in url or "/photos/" in url:
|
||||||
continue
|
continue
|
||||||
if url.startswith("http:"):
|
if url.startswith("http:"):
|
||||||
url = "https" + url[4:]
|
url = "https" + url[4:]
|
||||||
|
urls.append(url)
|
||||||
|
tget = tweet.get
|
||||||
|
for match in self._find_twitpic(
|
||||||
|
tget("full_text") or tget("text") or ""):
|
||||||
|
urls.append(text.ensure_http_scheme(match.group(1)))
|
||||||
|
|
||||||
|
# extract actual urls
|
||||||
|
seen = set()
|
||||||
|
for url in urls:
|
||||||
|
if url in seen:
|
||||||
|
self.log.debug("Skipping %s (previously seen)", url)
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
response = self.request(url, fatal=False)
|
response = self.request(url, fatal=False)
|
||||||
if response.status_code >= 400:
|
if response.status_code >= 400:
|
||||||
continue
|
continue
|
||||||
@@ -781,7 +801,13 @@ class TwitterTweetExtractor(TwitterExtractor):
|
|||||||
("https://twitter.com/i/web/status/112900228289540096", {
|
("https://twitter.com/i/web/status/112900228289540096", {
|
||||||
"options": (("twitpic", True), ("cards", False)),
|
"options": (("twitpic", True), ("cards", False)),
|
||||||
"pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
|
"pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg",
|
||||||
"count": 3,
|
"count": 2, # 1 duplicate
|
||||||
|
}),
|
||||||
|
# TwitPic URL not in 'urls' (#3792)
|
||||||
|
("https://twitter.com/shimoigusaP/status/8138669971", {
|
||||||
|
"options": (("twitpic", True),),
|
||||||
|
"pattern": r"https://\w+.cloudfront.net/photos/large/\d+.png",
|
||||||
|
"count": 1,
|
||||||
}),
|
}),
|
||||||
# Twitter card (#1005)
|
# Twitter card (#1005)
|
||||||
("https://twitter.com/billboard/status/1306599586602135555", {
|
("https://twitter.com/billboard/status/1306599586602135555", {
|
||||||
|
|||||||
Reference in New Issue
Block a user