[twitter] improve twitpic extraction (fixes #1019)

- ignore twitpic.com/photos/… URLs
- ignore empty image URLs
This commit is contained in:
Mike Fährmann
2020-09-21 22:21:16 +02:00
parent 2184ec5d78
commit aeb0d32333
3 changed files with 9 additions and 6 deletions

View File

@@ -110,16 +110,17 @@ class TwitterExtractor(Extractor):
twitpics = []
for url in tweet["entities"].get("urls", ()):
url = url["expanded_url"]
if "//twitpic.com/" in url:
if "//twitpic.com/" in url and "/photos/" not in url:
response = self.request(url, fatal=False)
if response.status_code >= 400:
continue
url = text.extract(
response.text, 'name="twitter:image" value="', '"')[0]
twitpics.append({
"original_info": {},
"media_url" : url,
})
if url:
twitpics.append({
"original_info": {},
"media_url" : url,
})
if twitpics:
if "extended_entities" in tweet:
tweet["extended_entities"]["media"].extend(twitpics)