[twitter] extract 'source_id' and 'source_user' metadata (#7470 #7640)

This commit is contained in:
Mike Fährmann
2025-06-12 18:54:34 +02:00
parent 06e2f2cd91
commit e2d104a110
2 changed files with 42 additions and 1 deletions

View File

@@ -123,6 +123,11 @@ class TwitterExtractor(Extractor):
tdata.update(metadata)
tdata["count"] = len(files)
yield Message.Directory, tdata
del tdata["source_id"]
if "source_user" in tdata:
del tdata["source_user"]
for tdata["num"], file in enumerate(files, 1):
file.update(tdata)
url = file.pop("url")
@@ -212,8 +217,22 @@ class TwitterExtractor(Extractor):
file["width"] = media["original_info"].get("width", 0)
file["height"] = media["original_info"].get("height", 0)
file["description"] = media.get("ext_alt_text")
self._extract_media_source(file, media)
files.append(file)
def _extract_media_source(self, dest, media):
dest["source_id"] = 0
if "source_status_id_str" in media:
try:
dest["source_id"] = text.parse_int(
media["source_status_id_str"])
dest["source_user"] = self._transform_user(
media["additional_media_info"]["source_user"]
["user_results"]["result"])
except Exception:
pass
def _image_fallback(self, base):
for fmt in self._size_fallback:
yield base + fmt
@@ -338,6 +357,7 @@ class TwitterExtractor(Extractor):
tget("in_reply_to_status_id_str")),
"conversation_id": text.parse_int(
tget("conversation_id_str")),
"source_id" : 0,
"date" : date,
"author" : author,
"user" : self._user or author,
@@ -401,6 +421,9 @@ class TwitterExtractor(Extractor):
tdata["reply_to"] = legacy["in_reply_to_screen_name"]
if "quoted_by" in legacy:
tdata["quote_by"] = legacy["quoted_by"]
if "extended_entities" in legacy:
self._extract_media_source(
tdata, legacy["extended_entities"]["media"][0])
if tdata["retweet_id"]:
tdata["content"] = "RT @{}: {}".format(
author["name"], tdata["content"])