[twitter] extract 'source_id' and 'source_user' metadata (#7470 #7640)

2025-06-12 18:54:34 +02:00
parent 06e2f2cd91
commit e2d104a110
2 changed files with 42 additions and 1 deletions
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -123,6 +123,11 @@ class TwitterExtractor(Extractor):
            tdata.update(metadata)
            tdata["count"] = len(files)
            yield Message.Directory, tdata
+
+            del tdata["source_id"]
+            if "source_user" in tdata:
+                del tdata["source_user"]
+
            for tdata["num"], file in enumerate(files, 1):
                file.update(tdata)
                url = file.pop("url")
@@ -212,8 +217,22 @@ class TwitterExtractor(Extractor):
            file["width"] = media["original_info"].get("width", 0)
            file["height"] = media["original_info"].get("height", 0)
            file["description"] = media.get("ext_alt_text")
+            self._extract_media_source(file, media)
            files.append(file)

+    def _extract_media_source(self, dest, media):
+        dest["source_id"] = 0
+
+        if "source_status_id_str" in media:
+            try:
+                dest["source_id"] = text.parse_int(
+                    media["source_status_id_str"])
+                dest["source_user"] = self._transform_user(
+                    media["additional_media_info"]["source_user"]
+                    ["user_results"]["result"])
+            except Exception:
+                pass
+
    def _image_fallback(self, base):
        for fmt in self._size_fallback:
            yield base + fmt
@@ -338,6 +357,7 @@ class TwitterExtractor(Extractor):
                tget("in_reply_to_status_id_str")),
            "conversation_id": text.parse_int(
                tget("conversation_id_str")),
+            "source_id"     : 0,
            "date"          : date,
            "author"        : author,
            "user"          : self._user or author,
@@ -401,6 +421,9 @@ class TwitterExtractor(Extractor):
            tdata["reply_to"] = legacy["in_reply_to_screen_name"]
        if "quoted_by" in legacy:
            tdata["quote_by"] = legacy["quoted_by"]
+        if "extended_entities" in legacy:
+            self._extract_media_source(
+                tdata, legacy["extended_entities"]["media"][0])
        if tdata["retweet_id"]:
            tdata["content"] = "RT @{}: {}".format(
                author["name"], tdata["content"])