[twitter] extract 'sensitive_flags' metadata (#2523)

a list of 'sensitive_media_warning' flags per file
and a combination of all file flags per Tweet
This commit is contained in:
Mike Fährmann
2025-07-08 21:42:39 +02:00
parent b6bd675a9e
commit cfafbc0675

View File

@@ -125,6 +125,7 @@ class TwitterExtractor(Extractor):
yield Message.Directory, tdata
del tdata["source_id"]
del tdata["sensitive_flags"]
if "source_user" in tdata:
del tdata["source_user"]
@@ -169,8 +170,29 @@ class TwitterExtractor(Extractor):
return files
def _extract_media(self, tweet, entities, files):
flags_tweet = None
for media in entities:
if "sensitive_media_warning" in media:
flags_media = media["sensitive_media_warning"]
flags = []
if "adult_content" in flags_media:
flags.append("Nudity")
if "other" in flags_media:
flags.append("Sensitive")
if "graphic_violence" in flags_media:
flags.append("Violence")
if flags_tweet is None:
flags_tweet = set(flags)
else:
flags_tweet.update(flags)
flags_media = flags
else:
flags_media = ()
if "ext_media_availability" in media:
ext = media["ext_media_availability"]
if ext.get("status") == "Unavailable":
@@ -214,9 +236,13 @@ class TwitterExtractor(Extractor):
file["width"] = media["original_info"].get("width", 0)
file["height"] = media["original_info"].get("height", 0)
file["description"] = media.get("ext_alt_text")
file["sensitive_flags"] = flags_media
self._extract_media_source(file, media)
files.append(file)
tweet["sensitive_flags"] = \
() if flags_tweet is None else sorted(flags_tweet)
def _extract_media_source(self, dest, media):
dest["source_id"] = 0
@@ -361,6 +387,7 @@ class TwitterExtractor(Extractor):
"lang" : legacy["lang"],
"source" : text.extr(source, ">", "<") if source else "",
"sensitive" : tget("possibly_sensitive"),
"sensitive_flags": tget("sensitive_flags"),
"favorite_count": tget("favorite_count"),
"quote_count" : tget("quote_count"),
"reply_count" : tget("reply_count"),