[reddit] ensure 'comment' metadata field for media files (#8228)

This commit is contained in:
Mike Fährmann
2025-09-19 17:22:33 +02:00
parent 713a65923a
commit b225018eda

View File

@@ -56,6 +56,7 @@ class RedditExtractor(Extractor):
urls = [] urls = []
if submission: if submission:
submission["comment"] = None
submission["date"] = text.parse_timestamp( submission["date"] = text.parse_timestamp(
submission["created_utc"]) submission["created_utc"])
yield Message.Directory, submission yield Message.Directory, submission
@@ -99,14 +100,13 @@ class RedditExtractor(Extractor):
elif not submission["is_self"]: elif not submission["is_self"]:
urls.append((url, submission)) urls.append((url, submission))
if selftext and (txt := submission["selftext_html"]):
for url in text.extract_iter(txt, ' href="', '"'):
urls.append((url, submission))
elif parentdir: elif parentdir:
yield Message.Directory, comments[0] yield Message.Directory, comments[0]
if selftext and submission:
for url in text.extract_iter(
submission["selftext_html"] or "", ' href="', '"'):
urls.append((url, submission))
if self.api.comments: if self.api.comments:
if comments and not submission: if comments and not submission:
submission = comments[0] submission = comments[0]
@@ -115,24 +115,24 @@ class RedditExtractor(Extractor):
yield Message.Directory, submission yield Message.Directory, submission
for comment in comments: for comment in comments:
media = (embeds and "media_metadata" in comment)
html = comment["body_html"] or "" html = comment["body_html"] or ""
href = (' href="' in html) href = (' href="' in html)
media = (embeds and "media_metadata" in comment)
if media or href: if not media and not href:
comment["date"] = text.parse_timestamp( continue
comment["created_utc"])
if submission: data = submission.copy()
data = submission.copy() data["comment"] = comment
data["comment"] = comment comment["date"] = text.parse_timestamp(
else: comment["created_utc"])
data = comment
if media: if media:
for embed in self._extract_embed(comment): for url in self._extract_embed(comment):
submission["num"] += 1 data["num"] += 1
text.nameext_from_url(embed, submission) text.nameext_from_url(url, data)
yield Message.Url, embed, submission yield Message.Url, url, data
submission["num"] = data["num"]
if href: if href:
for url in text.extract_iter(html, ' href="', '"'): for url in text.extract_iter(html, ' href="', '"'):