[reddit] ensure 'comment' metadata field for media files (#8228)
This commit is contained in:
@@ -56,6 +56,7 @@ class RedditExtractor(Extractor):
|
|||||||
urls = []
|
urls = []
|
||||||
|
|
||||||
if submission:
|
if submission:
|
||||||
|
submission["comment"] = None
|
||||||
submission["date"] = text.parse_timestamp(
|
submission["date"] = text.parse_timestamp(
|
||||||
submission["created_utc"])
|
submission["created_utc"])
|
||||||
yield Message.Directory, submission
|
yield Message.Directory, submission
|
||||||
@@ -99,14 +100,13 @@ class RedditExtractor(Extractor):
|
|||||||
elif not submission["is_self"]:
|
elif not submission["is_self"]:
|
||||||
urls.append((url, submission))
|
urls.append((url, submission))
|
||||||
|
|
||||||
|
if selftext and (txt := submission["selftext_html"]):
|
||||||
|
for url in text.extract_iter(txt, ' href="', '"'):
|
||||||
|
urls.append((url, submission))
|
||||||
|
|
||||||
elif parentdir:
|
elif parentdir:
|
||||||
yield Message.Directory, comments[0]
|
yield Message.Directory, comments[0]
|
||||||
|
|
||||||
if selftext and submission:
|
|
||||||
for url in text.extract_iter(
|
|
||||||
submission["selftext_html"] or "", ' href="', '"'):
|
|
||||||
urls.append((url, submission))
|
|
||||||
|
|
||||||
if self.api.comments:
|
if self.api.comments:
|
||||||
if comments and not submission:
|
if comments and not submission:
|
||||||
submission = comments[0]
|
submission = comments[0]
|
||||||
@@ -115,24 +115,24 @@ class RedditExtractor(Extractor):
|
|||||||
yield Message.Directory, submission
|
yield Message.Directory, submission
|
||||||
|
|
||||||
for comment in comments:
|
for comment in comments:
|
||||||
|
media = (embeds and "media_metadata" in comment)
|
||||||
html = comment["body_html"] or ""
|
html = comment["body_html"] or ""
|
||||||
href = (' href="' in html)
|
href = (' href="' in html)
|
||||||
media = (embeds and "media_metadata" in comment)
|
|
||||||
|
|
||||||
if media or href:
|
if not media and not href:
|
||||||
comment["date"] = text.parse_timestamp(
|
continue
|
||||||
comment["created_utc"])
|
|
||||||
if submission:
|
data = submission.copy()
|
||||||
data = submission.copy()
|
data["comment"] = comment
|
||||||
data["comment"] = comment
|
comment["date"] = text.parse_timestamp(
|
||||||
else:
|
comment["created_utc"])
|
||||||
data = comment
|
|
||||||
|
|
||||||
if media:
|
if media:
|
||||||
for embed in self._extract_embed(comment):
|
for url in self._extract_embed(comment):
|
||||||
submission["num"] += 1
|
data["num"] += 1
|
||||||
text.nameext_from_url(embed, submission)
|
text.nameext_from_url(url, data)
|
||||||
yield Message.Url, embed, submission
|
yield Message.Url, url, data
|
||||||
|
submission["num"] = data["num"]
|
||||||
|
|
||||||
if href:
|
if href:
|
||||||
for url in text.extract_iter(html, ' href="', '"'):
|
for url in text.extract_iter(html, ' href="', '"'):
|
||||||
|
|||||||
Reference in New Issue
Block a user