[kemonoparty] add 'comments' option (#1980)

This commit is contained in:
Mike Fährmann
2021-11-03 22:52:15 +01:00
parent 1fac74b14d
commit f0fc3b0ba1
2 changed files with 31 additions and 0 deletions

View File

@@ -1328,6 +1328,16 @@ Description
Download video files. Download video files.
extractor.kemonoparty.comments
-----------------------------
Type
``bool``
Default
``false``
Description
Extract ``comments`` metadata.
extractor.kemonoparty.max-posts extractor.kemonoparty.max-posts
------------------------------- -------------------------------
Type Type

View File

@@ -35,6 +35,7 @@ class KemonopartyExtractor(Extractor):
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
skip_service = \ skip_service = \
"patreon" if self.config("patreon-skip-file", True) else None "patreon" if self.config("patreon-skip-file", True) else None
comments = self.config("comments")
if self.config("metadata"): if self.config("metadata"):
username = text.unescape(text.extract( username = text.unescape(text.extract(
@@ -68,6 +69,8 @@ class KemonopartyExtractor(Extractor):
post["published"], "%a, %d %b %Y %H:%M:%S %Z") post["published"], "%a, %d %b %Y %H:%M:%S %Z")
if username: if username:
post["username"] = username post["username"] = username
if comments:
post["comments"] = self._extract_comments(post)
yield Message.Directory, post yield Message.Directory, post
for post["num"], file in enumerate(files, 1): for post["num"], file in enumerate(files, 1):
@@ -100,6 +103,24 @@ class KemonopartyExtractor(Extractor):
return {c.name: c.value for c in response.history[0].cookies} return {c.name: c.value for c in response.history[0].cookies}
def _extract_comments(self, post):
url = "{}/{}/user/{}/post/{}".format(
self.root, post["service"], post["user"], post["id"])
page = self.request(url).text
comments = []
for comment in text.extract_iter(page, "<article", "</article>"):
extr = text.extract_from(comment)
cid = extr('id="', '"')
comments.append({
"id" : cid,
"user": extr('href="#' + cid + '"', '</').strip(" \n\r>"),
"body": extr(
'<section class="comment__body">', '</section>').strip(),
"date": extr('datetime="', '"'),
})
return comments
class KemonopartyUserExtractor(KemonopartyExtractor): class KemonopartyUserExtractor(KemonopartyExtractor):
"""Extractor for all posts from a kemono.party user listing""" """Extractor for all posts from a kemono.party user listing"""