From 3d68eda4abcfde18ecf377f140b8ad6ec4c2de6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 16 Jan 2024 00:24:30 +0100 Subject: [PATCH] [kemonoparty] add 'revision_hash' metadata (#4706, #4727, #5013) A SHA1 hexdigest of other relevant metadata fields like title, content, file and attachment URLs. This value does NOT reflect which revisions are listed on the website. Neither does 'edited' or any other metadata field (combinations). --- gallery_dl/extractor/kemonoparty.py | 26 ++++++++++++++++++++++---- test/results/kemonoparty.py | 2 ++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index c24e57d1..10228b5c 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -9,9 +9,10 @@ """Extractors for https://kemono.party/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception from ..cache import cache, memcache import itertools +import json import re BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)" @@ -37,10 +38,14 @@ class KemonopartyExtractor(Extractor): Extractor.__init__(self, match) def _init(self): + self.revisions = self.config("revisions") self._prepare_ddosguard_cookies() self._find_inline = re.compile( r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall + self._json_dumps = json.JSONEncoder( + ensure_ascii=False, check_circular=False, + sort_keys=True, separators=(",", ":")).encode def items(self): find_hash = re.compile(HASH_PATTERN).match @@ -223,11 +228,23 @@ class KemonopartyExtractor(Extractor): idx = len(revs) for rev in revs: + rev["revision_hash"] = self._revision_hash(rev) rev["revision_index"] = idx idx -= 1 return revs + def _revision_hash(self, revision): + rev = revision.copy() + rev.pop("revision_id", None) + rev.pop("added", None) + rev.pop("next", None) + rev.pop("prev", None) + rev["file"].pop("name", None) + for a in rev["attachments"]: + a.pop("name", None) + return util.sha1(self._json_dumps(rev)) + def _validate(response): return (response.headers["content-length"] != "9" or @@ -252,13 +269,13 @@ class KemonopartyUserExtractor(KemonopartyExtractor): url = self.api_url params = text.parse_query(self.query) params["o"] = text.parse_int(params.get("o")) - revisions = self.config("revisions") while True: posts = self.request(url, params=params).json() - if revisions: + if self.revisions: for post in posts: + post["revision_hash"] = self._revision_hash(post) post["revision_id"] = 0 post_url = "{}/post/{}".format(self.api_url, post["id"]) try: @@ -296,7 +313,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor): def posts(self): if not self.revision: post = self.request(self.api_url).json() - if self.config("revisions"): + if self.revisions: + post["revision_hash"] = self._revision_hash(post) post["revision_id"] = 0 try: revs = self._post_revisions(self.api_url) diff --git a/test/results/kemonoparty.py b/test/results/kemonoparty.py index 5bd541a3..c3dbdf73 100644 --- a/test/results/kemonoparty.py +++ b/test/results/kemonoparty.py @@ -177,6 +177,7 @@ __tests__ = ( "revision_id": 142470, "revision_index": 2, + "revision_hash": "e0e93281495e151b11636c156e52bfe9234c2a40", }, { @@ -190,6 +191,7 @@ __tests__ = ( "revision_id": range(134996, 3052965), "revision_index": range(1, 9), + "revision_hash": r"re:^[0-9a-f]{40}$", },