[kemonoparty] add 'revision_hash' metadata (#4706, #4727, #5013)

A SHA1 hexdigest of other relevant metadata fields like
title, content, file and attachment URLs.

This value does NOT reflect which revisions are listed on the website.
Neither does 'edited' or any other metadata field (combinations).
This commit is contained in:
Mike Fährmann
2024-01-16 00:24:30 +01:00
parent 4d6ec6958d
commit 3d68eda4ab
2 changed files with 24 additions and 4 deletions

View File

@@ -9,9 +9,10 @@
"""Extractors for https://kemono.party/""" """Extractors for https://kemono.party/"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, exception from .. import text, util, exception
from ..cache import cache, memcache from ..cache import cache, memcache
import itertools import itertools
import json
import re import re
BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)" BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)"
@@ -37,10 +38,14 @@ class KemonopartyExtractor(Extractor):
Extractor.__init__(self, match) Extractor.__init__(self, match)
def _init(self): def _init(self):
self.revisions = self.config("revisions")
self._prepare_ddosguard_cookies() self._prepare_ddosguard_cookies()
self._find_inline = re.compile( self._find_inline = re.compile(
r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
self._json_dumps = json.JSONEncoder(
ensure_ascii=False, check_circular=False,
sort_keys=True, separators=(",", ":")).encode
def items(self): def items(self):
find_hash = re.compile(HASH_PATTERN).match find_hash = re.compile(HASH_PATTERN).match
@@ -223,11 +228,23 @@ class KemonopartyExtractor(Extractor):
idx = len(revs) idx = len(revs)
for rev in revs: for rev in revs:
rev["revision_hash"] = self._revision_hash(rev)
rev["revision_index"] = idx rev["revision_index"] = idx
idx -= 1 idx -= 1
return revs return revs
def _revision_hash(self, revision):
rev = revision.copy()
rev.pop("revision_id", None)
rev.pop("added", None)
rev.pop("next", None)
rev.pop("prev", None)
rev["file"].pop("name", None)
for a in rev["attachments"]:
a.pop("name", None)
return util.sha1(self._json_dumps(rev))
def _validate(response): def _validate(response):
return (response.headers["content-length"] != "9" or return (response.headers["content-length"] != "9" or
@@ -252,13 +269,13 @@ class KemonopartyUserExtractor(KemonopartyExtractor):
url = self.api_url url = self.api_url
params = text.parse_query(self.query) params = text.parse_query(self.query)
params["o"] = text.parse_int(params.get("o")) params["o"] = text.parse_int(params.get("o"))
revisions = self.config("revisions")
while True: while True:
posts = self.request(url, params=params).json() posts = self.request(url, params=params).json()
if revisions: if self.revisions:
for post in posts: for post in posts:
post["revision_hash"] = self._revision_hash(post)
post["revision_id"] = 0 post["revision_id"] = 0
post_url = "{}/post/{}".format(self.api_url, post["id"]) post_url = "{}/post/{}".format(self.api_url, post["id"])
try: try:
@@ -296,7 +313,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
def posts(self): def posts(self):
if not self.revision: if not self.revision:
post = self.request(self.api_url).json() post = self.request(self.api_url).json()
if self.config("revisions"): if self.revisions:
post["revision_hash"] = self._revision_hash(post)
post["revision_id"] = 0 post["revision_id"] = 0
try: try:
revs = self._post_revisions(self.api_url) revs = self._post_revisions(self.api_url)

View File

@@ -177,6 +177,7 @@ __tests__ = (
"revision_id": 142470, "revision_id": 142470,
"revision_index": 2, "revision_index": 2,
"revision_hash": "e0e93281495e151b11636c156e52bfe9234c2a40",
}, },
{ {
@@ -190,6 +191,7 @@ __tests__ = (
"revision_id": range(134996, 3052965), "revision_id": range(134996, 3052965),
"revision_index": range(1, 9), "revision_index": range(1, 9),
"revision_hash": r"re:^[0-9a-f]{40}$",
}, },