A SHA1 hexdigest of other relevant metadata fields like title, content, file and attachment URLs. This value does NOT reflect which revisions are listed on the website. Neither does 'edited' or any other metadata field (combinations).
This commit is contained in:
@@ -9,9 +9,10 @@
|
|||||||
"""Extractors for https://kemono.party/"""
|
"""Extractors for https://kemono.party/"""
|
||||||
|
|
||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import text, exception
|
from .. import text, util, exception
|
||||||
from ..cache import cache, memcache
|
from ..cache import cache, memcache
|
||||||
import itertools
|
import itertools
|
||||||
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)"
|
BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)"
|
||||||
@@ -37,10 +38,14 @@ class KemonopartyExtractor(Extractor):
|
|||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
|
|
||||||
def _init(self):
|
def _init(self):
|
||||||
|
self.revisions = self.config("revisions")
|
||||||
self._prepare_ddosguard_cookies()
|
self._prepare_ddosguard_cookies()
|
||||||
self._find_inline = re.compile(
|
self._find_inline = re.compile(
|
||||||
r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+'
|
r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+'
|
||||||
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
|
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
|
||||||
|
self._json_dumps = json.JSONEncoder(
|
||||||
|
ensure_ascii=False, check_circular=False,
|
||||||
|
sort_keys=True, separators=(",", ":")).encode
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
find_hash = re.compile(HASH_PATTERN).match
|
find_hash = re.compile(HASH_PATTERN).match
|
||||||
@@ -223,11 +228,23 @@ class KemonopartyExtractor(Extractor):
|
|||||||
|
|
||||||
idx = len(revs)
|
idx = len(revs)
|
||||||
for rev in revs:
|
for rev in revs:
|
||||||
|
rev["revision_hash"] = self._revision_hash(rev)
|
||||||
rev["revision_index"] = idx
|
rev["revision_index"] = idx
|
||||||
idx -= 1
|
idx -= 1
|
||||||
|
|
||||||
return revs
|
return revs
|
||||||
|
|
||||||
|
def _revision_hash(self, revision):
|
||||||
|
rev = revision.copy()
|
||||||
|
rev.pop("revision_id", None)
|
||||||
|
rev.pop("added", None)
|
||||||
|
rev.pop("next", None)
|
||||||
|
rev.pop("prev", None)
|
||||||
|
rev["file"].pop("name", None)
|
||||||
|
for a in rev["attachments"]:
|
||||||
|
a.pop("name", None)
|
||||||
|
return util.sha1(self._json_dumps(rev))
|
||||||
|
|
||||||
|
|
||||||
def _validate(response):
|
def _validate(response):
|
||||||
return (response.headers["content-length"] != "9" or
|
return (response.headers["content-length"] != "9" or
|
||||||
@@ -252,13 +269,13 @@ class KemonopartyUserExtractor(KemonopartyExtractor):
|
|||||||
url = self.api_url
|
url = self.api_url
|
||||||
params = text.parse_query(self.query)
|
params = text.parse_query(self.query)
|
||||||
params["o"] = text.parse_int(params.get("o"))
|
params["o"] = text.parse_int(params.get("o"))
|
||||||
revisions = self.config("revisions")
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
posts = self.request(url, params=params).json()
|
posts = self.request(url, params=params).json()
|
||||||
|
|
||||||
if revisions:
|
if self.revisions:
|
||||||
for post in posts:
|
for post in posts:
|
||||||
|
post["revision_hash"] = self._revision_hash(post)
|
||||||
post["revision_id"] = 0
|
post["revision_id"] = 0
|
||||||
post_url = "{}/post/{}".format(self.api_url, post["id"])
|
post_url = "{}/post/{}".format(self.api_url, post["id"])
|
||||||
try:
|
try:
|
||||||
@@ -296,7 +313,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
|
|||||||
def posts(self):
|
def posts(self):
|
||||||
if not self.revision:
|
if not self.revision:
|
||||||
post = self.request(self.api_url).json()
|
post = self.request(self.api_url).json()
|
||||||
if self.config("revisions"):
|
if self.revisions:
|
||||||
|
post["revision_hash"] = self._revision_hash(post)
|
||||||
post["revision_id"] = 0
|
post["revision_id"] = 0
|
||||||
try:
|
try:
|
||||||
revs = self._post_revisions(self.api_url)
|
revs = self._post_revisions(self.api_url)
|
||||||
|
|||||||
@@ -177,6 +177,7 @@ __tests__ = (
|
|||||||
|
|
||||||
"revision_id": 142470,
|
"revision_id": 142470,
|
||||||
"revision_index": 2,
|
"revision_index": 2,
|
||||||
|
"revision_hash": "e0e93281495e151b11636c156e52bfe9234c2a40",
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -190,6 +191,7 @@ __tests__ = (
|
|||||||
|
|
||||||
"revision_id": range(134996, 3052965),
|
"revision_id": range(134996, 3052965),
|
||||||
"revision_index": range(1, 9),
|
"revision_index": range(1, 9),
|
||||||
|
"revision_hash": r"re:^[0-9a-f]{40}$",
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user