[kemono] extend 'duplicates' option (#7696)

allow selecting which file type(s) ('file'/'attachment'/'inline')
not to consider when discarding duplicates
This commit is contained in:
Mike Fährmann
2025-06-27 20:52:32 +02:00
parent c1db879b6c
commit fde6110d96
2 changed files with 31 additions and 15 deletions

View File

@@ -693,9 +693,7 @@ Description
specified after a ``:`` (``windows``, ``linux``, or ``macos``).
Note:
This option overrides
`user-agent <extractor.*.user-agent_>`__
and sets custom
This option sets custom
`headers <extractor.*.headers_>`__
and
`ciphers <extractor.*.ciphers_>`__
@@ -3205,14 +3203,24 @@ Description
extractor.kemono.duplicates
---------------------------
Type
``bool``
* ``bool``
* ``string``
* ``list`` of ``strings``
Default
``false``
Example
* ``"attachment,inline"``
* ``["file", "attachment"]``
Description
Controls how to handle duplicate files in a post.
* ``true``: Download duplicates
* ``false``: Ignore duplicates
``true``
Download duplicates
``false``
Ignore duplicates
any ``list`` or ``string``
| Download a duplicate file if its ``type`` is in the given list
| Ignore it otherwise
extractor.kemono.dms

View File

@@ -56,12 +56,21 @@ class KemonoExtractor(Extractor):
announcements = True if self.config("announcements") else None
archives = True if self.config("archives") else False
comments = True if self.config("comments") else False
duplicates = True if self.config("duplicates") else False
dms = True if self.config("dms") else None
max_posts = self.config("max-posts")
creator_info = {} if self.config("metadata", True) else None
exts_archive = util.EXTS_ARCHIVE
if duplicates := self.config("duplicates"):
if isinstance(duplicates, str):
duplicates = set(duplicates.split(","))
elif isinstance(duplicates, (list, tuple)):
duplicates = set(duplicates)
else:
duplicates = {"file", "attachment", "inline"}
else:
duplicates = ()
# prevent files from being sent with gzip compression
headers = {"Accept-Encoding": "identity"}
@@ -123,14 +132,13 @@ class KemonoExtractor(Extractor):
if "\\" in url:
file["path"] = url = url.replace("\\", "/")
match = find_hash(url)
if match:
if match := find_hash(url):
file["hash"] = hash = match[1]
if not duplicates:
if hash in hashes:
self.log.debug("Skipping %s (duplicate)", url)
continue
hashes.add(hash)
if file["type"] not in duplicates and hash in hashes:
self.log.debug("Skipping %s %s (duplicate)",
file["type"], url)
continue
hashes.add(hash)
else:
file["hash"] = hash = ""
@@ -150,7 +158,7 @@ class KemonoExtractor(Extractor):
file["type"] = "archive"
if archives:
try:
data = self.api.file(file["hash"])
data = self.api.file(hash)
data.update(file)
post_archives.append(data)
except Exception as exc: