[wikimedia] add ability to fetch image revisions

as long as the number of revisions of a single image does not exceed 500
This commit is contained in:
ClosedPort22
2025-10-02 15:46:17 +08:00
parent 0a76cbc8c7
commit 55742c6a2c
2 changed files with 27 additions and 5 deletions

View File

@@ -6548,6 +6548,20 @@ Description
Download video files.
extractor.wikimedia.image-revisions
-----------------------------------
Type
``integer``
Default
``1``
Description
Number of revisions to return for a single image.
The dafault value of 1 only returns the latest revision.
The value must be between 1 and 500.
extractor.wikimedia.limit
-------------------------
Type

View File

@@ -46,6 +46,12 @@ class WikimediaExtractor(BaseExtractor):
else:
self.api_url = None
# note: image revisions are different from page revisions
# ref:
# https://www.mediawiki.org/wiki/API:Revisions
# https://www.mediawiki.org/wiki/API:Imageinfo
self.image_revisions = self.config("image-revisions", 1)
@cache(maxage=36500*86400, keyarg=1)
def _search_api_path(self, root):
self.log.debug("Probing possible API endpoints")
@@ -74,14 +80,15 @@ class WikimediaExtractor(BaseExtractor):
def items(self):
for info in self._pagination(self.params):
try:
image = info["imageinfo"][0]
except LookupError:
images = info["imageinfo"]
except KeyError:
self.log.debug("Missing 'imageinfo' for %s", info)
continue
self.prepare(image)
yield Message.Directory, image
yield Message.Url, image["url"], image
for image in images:
self.prepare(image)
yield Message.Directory, image
yield Message.Url, image["url"], image
if self.subcategories:
base = self.root + "/wiki/"
@@ -108,6 +115,7 @@ class WikimediaExtractor(BaseExtractor):
"timestamp|user|userid|comment|canonicaltitle|url|size|"
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth"
)
params["iilimit"] = self.image_revisions
while True:
data = self.request_json(url, params=params)