[wikimedia] add ability to change directory before downloading revisions

This commit is contained in:
ClosedPort22
2025-10-02 18:43:57 +08:00
parent 55742c6a2c
commit 47857b447b
2 changed files with 28 additions and 9 deletions

View File

@@ -62,7 +62,10 @@ class WikimediaExtractor(BaseExtractor):
return url
raise exception.AbortExtraction("Unable to find API endpoint")
def prepare(self, image):
def prepare_info(self, page):
"""Adjust the content of an image info object"""
def prepare_image(self, image):
"""Adjust the content of an image object"""
image["metadata"] = {
m["name"]: m["value"]
@@ -80,14 +83,18 @@ class WikimediaExtractor(BaseExtractor):
def items(self):
for info in self._pagination(self.params):
try:
images = info["imageinfo"]
images = info.pop("imageinfo")
except KeyError:
self.log.debug("Missing 'imageinfo' for %s", info)
continue
images = ()
for image in images:
self.prepare(image)
yield Message.Directory, image
info["count"] = len(images)
self.prepare_info(info)
yield Message.Directory, info
for info["num"], image in enumerate(images, 1):
self.prepare_image(image)
image.update(info)
yield Message.Url, image["url"], image
if self.subcategories:
@@ -245,9 +252,8 @@ class WikimediaArticleExtractor(WikimediaExtractor):
"titles" : path,
}
def prepare(self, image):
WikimediaExtractor.prepare(self, image)
image["page"] = self.title
def prepare_info(self, info):
info["page"] = self.title
class WikimediaWikiExtractor(WikimediaExtractor):

View File

@@ -44,6 +44,19 @@ __tests__ = (
"width" : int,
},
{
"#url" : "https://tl.wikipedia.org/wiki/Sitosol",
"#comment" : "revisions of an image in an article",
"#category": ("wikimedia", "wikipedia", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
"#options" : {"image-revisions": 10},
"#count" : 2,
"#pattern" : (
r"https://upload.wikimedia.org/wikipedia/commons/2/2e/Crowded_cytosol.png",
r"https://upload.wikimedia.org/wikipedia/commons/archive/2/2e/20080911161129%21Crowded_cytosol.png",
),
},
{
"#url" : "https://en.wikipedia.org/wiki/Category:Physics",
"#category": ("wikimedia", "wikipedia", "category"),