[wikimedia] fix 'limit' config lookup

rework config handling in general
This commit is contained in:
Mike Fährmann
2025-10-23 20:16:46 +02:00
parent 98d3354575
commit da4b8c1e83
2 changed files with 39 additions and 39 deletions

View File

@@ -966,18 +966,16 @@ class BaseExtractor(Extractor):
def __init__(self, match):
if not self.category:
self.groups = match.groups()
self.match = match
self._init_category()
self._init_category(match)
Extractor.__init__(self, match)
def _init_category(self):
for index, group in enumerate(self.groups):
def _init_category(self, match):
for index, group in enumerate(match.groups()):
if group is not None:
if index:
self.category, self.root, info = self.instances[index-1]
if not self.root:
self.root = text.root_from_url(self.match[0])
self.root = text.root_from_url(match[0])
self.config_instance = info.get
else:
self.root = group

View File

@@ -22,7 +22,7 @@ class WikimediaExtractor(BaseExtractor):
request_interval = (1.0, 2.0)
def __init__(self, match):
BaseExtractor.__init__(self, match)
self._init_category(match)
if self.category == "wikimedia":
self.category = self.root.split(".")[-2]
@@ -32,16 +32,15 @@ class WikimediaExtractor(BaseExtractor):
f"{self.category}-"
f"{self.root.partition('.')[0].rpartition('/')[2]}")
self.per_page = self.config("limit", 50)
self.subcategories = False
if useragent := self.config_instance("useragent"):
self.useragent = useragent
BaseExtractor.__init__(self, match)
def _init(self):
if api_path := self.config_instance("api-path"):
if api_path[0] == "/":
self.api_url = self.root + api_path
self.api_url = f"{self.root}{api_path}"
else:
self.api_url = api_path
else:
@@ -53,12 +52,14 @@ class WikimediaExtractor(BaseExtractor):
# https://www.mediawiki.org/wiki/API:Imageinfo
self.image_revisions = self.config("image-revisions", 1)
self.format = self.config("format", "original")
self.per_page = self.config("limit", 50)
self.subcategories = False
@cache(maxage=36500*86400, keyarg=1)
def _search_api_path(self, root):
self.log.debug("Probing possible API endpoints")
for path in ("/api.php", "/w/api.php", "/wiki/api.php"):
url = root + path
url = f"{root}{path}"
response = self.request(url, method="HEAD", fatal=None)
if response.status_code < 400:
return url
@@ -86,7 +87,9 @@ class WikimediaExtractor(BaseExtractor):
f"format={self.format}")
def items(self):
for info in self._pagination(self.params):
params = self.params()
for info in self._pagination(params):
try:
images = info.pop("imageinfo")
except KeyError:
@@ -112,10 +115,10 @@ class WikimediaExtractor(BaseExtractor):
yield Message.Url, image["url"], image
if self.subcategories:
base = self.root + "/wiki/"
self.params["gcmtype"] = "subcat"
for subcat in self._pagination(self.params):
url = base + subcat["title"].replace(" ", "_")
base = f"{self.root}/wiki/"
params["gcmtype"] = "subcat"
for subcat in self._pagination(params):
url = f"{base}{subcat['title'].replace(' ', '_')}"
subcat["_extractor"] = WikimediaArticleExtractor
yield Message.Queue, url, subcat
@@ -234,40 +237,41 @@ class WikimediaArticleExtractor(WikimediaExtractor):
path = self.groups[-1]
if path[2] == "/":
self.root = self.root + "/" + path[:2]
self.root = f"{self.root}/{path[:2]}"
path = path[3:]
if path.startswith("wiki/"):
path = path[5:]
self.path = text.unquote(path)
pre, sep, _ = path.partition(":")
prefix = pre.lower() if sep else None
self.title = path = text.unquote(path)
if prefix:
self.prefix = prefix = pre.lower() if sep else None
if prefix is not None:
self.subcategory = prefix
if prefix == "category":
def params(self):
if self.prefix == "category":
if self.config("subcategories", True):
self.subcategories = True
self.params = {
return {
"generator": "categorymembers",
"gcmtitle" : path,
"gcmtitle" : self.path,
"gcmtype" : "file",
"gcmlimit" : self.per_page,
}
elif prefix == "file":
self.params = {
"titles" : path,
}
else:
self.params = {
"generator": "images",
"gimlimit" : self.per_page,
"titles" : path,
if self.prefix == "file":
return {
"titles": self.path,
}
return {
"generator": "images",
"gimlimit" : self.per_page,
"titles" : self.path,
}
def prepare_info(self, info):
info["page"] = self.title
info["page"] = self.path
class WikimediaWikiExtractor(WikimediaExtractor):
@@ -276,11 +280,9 @@ class WikimediaWikiExtractor(WikimediaExtractor):
pattern = rf"{BASE_PATTERN}/?$"
example = "https://en.wikipedia.org/"
def __init__(self, match):
WikimediaExtractor.__init__(self, match)
def params(self):
# ref: https://www.mediawiki.org/wiki/API:Allpages
self.params = {
return {
"generator" : "allpages",
"gapnamespace": 6, # "File" namespace
"gaplimit" : self.per_page,