[wikimedia] fix 'limit' config lookup

rework config handling in general
2025-10-23 20:16:46 +02:00
parent 98d3354575
commit da4b8c1e83
2 changed files with 39 additions and 39 deletions
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -966,18 +966,16 @@ class BaseExtractor(Extractor):

    def __init__(self, match):
        if not self.category:
-            self.groups = match.groups()
-            self.match = match
-            self._init_category()
+            self._init_category(match)
        Extractor.__init__(self, match)

-    def _init_category(self):
-        for index, group in enumerate(self.groups):
+    def _init_category(self, match):
+        for index, group in enumerate(match.groups()):
            if group is not None:
                if index:
                    self.category, self.root, info = self.instances[index-1]
                    if not self.root:
-                        self.root = text.root_from_url(self.match[0])
+                        self.root = text.root_from_url(match[0])
                    self.config_instance = info.get
                else:
                    self.root = group
--- a/gallery_dl/extractor/wikimedia.py
+++ b/gallery_dl/extractor/wikimedia.py
@@ -22,7 +22,7 @@ class WikimediaExtractor(BaseExtractor):
    request_interval = (1.0, 2.0)

    def __init__(self, match):
-        BaseExtractor.__init__(self, match)
+        self._init_category(match)

        if self.category == "wikimedia":
            self.category = self.root.split(".")[-2]
@@ -32,16 +32,15 @@ class WikimediaExtractor(BaseExtractor):
                f"{self.category}-"
                f"{self.root.partition('.')[0].rpartition('/')[2]}")

-        self.per_page = self.config("limit", 50)
-        self.subcategories = False
-
        if useragent := self.config_instance("useragent"):
            self.useragent = useragent

+        BaseExtractor.__init__(self, match)
+
    def _init(self):
        if api_path := self.config_instance("api-path"):
            if api_path[0] == "/":
-                self.api_url = self.root + api_path
+                self.api_url = f"{self.root}{api_path}"
            else:
                self.api_url = api_path
        else:
@@ -53,12 +52,14 @@ class WikimediaExtractor(BaseExtractor):
        # https://www.mediawiki.org/wiki/API:Imageinfo
        self.image_revisions = self.config("image-revisions", 1)
        self.format = self.config("format", "original")
+        self.per_page = self.config("limit", 50)
+        self.subcategories = False

    @cache(maxage=36500*86400, keyarg=1)
    def _search_api_path(self, root):
        self.log.debug("Probing possible API endpoints")
        for path in ("/api.php", "/w/api.php", "/wiki/api.php"):
-            url = root + path
+            url = f"{root}{path}"
            response = self.request(url, method="HEAD", fatal=None)
            if response.status_code < 400:
                return url
@@ -86,7 +87,9 @@ class WikimediaExtractor(BaseExtractor):
                            f"format={self.format}")

    def items(self):
-        for info in self._pagination(self.params):
+        params = self.params()
+
+        for info in self._pagination(params):
            try:
                images = info.pop("imageinfo")
            except KeyError:
@@ -112,10 +115,10 @@ class WikimediaExtractor(BaseExtractor):
                yield Message.Url, image["url"], image

        if self.subcategories:
-            base = self.root + "/wiki/"
-            self.params["gcmtype"] = "subcat"
-            for subcat in self._pagination(self.params):
-                url = base + subcat["title"].replace(" ", "_")
+            base = f"{self.root}/wiki/"
+            params["gcmtype"] = "subcat"
+            for subcat in self._pagination(params):
+                url = f"{base}{subcat['title'].replace(' ', '_')}"
                subcat["_extractor"] = WikimediaArticleExtractor
                yield Message.Queue, url, subcat

@@ -234,40 +237,41 @@ class WikimediaArticleExtractor(WikimediaExtractor):

        path = self.groups[-1]
        if path[2] == "/":
-            self.root = self.root + "/" + path[:2]
+            self.root = f"{self.root}/{path[:2]}"
            path = path[3:]
        if path.startswith("wiki/"):
            path = path[5:]
+        self.path = text.unquote(path)

        pre, sep, _ = path.partition(":")
-        prefix = pre.lower() if sep else None
-
-        self.title = path = text.unquote(path)
-        if prefix:
+        self.prefix = prefix = pre.lower() if sep else None
+        if prefix is not None:
            self.subcategory = prefix

-        if prefix == "category":
+    def params(self):
+        if self.prefix == "category":
            if self.config("subcategories", True):
                self.subcategories = True
-            self.params = {
+            return {
                "generator": "categorymembers",
-                "gcmtitle" : path,
+                "gcmtitle" : self.path,
                "gcmtype"  : "file",
                "gcmlimit" : self.per_page,
            }
-        elif prefix == "file":
-            self.params = {
-                "titles"   : path,
-            }
-        else:
-            self.params = {
-                "generator": "images",
-                "gimlimit" : self.per_page,
-                "titles"   : path,
+
+        if self.prefix == "file":
+            return {
+                "titles": self.path,
            }

+        return {
+            "generator": "images",
+            "gimlimit" : self.per_page,
+            "titles"   : self.path,
+        }
+
    def prepare_info(self, info):
-        info["page"] = self.title
+        info["page"] = self.path


 class WikimediaWikiExtractor(WikimediaExtractor):
@@ -276,11 +280,9 @@ class WikimediaWikiExtractor(WikimediaExtractor):
    pattern = rf"{BASE_PATTERN}/?$"
    example = "https://en.wikipedia.org/"

-    def __init__(self, match):
-        WikimediaExtractor.__init__(self, match)
-
+    def params(self):
        # ref: https://www.mediawiki.org/wiki/API:Allpages
-        self.params = {
+        return {
            "generator"   : "allpages",
            "gapnamespace": 6,  # "File" namespace
            "gaplimit"    : self.per_page,