diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 31a49344..f2c4c78d 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -966,18 +966,16 @@ class BaseExtractor(Extractor): def __init__(self, match): if not self.category: - self.groups = match.groups() - self.match = match - self._init_category() + self._init_category(match) Extractor.__init__(self, match) - def _init_category(self): - for index, group in enumerate(self.groups): + def _init_category(self, match): + for index, group in enumerate(match.groups()): if group is not None: if index: self.category, self.root, info = self.instances[index-1] if not self.root: - self.root = text.root_from_url(self.match[0]) + self.root = text.root_from_url(match[0]) self.config_instance = info.get else: self.root = group diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 97caebc9..2f2b49d0 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -22,7 +22,7 @@ class WikimediaExtractor(BaseExtractor): request_interval = (1.0, 2.0) def __init__(self, match): - BaseExtractor.__init__(self, match) + self._init_category(match) if self.category == "wikimedia": self.category = self.root.split(".")[-2] @@ -32,16 +32,15 @@ class WikimediaExtractor(BaseExtractor): f"{self.category}-" f"{self.root.partition('.')[0].rpartition('/')[2]}") - self.per_page = self.config("limit", 50) - self.subcategories = False - if useragent := self.config_instance("useragent"): self.useragent = useragent + BaseExtractor.__init__(self, match) + def _init(self): if api_path := self.config_instance("api-path"): if api_path[0] == "/": - self.api_url = self.root + api_path + self.api_url = f"{self.root}{api_path}" else: self.api_url = api_path else: @@ -53,12 +52,14 @@ class WikimediaExtractor(BaseExtractor): # https://www.mediawiki.org/wiki/API:Imageinfo self.image_revisions = self.config("image-revisions", 1) self.format = self.config("format", "original") + self.per_page = self.config("limit", 50) + self.subcategories = False @cache(maxage=36500*86400, keyarg=1) def _search_api_path(self, root): self.log.debug("Probing possible API endpoints") for path in ("/api.php", "/w/api.php", "/wiki/api.php"): - url = root + path + url = f"{root}{path}" response = self.request(url, method="HEAD", fatal=None) if response.status_code < 400: return url @@ -86,7 +87,9 @@ class WikimediaExtractor(BaseExtractor): f"format={self.format}") def items(self): - for info in self._pagination(self.params): + params = self.params() + + for info in self._pagination(params): try: images = info.pop("imageinfo") except KeyError: @@ -112,10 +115,10 @@ class WikimediaExtractor(BaseExtractor): yield Message.Url, image["url"], image if self.subcategories: - base = self.root + "/wiki/" - self.params["gcmtype"] = "subcat" - for subcat in self._pagination(self.params): - url = base + subcat["title"].replace(" ", "_") + base = f"{self.root}/wiki/" + params["gcmtype"] = "subcat" + for subcat in self._pagination(params): + url = f"{base}{subcat['title'].replace(' ', '_')}" subcat["_extractor"] = WikimediaArticleExtractor yield Message.Queue, url, subcat @@ -234,40 +237,41 @@ class WikimediaArticleExtractor(WikimediaExtractor): path = self.groups[-1] if path[2] == "/": - self.root = self.root + "/" + path[:2] + self.root = f"{self.root}/{path[:2]}" path = path[3:] if path.startswith("wiki/"): path = path[5:] + self.path = text.unquote(path) pre, sep, _ = path.partition(":") - prefix = pre.lower() if sep else None - - self.title = path = text.unquote(path) - if prefix: + self.prefix = prefix = pre.lower() if sep else None + if prefix is not None: self.subcategory = prefix - if prefix == "category": + def params(self): + if self.prefix == "category": if self.config("subcategories", True): self.subcategories = True - self.params = { + return { "generator": "categorymembers", - "gcmtitle" : path, + "gcmtitle" : self.path, "gcmtype" : "file", "gcmlimit" : self.per_page, } - elif prefix == "file": - self.params = { - "titles" : path, - } - else: - self.params = { - "generator": "images", - "gimlimit" : self.per_page, - "titles" : path, + + if self.prefix == "file": + return { + "titles": self.path, } + return { + "generator": "images", + "gimlimit" : self.per_page, + "titles" : self.path, + } + def prepare_info(self, info): - info["page"] = self.title + info["page"] = self.path class WikimediaWikiExtractor(WikimediaExtractor): @@ -276,11 +280,9 @@ class WikimediaWikiExtractor(WikimediaExtractor): pattern = rf"{BASE_PATTERN}/?$" example = "https://en.wikipedia.org/" - def __init__(self, match): - WikimediaExtractor.__init__(self, match) - + def params(self): # ref: https://www.mediawiki.org/wiki/API:Allpages - self.params = { + return { "generator" : "allpages", "gapnamespace": 6, # "File" namespace "gaplimit" : self.per_page,