From a3b2c88fbe215fdc4d25cb4db9934a687356fdaa Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Sun, 18 Aug 2024 12:38:31 +0800 Subject: [PATCH 1/5] [wikimedia] refactor --- gallery_dl/extractor/wikimedia.py | 94 +++++++++++++++++-------------- 1 file changed, 52 insertions(+), 42 deletions(-) diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 9370cfb5..29671cb6 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -17,13 +17,11 @@ class WikimediaExtractor(BaseExtractor): """Base class for wikimedia extractors""" basecategory = "wikimedia" filename_fmt = "{filename} ({sha1[:8]}).{extension}" - directory_fmt = ("{category}", "{page}") archive_fmt = "{sha1}" request_interval = (1.0, 2.0) def __init__(self, match): BaseExtractor.__init__(self, match) - path = match.group(match.lastindex) if self.category == "wikimedia": self.category = self.root.split(".")[-2] @@ -31,32 +29,6 @@ class WikimediaExtractor(BaseExtractor): self.category = "{}-{}".format( self.category, self.root.partition(".")[0].rpartition("/")[2]) - if path.startswith("wiki/"): - path = path[5:] - - pre, sep, _ = path.partition(":") - prefix = pre.lower() if sep else None - - self.title = path = text.unquote(path) - if prefix: - self.subcategory = prefix - - if prefix == "category": - self.params = { - "generator": "categorymembers", - "gcmtitle" : path, - "gcmtype" : "file", - } - elif prefix == "file": - self.params = { - "titles" : path, - } - else: - self.params = { - "generator": "images", - "titles" : path, - } - def _init(self): api_path = self.config_instance("api-path") if api_path: @@ -67,6 +39,22 @@ class WikimediaExtractor(BaseExtractor): else: self.api_url = self.root + "/api.php" + @staticmethod + def prepare(image): + """Adjust the content of a image object""" + image["metadata"] = { + m["name"]: m["value"] + for m in image["metadata"] or ()} + image["commonmetadata"] = { + m["name"]: m["value"] + for m in image["commonmetadata"] or ()} + + filename = image["canonicaltitle"] + image["filename"], _, image["extension"] = \ + filename.partition(":")[2].rpartition(".") + image["date"] = text.parse_datetime( + image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") + def items(self): for info in self._pagination(self.params): try: @@ -75,20 +63,7 @@ class WikimediaExtractor(BaseExtractor): self.log.debug("Missing 'imageinfo' for %s", info) continue - image["metadata"] = { - m["name"]: m["value"] - for m in image["metadata"] or ()} - image["commonmetadata"] = { - m["name"]: m["value"] - for m in image["commonmetadata"] or ()} - - filename = image["canonicaltitle"] - image["filename"], _, image["extension"] = \ - filename.partition(":")[2].rpartition(".") - image["date"] = text.parse_datetime( - image["timestamp"], "%Y-%m-%dT%H:%M:%SZ") - image["page"] = self.title - + self.prepare(image) yield Message.Directory, image yield Message.Url, image["url"], image @@ -181,5 +156,40 @@ BASE_PATTERN = WikimediaExtractor.update({ class WikimediaArticleExtractor(WikimediaExtractor): """Extractor for wikimedia articles""" subcategory = "article" + directory_fmt = ("{category}", "{page}") pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)" example = "https://en.wikipedia.org/wiki/TITLE" + + def __init__(self, match): + WikimediaExtractor.__init__(self, match) + + path = match.group(match.lastindex) + if path.startswith("wiki/"): + path = path[5:] + + pre, sep, _ = path.partition(":") + prefix = pre.lower() if sep else None + + self.title = path = text.unquote(path) + if prefix: + self.subcategory = prefix + + if prefix == "category": + self.params = { + "generator": "categorymembers", + "gcmtitle" : path, + "gcmtype" : "file", + } + elif prefix == "file": + self.params = { + "titles" : path, + } + else: + self.params = { + "generator": "images", + "titles" : path, + } + + def prepare(self, image): + WikimediaExtractor.prepare(image) + image["page"] = self.title From 33d2ddd9fbe669b284d88d6c1421e07a7843a23d Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Sun, 18 Aug 2024 13:00:59 +0800 Subject: [PATCH 2/5] [wikimedia] add 'wiki' extractor --- gallery_dl/extractor/wikimedia.py | 16 ++++++++++++++++ test/results/fandom.py | 8 ++++++++ test/results/wikipedia.py | 8 ++++++++ 3 files changed, 32 insertions(+) diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 29671cb6..9aaa88a5 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -193,3 +193,19 @@ class WikimediaArticleExtractor(WikimediaExtractor): def prepare(self, image): WikimediaExtractor.prepare(image) image["page"] = self.title + + +class WikimediaWikiExtractor(WikimediaExtractor): + """Extractor for all files on a MediaWiki instance""" + subcategory = "wiki" + pattern = BASE_PATTERN + r"/?$" + example = "https://en.wikipedia.org/" + + def __init__(self, match): + WikimediaExtractor.__init__(self, match) + + # ref: https://www.mediawiki.org/wiki/API:Allpages + self.params = { + "generator" : "allpages", + "gapnamespace": 6, # "File" namespace + } diff --git a/test/results/fandom.py b/test/results/fandom.py index c876a64c..69ce88d1 100644 --- a/test/results/fandom.py +++ b/test/results/fandom.py @@ -98,4 +98,12 @@ __tests__ = ( "#class" : wikimedia.WikimediaArticleExtractor, }, +{ + "#url" : "https://youtube.fandom.com", + "#category": ("wikimedia", "fandom-youtube", "wiki"), + "#class" : wikimedia.WikimediaWikiExtractor, + "#range" : "1-20", + "#count" : 20, +}, + ) diff --git a/test/results/wikipedia.py b/test/results/wikipedia.py index e8e8f694..f478a49d 100644 --- a/test/results/wikipedia.py +++ b/test/results/wikipedia.py @@ -50,4 +50,12 @@ __tests__ = ( "#class" : wikimedia.WikimediaArticleExtractor, }, +{ + "#url" : "https://en.wikipedia.org", + "#category": ("wikimedia", "wikipedia", "wiki"), + "#class" : wikimedia.WikimediaWikiExtractor, + "#range" : "1-10", + "#count" : 10, +}, + ) From 33fe1b68b7ea9e8e610f5d1b06b43be8c41903da Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Sun, 18 Aug 2024 13:32:26 +0800 Subject: [PATCH 3/5] [wikimedia] add 'limit' option --- docs/configuration.rst | 12 ++++++++++++ gallery_dl/extractor/wikimedia.py | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 633c913f..9f4a81c7 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -4536,6 +4536,18 @@ Description Download video files. +extractor.wikimedia.limit +------------------------- +Type + ``integer`` +Default + ``10`` +Description + Number of results to return in a single API query. + + The value must be between 10 and 500. + + extractor.ytdl.cmdline-args --------------------------- Type diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 9aaa88a5..dbcc1e27 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -29,6 +29,8 @@ class WikimediaExtractor(BaseExtractor): self.category = "{}-{}".format( self.category, self.root.partition(".")[0].rpartition("/")[2]) + self.per_page = self.config("limit", 10) + def _init(self): api_path = self.config_instance("api-path") if api_path: @@ -179,6 +181,7 @@ class WikimediaArticleExtractor(WikimediaExtractor): "generator": "categorymembers", "gcmtitle" : path, "gcmtype" : "file", + "gcmlimit" : self.per_page, } elif prefix == "file": self.params = { @@ -187,6 +190,7 @@ class WikimediaArticleExtractor(WikimediaExtractor): else: self.params = { "generator": "images", + "gimlimit" : self.per_page, "titles" : path, } @@ -208,4 +212,5 @@ class WikimediaWikiExtractor(WikimediaExtractor): self.params = { "generator" : "allpages", "gapnamespace": 6, # "File" namespace + "gaplimit" : self.per_page, } From 5704024662ffe037650b16c9148bbe9538d0d4cd Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Sun, 18 Aug 2024 16:31:49 +0800 Subject: [PATCH 4/5] [wikimedia] improve handling of warnings and errors --- gallery_dl/extractor/wikimedia.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index dbcc1e27..97a12b05 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -87,6 +87,17 @@ class WikimediaExtractor(BaseExtractor): while True: data = self.request(url, params=params).json() + # ref: https://www.mediawiki.org/wiki/API:Errors_and_warnings + error = data.get("error") + if error: + self.log.error("%s: %s", error["code"], error["info"]) + return + # MediaWiki will emit warnings for non-fatal mistakes such as + # invalid parameter instead of raising an error + warnings = data.get("warnings") + if warnings: + self.log.debug("MediaWiki returned warnings: %s", warnings) + try: pages = data["query"]["pages"] except KeyError: From 968c04a27c4d350ec7a97122a68e1c7e611a840d Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Sun, 18 Aug 2024 16:33:13 +0800 Subject: [PATCH 5/5] [wikimedia] raise default value for 'limit' from 10 to 50 --- docs/configuration.rst | 2 +- gallery_dl/extractor/wikimedia.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 9f4a81c7..455fecf3 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -4541,7 +4541,7 @@ extractor.wikimedia.limit Type ``integer`` Default - ``10`` + ``50`` Description Number of results to return in a single API query. diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 97a12b05..7a62e01a 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -29,7 +29,7 @@ class WikimediaExtractor(BaseExtractor): self.category = "{}-{}".format( self.category, self.root.partition(".")[0].rpartition("/")[2]) - self.per_page = self.config("limit", 10) + self.per_page = self.config("limit", 50) def _init(self): api_path = self.config_instance("api-path")