From ea553a1d55e8e633019a3128ef6337c54b8f9031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 18 Jan 2024 15:36:16 +0100 Subject: [PATCH] [wikimedia] generalize (#1443) - support mediawiki.org - support mariowiki.com (#3660) - combine code into a single extractor (use prefix as subcategory) - handle non-wiki instances - unescape titles --- docs/supportedsites.md | 30 ++++++++---- gallery_dl/extractor/wikimedia.py | 80 ++++++++++++++++++++----------- scripts/supportedsites.py | 2 + test/results/mariowiki.py | 19 ++++++++ test/results/mediawiki.py | 24 ++++++++++ test/results/wikibooks.py | 2 +- test/results/wikimediacommons.py | 2 +- test/results/wikinews.py | 2 +- test/results/wikipedia.py | 2 +- test/results/wikiquote.py | 2 +- test/results/wikisource.py | 2 +- test/results/wikispecies.py | 2 +- test/results/wikiversity.py | 2 +- test/results/wiktionary.py | 2 +- 14 files changed, 126 insertions(+), 47 deletions(-) create mode 100644 test/results/mariowiki.py create mode 100644 test/results/mediawiki.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index d3d2a8a3..4a6d8bd2 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1484,55 +1484,67 @@ Consider all listed sites to potentially be NSFW. Wikipedia https://www.wikipedia.org/ - Articles, Categories + Articles Wiktionary https://www.wiktionary.org/ - Articles, Categories + Articles Wikiquote https://www.wikiquote.org/ - Articles, Categories + Articles Wikibooks https://www.wikibooks.org/ - Articles, Categories + Articles Wikisource https://www.wikisource.org/ - Articles, Categories + Articles Wikinews https://www.wikinews.org/ - Articles, Categories + Articles Wikiversity https://www.wikiversity.org/ - Articles, Categories + Articles Wikispecies https://species.wikimedia.org/ - Articles, Categories + Articles Wikimedia Commons https://commons.wikimedia.org/ - Articles, Categories + Articles + + + + MediaWiki + https://www.mediawiki.org/ + Articles + + + + Super Mario Wiki + https://www.mariowiki.com/ + Articles diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 1a896515..ffbf950e 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -7,7 +7,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for Wikimedia and Wikipedia""" +"""Extractors for Wikimedia sites""" from .common import BaseExtractor, Message from .. import text @@ -22,7 +22,41 @@ class WikimediaExtractor(BaseExtractor): def __init__(self, match): BaseExtractor.__init__(self, match) - self.title = match.group(match.lastindex) + path = match.group(match.lastindex) + + if path.startswith("wiki/"): + path = path[5:] + self.api_path = "/w/api.php" + else: + self.api_path = "/api.php" + + pre, sep, _ = path.partition(":") + prefix = pre.lower() if sep else None + + self.title = path = text.unquote(path) + self.subcategory = prefix + + if prefix == "category": + self.params = { + "generator": "categorymembers", + "gcmtitle" : path, + "gcmtype" : "file", + } + else: + self.params = { + "generator": "images", + "titles" : path, + } + + def _init(self): + api_path = self.config_instance("api-path") + if api_path: + if api_path[0] == "/": + self.api_url = self.root + api_path + else: + self.api_url = api_path + else: + self.api_url = self.root + self.api_path def items(self): for info in self._pagination(self.params): @@ -51,9 +85,14 @@ class WikimediaExtractor(BaseExtractor): https://opendata.stackexchange.com/questions/13381 """ - url = self.root + "/w/api.php" + url = self.api_url params["action"] = "query" params["format"] = "json" + params["prop"] = "imageinfo" + params["iiprop"] = ( + "timestamp|user|userid|comment|canonicaltitle|url|size|" + "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth" + ) while True: data = self.request(url, params=params).json() @@ -109,36 +148,19 @@ BASE_PATTERN = WikimediaExtractor.update({ "root": "https://commons.wikimedia.org", "pattern": r"commons\.wikimedia\.org", }, + "mediawiki": { + "root": "https://www.mediawiki.org", + "pattern": r"(?:www\.)?mediawiki\.org", + }, + "mariowiki": { + "root": "https://www.mariowiki.com", + "pattern": r"(?:www\.)?mariowiki\.com", + }, }) class WikimediaArticleExtractor(WikimediaExtractor): """Extractor for wikimedia articles""" subcategory = "article" - pattern = BASE_PATTERN + r"/wiki/(?!Category:)([^/?#]+)" + pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)" example = "https://en.wikipedia.org/wiki/TITLE" - - def _init(self): - self.params = { - "generator": "images", - "titles" : self.title, - "prop" : "imageinfo", - "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|" - "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth", - } - - -class WikimediaCategoryExtractor(WikimediaExtractor): - subcategory = "category" - pattern = BASE_PATTERN + r"/wiki/(Category:[^/?#]+)" - example = "https://commons.wikimedia.org/wiki/Category:NAME" - - def _init(self): - self.params = { - "generator": "categorymembers", - "gcmtitle" : self.title, - "gcmtype" : "file", - "prop" : "imageinfo", - "iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|" - "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth", - } diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 859cbbad..50b6e5d8 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -88,7 +88,9 @@ CATEGORY_MAP = { "mangapark" : "MangaPark", "mangaread" : "MangaRead", "mangasee" : "MangaSee", + "mariowiki" : "Super Mario Wiki", "mastodon.social": "mastodon.social", + "mediawiki" : "MediaWiki", "micmicidol" : "MIC MIC IDOL", "myhentaigallery": "My Hentai Gallery", "myportfolio" : "Adobe Portfolio", diff --git a/test/results/mariowiki.py b/test/results/mariowiki.py new file mode 100644 index 00000000..ebb8d6e6 --- /dev/null +++ b/test/results/mariowiki.py @@ -0,0 +1,19 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.mariowiki.com/Rabbit", + "#category": ("wikimedia", "wikibooks", "article"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#pattern" : r"https://mario\.wiki\.gallery/images/.+", + "#count" : range(20, 50), +}, + +) diff --git a/test/results/mediawiki.py b/test/results/mediawiki.py new file mode 100644 index 00000000..683d0d36 --- /dev/null +++ b/test/results/mediawiki.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import wikimedia + + +__tests__ = ( +{ + "#url" : "https://www.mediawiki.org/wiki/Help:Navigation", + "#category": ("wikimedia", "mediawiki", "help"), + "#class" : wikimedia.WikimediaArticleExtractor, + "#urls" : ( + "https://upload.wikimedia.org/wikipedia/commons/e/ec/OOjs_UI_icon_information-progressive.svg", + "https://upload.wikimedia.org/wikipedia/commons/6/62/PD-icon.svg", + "https://upload.wikimedia.org/wikipedia/commons/0/0e/Vector_Sidebar.png", + "https://upload.wikimedia.org/wikipedia/commons/7/77/Vector_page_tabs.png", + "https://upload.wikimedia.org/wikipedia/commons/6/6e/Vector_user_links.png", + ), +}, + +) diff --git a/test/results/wikibooks.py b/test/results/wikibooks.py index 882741d5..da4d761d 100644 --- a/test/results/wikibooks.py +++ b/test/results/wikibooks.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://en.wikibooks.org/wiki/Category:Title", "#category": ("wikimedia", "wikibooks", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikimediacommons.py b/test/results/wikimediacommons.py index 6cc03e34..a16d069a 100644 --- a/test/results/wikimediacommons.py +++ b/test/results/wikimediacommons.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro", "#category": ("wikimedia", "wikimediacommons", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikinews.py b/test/results/wikinews.py index 8a2af25e..79817fdb 100644 --- a/test/results/wikinews.py +++ b/test/results/wikinews.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://en.wikinews.org/wiki/Category:Title", "#category": ("wikimedia", "wikinews", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikipedia.py b/test/results/wikipedia.py index 87499878..e8e8f694 100644 --- a/test/results/wikipedia.py +++ b/test/results/wikipedia.py @@ -47,7 +47,7 @@ __tests__ = ( { "#url" : "https://en.wikipedia.org/wiki/Category:Physics", "#category": ("wikimedia", "wikipedia", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikiquote.py b/test/results/wikiquote.py index 5e6fb321..8365e3b7 100644 --- a/test/results/wikiquote.py +++ b/test/results/wikiquote.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://en.wikiquote.org/wiki/Category:Title", "#category": ("wikimedia", "wikiquote", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikisource.py b/test/results/wikisource.py index afdee23e..0ac1bb0f 100644 --- a/test/results/wikisource.py +++ b/test/results/wikisource.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://en.wikisource.org/wiki/Category:Title", "#category": ("wikimedia", "wikisource", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikispecies.py b/test/results/wikispecies.py index d455fbac..26aca84b 100644 --- a/test/results/wikispecies.py +++ b/test/results/wikispecies.py @@ -19,7 +19,7 @@ __tests__ = ( { "#url" : "https://species.wikimedia.org/wiki/Category:Names", "#category": ("wikimedia", "wikispecies", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wikiversity.py b/test/results/wikiversity.py index 58565f49..2e64ca31 100644 --- a/test/results/wikiversity.py +++ b/test/results/wikiversity.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://en.wikiversity.org/wiki/Category:Title", "#category": ("wikimedia", "wikiversity", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, ) diff --git a/test/results/wiktionary.py b/test/results/wiktionary.py index c7a016f5..4a643ab5 100644 --- a/test/results/wiktionary.py +++ b/test/results/wiktionary.py @@ -17,7 +17,7 @@ __tests__ = ( { "#url" : "https://en.wiktionary.org/wiki/Category:Words", "#category": ("wikimedia", "wiktionary", "category"), - "#class" : wikimedia.WikimediaCategoryExtractor, + "#class" : wikimedia.WikimediaArticleExtractor, }, )