[wikimedia] generalize (#1443)

- support mediawiki.org
- support mariowiki.com (#3660)

- combine code into a single extractor
  (use prefix as subcategory)
- handle non-wiki instances
- unescape titles
This commit is contained in:
Mike Fährmann
2024-01-18 15:36:16 +01:00
parent 89066844f4
commit ea553a1d55
14 changed files with 126 additions and 47 deletions

View File

@@ -1484,55 +1484,67 @@ Consider all listed sites to potentially be NSFW.
<tr> <tr>
<td>Wikipedia</td> <td>Wikipedia</td>
<td>https://www.wikipedia.org/</td> <td>https://www.wikipedia.org/</td>
<td>Articles, Categories</td> <td>Articles</td>
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
<td>Wiktionary</td> <td>Wiktionary</td>
<td>https://www.wiktionary.org/</td> <td>https://www.wiktionary.org/</td>
<td>Articles, Categories</td> <td>Articles</td>
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
<td>Wikiquote</td> <td>Wikiquote</td>
<td>https://www.wikiquote.org/</td> <td>https://www.wikiquote.org/</td>
<td>Articles, Categories</td> <td>Articles</td>
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
<td>Wikibooks</td> <td>Wikibooks</td>
<td>https://www.wikibooks.org/</td> <td>https://www.wikibooks.org/</td>
<td>Articles, Categories</td> <td>Articles</td>
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
<td>Wikisource</td> <td>Wikisource</td>
<td>https://www.wikisource.org/</td> <td>https://www.wikisource.org/</td>
<td>Articles, Categories</td> <td>Articles</td>
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
<td>Wikinews</td> <td>Wikinews</td>
<td>https://www.wikinews.org/</td> <td>https://www.wikinews.org/</td>
<td>Articles, Categories</td> <td>Articles</td>
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
<td>Wikiversity</td> <td>Wikiversity</td>
<td>https://www.wikiversity.org/</td> <td>https://www.wikiversity.org/</td>
<td>Articles, Categories</td> <td>Articles</td>
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
<td>Wikispecies</td> <td>Wikispecies</td>
<td>https://species.wikimedia.org/</td> <td>https://species.wikimedia.org/</td>
<td>Articles, Categories</td> <td>Articles</td>
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
<td>Wikimedia Commons</td> <td>Wikimedia Commons</td>
<td>https://commons.wikimedia.org/</td> <td>https://commons.wikimedia.org/</td>
<td>Articles, Categories</td> <td>Articles</td>
<td></td>
</tr>
<tr>
<td>MediaWiki</td>
<td>https://www.mediawiki.org/</td>
<td>Articles</td>
<td></td>
</tr>
<tr>
<td>Super Mario Wiki</td>
<td>https://www.mariowiki.com/</td>
<td>Articles</td>
<td></td> <td></td>
</tr> </tr>

View File

@@ -7,7 +7,7 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
"""Extractors for Wikimedia and Wikipedia""" """Extractors for Wikimedia sites"""
from .common import BaseExtractor, Message from .common import BaseExtractor, Message
from .. import text from .. import text
@@ -22,7 +22,41 @@ class WikimediaExtractor(BaseExtractor):
def __init__(self, match): def __init__(self, match):
BaseExtractor.__init__(self, match) BaseExtractor.__init__(self, match)
self.title = match.group(match.lastindex) path = match.group(match.lastindex)
if path.startswith("wiki/"):
path = path[5:]
self.api_path = "/w/api.php"
else:
self.api_path = "/api.php"
pre, sep, _ = path.partition(":")
prefix = pre.lower() if sep else None
self.title = path = text.unquote(path)
self.subcategory = prefix
if prefix == "category":
self.params = {
"generator": "categorymembers",
"gcmtitle" : path,
"gcmtype" : "file",
}
else:
self.params = {
"generator": "images",
"titles" : path,
}
def _init(self):
api_path = self.config_instance("api-path")
if api_path:
if api_path[0] == "/":
self.api_url = self.root + api_path
else:
self.api_url = api_path
else:
self.api_url = self.root + self.api_path
def items(self): def items(self):
for info in self._pagination(self.params): for info in self._pagination(self.params):
@@ -51,9 +85,14 @@ class WikimediaExtractor(BaseExtractor):
https://opendata.stackexchange.com/questions/13381 https://opendata.stackexchange.com/questions/13381
""" """
url = self.root + "/w/api.php" url = self.api_url
params["action"] = "query" params["action"] = "query"
params["format"] = "json" params["format"] = "json"
params["prop"] = "imageinfo"
params["iiprop"] = (
"timestamp|user|userid|comment|canonicaltitle|url|size|"
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth"
)
while True: while True:
data = self.request(url, params=params).json() data = self.request(url, params=params).json()
@@ -109,36 +148,19 @@ BASE_PATTERN = WikimediaExtractor.update({
"root": "https://commons.wikimedia.org", "root": "https://commons.wikimedia.org",
"pattern": r"commons\.wikimedia\.org", "pattern": r"commons\.wikimedia\.org",
}, },
"mediawiki": {
"root": "https://www.mediawiki.org",
"pattern": r"(?:www\.)?mediawiki\.org",
},
"mariowiki": {
"root": "https://www.mariowiki.com",
"pattern": r"(?:www\.)?mariowiki\.com",
},
}) })
class WikimediaArticleExtractor(WikimediaExtractor): class WikimediaArticleExtractor(WikimediaExtractor):
"""Extractor for wikimedia articles""" """Extractor for wikimedia articles"""
subcategory = "article" subcategory = "article"
pattern = BASE_PATTERN + r"/wiki/(?!Category:)([^/?#]+)" pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)"
example = "https://en.wikipedia.org/wiki/TITLE" example = "https://en.wikipedia.org/wiki/TITLE"
def _init(self):
self.params = {
"generator": "images",
"titles" : self.title,
"prop" : "imageinfo",
"iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|"
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth",
}
class WikimediaCategoryExtractor(WikimediaExtractor):
subcategory = "category"
pattern = BASE_PATTERN + r"/wiki/(Category:[^/?#]+)"
example = "https://commons.wikimedia.org/wiki/Category:NAME"
def _init(self):
self.params = {
"generator": "categorymembers",
"gcmtitle" : self.title,
"gcmtype" : "file",
"prop" : "imageinfo",
"iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|"
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth",
}

View File

@@ -88,7 +88,9 @@ CATEGORY_MAP = {
"mangapark" : "MangaPark", "mangapark" : "MangaPark",
"mangaread" : "MangaRead", "mangaread" : "MangaRead",
"mangasee" : "MangaSee", "mangasee" : "MangaSee",
"mariowiki" : "Super Mario Wiki",
"mastodon.social": "mastodon.social", "mastodon.social": "mastodon.social",
"mediawiki" : "MediaWiki",
"micmicidol" : "MIC MIC IDOL", "micmicidol" : "MIC MIC IDOL",
"myhentaigallery": "My Hentai Gallery", "myhentaigallery": "My Hentai Gallery",
"myportfolio" : "Adobe Portfolio", "myportfolio" : "Adobe Portfolio",

19
test/results/mariowiki.py Normal file
View File

@@ -0,0 +1,19 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import wikimedia
__tests__ = (
{
"#url" : "https://www.mariowiki.com/Rabbit",
"#category": ("wikimedia", "wikibooks", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
"#pattern" : r"https://mario\.wiki\.gallery/images/.+",
"#count" : range(20, 50),
},
)

24
test/results/mediawiki.py Normal file
View File

@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import wikimedia
__tests__ = (
{
"#url" : "https://www.mediawiki.org/wiki/Help:Navigation",
"#category": ("wikimedia", "mediawiki", "help"),
"#class" : wikimedia.WikimediaArticleExtractor,
"#urls" : (
"https://upload.wikimedia.org/wikipedia/commons/e/ec/OOjs_UI_icon_information-progressive.svg",
"https://upload.wikimedia.org/wikipedia/commons/6/62/PD-icon.svg",
"https://upload.wikimedia.org/wikipedia/commons/0/0e/Vector_Sidebar.png",
"https://upload.wikimedia.org/wikipedia/commons/7/77/Vector_page_tabs.png",
"https://upload.wikimedia.org/wikipedia/commons/6/6e/Vector_user_links.png",
),
},
)

View File

@@ -17,7 +17,7 @@ __tests__ = (
{ {
"#url" : "https://en.wikibooks.org/wiki/Category:Title", "#url" : "https://en.wikibooks.org/wiki/Category:Title",
"#category": ("wikimedia", "wikibooks", "category"), "#category": ("wikimedia", "wikibooks", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor, "#class" : wikimedia.WikimediaArticleExtractor,
}, },
) )

View File

@@ -17,7 +17,7 @@ __tests__ = (
{ {
"#url" : "https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro", "#url" : "https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro",
"#category": ("wikimedia", "wikimediacommons", "category"), "#category": ("wikimedia", "wikimediacommons", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor, "#class" : wikimedia.WikimediaArticleExtractor,
}, },
) )

View File

@@ -17,7 +17,7 @@ __tests__ = (
{ {
"#url" : "https://en.wikinews.org/wiki/Category:Title", "#url" : "https://en.wikinews.org/wiki/Category:Title",
"#category": ("wikimedia", "wikinews", "category"), "#category": ("wikimedia", "wikinews", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor, "#class" : wikimedia.WikimediaArticleExtractor,
}, },
) )

View File

@@ -47,7 +47,7 @@ __tests__ = (
{ {
"#url" : "https://en.wikipedia.org/wiki/Category:Physics", "#url" : "https://en.wikipedia.org/wiki/Category:Physics",
"#category": ("wikimedia", "wikipedia", "category"), "#category": ("wikimedia", "wikipedia", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor, "#class" : wikimedia.WikimediaArticleExtractor,
}, },
) )

View File

@@ -17,7 +17,7 @@ __tests__ = (
{ {
"#url" : "https://en.wikiquote.org/wiki/Category:Title", "#url" : "https://en.wikiquote.org/wiki/Category:Title",
"#category": ("wikimedia", "wikiquote", "category"), "#category": ("wikimedia", "wikiquote", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor, "#class" : wikimedia.WikimediaArticleExtractor,
}, },
) )

View File

@@ -17,7 +17,7 @@ __tests__ = (
{ {
"#url" : "https://en.wikisource.org/wiki/Category:Title", "#url" : "https://en.wikisource.org/wiki/Category:Title",
"#category": ("wikimedia", "wikisource", "category"), "#category": ("wikimedia", "wikisource", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor, "#class" : wikimedia.WikimediaArticleExtractor,
}, },
) )

View File

@@ -19,7 +19,7 @@ __tests__ = (
{ {
"#url" : "https://species.wikimedia.org/wiki/Category:Names", "#url" : "https://species.wikimedia.org/wiki/Category:Names",
"#category": ("wikimedia", "wikispecies", "category"), "#category": ("wikimedia", "wikispecies", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor, "#class" : wikimedia.WikimediaArticleExtractor,
}, },
) )

View File

@@ -17,7 +17,7 @@ __tests__ = (
{ {
"#url" : "https://en.wikiversity.org/wiki/Category:Title", "#url" : "https://en.wikiversity.org/wiki/Category:Title",
"#category": ("wikimedia", "wikiversity", "category"), "#category": ("wikimedia", "wikiversity", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor, "#class" : wikimedia.WikimediaArticleExtractor,
}, },
) )

View File

@@ -17,7 +17,7 @@ __tests__ = (
{ {
"#url" : "https://en.wiktionary.org/wiki/Category:Words", "#url" : "https://en.wiktionary.org/wiki/Category:Words",
"#category": ("wikimedia", "wiktionary", "category"), "#category": ("wikimedia", "wiktionary", "category"),
"#class" : wikimedia.WikimediaCategoryExtractor, "#class" : wikimedia.WikimediaArticleExtractor,
}, },
) )