[wikimedia] generalize (#1443)
- support mediawiki.org - support mariowiki.com (#3660) - combine code into a single extractor (use prefix as subcategory) - handle non-wiki instances - unescape titles
This commit is contained in:
@@ -1484,55 +1484,67 @@ Consider all listed sites to potentially be NSFW.
|
|||||||
<tr>
|
<tr>
|
||||||
<td>Wikipedia</td>
|
<td>Wikipedia</td>
|
||||||
<td>https://www.wikipedia.org/</td>
|
<td>https://www.wikipedia.org/</td>
|
||||||
<td>Articles, Categories</td>
|
<td>Articles</td>
|
||||||
<td></td>
|
<td></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Wiktionary</td>
|
<td>Wiktionary</td>
|
||||||
<td>https://www.wiktionary.org/</td>
|
<td>https://www.wiktionary.org/</td>
|
||||||
<td>Articles, Categories</td>
|
<td>Articles</td>
|
||||||
<td></td>
|
<td></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Wikiquote</td>
|
<td>Wikiquote</td>
|
||||||
<td>https://www.wikiquote.org/</td>
|
<td>https://www.wikiquote.org/</td>
|
||||||
<td>Articles, Categories</td>
|
<td>Articles</td>
|
||||||
<td></td>
|
<td></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Wikibooks</td>
|
<td>Wikibooks</td>
|
||||||
<td>https://www.wikibooks.org/</td>
|
<td>https://www.wikibooks.org/</td>
|
||||||
<td>Articles, Categories</td>
|
<td>Articles</td>
|
||||||
<td></td>
|
<td></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Wikisource</td>
|
<td>Wikisource</td>
|
||||||
<td>https://www.wikisource.org/</td>
|
<td>https://www.wikisource.org/</td>
|
||||||
<td>Articles, Categories</td>
|
<td>Articles</td>
|
||||||
<td></td>
|
<td></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Wikinews</td>
|
<td>Wikinews</td>
|
||||||
<td>https://www.wikinews.org/</td>
|
<td>https://www.wikinews.org/</td>
|
||||||
<td>Articles, Categories</td>
|
<td>Articles</td>
|
||||||
<td></td>
|
<td></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Wikiversity</td>
|
<td>Wikiversity</td>
|
||||||
<td>https://www.wikiversity.org/</td>
|
<td>https://www.wikiversity.org/</td>
|
||||||
<td>Articles, Categories</td>
|
<td>Articles</td>
|
||||||
<td></td>
|
<td></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Wikispecies</td>
|
<td>Wikispecies</td>
|
||||||
<td>https://species.wikimedia.org/</td>
|
<td>https://species.wikimedia.org/</td>
|
||||||
<td>Articles, Categories</td>
|
<td>Articles</td>
|
||||||
<td></td>
|
<td></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<td>Wikimedia Commons</td>
|
<td>Wikimedia Commons</td>
|
||||||
<td>https://commons.wikimedia.org/</td>
|
<td>https://commons.wikimedia.org/</td>
|
||||||
<td>Articles, Categories</td>
|
<td>Articles</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>MediaWiki</td>
|
||||||
|
<td>https://www.mediawiki.org/</td>
|
||||||
|
<td>Articles</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>Super Mario Wiki</td>
|
||||||
|
<td>https://www.mariowiki.com/</td>
|
||||||
|
<td>Articles</td>
|
||||||
<td></td>
|
<td></td>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@
|
|||||||
# it under the terms of the GNU General Public License version 2 as
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
# published by the Free Software Foundation.
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
"""Extractors for Wikimedia and Wikipedia"""
|
"""Extractors for Wikimedia sites"""
|
||||||
|
|
||||||
from .common import BaseExtractor, Message
|
from .common import BaseExtractor, Message
|
||||||
from .. import text
|
from .. import text
|
||||||
@@ -22,7 +22,41 @@ class WikimediaExtractor(BaseExtractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
BaseExtractor.__init__(self, match)
|
BaseExtractor.__init__(self, match)
|
||||||
self.title = match.group(match.lastindex)
|
path = match.group(match.lastindex)
|
||||||
|
|
||||||
|
if path.startswith("wiki/"):
|
||||||
|
path = path[5:]
|
||||||
|
self.api_path = "/w/api.php"
|
||||||
|
else:
|
||||||
|
self.api_path = "/api.php"
|
||||||
|
|
||||||
|
pre, sep, _ = path.partition(":")
|
||||||
|
prefix = pre.lower() if sep else None
|
||||||
|
|
||||||
|
self.title = path = text.unquote(path)
|
||||||
|
self.subcategory = prefix
|
||||||
|
|
||||||
|
if prefix == "category":
|
||||||
|
self.params = {
|
||||||
|
"generator": "categorymembers",
|
||||||
|
"gcmtitle" : path,
|
||||||
|
"gcmtype" : "file",
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
self.params = {
|
||||||
|
"generator": "images",
|
||||||
|
"titles" : path,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _init(self):
|
||||||
|
api_path = self.config_instance("api-path")
|
||||||
|
if api_path:
|
||||||
|
if api_path[0] == "/":
|
||||||
|
self.api_url = self.root + api_path
|
||||||
|
else:
|
||||||
|
self.api_url = api_path
|
||||||
|
else:
|
||||||
|
self.api_url = self.root + self.api_path
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
for info in self._pagination(self.params):
|
for info in self._pagination(self.params):
|
||||||
@@ -51,9 +85,14 @@ class WikimediaExtractor(BaseExtractor):
|
|||||||
https://opendata.stackexchange.com/questions/13381
|
https://opendata.stackexchange.com/questions/13381
|
||||||
"""
|
"""
|
||||||
|
|
||||||
url = self.root + "/w/api.php"
|
url = self.api_url
|
||||||
params["action"] = "query"
|
params["action"] = "query"
|
||||||
params["format"] = "json"
|
params["format"] = "json"
|
||||||
|
params["prop"] = "imageinfo"
|
||||||
|
params["iiprop"] = (
|
||||||
|
"timestamp|user|userid|comment|canonicaltitle|url|size|"
|
||||||
|
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth"
|
||||||
|
)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
data = self.request(url, params=params).json()
|
data = self.request(url, params=params).json()
|
||||||
@@ -109,36 +148,19 @@ BASE_PATTERN = WikimediaExtractor.update({
|
|||||||
"root": "https://commons.wikimedia.org",
|
"root": "https://commons.wikimedia.org",
|
||||||
"pattern": r"commons\.wikimedia\.org",
|
"pattern": r"commons\.wikimedia\.org",
|
||||||
},
|
},
|
||||||
|
"mediawiki": {
|
||||||
|
"root": "https://www.mediawiki.org",
|
||||||
|
"pattern": r"(?:www\.)?mediawiki\.org",
|
||||||
|
},
|
||||||
|
"mariowiki": {
|
||||||
|
"root": "https://www.mariowiki.com",
|
||||||
|
"pattern": r"(?:www\.)?mariowiki\.com",
|
||||||
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
class WikimediaArticleExtractor(WikimediaExtractor):
|
class WikimediaArticleExtractor(WikimediaExtractor):
|
||||||
"""Extractor for wikimedia articles"""
|
"""Extractor for wikimedia articles"""
|
||||||
subcategory = "article"
|
subcategory = "article"
|
||||||
pattern = BASE_PATTERN + r"/wiki/(?!Category:)([^/?#]+)"
|
pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)"
|
||||||
example = "https://en.wikipedia.org/wiki/TITLE"
|
example = "https://en.wikipedia.org/wiki/TITLE"
|
||||||
|
|
||||||
def _init(self):
|
|
||||||
self.params = {
|
|
||||||
"generator": "images",
|
|
||||||
"titles" : self.title,
|
|
||||||
"prop" : "imageinfo",
|
|
||||||
"iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|"
|
|
||||||
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class WikimediaCategoryExtractor(WikimediaExtractor):
|
|
||||||
subcategory = "category"
|
|
||||||
pattern = BASE_PATTERN + r"/wiki/(Category:[^/?#]+)"
|
|
||||||
example = "https://commons.wikimedia.org/wiki/Category:NAME"
|
|
||||||
|
|
||||||
def _init(self):
|
|
||||||
self.params = {
|
|
||||||
"generator": "categorymembers",
|
|
||||||
"gcmtitle" : self.title,
|
|
||||||
"gcmtype" : "file",
|
|
||||||
"prop" : "imageinfo",
|
|
||||||
"iiprop": "timestamp|user|userid|comment|canonicaltitle|url|size|"
|
|
||||||
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth",
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -88,7 +88,9 @@ CATEGORY_MAP = {
|
|||||||
"mangapark" : "MangaPark",
|
"mangapark" : "MangaPark",
|
||||||
"mangaread" : "MangaRead",
|
"mangaread" : "MangaRead",
|
||||||
"mangasee" : "MangaSee",
|
"mangasee" : "MangaSee",
|
||||||
|
"mariowiki" : "Super Mario Wiki",
|
||||||
"mastodon.social": "mastodon.social",
|
"mastodon.social": "mastodon.social",
|
||||||
|
"mediawiki" : "MediaWiki",
|
||||||
"micmicidol" : "MIC MIC IDOL",
|
"micmicidol" : "MIC MIC IDOL",
|
||||||
"myhentaigallery": "My Hentai Gallery",
|
"myhentaigallery": "My Hentai Gallery",
|
||||||
"myportfolio" : "Adobe Portfolio",
|
"myportfolio" : "Adobe Portfolio",
|
||||||
|
|||||||
19
test/results/mariowiki.py
Normal file
19
test/results/mariowiki.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
from gallery_dl.extractor import wikimedia
|
||||||
|
|
||||||
|
|
||||||
|
__tests__ = (
|
||||||
|
{
|
||||||
|
"#url" : "https://www.mariowiki.com/Rabbit",
|
||||||
|
"#category": ("wikimedia", "wikibooks", "article"),
|
||||||
|
"#class" : wikimedia.WikimediaArticleExtractor,
|
||||||
|
"#pattern" : r"https://mario\.wiki\.gallery/images/.+",
|
||||||
|
"#count" : range(20, 50),
|
||||||
|
},
|
||||||
|
|
||||||
|
)
|
||||||
24
test/results/mediawiki.py
Normal file
24
test/results/mediawiki.py
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
from gallery_dl.extractor import wikimedia
|
||||||
|
|
||||||
|
|
||||||
|
__tests__ = (
|
||||||
|
{
|
||||||
|
"#url" : "https://www.mediawiki.org/wiki/Help:Navigation",
|
||||||
|
"#category": ("wikimedia", "mediawiki", "help"),
|
||||||
|
"#class" : wikimedia.WikimediaArticleExtractor,
|
||||||
|
"#urls" : (
|
||||||
|
"https://upload.wikimedia.org/wikipedia/commons/e/ec/OOjs_UI_icon_information-progressive.svg",
|
||||||
|
"https://upload.wikimedia.org/wikipedia/commons/6/62/PD-icon.svg",
|
||||||
|
"https://upload.wikimedia.org/wikipedia/commons/0/0e/Vector_Sidebar.png",
|
||||||
|
"https://upload.wikimedia.org/wikipedia/commons/7/77/Vector_page_tabs.png",
|
||||||
|
"https://upload.wikimedia.org/wikipedia/commons/6/6e/Vector_user_links.png",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
|
||||||
|
)
|
||||||
@@ -17,7 +17,7 @@ __tests__ = (
|
|||||||
{
|
{
|
||||||
"#url" : "https://en.wikibooks.org/wiki/Category:Title",
|
"#url" : "https://en.wikibooks.org/wiki/Category:Title",
|
||||||
"#category": ("wikimedia", "wikibooks", "category"),
|
"#category": ("wikimedia", "wikibooks", "category"),
|
||||||
"#class" : wikimedia.WikimediaCategoryExtractor,
|
"#class" : wikimedia.WikimediaArticleExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ __tests__ = (
|
|||||||
{
|
{
|
||||||
"#url" : "https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro",
|
"#url" : "https://commons.wikimedia.org/wiki/Category:Network_maps_of_the_Paris_Metro",
|
||||||
"#category": ("wikimedia", "wikimediacommons", "category"),
|
"#category": ("wikimedia", "wikimediacommons", "category"),
|
||||||
"#class" : wikimedia.WikimediaCategoryExtractor,
|
"#class" : wikimedia.WikimediaArticleExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ __tests__ = (
|
|||||||
{
|
{
|
||||||
"#url" : "https://en.wikinews.org/wiki/Category:Title",
|
"#url" : "https://en.wikinews.org/wiki/Category:Title",
|
||||||
"#category": ("wikimedia", "wikinews", "category"),
|
"#category": ("wikimedia", "wikinews", "category"),
|
||||||
"#class" : wikimedia.WikimediaCategoryExtractor,
|
"#class" : wikimedia.WikimediaArticleExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ __tests__ = (
|
|||||||
{
|
{
|
||||||
"#url" : "https://en.wikipedia.org/wiki/Category:Physics",
|
"#url" : "https://en.wikipedia.org/wiki/Category:Physics",
|
||||||
"#category": ("wikimedia", "wikipedia", "category"),
|
"#category": ("wikimedia", "wikipedia", "category"),
|
||||||
"#class" : wikimedia.WikimediaCategoryExtractor,
|
"#class" : wikimedia.WikimediaArticleExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ __tests__ = (
|
|||||||
{
|
{
|
||||||
"#url" : "https://en.wikiquote.org/wiki/Category:Title",
|
"#url" : "https://en.wikiquote.org/wiki/Category:Title",
|
||||||
"#category": ("wikimedia", "wikiquote", "category"),
|
"#category": ("wikimedia", "wikiquote", "category"),
|
||||||
"#class" : wikimedia.WikimediaCategoryExtractor,
|
"#class" : wikimedia.WikimediaArticleExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ __tests__ = (
|
|||||||
{
|
{
|
||||||
"#url" : "https://en.wikisource.org/wiki/Category:Title",
|
"#url" : "https://en.wikisource.org/wiki/Category:Title",
|
||||||
"#category": ("wikimedia", "wikisource", "category"),
|
"#category": ("wikimedia", "wikisource", "category"),
|
||||||
"#class" : wikimedia.WikimediaCategoryExtractor,
|
"#class" : wikimedia.WikimediaArticleExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ __tests__ = (
|
|||||||
{
|
{
|
||||||
"#url" : "https://species.wikimedia.org/wiki/Category:Names",
|
"#url" : "https://species.wikimedia.org/wiki/Category:Names",
|
||||||
"#category": ("wikimedia", "wikispecies", "category"),
|
"#category": ("wikimedia", "wikispecies", "category"),
|
||||||
"#class" : wikimedia.WikimediaCategoryExtractor,
|
"#class" : wikimedia.WikimediaArticleExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ __tests__ = (
|
|||||||
{
|
{
|
||||||
"#url" : "https://en.wikiversity.org/wiki/Category:Title",
|
"#url" : "https://en.wikiversity.org/wiki/Category:Title",
|
||||||
"#category": ("wikimedia", "wikiversity", "category"),
|
"#category": ("wikimedia", "wikiversity", "category"),
|
||||||
"#class" : wikimedia.WikimediaCategoryExtractor,
|
"#class" : wikimedia.WikimediaArticleExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ __tests__ = (
|
|||||||
{
|
{
|
||||||
"#url" : "https://en.wiktionary.org/wiki/Category:Words",
|
"#url" : "https://en.wiktionary.org/wiki/Category:Words",
|
||||||
"#category": ("wikimedia", "wiktionary", "category"),
|
"#category": ("wikimedia", "wiktionary", "category"),
|
||||||
"#class" : wikimedia.WikimediaCategoryExtractor,
|
"#class" : wikimedia.WikimediaArticleExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user