280 lines
8.9 KiB
Python
280 lines
8.9 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2022 Ailothaen
|
|
# Copyright 2024-2025 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
"""Extractors for Wikimedia sites"""
|
|
|
|
from .common import BaseExtractor, Message
|
|
from .. import text, exception
|
|
from ..cache import cache
|
|
|
|
|
|
class WikimediaExtractor(BaseExtractor):
|
|
"""Base class for wikimedia extractors"""
|
|
basecategory = "wikimedia"
|
|
filename_fmt = "{filename} ({sha1[:8]}).{extension}"
|
|
archive_fmt = "{sha1}"
|
|
request_interval = (1.0, 2.0)
|
|
|
|
def __init__(self, match):
|
|
BaseExtractor.__init__(self, match)
|
|
|
|
if self.category == "wikimedia":
|
|
self.category = self.root.split(".")[-2]
|
|
elif self.category in ("fandom", "wikigg"):
|
|
self.category = (
|
|
f"{self.category}-"
|
|
f"{self.root.partition('.')[0].rpartition('/')[2]}")
|
|
|
|
self.per_page = self.config("limit", 50)
|
|
self.subcategories = False
|
|
|
|
if useragent := self.config_instance("useragent"):
|
|
self.useragent = useragent
|
|
|
|
def _init(self):
|
|
if api_path := self.config_instance("api-path"):
|
|
if api_path[0] == "/":
|
|
self.api_url = self.root + api_path
|
|
else:
|
|
self.api_url = api_path
|
|
else:
|
|
self.api_url = None
|
|
|
|
# note: image revisions are different from page revisions
|
|
# ref:
|
|
# https://www.mediawiki.org/wiki/API:Revisions
|
|
# https://www.mediawiki.org/wiki/API:Imageinfo
|
|
self.image_revisions = self.config("image-revisions", 1)
|
|
|
|
@cache(maxage=36500*86400, keyarg=1)
|
|
def _search_api_path(self, root):
|
|
self.log.debug("Probing possible API endpoints")
|
|
for path in ("/api.php", "/w/api.php", "/wiki/api.php"):
|
|
url = root + path
|
|
response = self.request(url, method="HEAD", fatal=None)
|
|
if response.status_code < 400:
|
|
return url
|
|
raise exception.AbortExtraction("Unable to find API endpoint")
|
|
|
|
def prepare_info(self, info):
|
|
"""Adjust the content of an image info object"""
|
|
|
|
def prepare_image(self, image):
|
|
"""Adjust the content of an image object"""
|
|
image["metadata"] = {
|
|
m["name"]: m["value"]
|
|
for m in image["metadata"] or ()}
|
|
image["commonmetadata"] = {
|
|
m["name"]: m["value"]
|
|
for m in image["commonmetadata"] or ()}
|
|
|
|
text.nameext_from_url(image["canonicaltitle"].partition(":")[2], image)
|
|
image["date"] = self.parse_datetime_iso(image["timestamp"])
|
|
|
|
def items(self):
|
|
for info in self._pagination(self.params):
|
|
try:
|
|
images = info.pop("imageinfo")
|
|
except KeyError:
|
|
self.log.debug("Missing 'imageinfo' for %s", info)
|
|
images = ()
|
|
|
|
info["count"] = len(images)
|
|
self.prepare_info(info)
|
|
yield Message.Directory, info
|
|
|
|
num = 0
|
|
for image in images:
|
|
# https://www.mediawiki.org/wiki/Release_notes/1.34
|
|
if "filemissing" in image:
|
|
self.log.warning(
|
|
"File %s (or its revision) is missing",
|
|
image["canonicaltitle"].partition(":")[2])
|
|
continue
|
|
num += 1
|
|
image["num"] = num
|
|
self.prepare_image(image)
|
|
image.update(info)
|
|
yield Message.Url, image["url"], image
|
|
|
|
if self.subcategories:
|
|
base = self.root + "/wiki/"
|
|
self.params["gcmtype"] = "subcat"
|
|
for subcat in self._pagination(self.params):
|
|
url = base + subcat["title"].replace(" ", "_")
|
|
subcat["_extractor"] = WikimediaArticleExtractor
|
|
yield Message.Queue, url, subcat
|
|
|
|
def _pagination(self, params):
|
|
"""
|
|
https://www.mediawiki.org/wiki/API:Query
|
|
https://opendata.stackexchange.com/questions/13381
|
|
"""
|
|
|
|
url = self.api_url
|
|
if not url:
|
|
url = self._search_api_path(self.root)
|
|
|
|
params["action"] = "query"
|
|
params["format"] = "json"
|
|
params["prop"] = "imageinfo"
|
|
params["iiprop"] = (
|
|
"timestamp|user|userid|comment|canonicaltitle|url|size|"
|
|
"sha1|mime|metadata|commonmetadata|extmetadata|bitdepth"
|
|
)
|
|
params["iilimit"] = self.image_revisions
|
|
|
|
while True:
|
|
data = self.request_json(url, params=params)
|
|
|
|
# ref: https://www.mediawiki.org/wiki/API:Errors_and_warnings
|
|
if error := data.get("error"):
|
|
self.log.error("%s: %s", error["code"], error["info"])
|
|
return
|
|
# MediaWiki will emit warnings for non-fatal mistakes such as
|
|
# invalid parameter instead of raising an error
|
|
if warnings := data.get("warnings"):
|
|
self.log.debug("MediaWiki returned warnings: %s", warnings)
|
|
|
|
try:
|
|
pages = data["query"]["pages"]
|
|
except KeyError:
|
|
pass
|
|
else:
|
|
yield from pages.values()
|
|
|
|
try:
|
|
continuation = data["continue"]
|
|
except KeyError:
|
|
break
|
|
params.update(continuation)
|
|
|
|
|
|
BASE_PATTERN = WikimediaExtractor.update({
|
|
"wikimedia": {
|
|
"root": None,
|
|
"pattern": r"[a-z]{2,}\."
|
|
r"wik(?:i(?:pedia|quote|books|source|news|versity|data"
|
|
r"|voyage)|tionary)"
|
|
r"\.org",
|
|
"api-path": "/w/api.php",
|
|
},
|
|
"wikispecies": {
|
|
"root": "https://species.wikimedia.org",
|
|
"pattern": r"species\.wikimedia\.org",
|
|
"api-path": "/w/api.php",
|
|
},
|
|
"wikimediacommons": {
|
|
"root": "https://commons.wikimedia.org",
|
|
"pattern": r"commons\.wikimedia\.org",
|
|
"api-path": "/w/api.php",
|
|
},
|
|
"mediawiki": {
|
|
"root": "https://www.mediawiki.org",
|
|
"pattern": r"(?:www\.)?mediawiki\.org",
|
|
"api-path": "/w/api.php",
|
|
},
|
|
"fandom": {
|
|
"root": None,
|
|
"pattern": r"[\w-]+\.fandom\.com",
|
|
"api-path": "/api.php",
|
|
},
|
|
"wikigg": {
|
|
"root": None,
|
|
"pattern": r"\w+\.wiki\.gg",
|
|
"api-path": "/api.php",
|
|
},
|
|
"mariowiki": {
|
|
"root": "https://www.mariowiki.com",
|
|
"pattern": r"(?:www\.)?mariowiki\.com",
|
|
"api-path": "/api.php",
|
|
},
|
|
"bulbapedia": {
|
|
"root": "https://bulbapedia.bulbagarden.net",
|
|
"pattern": r"(?:bulbapedia|archives)\.bulbagarden\.net",
|
|
"api-path": "/w/api.php",
|
|
},
|
|
"pidgiwiki": {
|
|
"root": "https://www.pidgi.net",
|
|
"pattern": r"(?:www\.)?pidgi\.net",
|
|
"api-path": "/wiki/api.php",
|
|
},
|
|
"azurlanewiki": {
|
|
"root": "https://azurlane.koumakan.jp",
|
|
"pattern": r"azurlane\.koumakan\.jp",
|
|
"api-path": "/w/api.php",
|
|
"useragent": "Googlebot-Image/1.0",
|
|
},
|
|
})
|
|
|
|
|
|
class WikimediaArticleExtractor(WikimediaExtractor):
|
|
"""Extractor for wikimedia articles"""
|
|
subcategory = "article"
|
|
directory_fmt = ("{category}", "{page}")
|
|
pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)"
|
|
example = "https://en.wikipedia.org/wiki/TITLE"
|
|
|
|
def __init__(self, match):
|
|
WikimediaExtractor.__init__(self, match)
|
|
|
|
path = self.groups[-1]
|
|
if path[2] == "/":
|
|
self.root = self.root + "/" + path[:2]
|
|
path = path[3:]
|
|
if path.startswith("wiki/"):
|
|
path = path[5:]
|
|
|
|
pre, sep, _ = path.partition(":")
|
|
prefix = pre.lower() if sep else None
|
|
|
|
self.title = path = text.unquote(path)
|
|
if prefix:
|
|
self.subcategory = prefix
|
|
|
|
if prefix == "category":
|
|
if self.config("subcategories", True):
|
|
self.subcategories = True
|
|
self.params = {
|
|
"generator": "categorymembers",
|
|
"gcmtitle" : path,
|
|
"gcmtype" : "file",
|
|
"gcmlimit" : self.per_page,
|
|
}
|
|
elif prefix == "file":
|
|
self.params = {
|
|
"titles" : path,
|
|
}
|
|
else:
|
|
self.params = {
|
|
"generator": "images",
|
|
"gimlimit" : self.per_page,
|
|
"titles" : path,
|
|
}
|
|
|
|
def prepare_info(self, info):
|
|
info["page"] = self.title
|
|
|
|
|
|
class WikimediaWikiExtractor(WikimediaExtractor):
|
|
"""Extractor for all files on a MediaWiki instance"""
|
|
subcategory = "wiki"
|
|
pattern = BASE_PATTERN + r"/?$"
|
|
example = "https://en.wikipedia.org/"
|
|
|
|
def __init__(self, match):
|
|
WikimediaExtractor.__init__(self, match)
|
|
|
|
# ref: https://www.mediawiki.org/wiki/API:Allpages
|
|
self.params = {
|
|
"generator" : "allpages",
|
|
"gapnamespace": 6, # "File" namespace
|
|
"gaplimit" : self.per_page,
|
|
}
|