[wikimedia] fix 'limit' config lookup
rework config handling in general
This commit is contained in:
@@ -966,18 +966,16 @@ class BaseExtractor(Extractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
if not self.category:
|
if not self.category:
|
||||||
self.groups = match.groups()
|
self._init_category(match)
|
||||||
self.match = match
|
|
||||||
self._init_category()
|
|
||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
|
|
||||||
def _init_category(self):
|
def _init_category(self, match):
|
||||||
for index, group in enumerate(self.groups):
|
for index, group in enumerate(match.groups()):
|
||||||
if group is not None:
|
if group is not None:
|
||||||
if index:
|
if index:
|
||||||
self.category, self.root, info = self.instances[index-1]
|
self.category, self.root, info = self.instances[index-1]
|
||||||
if not self.root:
|
if not self.root:
|
||||||
self.root = text.root_from_url(self.match[0])
|
self.root = text.root_from_url(match[0])
|
||||||
self.config_instance = info.get
|
self.config_instance = info.get
|
||||||
else:
|
else:
|
||||||
self.root = group
|
self.root = group
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ class WikimediaExtractor(BaseExtractor):
|
|||||||
request_interval = (1.0, 2.0)
|
request_interval = (1.0, 2.0)
|
||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
BaseExtractor.__init__(self, match)
|
self._init_category(match)
|
||||||
|
|
||||||
if self.category == "wikimedia":
|
if self.category == "wikimedia":
|
||||||
self.category = self.root.split(".")[-2]
|
self.category = self.root.split(".")[-2]
|
||||||
@@ -32,16 +32,15 @@ class WikimediaExtractor(BaseExtractor):
|
|||||||
f"{self.category}-"
|
f"{self.category}-"
|
||||||
f"{self.root.partition('.')[0].rpartition('/')[2]}")
|
f"{self.root.partition('.')[0].rpartition('/')[2]}")
|
||||||
|
|
||||||
self.per_page = self.config("limit", 50)
|
|
||||||
self.subcategories = False
|
|
||||||
|
|
||||||
if useragent := self.config_instance("useragent"):
|
if useragent := self.config_instance("useragent"):
|
||||||
self.useragent = useragent
|
self.useragent = useragent
|
||||||
|
|
||||||
|
BaseExtractor.__init__(self, match)
|
||||||
|
|
||||||
def _init(self):
|
def _init(self):
|
||||||
if api_path := self.config_instance("api-path"):
|
if api_path := self.config_instance("api-path"):
|
||||||
if api_path[0] == "/":
|
if api_path[0] == "/":
|
||||||
self.api_url = self.root + api_path
|
self.api_url = f"{self.root}{api_path}"
|
||||||
else:
|
else:
|
||||||
self.api_url = api_path
|
self.api_url = api_path
|
||||||
else:
|
else:
|
||||||
@@ -53,12 +52,14 @@ class WikimediaExtractor(BaseExtractor):
|
|||||||
# https://www.mediawiki.org/wiki/API:Imageinfo
|
# https://www.mediawiki.org/wiki/API:Imageinfo
|
||||||
self.image_revisions = self.config("image-revisions", 1)
|
self.image_revisions = self.config("image-revisions", 1)
|
||||||
self.format = self.config("format", "original")
|
self.format = self.config("format", "original")
|
||||||
|
self.per_page = self.config("limit", 50)
|
||||||
|
self.subcategories = False
|
||||||
|
|
||||||
@cache(maxage=36500*86400, keyarg=1)
|
@cache(maxage=36500*86400, keyarg=1)
|
||||||
def _search_api_path(self, root):
|
def _search_api_path(self, root):
|
||||||
self.log.debug("Probing possible API endpoints")
|
self.log.debug("Probing possible API endpoints")
|
||||||
for path in ("/api.php", "/w/api.php", "/wiki/api.php"):
|
for path in ("/api.php", "/w/api.php", "/wiki/api.php"):
|
||||||
url = root + path
|
url = f"{root}{path}"
|
||||||
response = self.request(url, method="HEAD", fatal=None)
|
response = self.request(url, method="HEAD", fatal=None)
|
||||||
if response.status_code < 400:
|
if response.status_code < 400:
|
||||||
return url
|
return url
|
||||||
@@ -86,7 +87,9 @@ class WikimediaExtractor(BaseExtractor):
|
|||||||
f"format={self.format}")
|
f"format={self.format}")
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
for info in self._pagination(self.params):
|
params = self.params()
|
||||||
|
|
||||||
|
for info in self._pagination(params):
|
||||||
try:
|
try:
|
||||||
images = info.pop("imageinfo")
|
images = info.pop("imageinfo")
|
||||||
except KeyError:
|
except KeyError:
|
||||||
@@ -112,10 +115,10 @@ class WikimediaExtractor(BaseExtractor):
|
|||||||
yield Message.Url, image["url"], image
|
yield Message.Url, image["url"], image
|
||||||
|
|
||||||
if self.subcategories:
|
if self.subcategories:
|
||||||
base = self.root + "/wiki/"
|
base = f"{self.root}/wiki/"
|
||||||
self.params["gcmtype"] = "subcat"
|
params["gcmtype"] = "subcat"
|
||||||
for subcat in self._pagination(self.params):
|
for subcat in self._pagination(params):
|
||||||
url = base + subcat["title"].replace(" ", "_")
|
url = f"{base}{subcat['title'].replace(' ', '_')}"
|
||||||
subcat["_extractor"] = WikimediaArticleExtractor
|
subcat["_extractor"] = WikimediaArticleExtractor
|
||||||
yield Message.Queue, url, subcat
|
yield Message.Queue, url, subcat
|
||||||
|
|
||||||
@@ -234,40 +237,41 @@ class WikimediaArticleExtractor(WikimediaExtractor):
|
|||||||
|
|
||||||
path = self.groups[-1]
|
path = self.groups[-1]
|
||||||
if path[2] == "/":
|
if path[2] == "/":
|
||||||
self.root = self.root + "/" + path[:2]
|
self.root = f"{self.root}/{path[:2]}"
|
||||||
path = path[3:]
|
path = path[3:]
|
||||||
if path.startswith("wiki/"):
|
if path.startswith("wiki/"):
|
||||||
path = path[5:]
|
path = path[5:]
|
||||||
|
self.path = text.unquote(path)
|
||||||
|
|
||||||
pre, sep, _ = path.partition(":")
|
pre, sep, _ = path.partition(":")
|
||||||
prefix = pre.lower() if sep else None
|
self.prefix = prefix = pre.lower() if sep else None
|
||||||
|
if prefix is not None:
|
||||||
self.title = path = text.unquote(path)
|
|
||||||
if prefix:
|
|
||||||
self.subcategory = prefix
|
self.subcategory = prefix
|
||||||
|
|
||||||
if prefix == "category":
|
def params(self):
|
||||||
|
if self.prefix == "category":
|
||||||
if self.config("subcategories", True):
|
if self.config("subcategories", True):
|
||||||
self.subcategories = True
|
self.subcategories = True
|
||||||
self.params = {
|
return {
|
||||||
"generator": "categorymembers",
|
"generator": "categorymembers",
|
||||||
"gcmtitle" : path,
|
"gcmtitle" : self.path,
|
||||||
"gcmtype" : "file",
|
"gcmtype" : "file",
|
||||||
"gcmlimit" : self.per_page,
|
"gcmlimit" : self.per_page,
|
||||||
}
|
}
|
||||||
elif prefix == "file":
|
|
||||||
self.params = {
|
if self.prefix == "file":
|
||||||
"titles" : path,
|
return {
|
||||||
}
|
"titles": self.path,
|
||||||
else:
|
|
||||||
self.params = {
|
|
||||||
"generator": "images",
|
|
||||||
"gimlimit" : self.per_page,
|
|
||||||
"titles" : path,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"generator": "images",
|
||||||
|
"gimlimit" : self.per_page,
|
||||||
|
"titles" : self.path,
|
||||||
|
}
|
||||||
|
|
||||||
def prepare_info(self, info):
|
def prepare_info(self, info):
|
||||||
info["page"] = self.title
|
info["page"] = self.path
|
||||||
|
|
||||||
|
|
||||||
class WikimediaWikiExtractor(WikimediaExtractor):
|
class WikimediaWikiExtractor(WikimediaExtractor):
|
||||||
@@ -276,11 +280,9 @@ class WikimediaWikiExtractor(WikimediaExtractor):
|
|||||||
pattern = rf"{BASE_PATTERN}/?$"
|
pattern = rf"{BASE_PATTERN}/?$"
|
||||||
example = "https://en.wikipedia.org/"
|
example = "https://en.wikipedia.org/"
|
||||||
|
|
||||||
def __init__(self, match):
|
def params(self):
|
||||||
WikimediaExtractor.__init__(self, match)
|
|
||||||
|
|
||||||
# ref: https://www.mediawiki.org/wiki/API:Allpages
|
# ref: https://www.mediawiki.org/wiki/API:Allpages
|
||||||
self.params = {
|
return {
|
||||||
"generator" : "allpages",
|
"generator" : "allpages",
|
||||||
"gapnamespace": 6, # "File" namespace
|
"gapnamespace": 6, # "File" namespace
|
||||||
"gaplimit" : self.per_page,
|
"gaplimit" : self.per_page,
|
||||||
|
|||||||
Reference in New Issue
Block a user