[wikimedia] provide 'lang' metadata (#7283)

This commit is contained in:
Mike Fährmann
2025-10-23 20:42:45 +02:00
parent da4b8c1e83
commit 5e1862871e
3 changed files with 12 additions and 2 deletions

View File

@@ -25,12 +25,17 @@ class WikimediaExtractor(BaseExtractor):
self._init_category(match)
if self.category == "wikimedia":
self.category = self.root.split(".")[-2]
labels = self.root.split(".")
self.lang = labels[-3][-2:]
self.category = labels[-2]
elif self.category in ("fandom", "wikigg"):
self.lang = "en"
self.basesubcategory = self.category
self.category = (
f"{self.category}-"
f"{self.root.partition('.')[0].rpartition('/')[2]}")
else:
self.lang = ""
if useragent := self.config_instance("useragent"):
self.useragent = useragent
@@ -237,7 +242,8 @@ class WikimediaArticleExtractor(WikimediaExtractor):
path = self.groups[-1]
if path[2] == "/":
self.root = f"{self.root}/{path[:2]}"
self.lang = lang = path[:2]
self.root = f"{self.root}/{lang}"
path = path[3:]
if path.startswith("wiki/"):
path = path[5:]
@@ -272,6 +278,7 @@ class WikimediaArticleExtractor(WikimediaExtractor):
def prepare_info(self, info):
info["page"] = self.path
info["lang"] = self.lang
class WikimediaWikiExtractor(WikimediaExtractor):

View File

@@ -107,6 +107,8 @@ __tests__ = (
"#category": ("wikimedia", "fandom-discogs", "file"),
"#class" : wikimedia.WikimediaArticleExtractor,
"#results" : "https://static.wikia.nocookie.net/discogs/images/a/ab/CH-0430D2.jpg/revision/latest?cb=20241007150151&path-prefix=zh&format=original",
"lang": "zh",
},
{

View File

@@ -32,6 +32,7 @@ __tests__ = (
"extmetadata" : dict,
"filename" : str,
"height" : int,
"lang" : "en",
"metadata" : dict,
"mime" : r"re:image/\w+",
"page" : "Athena",