[wikimedia] refactor

2024-08-18 12:38:31 +08:00
parent 8ea75202ed
commit a3b2c88fbe
1 changed files with 52 additions and 42 deletions
--- a/gallery_dl/extractor/wikimedia.py
+++ b/gallery_dl/extractor/wikimedia.py
@@ -17,13 +17,11 @@ class WikimediaExtractor(BaseExtractor):
    """Base class for wikimedia extractors"""
    basecategory = "wikimedia"
    filename_fmt = "{filename} ({sha1[:8]}).{extension}"
-    directory_fmt = ("{category}", "{page}")
    archive_fmt = "{sha1}"
    request_interval = (1.0, 2.0)

    def __init__(self, match):
        BaseExtractor.__init__(self, match)
-        path = match.group(match.lastindex)

        if self.category == "wikimedia":
            self.category = self.root.split(".")[-2]
@@ -31,32 +29,6 @@ class WikimediaExtractor(BaseExtractor):
            self.category = "{}-{}".format(
                self.category, self.root.partition(".")[0].rpartition("/")[2])

-        if path.startswith("wiki/"):
-            path = path[5:]
-
-        pre, sep, _ = path.partition(":")
-        prefix = pre.lower() if sep else None
-
-        self.title = path = text.unquote(path)
-        if prefix:
-            self.subcategory = prefix
-
-        if prefix == "category":
-            self.params = {
-                "generator": "categorymembers",
-                "gcmtitle" : path,
-                "gcmtype"  : "file",
-            }
-        elif prefix == "file":
-            self.params = {
-                "titles"   : path,
-            }
-        else:
-            self.params = {
-                "generator": "images",
-                "titles"   : path,
-            }
-
    def _init(self):
        api_path = self.config_instance("api-path")
        if api_path:
@@ -67,6 +39,22 @@ class WikimediaExtractor(BaseExtractor):
        else:
            self.api_url = self.root + "/api.php"

+    @staticmethod
+    def prepare(image):
+        """Adjust the content of a image object"""
+        image["metadata"] = {
+            m["name"]: m["value"]
+            for m in image["metadata"] or ()}
+        image["commonmetadata"] = {
+            m["name"]: m["value"]
+            for m in image["commonmetadata"] or ()}
+
+        filename = image["canonicaltitle"]
+        image["filename"], _, image["extension"] = \
+            filename.partition(":")[2].rpartition(".")
+        image["date"] = text.parse_datetime(
+            image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
+
    def items(self):
        for info in self._pagination(self.params):
            try:
@@ -75,20 +63,7 @@ class WikimediaExtractor(BaseExtractor):
                self.log.debug("Missing 'imageinfo' for %s", info)
                continue

-            image["metadata"] = {
-                m["name"]: m["value"]
-                for m in image["metadata"] or ()}
-            image["commonmetadata"] = {
-                m["name"]: m["value"]
-                for m in image["commonmetadata"] or ()}
-
-            filename = image["canonicaltitle"]
-            image["filename"], _, image["extension"] = \
-                filename.partition(":")[2].rpartition(".")
-            image["date"] = text.parse_datetime(
-                image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
-            image["page"] = self.title
-
+            self.prepare(image)
            yield Message.Directory, image
            yield Message.Url, image["url"], image

@@ -181,5 +156,40 @@ BASE_PATTERN = WikimediaExtractor.update({
 class WikimediaArticleExtractor(WikimediaExtractor):
    """Extractor for wikimedia articles"""
    subcategory = "article"
+    directory_fmt = ("{category}", "{page}")
    pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)"
    example = "https://en.wikipedia.org/wiki/TITLE"
+
+    def __init__(self, match):
+        WikimediaExtractor.__init__(self, match)
+
+        path = match.group(match.lastindex)
+        if path.startswith("wiki/"):
+            path = path[5:]
+
+        pre, sep, _ = path.partition(":")
+        prefix = pre.lower() if sep else None
+
+        self.title = path = text.unquote(path)
+        if prefix:
+            self.subcategory = prefix
+
+        if prefix == "category":
+            self.params = {
+                "generator": "categorymembers",
+                "gcmtitle" : path,
+                "gcmtype"  : "file",
+            }
+        elif prefix == "file":
+            self.params = {
+                "titles"   : path,
+            }
+        else:
+            self.params = {
+                "generator": "images",
+                "titles"   : path,
+            }
+
+    def prepare(self, image):
+        WikimediaExtractor.prepare(image)
+        image["page"] = self.title