[exhentai] rename metadata fields to match API results (#1325)

- gallery_id -> gid - gallery_token -> token - title_jp -> title_jpn - visible -> expunged - gallery_size -> filesize - count -> filecount Also changes the function of the 'metadata' option. It is now boolean and causes extra data fields from the API to be added instead of completely replacing the data from HTML when activated.
2021-02-26 17:53:27 +01:00
parent 996bfe4d4b
commit 61fbbd2dae
2 changed files with 61 additions and 31 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -927,14 +927,15 @@ Description
 extractor.exhentai.metadata
 ---------------------------
 Type
-    ``string``
+    ``bool``
 Default
-    ``"html"``
+    ``false``
 Description
-    Select the gallery metadata source.
+    Load extended gallery metadata from the
+    `API <https://ehwiki.org/wiki/API#Gallery_Metadata>`_.

-    * ``"api"``: Get data from the `API <https://ehwiki.org/wiki/API>`_.
-    * ``"html"``: Extract data from HTML.
+    Adds ``archiver_key``, ``posted``, and ``torrents``.
+    Makes ``date`` and ``filesize`` more precise.


 extractor.exhentai.original
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -22,10 +22,10 @@ BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org"
 class ExhentaiExtractor(Extractor):
    """Base class for exhentai extractors"""
    category = "exhentai"
-    directory_fmt = ("{category}", "{gallery_id} {title[:247]}")
+    directory_fmt = ("{category}", "{gid} {title[:247]}")
    filename_fmt = (
-        "{gallery_id}_{num:>04}_{image_token}_{filename}.{extension}")
-    archive_fmt = "{gallery_id}_{num}"
+        "{gid}_{num:>04}_{image_token}_{filename}.{extension}")
+    archive_fmt = "{gid}_{num}"
    cookienames = ("ipb_member_id", "ipb_pass_hash")
    cookiedomain = ".exhentai.org"
    root = "https://exhentai.org"
@@ -131,7 +131,39 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
               r"|/s/([\da-f]{10})/(\d+)-(\d+))")
    test = (
        ("https://exhentai.org/g/1200119/d55c44d3d0/", {
-            "keyword": "199db053b4ccab94463b459e1cfe079df8cdcdd1",
+            "keyword": {
+                "cost": int,
+                "date": "dt:2018-03-18 20:15:00",
+                "eh_category": "Non-H",
+                "expunged": False,
+                "favorites": "17",
+                "filecount": "4",
+                "filesize": 1488978,
+                "gid": 1200119,
+                "height": int,
+                "image_token": "re:[0-9a-f]{10}",
+                "lang": "jp",
+                "language": "Japanese",
+                "parent": "",
+                "rating": r"re:\d\.\d+",
+                "size": int,
+                "tags": [
+                    "parody:komi-san wa komyushou desu.",
+                    "character:shouko komi",
+                    "group:seventh lowlife",
+                    "sample",
+                ],
+                "thumb": "https://exhentai.org/t/ce/0a/ce0a5bcb583229a9b07c0f8"
+                         "3bcb1630ab1350640-624622-736-1036-jpg_250.jpg",
+                "title": "C93 [Seventh_Lowlife] Komi-san ha Tokidoki Daitan de"
+                         "su (Komi-san wa Komyushou desu) [Sample]",
+                "title_jpn": "(C93) [Comiketjack (わ！)] 古見さんは、時々大胆"
+                             "です。 (古見さんは、コミュ症です。) [見本]",
+                "token": "d55c44d3d0",
+                "torrentcount": "0",
+                "uploader": "klorpa",
+                "width": int,
+            },
            "content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff",
        }),
        ("https://exhentai.org/g/960461/4f0e369d82/", {
@@ -182,6 +214,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
            gpage = self._gallery_page()

        data = self.get_metadata(gpage)
+        self.count = text.parse_int(data["filecount"])
        yield Message.Directory, data

        images = itertools.chain(
@@ -197,37 +230,38 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):

    def get_metadata(self, page):
        """Extract gallery metadata"""
-        if self.config("metadata") == "api":
-            return self.metadata_from_api()
-        return self.metadata_from_page(page)
+        data = self.metadata_from_page(page)
+        if self.config("metadata", False):
+            data.update(self.metadata_from_api())
+            data["date"] = text.parse_timestamp(data["posted"])
+        return data

    def metadata_from_page(self, page):
        extr = text.extract_from(page)
        data = {
-            "gallery_id"   : self.gallery_id,
-            "gallery_token": self.gallery_token,
+            "gid"          : self.gallery_id,
+            "token"        : self.gallery_token,
            "thumb"        : extr("background:transparent url(", ")"),
            "title"        : text.unescape(extr('<h1 id="gn">', '</h1>')),
-            "title_jp"     : text.unescape(extr('<h1 id="gj">', '</h1>')),
+            "title_jpn"    : text.unescape(extr('<h1 id="gj">', '</h1>')),
+            "_"            : extr('<div id="gdc"><div class="cs ct', '"'),
+            "eh_category"  : extr('>', '<'),
            "uploader"     : text.unquote(extr('/uploader/', '"')),
            "date"         : text.parse_datetime(extr(
                '>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"),
            "parent"       : extr(
                '>Parent:</td><td class="gdt2"><a href="', '"'),
-            "visible"      : extr(
+            "expunged"     : "Yes" != extr(
                '>Visible:</td><td class="gdt2">', '<'),
-            "language"     : extr(
-                '>Language:</td><td class="gdt2">', ' '),
-            "gallery_size" : text.parse_bytes(extr(
+            "language"     : extr('>Language:</td><td class="gdt2">', ' '),
+            "filesize"     : text.parse_bytes(extr(
                '>File Size:</td><td class="gdt2">', '<').rstrip("Bb")),
-            "count"        : text.parse_int(extr(
-                '>Length:</td><td class="gdt2">', ' ')),
-            "favorites"    : text.parse_int(extr('id="favcount">', ' ')),
-            "rating"       : text.parse_float(extr(">Average: ", "<")),
-            "torrentcount" : text.parse_int(extr('>Torrent Download (', ')')),
+            "filecount"    : extr('>Length:</td><td class="gdt2">', ' '),
+            "favorites"    : extr('id="favcount">', ' '),
+            "rating"       : extr(">Average: ", "<"),
+            "torrentcount" : extr('>Torrent Download (', ')'),
        }

-        self.count = data["count"]
        data["lang"] = util.language_to_code(data["language"])
        data["tags"] = [
            text.unquote(tag.replace("+", " "))
@@ -248,12 +282,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
        if "error" in data:
            raise exception.StopExtraction(data["error"])

-        data = data["gmetadata"][0]
-        data["eh_category"] = data["category"]
-        data["date"] = text.parse_timestamp(data["posted"])
-        self.count = data["filecount"]
-
-        return data
+        return data["gmetadata"][0]

    def image_from_page(self, page):
        """Get image url and data from webpage"""