[hitomi] restore metadata fields from before f33b13a

... and add a 'metadata' option to disable visiting the gallery page and extracting data from it if this is not needed.
2020-03-12 23:39:29 +01:00
parent 2d5703c493
commit 59edcdc822
3 changed files with 64 additions and 13 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -790,6 +790,17 @@ Description The name of the preferred animation format, which can be one of
 =========== =====


+extractor.hitomi.metadata
+-------------------------
+=========== =====
+Type        ``bool``
+Default     ``true``
+Description Try to extract
+            ``artist``, ``group``, ``parody``,  and ``characters``
+            metadata.
+=========== =====
+
+
 extractor.imgur.mp4
 -------------------
 =========== =====
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -63,6 +63,10 @@
        {
            "format": "mp4"
        },
+        "hitomi":
+        {
+            "metadata": true
+        },
        "idolcomplex":
        {
            "username": null,
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -10,6 +10,7 @@

 from .common import GalleryExtractor
 from .. import text, util
+import string
 import json


@@ -23,7 +24,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
    test = (
        ("https://hitomi.la/galleries/867789.html", {
            "pattern": r"https://[a-c]a.hitomi.la/images/./../[0-9a-f]+.jpg",
-            "keyword": "3314105a0b344ea1461c43257b14b0de415b88bb",
+            "keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2",
            "count": 16,
        }),
        # download test
@@ -57,40 +58,75 @@ class HitomiGalleryExtractor(GalleryExtractor):
        gid = match.group(1)
        url = "https://ltn.hitomi.la/galleries/{}.js".format(gid)
        GalleryExtractor.__init__(self, match, url)
-        self.data = None
+        self.info = None
        self.session.headers["Referer"] = "{}/reader/{}.html".format(
            self.root, gid)

    def metadata(self, page):
-        self.data = data = json.loads(page.partition("=")[2])
+        self.info = info = json.loads(page.partition("=")[2])

-        language = data.get("language")
+        data = self._data_from_gallery_info(info)
+        if self.config("metadata", True):
+            data.update(self._data_from_gallery_page(info))
+        return data
+
+    def _data_from_gallery_info(self, info):
+        language = info.get("language")
        if language:
            language = language.capitalize()

        tags = []
-        for tinfo in data["tags"]:
+        for tinfo in info["tags"]:
            tag = tinfo["tag"]
            if tinfo.get("female"):
-                tag = "female:" + tag
+                tag += " ♀"
            elif tinfo.get("male"):
-                tag = "male:" + tag
-            tags.append(tag)
+                tag += " ♂"
+            tags.append(string.capwords(tag))

        return {
-            "gallery_id": text.parse_int(data["id"]),
-            "title"     : data["title"],
-            "type"      : data["type"],
+            "gallery_id": text.parse_int(info["id"]),
+            "title"     : info["title"],
+            "type"      : info["type"].capitalize(),
            "language"  : language,
            "lang"      : util.language_to_code(language),
            "tags"      : tags,
            "date"      : text.parse_datetime(
-                data["date"] + ":00", "%Y-%m-%d %H:%M:%S%z"),
+                info["date"] + ":00", "%Y-%m-%d %H:%M:%S%z"),
+        }
+
+    def _data_from_gallery_page(self, info):
+        url = "{}/galleries/{}.html".format(self.root, info["id"])
+
+        # follow redirects
+        while True:
+            response = self.request(url, fatal=False)
+            if b"<title>Redirect</title>" not in response.content:
+                break
+            url = text.extract(response.text, "href='", "'")[0]
+            if not url.startswith("http"):
+                url = text.urljoin(self.root, url)
+
+        if response.status_code >= 400:
+            return {}
+
+        def prep(value):
+            return [
+                text.unescape(string.capwords(v))
+                for v in text.extract_iter(value or "", '.html">', '<')
+            ]
+
+        extr = text.extract_from(response.text)
+        return {
+            "artist"    : prep(extr('<h2>', '</h2>')),
+            "group"     : prep(extr('<td>Group</td><td>', '</td>')),
+            "parody"    : prep(extr('<td>Series</td><td>', '</td>')),
+            "characters": prep(extr('<td>Characters</td><td>', '</td>')),
        }

    def images(self, _):
        result = []
-        for image in self.data["files"]:
+        for image in self.info["files"]:
            ihash = image["hash"]
            idata = text.nameext_from_url(image["name"])