adjust metadata types for GalleryExtractors

2019-03-01 23:13:40 +01:00
parent 13e0f2a78f
commit 26c4365baa
6 changed files with 92 additions and 89 deletions
--- a/gallery_dl/extractor/hentaifox.py
+++ b/gallery_dl/extractor/hentaifox.py
@@ -12,16 +12,20 @@ from .common import GalleryExtractor, Extractor, Message
 from .. import text
-class HentaifoxGalleryExtractor(GalleryExtractor):
+class HentaifoxBase():
-    """Extractor for image galleries on hentaifox.com"""
+    """Base class for hentaifox extractors"""
    category = "hentaifox"
    root = "https://hentaifox.com"
 class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
    """Extractor for image galleries on hentaifox.com"""
    pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
    test = ("https://hentaifox.com/gallery/56622/", {
        "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
        "count": 24,
-        "keyword": "d0df47e073e32a7752236ab151949c3820f9d81e",
+        "keyword": "38f8517605feb6854d48833297da6b05c6541b69",
    })
    root = "https://hentaifox.com"
    def __init__(self, match):
        GalleryExtractor.__init__(self, match)
@@ -30,7 +34,7 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
    def metadata(self, page):
        title, pos = text.extract(page, "<h1>", "</h1>")
        data = text.extract_all(page, (
-            ("parodies"  , ">Parodies:"  , "</a></span>"),
+            ("parody"    , ">Parodies:"  , "</a></span>"),
            ("characters", ">Characters:", "</a></span>"),
            ("tags"      , ">Tags:"      , "</a></span>"),
            ("artist"    , ">Artists:"   , "</a></span>"),
@@ -39,9 +43,10 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
        ), pos)[0]
        for key, value in data.items():
-            data[key] = text.remove_html(value).replace(" , ", ", ")
+            data[key] = text.split_html(value)[::2]
        data["gallery_id"] = text.parse_int(self.gallery_id)
        data["title"] = text.unescape(title)
        data["type"] = data["type"][0] if data["type"] else ""
        data["language"] = "English"
        data["lang"] = "en"
        return data
@@ -53,9 +58,8 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
        ]
-class HentaifoxSearchExtractor(Extractor):
+class HentaifoxSearchExtractor(HentaifoxBase, Extractor):
    """Extractor for search results and listings on hentaifox.com"""
    category = "hentaifox"
    subcategory = "search"
    pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"
               r"(/(?:parody|tag|artist|character|search)/[^/?%#]+)")
@@ -76,7 +80,6 @@ class HentaifoxSearchExtractor(Extractor):
            },
        }),
    )
    root = "https://hentaifox.com"
    def __init__(self, match):
        Extractor.__init__(self, match)
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -20,7 +20,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
    test = (
        ("https://hitomi.la/galleries/867789.html", {
            "url": "cb759868d090fe0e2655c3e29ebf146054322b6d",
-            "keyword": "52951edb50163180eb669a78aef0bab0522d32b7",
+            "keyword": "07536afc5696cb4983a4831ab4c70c1d155f875c",
        }),
        ("https://hitomi.la/galleries/1036181.html", {
            # "aa" subdomain for gallery-id ending in 1 (#142)
@@ -30,8 +30,8 @@ class HitomiGalleryExtractor(GalleryExtractor):
    )
    def __init__(self, match):
-        self.gid = text.parse_int(match.group(1))
+        self.gallery_id = text.parse_int(match.group(1))
-        url = "https://hitomi.la/galleries/{}.html".format(self.gid)
+        url = "https://hitomi.la/galleries/{}.html".format(self.gallery_id)
        GalleryExtractor.__init__(self, match, url)
    def metadata(self, page):
@@ -49,23 +49,22 @@ class HitomiGalleryExtractor(GalleryExtractor):
        lang = None if lang == "N/A" else text.remove_html(lang)
        return {
-            "gallery_id": self.gid,
+            "gallery_id": self.gallery_id,
-            "title": text.unescape(" ".join(title.split())),
+            "title"     : text.unescape(title.strip()),
-            "artist": self._prepare(artist),
+            "artist"    : self._prepare(artist),
-            "group": self._prepare(group),
+            "group"     : self._prepare(group),
-            "type": text.remove_html(gtype).capitalize(),
+            "parody"    : self._prepare(series),
            "lang": util.language_to_code(lang),
            "language": lang,
            "date": date,
            "series": self._prepare(series),
            "characters": self._prepare(chars),
-            "tags": self._prepare(tags),
+            "tags"      : self._prepare(tags),
            "type"      : text.remove_html(gtype).capitalize(),
            "lang"      : util.language_to_code(lang),
            "language"  : lang,
            "date"      : date,
        }
    def images(self, page):
        # see https://ltn.hitomi.la/common.js
-        frontends = 2
+        offset = self.gallery_id % 2 if self.gallery_id % 10 != 1 else 0
        offset = self.gid % frontends if self.gid % 10 != 1 else 0
        subdomain = chr(97 + offset) + "a"
        base = "https://" + subdomain + ".hitomi.la/galleries/"
@@ -78,10 +77,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
    @staticmethod
    def _prepare(value):
-        if not value or "<ul " not in value:
+        return [
-            return ""
+            text.unescape(string.capwords(v))
-        value = ", ".join(text.extract_iter(
+            for v in text.extract_iter(value or "", '.html">', '<')
-            value, '.html">', '<'))
+        ]
        return string.capwords(
            text.unescape(value)
        )
--- a/gallery_dl/extractor/nhentai.py
+++ b/gallery_dl/extractor/nhentai.py
@@ -32,6 +32,7 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
            "title_ja"  : str,
            "gallery_id": 147850,
            "media_id"  : 867789,
            "count"     : 16,
            "date"      : 1446050915,
            "scanlator" : "",
            "artist"    : ["morris"],
@@ -40,8 +41,8 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
            "characters": list,
            "tags"      : list,
            "type"      : "manga",
            "language"  : ["translated", "english"],
            "lang"      : "en",
            "language"  : "English",
            "width"     : int,
            "height"    : int,
        },
@@ -63,12 +64,11 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
        for tag in data["tags"]:
            info[tag["type"]].append(tag["name"])
        language = ""
        for language in info["language"]:
            if language != "translated":
-                lang = util.language_to_code(language)
+                language = language.capitalize()
                break
        else:
            lang = ""
        return {
            "title"     : title_en or title_ja,
@@ -84,8 +84,8 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
            "characters": info["character"],
            "tags"      : info["tag"],
            "type"      : info["category"][0] if info["category"] else "",
-            "language"  : info["language"],
+            "lang"      : util.language_to_code(language),
-            "lang"      : lang,
+            "language"  : language,
        }
    def images(self, _):
--- a/gallery_dl/extractor/pururin.py
+++ b/gallery_dl/extractor/pururin.py
@@ -20,24 +20,24 @@ class PururinGalleryExtractor(GalleryExtractor):
    test = ("https://pururin.io/gallery/38661/iowant-2", {
        "pattern": r"https://cdn.pururin.io/assets/images/data/38661/\d+\.jpg",
        "keyword": {
-            "artist": "Shoda Norihiro",
+            "title"     : "Iowant 2!!",
            "title_en"  : "Iowant 2!!",
            "title_jp"  : "",
            "gallery_id": 38661,
            "count"     : 19,
            "artist"    : ["Shoda Norihiro"],
            "group"     : ["Obsidian Order"],
            "parody"    : ["Kantai Collection"],
            "characters": ["Iowa", "Teitoku"],
            "tags"      : list,
            "type"      : "Doujinshi",
            "collection": "",
            "convention": "C92",
-            "count": 19,
+            "rating"    : float,
-            "extension": "jpg",
+            "uploader"  : "demo",
-            "gallery_id": 38661,
+            "scanlator" : "",
-            "group": "Obsidian Order",
+            "lang"      : "en",
-            "lang": "en",
+            "language"  : "English",
            "language": "English",
            "parody": "Kantai Collection",
            "rating": float,
            "scanlator": "",
            "tags": list,
            "title": "Iowant 2!!",
            "title_jp": str,
            "type": "Doujinshi",
            "uploader": "demo"
        }
    })
    root = "https://pururin.io"
@@ -74,18 +74,19 @@ class PururinGalleryExtractor(GalleryExtractor):
        self._ext = info["image_extension"]
        self._cnt = info["total_pages"]
-        for key in ("tags", "characters"):
+        for key in ("artist", "group", "parody", "tags", "characters"):
            data[key] = [
                text.unescape(item)
                for item in text.extract_iter(data[key], 'title="', '"')
            ]
-        for key in ("artist", "group", "parody", "type", "collection",
+        for key in ("type", "collection", "language", "scanlator",
-                    "language", "scanlator", "convention"):
+                    "convention"):
            data[key] = text.unescape(text.extract(
                data[key], 'title="', '"')[0] or "")
        data["gallery_id"] = text.parse_int(self.gallery_id)
-        data["title"] = info["title"]
+        data["title"] = info["title"] or info.get("j_title") or ""
        data["title_en"] = info["title"]
        data["title_jp"] = info.get("j_title") or ""
        data["uploader"] = text.remove_html(data["uploader"])
        data["rating"] = text.parse_float(data["rating"])
--- a/gallery_dl/extractor/simplyhentai.py
+++ b/gallery_dl/extractor/simplyhentai.py
@@ -23,7 +23,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
        (("https://original-work.simply-hentai.com"
          "/amazon-no-hiyaku-amazon-elixir"), {
            "url": "258289249990502c3138719cb89e995a60861e49",
-            "keyword": "468a0a3db4fc6ad7fcae0facefb9753831c0404d",
+            "keyword": "18ab9defca53dbb2aeb7965193e93e0ea125b76b",
        }),
        ("https://www.simply-hentai.com/notfound", {
            "exception": exception.GalleryDLException,
@@ -55,14 +55,14 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
        return {
            "gallery_id": text.parse_int(gid),
-            "title": text.unescape(title),
+            "title"     : text.unescape(title),
-            "series": text.remove_html(series),
+            "artist"    : text.split_html(artist),
-            "characters": ", ".join(text.split_html(chars)),
+            "parody"    : text.split_html(series),
-            "tags": text.split_html(tags),
+            "characters": text.split_html(chars),
-            "artist": ", ".join(text.split_html(artist)),
+            "tags"      : text.split_html(tags),
-            "lang": util.language_to_code(lang),
+            "lang"      : util.language_to_code(lang),
-            "language": lang,
+            "language"  : lang,
-            "date": text.remove_html(date),
+            "date"      : text.remove_html(date),
        }
    def images(self, _):
--- a/gallery_dl/extractor/tsumino.py
+++ b/gallery_dl/extractor/tsumino.py
@@ -48,24 +48,24 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
        ("https://www.tsumino.com/Book/Info/40996", {
            "url": "84bf30a86623039fc87855680fada884dc8a1ddd",
            "keyword": {
-                "artist": "Itou Life",
+                "title"     : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
-                "characters": "Carmilla, Gudako, Gudao, Lancelot, Nightingale",
+                "title_en"  : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
-                "collection": "",
+                "title_jp"  : "シコシコ大好きナイチンゲール + 会場限定おまけ本",
                "count": 42,
                "date": "2018 June 29",
                "gallery_id": 40996,
-                "group": "Itou Life",
+                "date"      : "2018 June 29",
-                "lang": "en",
+                "count"     : 42,
-                "language": "English",
+                "collection": "",
-                "page": int,
+                "artist"    : ["Itou Life"],
-                "parodies": "Fate/Grand Order",
+                "group"     : ["Itou Life"],
-                "rating": float,
+                "parody"    : ["Fate/Grand Order"],
-                "tags": str,
+                "characters": list,
-                "thumbnail": "http://www.tsumino.com/Image/Thumb/40996",
+                "tags"      : list,
-                "title": r"re:Shikoshiko Daisuki Nightingale \+ Kaijou Gentei",
+                "type"      : "Doujinshi",
-                "title_jp": "シコシコ大好きナイチンゲール + 会場限定おまけ本",
+                "rating"    : float,
-                "type": "Doujinshi",
+                "uploader"  : "sehki",
-                "uploader": "sehki"
+                "lang"      : "en",
                "language"  : "English",
                "thumbnail" : "http://www.tsumino.com/Image/Thumb/40996",
            },
        }),
        ("https://www.tsumino.com/Read/View/45834"),
@@ -81,6 +81,8 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
        title, pos = extr(page, '"og:title" content="', '"')
        thumb, pos = extr(page, '"og:image" content="', '"', pos)
        title_en, _, title_jp = text.unescape(title).partition("/")
        title_en = title_en.strip()
        title_jp = title_jp.strip()
        uploader  , pos = extr(page, 'id="Uploader">'  , '</div>', pos)
        date      , pos = extr(page, 'id="Uploaded">'  , '</div>', pos)
@@ -95,19 +97,20 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
        return {
            "gallery_id": text.parse_int(self.gallery_id),
-            "title": title_en.strip(),
+            "title": title_en or title_jp,
-            "title_jp": title_jp.strip(),
+            "title_en": title_en,
            "title_jp": title_jp,
            "thumbnail": thumb,
            "uploader": text.remove_html(uploader),
            "date": date.strip(),
            "rating": text.parse_float(rating.partition(" ")[0]),
            "type": text.remove_html(gtype),
            "collection": text.remove_html(collection),
-            "group": text.remove_html(group),
+            "group": text.split_html(group),
-            "artist": ", ".join(text.split_html(artist)),
+            "artist": text.split_html(artist),
-            "parodies": ", ".join(text.split_html(parody)),
+            "parody": text.split_html(parody),
-            "characters": ", ".join(text.split_html(character)),
+            "characters": text.split_html(character),
-            "tags": ", ".join(text.split_html(tags)),
+            "tags": text.split_html(tags),
            "language": "English",
            "lang": "en",
        }