[imhentai] add support (#1660 #3046 #3824 #4338 #5936)

2025-02-10 21:42:07 +01:00
parent be77465e1b
commit 55034d9638
5 changed files with 258 additions and 0 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -439,6 +439,12 @@ Consider all listed sites to potentially be NSFW.
    <td>Albums, Favorites, Favorites Folders, Galleries, individual Images, Search Results, Subreddits, Tag Searches, User Profiles</td>
    <td></td>
 </tr>
+<tr>
+    <td>IMHentai</td>
+    <td>https://imhentai.xxx/</td>
+    <td>Galleries, Search Results, Tag Searches</td>
+    <td></td>
+</tr>
 <tr>
    <td>Imxto</td>
    <td>https://imx.to/</td>
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@@ -80,6 +80,7 @@ modules = [
    "imgbox",
    "imgth",
    "imgur",
+    "imhentai",
    "inkbunny",
    "instagram",
    "issuu",
--- a/gallery_dl/extractor/imhentai.py
+++ b/gallery_dl/extractor/imhentai.py
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://imhentai.xxx/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, util
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?imhentai\.xxx"
+
+
+class ImhentaiExtractor(Extractor):
+    category = "imhentai"
+    root = "https://imhentai.xxx"
+
+    def _pagination(self, url):
+        base = self.root + "/gallery/"
+        data = {"_extractor": ImhentaiGalleryExtractor}
+
+        while True:
+            page = self.request(url).text
+            extr = text.extract_from(page)
+
+            while True:
+                gallery_id = extr('<a href="/gallery/', '"')
+                if not gallery_id:
+                    break
+                yield Message.Queue, base + gallery_id, data
+                extr('<a href="/gallery/', '"')  # skip duplicate GIDs
+
+            href = text.rextract(page, "class='page-link' href='", "'")[0]
+            if not href or href == "#":
+                return
+            url = text.ensure_http_scheme(href)
+
+
+class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor):
+    """Extractor for imhentai galleries"""
+    pattern = BASE_PATTERN + r"/(?:gallery|view)/(\d+)"
+    example = "https://imhentai.xxx/gallery/12345/"
+
+    def __init__(self, match):
+        self.gallery_id = match.group(1)
+        url = "{}/gallery/{}/".format(self.root, self.gallery_id)
+        GalleryExtractor.__init__(self, match, url)
+
+    def metadata(self, page):
+        extr = text.extract_from(page)
+
+        data = {
+            "gallery_id": text.parse_int(self.gallery_id),
+            "title"     : text.unescape(extr("<h1>", "<")),
+            "title_alt" : text.unescape(extr('class="subtitle">', "<")),
+            "parody"    : self._split(extr(">Parodies:</span>", "</li>")),
+            "character" : self._split(extr(">Characters:</span>", "</li>")),
+            "tags"      : self._split(extr(">Tags:</span>", "</li>")),
+            "artist"    : self._split(extr(">Artists:</span>", "</li>")),
+            "group"     : self._split(extr(">Groups:</span>", "</li>")),
+            "language"  : self._split(extr(">Languages:</span>", "</li>")),
+            "type"      : text.remove_html(extr(">Category:</span>", "<span")),
+        }
+
+        if data["language"]:
+            data["lang"] = util.language_to_code(data["language"][0])
+
+        return data
+
+    def _split(self, html):
+        results = []
+        for tag in text.extract_iter(html, ">", "</a>"):
+            tag = tag.partition(" <span class='badge'>")[0]
+            if "<" in tag:
+                tag = text.remove_html(tag)
+            results.append(tag)
+        return results
+
+    def images(self, _):
+        url = "{}/view/{}/1/".format(self.root, self.gallery_id)
+        page = self.request(url).text
+        data = util.json_loads(text.extr(page, "$.parseJSON('", "'"))
+        base = text.extr(page, 'data-src="', '"').rpartition("/")[0] + "/"
+        exts = {"j": "jpg", "p": "png", "g": "gif", "w": "webp", "a": "avif"}
+
+        results = []
+        for i in map(str, range(1, len(data)+1)):
+            ext, width, height = data[i].split(",")
+            url = base + i + "." + exts[ext]
+            results.append((url, {
+                "width" : text.parse_int(width),
+                "height": text.parse_int(height),
+            }))
+        return results
+
+
+class ImhentaiTagExtractor(ImhentaiExtractor):
+    """Extractor for imhentai tag searches"""
+    subcategory = "tag"
+    pattern = (BASE_PATTERN + r"(/(?:"
+               r"artist|category|character|group|language|parody|tag"
+               r")/([^/?#]+))")
+    example = "https://imhentai.xxx/tag/TAG/"
+
+    def items(self):
+        url = self.root + self.groups[0] + "/"
+        return self._pagination(url)
+
+
+class ImhentaiSearchExtractor(ImhentaiExtractor):
+    """Extractor for imhentai search results"""
+    subcategory = "search"
+    pattern = BASE_PATTERN + r"/search/?\?([^#]+)"
+    example = "https://imhentai.xxx/search/?key=QUERY"
+
+    def items(self):
+        url = self.root + "/search/?" + self.groups[0]
+        return self._pagination(url)
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@@ -79,6 +79,7 @@ CATEGORY_MAP = {
    "imgkiwi"        : "IMG.Kiwi",
    "imgth"          : "imgth",
    "imgur"          : "imgur",
+    "imhentai"       : "IMHentai",
    "joyreactor"     : "JoyReactor",
    "itchio"         : "itch.io",
    "jpgfish"        : "JPG Fish",
--- a/test/results/imhentai.py
+++ b/test/results/imhentai.py
@@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import imhentai
+
+
+__tests__ = (
+{
+    "#url"    : "https://imhentai.xxx/gallery/12/",
+    "#class"  : imhentai.ImhentaiGalleryExtractor,
+    "#pattern": r"https://m1\.imhentai\.xxx/001/3x907ntq18/\d+\.jpg",
+    "#count"  : 94,
+
+    "count"     : 94,
+    "extension" : "jpg",
+    "filename"  : str,
+    "gallery_id": 12,
+    "lang"      : "en",
+    "num"       : range(1, 94),
+    "title"     : "(C67) [Studio Kimigabuchi (Kimimaru)] RE-TAKE 2 (Neon Genesis Evangelion) [English]",
+    "title_alt" : "(C67) [スタジオKIMIGABUCHI (きみまる)] RE-TAKE2 (新世紀エヴァンゲリオン) [英訳]",
+    "type"      : "doujinshi",
+    "width"     : {835, 838, 841, 1200},
+    "height"    : {862, 865, 1200},
+
+    "artist":    [
+        "kimimaru | entokkun",
+    ],
+    "character": [
+        "asuka langley soryu",
+        "gendo ikari",
+        "makoto hyuga",
+        "maya ibuki",
+        "misato katsuragi",
+        "rei ayanami",
+        "shigeru aoba",
+        "shinji ikari",
+    ],
+    "group": [
+        "studio kimigabuchi",
+    ],
+    "language": [
+        "english",
+        "translated",
+    ],
+    "parody": [
+        "neon genesis evangelion | shin seiki evangelion",
+    ],
+    "tags": [
+        "multi-work series",
+        "schoolboy uniform",
+        "schoolgirl uniform",
+        "sole female",
+        "sole male",
+        "story arc",
+        "twintails",
+    ],
+},
+
+{
+    "#url"    : "https://imhentai.xxx/gallery/1396508/",
+    "#class"  : imhentai.ImhentaiGalleryExtractor,
+    "#pattern": r"https://m9\.imhentai\.xxx/028/po9f4w3jzx/\d+\.webp",
+    "#count"  : 34,
+
+    "count"     : 34,
+    "extension" : "webp",
+    "filename"  : str,
+    "gallery_id": 1396508,
+    "lang"      : "ko",
+    "num"       : range(1, 34),
+    "title"     : "[Beruennea (Skylader)] Tada no Kouhai ni Natta Kimi | 그냥 후배가 돼 버린 너 [Korean] [Digital]",
+    "title_alt" : "[ベルエンネーア (すかいれーだー)] ただの後輩になった君 [韓国翻訳] [DL版]",
+    "type"      : "doujinshi",
+    "width"     : 1280,
+    "height"    : {1790, 1791},
+
+    "artist": [
+        "skylader",
+    ],
+    "character": [],
+    "group": [
+        "beruennea",
+    ],
+    "language": [
+        "korean",
+        "translated",
+    ],
+    "parody": [
+        "original",
+    ],
+    "tags": [
+        "ahegao",
+        "big ass",
+        "big breasts",
+        "big nipples",
+        "big penis",
+        "bike shorts",
+        "blowjob",
+        "gokkun",
+        "hairy",
+        "huge breasts",
+        "mosaic censorship",
+        "muscle",
+        "nakadashi",
+        "netorare",
+        "schoolgirl uniform",
+        "tanlines",
+    ],
+},
+
+{
+    "#url"    : "https://imhentai.xxx/artist/asutora/",
+    "#class"  : imhentai.ImhentaiTagExtractor,
+    "#pattern": imhentai.ImhentaiGalleryExtractor.pattern,
+    "#count"  : range(30, 50),
+},
+
+{
+    "#url"    : "https://imhentai.xxx/search/?lt=1&pp=0&m=1&d=1&w=1&i=1&a=1&g=1&key=asutora&apply=Search&en=1&jp=1&es=1&fr=1&kr=1&de=1&ru=1&dl=0&tr=0",
+    "#class"  : imhentai.ImhentaiSearchExtractor,
+    "#pattern": imhentai.ImhentaiGalleryExtractor.pattern,
+    "#count"  : range(30, 50),
+},
+
+)