gallery-dl/gallery_dl/extractor/imhentai.py

# -*- coding: utf-8 -*-

# Copyright 2025-2026 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://imhentai.xxx/ and mirror sites"""

from .common import GalleryExtractor, BaseExtractor, Message
from .. import text, util


class ImhentaiExtractor(BaseExtractor):
    basecategory = "IMHentai"

    def _pagination(self, url):
        prev = None
        base = self.root + "/gallery/"
        data = {"_extractor": ImhentaiGalleryExtractor}

        while True:
            page = self.request(url).text

            pos = page.find('class="ranking_list"')
            if pos >= 0:
                page = page[:pos]

            extr = text.extract_from(page)

            while True:
                gallery_id = extr('href="/gallery/', '"')
                if gallery_id == prev:
                    continue
                if not gallery_id:
                    break
                yield Message.Queue, base + gallery_id, data
                prev = gallery_id

            href = text.rextr(page, "class='page-link' href='", "'")
            if not href or href == "#":
                return
            if href[0] == "/":
                if href[1] == "/":
                    href = "https:" + href
                else:
                    href = self.root + href
            url = href


BASE_PATTERN = ImhentaiExtractor.update({
    "imhentai": {
        "root": "https://imhentai.xxx",
        "pattern": r"(?:www\.)?imhentai\.xxx",
    },
    "hentaiera": {
        "root": "https://hentaiera.com",
        "pattern": r"(?:www\.)?hentaiera\.com",
    },
    "hentairox": {
        "root": "https://hentairox.com",
        "pattern": r"(?:www\.)?hentairox\.com",
    },
    "hentaifox": {
        "root": "https://hentaifox.com",
        "pattern": r"(?:www\.)?hentaifox\.com",
    },
    "hentaienvy": {
        "root": "https://hentaienvy.com",
        "pattern": r"(?:www\.)?hentaienvy\.com",
    },
    "hentaizap": {
        "root": "https://hentaizap.com",
        "pattern": r"(?:www\.)?hentaizap\.com",
    },
})


class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor):
    """Extractor for imhentai galleries"""
    pattern = BASE_PATTERN + r"/(?:gallery|view)/(\d+)"
    example = "https://imhentai.xxx/gallery/12345/"

    def __init__(self, match):
        ImhentaiExtractor.__init__(self, match)
        self.gallery_id = self.groups[-1]
        self.page_url = f"{self.root}/gallery/{self.gallery_id}/"

    def metadata(self, page):
        extr = text.extract_from(page)
        title = extr("<h1>", "<")
        title_alt = extr('class="subtitle">', "<")
        end = "</li>" if extr('<ul class="galleries_info', ">") else "</ul>"

        data = {
            "gallery_id": text.parse_int(self.gallery_id),
            "title"     : text.unescape(title),
            "title_alt" : text.unescape(title_alt),
            "parody"    : self._split(extr(">Parodies", end)),
            "character" : self._split(extr(">Characters", end)),
            "tags"      : self._split(extr(">Tags", end)),
            "artist"    : self._split(extr(">Artists", end)),
            "group"     : self._split(extr(">Groups", end)),
            "language"  : self._split(extr(">Languages", end)),
            "type"      : extr("href='/category/", "/"),
        }

        if data["language"]:
            data["lang"] = util.language_to_code(data["language"][0])

        return data

    def _split(self, html):
        results = []
        for tag in text.extract_iter(html, ">", "</a>"):
            badge = ("badge'>" in tag or "class='badge" in tag)
            tag = text.remove_html(tag)
            if badge:
                tag = tag.rpartition(" ")[0]
            results.append(tag)
        results.sort()
        return results

    def images(self, page):
        base = text.extr(page, 'data-src="', '"').rpartition("/")[0] + "/"
        exts = {"j": "jpg", "p": "png", "g": "gif", "w": "webp", "a": "avif"}

        try:
            data = util.json_loads(text.extr(page, "$.parseJSON('", "'"))
        except Exception:
            data = None

        if data is None:
            self.log.warning("%s: Missing image data", self.gallery_id)

            def _fallback_exts(i):
                for ext in util.advance(exts.values(), 1):
                    yield f"{base}{i}.{ext}"
            cnt = text.parse_int(text.extr(
                page, 'id="load_pages" value="', '"'))
            return [(f"{base}{i}.jpg", {"_fallback": _fallback_exts(i)})
                    for i in range(1, cnt+1)]

        results = []
        for i in map(str, range(1, len(data)+1)):
            ext, width, height = data[i].split(",")
            url = f"{base}{i}.{exts[ext]}"
            results.append((url, {
                "width" : text.parse_int(width),
                "height": text.parse_int(height),
            }))
        return results


class ImhentaiTagExtractor(ImhentaiExtractor):
    """Extractor for imhentai tag searches"""
    subcategory = "tag"
    pattern = (BASE_PATTERN + r"(/(?:"
               r"artist|category|character|group|language|parody|tag"
               r")/([^/?#]+))")
    example = "https://imhentai.xxx/tag/TAG/"

    def items(self):
        url = self.root + self.groups[-2] + "/"
        return self._pagination(url)


class ImhentaiSearchExtractor(ImhentaiExtractor):
    """Extractor for imhentai search results"""
    subcategory = "search"
    pattern = BASE_PATTERN + r"(/(?:advanced-)?search/?\?[^#]+|/[^/?#]+/?)"
    example = "https://imhentai.xxx/search/?key=QUERY"

    def items(self):
        return self._pagination(self.root + self.groups[-1])