gallery-dl/gallery_dl/extractor/thehentaiworld.py

# -*- coding: utf-8 -*-

# Copyright 2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://thehentaiworld.com/"""

from .common import Extractor, Message
from .. import text, util
import collections

BASE_PATTERN = r"(?:https?://)?(?:www\.)?thehentaiworld\.com"


class ThehentaiworldExtractor(Extractor):
    """Base class for thehentaiworld extractors"""
    category = "thehentaiworld"
    root = "https://thehentaiworld.com"
    filename_fmt = "{title} ({id}{num:?-//}).{extension}"
    archive_fmt = "{id}_{num}"
    request_interval = (0.5, 1.5)

    def items(self):
        for url in self.posts():
            try:
                post = self._extract_post(url)
            except Exception as exc:
                self.status |= 1
                self.log.warning("Failed to extract post %s (%s: %s)",
                                 url, exc.__class__.__name__, exc)
                continue

            if "file_urls" in post:
                urls = post["file_urls"]
                post["count"] = len(urls)
                yield Message.Directory, "", post
                for post["num"], url in enumerate(urls, 1):
                    text.nameext_from_url(url, post)
                    yield Message.Url, url, post
            else:
                yield Message.Directory, "", post
                url = post["file_url"]
                text.nameext_from_url(url, post)
                yield Message.Url, url, post

    def _extract_post(self, url):
        extr = text.extract_from(self.request(url).text)

        post = {
            "num"     : 0,
            "count"   : 1,
            "title"   : text.unescape(extr("<title>", "<").strip()),
            "id"      : text.parse_int(extr(" postid-", " ")),
            "slug"    : extr(" post-", '"'),
            "tags"    : extr('id="tagsHead">', "</ul>"),
            "date"    : self.parse_datetime_iso(extr("<li>Posted: ", "<")),
        }

        if (c := url[27]) == "v":
            post["type"] = "video"
            post["width"] = post["height"] = 0
            post["votes"] = text.parse_int(extr("(<strong>", "</strong>"))
            post["score"] = text.parse_float(extr("<strong>", "<"))
            post["file_url"] = extr('<source src="', '"')
        else:
            post["type"] = ("animated" if c == "g" else
                            "3d cgi" if c == "3" else
                            "image")
            post["width"] = text.parse_int(extr("<li>Size: ", " "))
            post["height"] = text.parse_int(extr("x ", "<"))
            post["file_url"] = extr('a href="', '"')
            post["votes"] = text.parse_int(extr("(<strong>", "</strong>"))
            post["score"] = text.parse_float(extr("<strong>", "<"))

            if doujin := extr('<a id="prev-page"', "</div></div><"):
                repl = text.re(r"-220x\d+\.").sub
                post["file_urls"] = [
                    repl(".", url)
                    for url in text.extract_iter(
                        doujin, 'class="border" src="', '"')
                ]

        tags = collections.defaultdict(list)
        pattern = text.re(r'<li><a class="([^"]*)" href="[^"]*">([^<]+)')
        for tag_type, tag_name in pattern.findall(post["tags"]):
            tags[tag_type].append(tag_name)
        post["tags"] = tags_list = []
        for key, value in tags.items():
            tags_list.extend(value)
            post["tags_" + key if key else "tags_general"] = value

        return post

    def _pagination(self, endpoint):
        base = self.root + endpoint
        pnum = self.page_start

        while True:
            url = base if pnum < 2 else f"{base}page/{pnum}/"
            page = self.request(url).text

            yield from text.extract_iter(text.extr(
                page, 'id="thumbContainer"', "<script"), ' href="', '"')

            if 'class="next"' not in page:
                return
            pnum += 1


class ThehentaiworldTagExtractor(ThehentaiworldExtractor):
    subcategory = "tag"
    per_page = 24
    page_start = 1
    post_start = 0
    directory_fmt = ("{category}", "{search_tags}")
    pattern = BASE_PATTERN + r"/tag/([^/?#]+)"
    example = "https://thehentaiworld.com/tag/TAG/"

    def posts(self):
        self.kwdict["search_tags"] = tag = self.groups[0]
        return util.advance(self._pagination(f"/tag/{tag}/"), self.post_start)

    def skip(self, num):
        pages, posts = divmod(num, self.per_page)
        self.page_start += pages
        self.post_start += posts
        return num


class ThehentaiworldPostExtractor(ThehentaiworldExtractor):
    subcategory = "post"
    pattern = (BASE_PATTERN +
               r"(/(?:video|(?:[\w-]+-)?hentai-image)s/([^/?#]+))")
    example = "https://thehentaiworld.com/hentai-images/SLUG/"

    def posts(self):
        return (f"{self.root}{self.groups[0]}/",)