From 96ce1926a4815ab04949fdab01dc0b5ea595903f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 19 Sep 2025 22:05:36 +0200 Subject: [PATCH] [thehentaiworld] add support (#274 #8237) --- docs/configuration.rst | 2 + docs/gallery-dl.conf | 4 + docs/supportedsites.md | 6 + gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/thehentaiworld.py | 132 ++++++++++++++++ scripts/supportedsites.py | 1 + test/results/thehentaiworld.py | 202 +++++++++++++++++++++++++ 7 files changed, 348 insertions(+) create mode 100644 gallery_dl/extractor/thehentaiworld.py create mode 100644 test/results/thehentaiworld.py diff --git a/docs/configuration.rst b/docs/configuration.rst index 88452f12..fa278861 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -427,6 +427,7 @@ Default ``[Danbooru]``, ``[E621]``, ``[foolfuuka]:search``, + ``hdoujin``, ``itaku``, ``newgrounds``, ``[philomena]``, @@ -438,6 +439,7 @@ Default ``scrolller``, ``sizebooru``, ``soundgasm``, + ``thehentaiworld``, ``urlgalleries``, ``vk``, ``webtoons``, diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 173b3119..e4cba423 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -769,6 +769,10 @@ { "format": ["gif", "mp4", "webm", "webp"] }, + "thehentaiworld": + { + "sleep-request": "0.5-1.5" + }, "tiktok": { "audio" : true, diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 727f4124..1eeb2a33 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -997,6 +997,12 @@ Consider all listed sites to potentially be NSFW. individual Images, Search Results, User Profiles + + The Hentai World + https://thehentaiworld.com/ + Posts, Tag Searches + + TikTok https://www.tiktok.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 83cebc1f..abdb6cc8 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -191,6 +191,7 @@ modules = [ "tcbscans", "telegraph", "tenor", + "thehentaiworld", "tiktok", "tmohentai", "toyhouse", diff --git a/gallery_dl/extractor/thehentaiworld.py b/gallery_dl/extractor/thehentaiworld.py new file mode 100644 index 00000000..c366f9c4 --- /dev/null +++ b/gallery_dl/extractor/thehentaiworld.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://thehentaiworld.com/""" + +from .common import Extractor, Message +from .. import text, util +import collections + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?thehentaiworld\.com" + + +class ThehentaiworldExtractor(Extractor): + """Base class for thehentaiworld extractors""" + category = "thehentaiworld" + root = "https://thehentaiworld.com" + filename_fmt = "{title} ({id}{num:?-//}).{extension}" + archive_fmt = "{id}_{num}" + request_interval = (0.5, 1.5) + + def items(self): + for url in self.posts(): + post = self._extract_post(url) + + if "file_urls" in post: + urls = post["file_urls"] + post["count"] = len(urls) + yield Message.Directory, post + for post["num"], url in enumerate(urls, 1): + text.nameext_from_url(url, post) + yield Message.Url, url, post + else: + yield Message.Directory, post + url = post["file_url"] + text.nameext_from_url(url, post) + yield Message.Url, url, post + + def _extract_post(self, url): + extr = text.extract_from(self.request(url).text) + + post = { + "num" : 0, + "count" : 1, + "title" : text.unescape(extr("", "<").strip()), + "id" : text.parse_int(extr(" postid-", " ")), + "slug" : extr(" post-", '"'), + "tags" : extr('id="tagsHead">', "</ul>"), + "date" : text.parse_datetime(extr( + "<li>Posted: ", "<"), "%Y-%m-%d"), + } + + if "/videos/" in url: + post["type"] = "video" + post["width"] = post["height"] = 0 + post["votes"] = text.parse_int(extr("(<strong>", "</strong>")) + post["score"] = text.parse_float(extr("<strong>", "<")) + post["file_url"] = extr('<source src="', '"') + else: + post["type"] = "image" + post["width"] = text.parse_int(extr("<li>Size: ", " ")) + post["height"] = text.parse_int(extr("x ", "<")) + post["file_url"] = extr('a href="', '"') + post["votes"] = text.parse_int(extr("(<strong>", "</strong>")) + post["score"] = text.parse_float(extr("<strong>", "<")) + + if doujin := extr('<a id="prev-page"', "</div></div><"): + repl = text.re(r"-220x\d+\.").sub + post["file_urls"] = [ + repl(".", url) + for url in text.extract_iter( + doujin, 'class="border" src="', '"') + ] + + tags = collections.defaultdict(list) + pattern = text.re(r'<li><a class="([^"]*)" href="[^"]*">([^<]+)') + for tag_type, tag_name in pattern.findall(post["tags"]): + tags[tag_type].append(tag_name) + post["tags"] = tags_list = [] + for key, value in tags.items(): + tags_list.extend(value) + post[f"tags_{key}" if key else "tags_general"] = value + + return post + + def _pagination(self, endpoint): + base = f"{self.root}{endpoint}" + pnum = self.page_start + + while True: + url = base if pnum < 2 else f"{base}page/{pnum}/" + page = self.request(url).text + + yield from text.extract_iter(text.extr( + page, 'id="thumbContainer"', "<script"), ' href="', '"') + + if 'class="next"' not in page: + return + pnum += 1 + + +class ThehentaiworldPostExtractor(ThehentaiworldExtractor): + subcategory = "post" + pattern = rf"{BASE_PATTERN}(/(?:hentai-image|video)s/([^/?#]+))" + example = "https://thehentaiworld.com/hentai-images/SLUG/" + + def posts(self): + return (f"{self.root}{self.groups[0]}/",) + + +class ThehentaiworldTagExtractor(ThehentaiworldExtractor): + subcategory = "tag" + per_page = 24 + page_start = 1 + post_start = 0 + directory_fmt = ("{category}", "{search_tags}") + pattern = rf"{BASE_PATTERN}/tag/([^/?#]+)" + example = "https://thehentaiworld.com/tag/TAG/" + + def posts(self): + self.kwdict["search_tags"] = tag = self.groups[0] + return util.advance(self._pagination(f"/tag/{tag}/"), self.post_start) + + def skip(self, num): + pages, posts = divmod(num, self.per_page) + self.page_start += pages + self.post_start += posts + return num diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 419ec6b8..3963bd3d 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -179,6 +179,7 @@ CATEGORY_MAP = { "thebarchive" : "The /b/ Archive", "thecollection" : "The /co/llection", "thecollectionS" : "The /co/llection", + "thehentaiworld" : "The Hentai World", "tiktok" : "TikTok", "tmohentai" : "TMOHentai", "tumblrgallery" : "TumblrGallery", diff --git a/test/results/thehentaiworld.py b/test/results/thehentaiworld.py new file mode 100644 index 00000000..6cb4d9c5 --- /dev/null +++ b/test/results/thehentaiworld.py @@ -0,0 +1,202 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import thehentaiworld + + +__tests__ = ( +{ + "#url" : "https://thehentaiworld.com/hentai-images/samus-aran-aurahack-metroid-2/", + "#class" : thehentaiworld.ThehentaiworldPostExtractor, + "#results" : "https://thehentaiworld.com/wp-content/uploads/2020/06/Samus-Aran-Aurahack-Metroid-Hentai.jpeg", + + "count" : 1, + "num" : 0, + "date" : "dt:2020-06-05 00:00:00", + "extension" : "jpeg", + "file_url" : "https://thehentaiworld.com/wp-content/uploads/2020/06/Samus-Aran-Aurahack-Metroid-Hentai.jpeg", + "filename" : "Samus-Aran-Aurahack-Metroid-Hentai", + "height" : 2893, + "id" : 147048, + "score" : range(3, 5), + "slug" : "samus-aran-aurahack-metroid-2", + "title" : "Samus Aran – Aurahack – Metroid", + "type" : "image", + "votes" : range(5, 20), + "width" : 2000, + "tags" : [ + "Metroid", + "Samus Aran", + "Aurahack18", + "Blonde", + "blush", + "sweat", + ], + "tags_general" : [ + "Blonde", + "blush", + "sweat", + ], + "tags_artist" : ["Aurahack18"], + "tags_character": ["Samus Aran"], + "tags_origin" : ["Metroid"], +}, + +{ + "#url" : "https://thehentaiworld.com/hentai-images/ubel-nt00-sousou-no-frieren/", + "#class" : thehentaiworld.ThehentaiworldPostExtractor, + "#results" : ( + "https://thehentaiworld.com/wp-content/uploads/2024/04/Ubel-nt00-Sousou-no-Frieren-Hentai.jpg", + "https://thehentaiworld.com/wp-content/uploads/2024/04/Ubel-–-nt00-–-Sousou-no-Frieren-Hentai.jpg", + ), + + "count" : 2, + "num" : range(1, 2), + "date" : "dt:2024-04-16 00:00:00", + "extension" : "jpg", + "file_url" : "https://thehentaiworld.com/wp-content/uploads/2024/04/Ubel-nt00-Sousou-no-Frieren-Hentai.jpg", + "filename" : { + "Ubel-nt00-Sousou-no-Frieren-Hentai", + "Ubel-–-nt00-–-Sousou-no-Frieren-Hentai", + }, + "height" : 1422, + "id" : 226208, + "score" : range(3, 5), + "slug" : "ubel-nt00-sousou-no-frieren", + "title" : "Ubel – nt00 – Sousou no Frieren", + "type" : "image", + "votes" : range(10, 20), + "width" : 800, + "file_urls" : [ + "https://thehentaiworld.com/wp-content/uploads/2024/04/Ubel-nt00-Sousou-no-Frieren-Hentai.jpg", + "https://thehentaiworld.com/wp-content/uploads/2024/04/Ubel-–-nt00-–-Sousou-no-Frieren-Hentai.jpg", + ], + "tags" : [ + "Sousou no Frieren", + "Ubel", + "nt00", + "blush", + "Green Hair", + "pubic hair", + "smile", + ], + "tags_general" : [ + "blush", + "Green Hair", + "pubic hair", + "smile", + ], + "tags_artist" : ["nt00"], + "tags_character": ["Ubel"], + "tags_origin" : ["Sousou no Frieren"], +}, + +{ + "#url" : "https://thehentaiworld.com/videos/lucy-heartfilia-and-natsu-dragneel-shiina-ecchi-fairy-tail/#comment-396839", + "#class" : thehentaiworld.ThehentaiworldPostExtractor, + "#results" : "https://thehentaiworld.com/wp-content/uploads/2025/09/Lucy-Heartfilia-and-Natsu-Dragneel-Shiina-Ecchi-Fairy-Tail-Animated-Hentai-Video.mp4", + + "count" : 1, + "num" : 0, + "date" : "dt:2025-09-19 00:00:00", + "extension" : "mp4", + "file_url" : "https://thehentaiworld.com/wp-content/uploads/2025/09/Lucy-Heartfilia-and-Natsu-Dragneel-Shiina-Ecchi-Fairy-Tail-Animated-Hentai-Video.mp4", + "filename" : "Lucy-Heartfilia-and-Natsu-Dragneel-Shiina-Ecchi-Fairy-Tail-Animated-Hentai-Video", + "height" : 0, + "id" : 253263, + "score" : 5.0, + "slug" : "lucy-heartfilia-and-natsu-dragneel-shiina-ecchi-fairy-tail", + "title" : "Lucy Heartfilia and Natsu Dragneel – Shiina Ecchi – Fairy Tail", + "type" : "video", + "votes" : range(25, 50), + "width" : 0, + "tags" : [ + "Fairy Tail", + "Animated", + "sound", + "video", + "lucy heartfilia", + "Natsu Dragneel", + "Shiina Ecchi", + "arse", + "blush", + "Cowgirl Ride", + "cum", + "cum inside", + "eye roll", + "Fingering", + "Jiggle", + "legs spread", + "masturbating", + "moan", + "panties", + "pov", + "ride", + "smile", + "squeeze", + "vagina", + "x-ray", + ], + "tags_character": [ + "lucy heartfilia", + "Natsu Dragneel", + ], + "tags_general" : [ + "arse", + "blush", + "Cowgirl Ride", + "cum", + "cum inside", + "eye roll", + "Fingering", + "Jiggle", + "legs spread", + "masturbating", + "moan", + "panties", + "pov", + "ride", + "smile", + "squeeze", + "vagina", + "x-ray", + ], + "tags_media" : [ + "Animated", + "sound", + "video", + ], + "tags_artist" : ["Shiina Ecchi"], + "tags_origin" : ["Fairy Tail"], +}, + +{ + "#url" : "https://thehentaiworld.com/tag/aurahack/", + "#class" : thehentaiworld.ThehentaiworldTagExtractor, + "#pattern" : r"https://thehentaiworld\.com/wp\-content/uploads/20\d\d/.+", + "#range" : "20-", + "#count" : 10, + + "count" : {1, 2}, + "num" : {1, 2, 0}, + "date" : "type:datetime", + "extension" : {"jpg", "png"}, + "file_url" : str, + "filename" : str, + "height" : int, + "id" : int, + "score" : float, + "search_tags" : "aurahack", + "slug" : str, + "tags_artist" : ["Aurahack18"], + "title" : str, + "type" : "image", + "votes" : int, + "width" : int, + "tags" : list, +}, + +)