[thehentaiworld] add support (#274 #8237)

This commit is contained in:
Mike Fährmann
2025-09-19 22:05:36 +02:00
parent e5db6efca9
commit 96ce1926a4
7 changed files with 348 additions and 0 deletions

View File

@@ -427,6 +427,7 @@ Default
``[Danbooru]``,
``[E621]``,
``[foolfuuka]:search``,
``hdoujin``,
``itaku``,
``newgrounds``,
``[philomena]``,
@@ -438,6 +439,7 @@ Default
``scrolller``,
``sizebooru``,
``soundgasm``,
``thehentaiworld``,
``urlgalleries``,
``vk``,
``webtoons``,

View File

@@ -769,6 +769,10 @@
{
"format": ["gif", "mp4", "webm", "webp"]
},
"thehentaiworld":
{
"sleep-request": "0.5-1.5"
},
"tiktok":
{
"audio" : true,

View File

@@ -997,6 +997,12 @@ Consider all listed sites to potentially be NSFW.
<td>individual Images, Search Results, User Profiles</td>
<td></td>
</tr>
<tr id="thehentaiworld" title="thehentaiworld">
<td>The Hentai World</td>
<td>https://thehentaiworld.com/</td>
<td>Posts, Tag Searches</td>
<td></td>
</tr>
<tr id="tiktok" title="tiktok">
<td>TikTok</td>
<td>https://www.tiktok.com/</td>

View File

@@ -191,6 +191,7 @@ modules = [
"tcbscans",
"telegraph",
"tenor",
"thehentaiworld",
"tiktok",
"tmohentai",
"toyhouse",

View File

@@ -0,0 +1,132 @@
# -*- coding: utf-8 -*-
# Copyright 2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://thehentaiworld.com/"""
from .common import Extractor, Message
from .. import text, util
import collections
BASE_PATTERN = r"(?:https?://)?(?:www\.)?thehentaiworld\.com"
class ThehentaiworldExtractor(Extractor):
"""Base class for thehentaiworld extractors"""
category = "thehentaiworld"
root = "https://thehentaiworld.com"
filename_fmt = "{title} ({id}{num:?-//}).{extension}"
archive_fmt = "{id}_{num}"
request_interval = (0.5, 1.5)
def items(self):
for url in self.posts():
post = self._extract_post(url)
if "file_urls" in post:
urls = post["file_urls"]
post["count"] = len(urls)
yield Message.Directory, post
for post["num"], url in enumerate(urls, 1):
text.nameext_from_url(url, post)
yield Message.Url, url, post
else:
yield Message.Directory, post
url = post["file_url"]
text.nameext_from_url(url, post)
yield Message.Url, url, post
def _extract_post(self, url):
extr = text.extract_from(self.request(url).text)
post = {
"num" : 0,
"count" : 1,
"title" : text.unescape(extr("<title>", "<").strip()),
"id" : text.parse_int(extr(" postid-", " ")),
"slug" : extr(" post-", '"'),
"tags" : extr('id="tagsHead">', "</ul>"),
"date" : text.parse_datetime(extr(
"<li>Posted: ", "<"), "%Y-%m-%d"),
}
if "/videos/" in url:
post["type"] = "video"
post["width"] = post["height"] = 0
post["votes"] = text.parse_int(extr("(<strong>", "</strong>"))
post["score"] = text.parse_float(extr("<strong>", "<"))
post["file_url"] = extr('<source src="', '"')
else:
post["type"] = "image"
post["width"] = text.parse_int(extr("<li>Size: ", " "))
post["height"] = text.parse_int(extr("x ", "<"))
post["file_url"] = extr('a href="', '"')
post["votes"] = text.parse_int(extr("(<strong>", "</strong>"))
post["score"] = text.parse_float(extr("<strong>", "<"))
if doujin := extr('<a id="prev-page"', "</div></div><"):
repl = text.re(r"-220x\d+\.").sub
post["file_urls"] = [
repl(".", url)
for url in text.extract_iter(
doujin, 'class="border" src="', '"')
]
tags = collections.defaultdict(list)
pattern = text.re(r'<li><a class="([^"]*)" href="[^"]*">([^<]+)')
for tag_type, tag_name in pattern.findall(post["tags"]):
tags[tag_type].append(tag_name)
post["tags"] = tags_list = []
for key, value in tags.items():
tags_list.extend(value)
post[f"tags_{key}" if key else "tags_general"] = value
return post
def _pagination(self, endpoint):
base = f"{self.root}{endpoint}"
pnum = self.page_start
while True:
url = base if pnum < 2 else f"{base}page/{pnum}/"
page = self.request(url).text
yield from text.extract_iter(text.extr(
page, 'id="thumbContainer"', "<script"), ' href="', '"')
if 'class="next"' not in page:
return
pnum += 1
class ThehentaiworldPostExtractor(ThehentaiworldExtractor):
subcategory = "post"
pattern = rf"{BASE_PATTERN}(/(?:hentai-image|video)s/([^/?#]+))"
example = "https://thehentaiworld.com/hentai-images/SLUG/"
def posts(self):
return (f"{self.root}{self.groups[0]}/",)
class ThehentaiworldTagExtractor(ThehentaiworldExtractor):
subcategory = "tag"
per_page = 24
page_start = 1
post_start = 0
directory_fmt = ("{category}", "{search_tags}")
pattern = rf"{BASE_PATTERN}/tag/([^/?#]+)"
example = "https://thehentaiworld.com/tag/TAG/"
def posts(self):
self.kwdict["search_tags"] = tag = self.groups[0]
return util.advance(self._pagination(f"/tag/{tag}/"), self.post_start)
def skip(self, num):
pages, posts = divmod(num, self.per_page)
self.page_start += pages
self.post_start += posts
return num

View File

@@ -179,6 +179,7 @@ CATEGORY_MAP = {
"thebarchive" : "The /b/ Archive",
"thecollection" : "The /co/llection",
"thecollectionS" : "The /co/llection",
"thehentaiworld" : "The Hentai World",
"tiktok" : "TikTok",
"tmohentai" : "TMOHentai",
"tumblrgallery" : "TumblrGallery",

View File

@@ -0,0 +1,202 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import thehentaiworld
__tests__ = (
{
"#url" : "https://thehentaiworld.com/hentai-images/samus-aran-aurahack-metroid-2/",
"#class" : thehentaiworld.ThehentaiworldPostExtractor,
"#results" : "https://thehentaiworld.com/wp-content/uploads/2020/06/Samus-Aran-Aurahack-Metroid-Hentai.jpeg",
"count" : 1,
"num" : 0,
"date" : "dt:2020-06-05 00:00:00",
"extension" : "jpeg",
"file_url" : "https://thehentaiworld.com/wp-content/uploads/2020/06/Samus-Aran-Aurahack-Metroid-Hentai.jpeg",
"filename" : "Samus-Aran-Aurahack-Metroid-Hentai",
"height" : 2893,
"id" : 147048,
"score" : range(3, 5),
"slug" : "samus-aran-aurahack-metroid-2",
"title" : "Samus Aran Aurahack Metroid",
"type" : "image",
"votes" : range(5, 20),
"width" : 2000,
"tags" : [
"Metroid",
"Samus Aran",
"Aurahack18",
"Blonde",
"blush",
"sweat",
],
"tags_general" : [
"Blonde",
"blush",
"sweat",
],
"tags_artist" : ["Aurahack18"],
"tags_character": ["Samus Aran"],
"tags_origin" : ["Metroid"],
},
{
"#url" : "https://thehentaiworld.com/hentai-images/ubel-nt00-sousou-no-frieren/",
"#class" : thehentaiworld.ThehentaiworldPostExtractor,
"#results" : (
"https://thehentaiworld.com/wp-content/uploads/2024/04/Ubel-nt00-Sousou-no-Frieren-Hentai.jpg",
"https://thehentaiworld.com/wp-content/uploads/2024/04/Ubel--nt00--Sousou-no-Frieren-Hentai.jpg",
),
"count" : 2,
"num" : range(1, 2),
"date" : "dt:2024-04-16 00:00:00",
"extension" : "jpg",
"file_url" : "https://thehentaiworld.com/wp-content/uploads/2024/04/Ubel-nt00-Sousou-no-Frieren-Hentai.jpg",
"filename" : {
"Ubel-nt00-Sousou-no-Frieren-Hentai",
"Ubel--nt00--Sousou-no-Frieren-Hentai",
},
"height" : 1422,
"id" : 226208,
"score" : range(3, 5),
"slug" : "ubel-nt00-sousou-no-frieren",
"title" : "Ubel nt00 Sousou no Frieren",
"type" : "image",
"votes" : range(10, 20),
"width" : 800,
"file_urls" : [
"https://thehentaiworld.com/wp-content/uploads/2024/04/Ubel-nt00-Sousou-no-Frieren-Hentai.jpg",
"https://thehentaiworld.com/wp-content/uploads/2024/04/Ubel--nt00--Sousou-no-Frieren-Hentai.jpg",
],
"tags" : [
"Sousou no Frieren",
"Ubel",
"nt00",
"blush",
"Green Hair",
"pubic hair",
"smile",
],
"tags_general" : [
"blush",
"Green Hair",
"pubic hair",
"smile",
],
"tags_artist" : ["nt00"],
"tags_character": ["Ubel"],
"tags_origin" : ["Sousou no Frieren"],
},
{
"#url" : "https://thehentaiworld.com/videos/lucy-heartfilia-and-natsu-dragneel-shiina-ecchi-fairy-tail/#comment-396839",
"#class" : thehentaiworld.ThehentaiworldPostExtractor,
"#results" : "https://thehentaiworld.com/wp-content/uploads/2025/09/Lucy-Heartfilia-and-Natsu-Dragneel-Shiina-Ecchi-Fairy-Tail-Animated-Hentai-Video.mp4",
"count" : 1,
"num" : 0,
"date" : "dt:2025-09-19 00:00:00",
"extension" : "mp4",
"file_url" : "https://thehentaiworld.com/wp-content/uploads/2025/09/Lucy-Heartfilia-and-Natsu-Dragneel-Shiina-Ecchi-Fairy-Tail-Animated-Hentai-Video.mp4",
"filename" : "Lucy-Heartfilia-and-Natsu-Dragneel-Shiina-Ecchi-Fairy-Tail-Animated-Hentai-Video",
"height" : 0,
"id" : 253263,
"score" : 5.0,
"slug" : "lucy-heartfilia-and-natsu-dragneel-shiina-ecchi-fairy-tail",
"title" : "Lucy Heartfilia and Natsu Dragneel Shiina Ecchi Fairy Tail",
"type" : "video",
"votes" : range(25, 50),
"width" : 0,
"tags" : [
"Fairy Tail",
"Animated",
"sound",
"video",
"lucy heartfilia",
"Natsu Dragneel",
"Shiina Ecchi",
"arse",
"blush",
"Cowgirl Ride",
"cum",
"cum inside",
"eye roll",
"Fingering",
"Jiggle",
"legs spread",
"masturbating",
"moan",
"panties",
"pov",
"ride",
"smile",
"squeeze",
"vagina",
"x-ray",
],
"tags_character": [
"lucy heartfilia",
"Natsu Dragneel",
],
"tags_general" : [
"arse",
"blush",
"Cowgirl Ride",
"cum",
"cum inside",
"eye roll",
"Fingering",
"Jiggle",
"legs spread",
"masturbating",
"moan",
"panties",
"pov",
"ride",
"smile",
"squeeze",
"vagina",
"x-ray",
],
"tags_media" : [
"Animated",
"sound",
"video",
],
"tags_artist" : ["Shiina Ecchi"],
"tags_origin" : ["Fairy Tail"],
},
{
"#url" : "https://thehentaiworld.com/tag/aurahack/",
"#class" : thehentaiworld.ThehentaiworldTagExtractor,
"#pattern" : r"https://thehentaiworld\.com/wp\-content/uploads/20\d\d/.+",
"#range" : "20-",
"#count" : 10,
"count" : {1, 2},
"num" : {1, 2, 0},
"date" : "type:datetime",
"extension" : {"jpg", "png"},
"file_url" : str,
"filename" : str,
"height" : int,
"id" : int,
"score" : float,
"search_tags" : "aurahack",
"slug" : str,
"tags_artist" : ["Aurahack18"],
"title" : str,
"type" : "image",
"votes" : int,
"width" : int,
"tags" : list,
},
)