[imhentai] add support (#1660 #3046 #3824 #4338 #5936)

This commit is contained in:
Mike Fährmann
2025-02-10 21:42:07 +01:00
parent be77465e1b
commit 55034d9638
5 changed files with 258 additions and 0 deletions

View File

@@ -439,6 +439,12 @@ Consider all listed sites to potentially be NSFW.
<td>Albums, Favorites, Favorites Folders, Galleries, individual Images, Search Results, Subreddits, Tag Searches, User Profiles</td>
<td></td>
</tr>
<tr>
<td>IMHentai</td>
<td>https://imhentai.xxx/</td>
<td>Galleries, Search Results, Tag Searches</td>
<td></td>
</tr>
<tr>
<td>Imxto</td>
<td>https://imx.to/</td>

View File

@@ -80,6 +80,7 @@ modules = [
"imgbox",
"imgth",
"imgur",
"imhentai",
"inkbunny",
"instagram",
"issuu",

View File

@@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-
# Copyright 2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://imhentai.xxx/"""
from .common import GalleryExtractor, Extractor, Message
from .. import text, util
BASE_PATTERN = r"(?:https?://)?(?:www\.)?imhentai\.xxx"
class ImhentaiExtractor(Extractor):
category = "imhentai"
root = "https://imhentai.xxx"
def _pagination(self, url):
base = self.root + "/gallery/"
data = {"_extractor": ImhentaiGalleryExtractor}
while True:
page = self.request(url).text
extr = text.extract_from(page)
while True:
gallery_id = extr('<a href="/gallery/', '"')
if not gallery_id:
break
yield Message.Queue, base + gallery_id, data
extr('<a href="/gallery/', '"') # skip duplicate GIDs
href = text.rextract(page, "class='page-link' href='", "'")[0]
if not href or href == "#":
return
url = text.ensure_http_scheme(href)
class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor):
"""Extractor for imhentai galleries"""
pattern = BASE_PATTERN + r"/(?:gallery|view)/(\d+)"
example = "https://imhentai.xxx/gallery/12345/"
def __init__(self, match):
self.gallery_id = match.group(1)
url = "{}/gallery/{}/".format(self.root, self.gallery_id)
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
extr = text.extract_from(page)
data = {
"gallery_id": text.parse_int(self.gallery_id),
"title" : text.unescape(extr("<h1>", "<")),
"title_alt" : text.unescape(extr('class="subtitle">', "<")),
"parody" : self._split(extr(">Parodies:</span>", "</li>")),
"character" : self._split(extr(">Characters:</span>", "</li>")),
"tags" : self._split(extr(">Tags:</span>", "</li>")),
"artist" : self._split(extr(">Artists:</span>", "</li>")),
"group" : self._split(extr(">Groups:</span>", "</li>")),
"language" : self._split(extr(">Languages:</span>", "</li>")),
"type" : text.remove_html(extr(">Category:</span>", "<span")),
}
if data["language"]:
data["lang"] = util.language_to_code(data["language"][0])
return data
def _split(self, html):
results = []
for tag in text.extract_iter(html, ">", "</a>"):
tag = tag.partition(" <span class='badge'>")[0]
if "<" in tag:
tag = text.remove_html(tag)
results.append(tag)
return results
def images(self, _):
url = "{}/view/{}/1/".format(self.root, self.gallery_id)
page = self.request(url).text
data = util.json_loads(text.extr(page, "$.parseJSON('", "'"))
base = text.extr(page, 'data-src="', '"').rpartition("/")[0] + "/"
exts = {"j": "jpg", "p": "png", "g": "gif", "w": "webp", "a": "avif"}
results = []
for i in map(str, range(1, len(data)+1)):
ext, width, height = data[i].split(",")
url = base + i + "." + exts[ext]
results.append((url, {
"width" : text.parse_int(width),
"height": text.parse_int(height),
}))
return results
class ImhentaiTagExtractor(ImhentaiExtractor):
"""Extractor for imhentai tag searches"""
subcategory = "tag"
pattern = (BASE_PATTERN + r"(/(?:"
r"artist|category|character|group|language|parody|tag"
r")/([^/?#]+))")
example = "https://imhentai.xxx/tag/TAG/"
def items(self):
url = self.root + self.groups[0] + "/"
return self._pagination(url)
class ImhentaiSearchExtractor(ImhentaiExtractor):
"""Extractor for imhentai search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"/search/?\?([^#]+)"
example = "https://imhentai.xxx/search/?key=QUERY"
def items(self):
url = self.root + "/search/?" + self.groups[0]
return self._pagination(url)

View File

@@ -79,6 +79,7 @@ CATEGORY_MAP = {
"imgkiwi" : "IMG.Kiwi",
"imgth" : "imgth",
"imgur" : "imgur",
"imhentai" : "IMHentai",
"joyreactor" : "JoyReactor",
"itchio" : "itch.io",
"jpgfish" : "JPG Fish",

129
test/results/imhentai.py Normal file
View File

@@ -0,0 +1,129 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import imhentai
__tests__ = (
{
"#url" : "https://imhentai.xxx/gallery/12/",
"#class" : imhentai.ImhentaiGalleryExtractor,
"#pattern": r"https://m1\.imhentai\.xxx/001/3x907ntq18/\d+\.jpg",
"#count" : 94,
"count" : 94,
"extension" : "jpg",
"filename" : str,
"gallery_id": 12,
"lang" : "en",
"num" : range(1, 94),
"title" : "(C67) [Studio Kimigabuchi (Kimimaru)] RE-TAKE 2 (Neon Genesis Evangelion) [English]",
"title_alt" : "(C67) [スタジオKIMIGABUCHI (きみまる)] RE-TAKE2 (新世紀エヴァンゲリオン) [英訳]",
"type" : "doujinshi",
"width" : {835, 838, 841, 1200},
"height" : {862, 865, 1200},
"artist": [
"kimimaru | entokkun",
],
"character": [
"asuka langley soryu",
"gendo ikari",
"makoto hyuga",
"maya ibuki",
"misato katsuragi",
"rei ayanami",
"shigeru aoba",
"shinji ikari",
],
"group": [
"studio kimigabuchi",
],
"language": [
"english",
"translated",
],
"parody": [
"neon genesis evangelion | shin seiki evangelion",
],
"tags": [
"multi-work series",
"schoolboy uniform",
"schoolgirl uniform",
"sole female",
"sole male",
"story arc",
"twintails",
],
},
{
"#url" : "https://imhentai.xxx/gallery/1396508/",
"#class" : imhentai.ImhentaiGalleryExtractor,
"#pattern": r"https://m9\.imhentai\.xxx/028/po9f4w3jzx/\d+\.webp",
"#count" : 34,
"count" : 34,
"extension" : "webp",
"filename" : str,
"gallery_id": 1396508,
"lang" : "ko",
"num" : range(1, 34),
"title" : "[Beruennea (Skylader)] Tada no Kouhai ni Natta Kimi | 그냥 후배가 돼 버린 너 [Korean] [Digital]",
"title_alt" : "[ベルエンネーア (すかいれーだー)] ただの後輩になった君 [韓国翻訳] [DL版]",
"type" : "doujinshi",
"width" : 1280,
"height" : {1790, 1791},
"artist": [
"skylader",
],
"character": [],
"group": [
"beruennea",
],
"language": [
"korean",
"translated",
],
"parody": [
"original",
],
"tags": [
"ahegao",
"big ass",
"big breasts",
"big nipples",
"big penis",
"bike shorts",
"blowjob",
"gokkun",
"hairy",
"huge breasts",
"mosaic censorship",
"muscle",
"nakadashi",
"netorare",
"schoolgirl uniform",
"tanlines",
],
},
{
"#url" : "https://imhentai.xxx/artist/asutora/",
"#class" : imhentai.ImhentaiTagExtractor,
"#pattern": imhentai.ImhentaiGalleryExtractor.pattern,
"#count" : range(30, 50),
},
{
"#url" : "https://imhentai.xxx/search/?lt=1&pp=0&m=1&d=1&w=1&i=1&a=1&g=1&key=asutora&apply=Search&en=1&jp=1&es=1&fr=1&kr=1&de=1&ru=1&dl=0&tr=0",
"#class" : imhentai.ImhentaiSearchExtractor,
"#pattern": imhentai.ImhentaiGalleryExtractor.pattern,
"#count" : range(30, 50),
},
)