[hitomi] fix extractors (#7230)

This commit is contained in:
Mike Fährmann
2025-03-23 20:32:27 +01:00
parent b20777e68b
commit fd8f652490
3 changed files with 36 additions and 44 deletions

View File

@@ -2793,9 +2793,6 @@ Description
Available formats are ``"webp"`` and ``"avif"``. Available formats are ``"webp"`` and ``"avif"``.
``"original"`` will try to download the original ``jpg`` or ``png`` versions,
but is most likely going to fail with ``403 Forbidden`` errors.
extractor.imagechest.access-token extractor.imagechest.access-token
--------------------------------- ---------------------------------

View File

@@ -16,19 +16,25 @@ import string
import re import re
class HitomiGalleryExtractor(GalleryExtractor): class HitomiExtractor(Extractor):
"""Extractor for image galleries from hitomi.la""" """Base class for hitomi extractors"""
category = "hitomi" category = "hitomi"
root = "https://hitomi.la" root = "https://hitomi.la"
domain = "gold-usergeneratedcontent.net"
class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor):
"""Extractor for hitomi.la galleries"""
pattern = (r"(?:https?://)?hitomi\.la" pattern = (r"(?:https?://)?hitomi\.la"
r"/(?:manga|doujinshi|cg|gamecg|imageset|galleries|reader)" r"/(?:manga|doujinshi|cg|gamecg|imageset|galleries|reader)"
r"/(?:[^/?#]+-)?(\d+)") r"/(?:[^/?#]+-)?(\d+)")
example = "https://hitomi.la/manga/TITLE-867789.html" example = "https://hitomi.la/manga/TITLE-867789.html"
def __init__(self, match): def __init__(self, match):
self.gid = match.group(1) GalleryExtractor.__init__(self, match, False)
url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gid) self.gid = gid = self.groups[0]
GalleryExtractor.__init__(self, match, url) self.gallery_url = "https://ltn.{}/galleries/{}.js".format(
self.domain, gid)
def _init(self): def _init(self):
self.session.headers["Referer"] = "{}/reader/{}.html".format( self.session.headers["Referer"] = "{}/reader/{}.html".format(
@@ -71,43 +77,34 @@ class HitomiGalleryExtractor(GalleryExtractor):
} }
def images(self, _): def images(self, _):
# see https://ltn.hitomi.la/gg.js # https://ltn.gold-usergeneratedcontent.net/gg.js
gg_m, gg_b, gg_default = _parse_gg(self) gg_m, gg_b, gg_default = _parse_gg(self)
fmt = self.config("format") or "webp" fmt = ext = self.config("format") or "webp"
if fmt == "original": check = (fmt != "webp")
subdomain, path, ext, check = "b", "images", None, False
else:
subdomain, path, ext, check = "a", fmt, fmt, (fmt != "webp")
result = [] result = []
for image in self.info["files"]: for image in self.info["files"]:
if check: if check:
if image.get("has" + fmt): ext = fmt if image.get("has" + fmt) else "webp"
path = ext = fmt
else:
path = ext = "webp"
ihash = image["hash"] ihash = image["hash"]
idata = text.nameext_from_url(image["name"]) idata = text.nameext_from_url(image["name"])
idata["extension_original"] = idata["extension"] idata["extension_original"] = idata["extension"]
if ext: idata["extension"] = ext
idata["extension"] = ext
# see https://ltn.hitomi.la/common.js # https://ltn.gold-usergeneratedcontent.net/common.js
inum = int(ihash[-1] + ihash[-3:-1], 16) inum = int(ihash[-1] + ihash[-3:-1], 16)
url = "https://{}{}.hitomi.la/{}/{}/{}/{}.{}".format( url = "https://{}{}.{}/{}/{}/{}.{}".format(
chr(97 + gg_m.get(inum, gg_default)), ext[0], gg_m.get(inum, gg_default) + 1, self.domain,
subdomain, path, gg_b, inum, ihash, idata["extension"], gg_b, inum, ihash, ext,
) )
result.append((url, idata)) result.append((url, idata))
return result return result
class HitomiTagExtractor(Extractor): class HitomiTagExtractor(HitomiExtractor):
"""Extractor for galleries from tag searches on hitomi.la""" """Extractor for galleries from tag searches on hitomi.la"""
category = "hitomi"
subcategory = "tag" subcategory = "tag"
root = "https://hitomi.la"
pattern = (r"(?:https?://)?hitomi\.la" pattern = (r"(?:https?://)?hitomi\.la"
r"/(tag|artist|group|series|type|character)" r"/(tag|artist|group|series|type|character)"
r"/([^/?#]+)\.html") r"/([^/?#]+)\.html")
@@ -126,8 +123,8 @@ class HitomiTagExtractor(Extractor):
"_extractor": HitomiGalleryExtractor, "_extractor": HitomiGalleryExtractor,
"search_tags": text.unquote(self.tag.rpartition("-")[0]), "search_tags": text.unquote(self.tag.rpartition("-")[0]),
} }
nozomi_url = "https://ltn.hitomi.la/{}/{}.nozomi".format( nozomi_url = "https://ltn.{}/{}/{}.nozomi".format(
self.type, self.tag) self.domain, self.type, self.tag)
headers = { headers = {
"Origin": self.root, "Origin": self.root,
"Cache-Control": "max-age=0", "Cache-Control": "max-age=0",
@@ -166,8 +163,8 @@ class HitomiIndexExtractor(HitomiTagExtractor):
def items(self): def items(self):
data = {"_extractor": HitomiGalleryExtractor} data = {"_extractor": HitomiGalleryExtractor}
nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format( nozomi_url = "https://ltn.{}/{}-{}.nozomi".format(
self.tag, self.language) self.domain, self.tag, self.language)
headers = { headers = {
"Origin": self.root, "Origin": self.root,
"Cache-Control": "max-age=0", "Cache-Control": "max-age=0",
@@ -194,11 +191,9 @@ class HitomiIndexExtractor(HitomiTagExtractor):
return return
class HitomiSearchExtractor(Extractor): class HitomiSearchExtractor(HitomiExtractor):
"""Extractor for galleries from multiple tag searches on hitomi.la""" """Extractor for galleries from multiple tag searches on hitomi.la"""
category = "hitomi"
subcategory = "search" subcategory = "search"
root = "https://hitomi.la"
pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^/?#]+)" pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^/?#]+)"
example = "https://hitomi.la/search.html?QUERY" example = "https://hitomi.la/search.html?QUERY"
@@ -224,11 +219,11 @@ class HitomiSearchExtractor(Extractor):
area, tag, language = self.get_nozomi_args(full_tag) area, tag, language = self.get_nozomi_args(full_tag)
if area: if area:
nozomi_url = "https://ltn.hitomi.la/n/{}/{}-{}.nozomi".format( nozomi_url = "https://ltn.{}/n/{}/{}-{}.nozomi".format(
area, tag, language) self.domain, area, tag, language)
else: else:
nozomi_url = "https://ltn.hitomi.la/n/{}-{}.nozomi".format( nozomi_url = "https://ltn.{}/n/{}-{}.nozomi".format(
tag, language) self.domain, tag, language)
headers = { headers = {
"Origin": self.root, "Origin": self.root,
@@ -257,7 +252,7 @@ class HitomiSearchExtractor(Extractor):
@memcache(maxage=1800) @memcache(maxage=1800)
def _parse_gg(extr): def _parse_gg(extr):
page = extr.request("https://ltn.hitomi.la/gg.js").text page = extr.request("https://ltn.gold-usergeneratedcontent.net/gg.js").text
m = {} m = {}
@@ -280,4 +275,4 @@ def _parse_gg(extr):
d = re.search(r"(?:var\s|default:)\s*o\s*=\s*(\d+)", page) d = re.search(r"(?:var\s|default:)\s*o\s*=\s*(\d+)", page)
b = re.search(r"b:\s*[\"'](.+)[\"']", page) b = re.search(r"b:\s*[\"'](.+)[\"']", page)
return m, b.group(1).strip("/"), int(d.group(1)) if d else 1 return m, b.group(1).strip("/"), int(d.group(1)) if d else 0

View File

@@ -12,8 +12,8 @@ __tests__ = (
"#url" : "https://hitomi.la/galleries/867789.html", "#url" : "https://hitomi.la/galleries/867789.html",
"#category": ("", "hitomi", "gallery"), "#category": ("", "hitomi", "gallery"),
"#class" : hitomi.HitomiGalleryExtractor, "#class" : hitomi.HitomiGalleryExtractor,
"#pattern" : r"https://[a-c]a\.hitomi\.la/webp/\d+/\d+/[0-9a-f]{64}\.webp", "#pattern" : r"https://w[1-3]\.gold-usergeneratedcontent\.net/\d+/\d+/[0-9a-f]{64}\.webp",
"#count" : 16, "#count" : 16,
"artist" : ["morris"], "artist" : ["morris"],
"characters": [], "characters": [],
@@ -82,7 +82,7 @@ __tests__ = (
"#category": ("", "hitomi", "gallery"), "#category": ("", "hitomi", "gallery"),
"#class" : hitomi.HitomiGalleryExtractor, "#class" : hitomi.HitomiGalleryExtractor,
"#options" : {"format": "avif"}, "#options" : {"format": "avif"},
"#pattern" : r"https://[a-c]a\.hitomi\.la/avif/\d+/\d+/[0-9a-f]{64}\.avif", "#pattern" : r"https://a[1-3]\.gold-usergeneratedcontent\.net/\d+/\d+/[0-9a-f]{64}\.avif",
"#count" : 22, "#count" : 22,
"artist" : ["sorairo len"], "artist" : ["sorairo len"],
@@ -96,7 +96,7 @@ __tests__ = (
"lang" : "ja", "lang" : "ja",
"language" : "Japanese", "language" : "Japanese",
"num" : range(1, 22), "num" : range(1, 22),
"parody" : [], "parody" : ["original"],
"tags" : [ "tags" : [
"Blowjob ♀", "Blowjob ♀",
"Focus Blowjob ♀", "Focus Blowjob ♀",