[hitomi] simplify metadata extraction

Use the data from https://ltn.hitomi.la/galleries/<id>.js for both
image URLs and metadata and ignore any gallery or reader pages.

This removes 'artist', 'characters', 'group', and 'parody' metadata
fields since this information is, as for now, only available in
gallery pages.
This commit is contained in:
Mike Fährmann
2020-03-03 23:18:58 +01:00
parent 115fd2c6f2
commit f33b13aacf

View File

@@ -10,7 +10,6 @@
from .common import GalleryExtractor from .common import GalleryExtractor
from .. import text, util from .. import text, util
import string
import json import json
@@ -24,7 +23,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
test = ( test = (
("https://hitomi.la/galleries/867789.html", { ("https://hitomi.la/galleries/867789.html", {
"pattern": r"https://[a-c]a.hitomi.la/images/./../[0-9a-f]+.jpg", "pattern": r"https://[a-c]a.hitomi.la/images/./../[0-9a-f]+.jpg",
"keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2", "keyword": "3314105a0b344ea1461c43257b14b0de415b88bb",
"count": 16, "count": 16,
}), }),
# download test # download test
@@ -55,56 +54,43 @@ class HitomiGalleryExtractor(GalleryExtractor):
) )
def __init__(self, match): def __init__(self, match):
self.gallery_id = match.group(1) gid = match.group(1)
url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gallery_id) url = "https://ltn.hitomi.la/galleries/{}.js".format(gid)
GalleryExtractor.__init__(self, match, url) GalleryExtractor.__init__(self, match, url)
self.data = None
self.session.headers["Referer"] = "{}/reader/{}.html".format( self.session.headers["Referer"] = "{}/reader/{}.html".format(
self.root, self.gallery_id) self.root, gid)
def metadata(self, _): def metadata(self, page):
# try galleries page first self.data = data = json.loads(page.partition("=")[2])
url = "{}/galleries/{}.html".format(self.root, self.gallery_id)
response = self.request(url, fatal=False)
# follow redirects language = data.get("language")
while b"<title>Redirect</title>" in response.content: if language:
url = text.extract(response.text, "href='", "'")[0] language = language.capitalize()
if not url.startswith("http"):
url = text.urljoin(self.root, url)
response = self.request(url, fatal=False)
# fallback to reader page tags = []
if response.status_code >= 400: for tinfo in data["tags"]:
url = "{}/reader/{}.html".format(self.root, self.gallery_id) tag = tinfo["tag"]
page = self.request(url).text if tinfo.get("female"):
return { tag = "female:" + tag
"gallery_id": text.parse_int(self.gallery_id), elif tinfo.get("male"):
"title": text.unescape(text.extract( tag = "male:" + tag
page, "<title>", "<")[0].rpartition(" | ")[0]), tags.append(tag)
}
page = response.text return {
extr = text.extract_from(page, page.index('<h1><a href="/reader/')) "gallery_id": text.parse_int(data["id"]),
data = { "title" : data["title"],
"gallery_id": text.parse_int(self.gallery_id), "type" : data["type"],
"title" : text.unescape(extr('.html">', '<').strip()), "language" : language,
"artist" : self._prep(extr('<h2>', '</h2>')), "lang" : util.language_to_code(language),
"group" : self._prep(extr('<td>Group</td><td>', '</td>')), "tags" : tags,
"type" : self._prep_1(extr('<td>Type</td><td>', '</td>')), "date" : text.parse_datetime(
"language" : self._prep_1(extr('<td>Language</td><td>', '</td>')), data["date"] + ":00", "%Y-%m-%d %H:%M:%S%z"),
"parody" : self._prep(extr('<td>Series</td><td>', '</td>')),
"characters": self._prep(extr('<td>Characters</td><td>', '</td>')),
"tags" : self._prep(extr('<td>Tags</td><td>', '</td>')),
"date" : self._date(extr('<span class="date">', '</span>')),
} }
if data["language"] == "N/a":
data["language"] = None
data["lang"] = util.language_to_code(data["language"])
return data
def images(self, page): def images(self, _):
result = [] result = []
for image in json.loads(page.partition("=")[2])["files"]: for image in self.data["files"]:
ihash = image["hash"] ihash = image["hash"]
idata = text.nameext_from_url(image["name"]) idata = text.nameext_from_url(image["name"])
@@ -120,18 +106,3 @@ class HitomiGalleryExtractor(GalleryExtractor):
) )
result.append((url, idata)) result.append((url, idata))
return result return result
@staticmethod
def _prep(value):
return [
text.unescape(string.capwords(v))
for v in text.extract_iter(value or "", '.html">', '<')
]
@staticmethod
def _prep_1(value):
return text.remove_html(value).capitalize()
@staticmethod
def _date(value):
return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z")