[hitomi] simplify metadata extraction
Use the data from https://ltn.hitomi.la/galleries/<id>.js for both image URLs and metadata and ignore any gallery or reader pages. This removes 'artist', 'characters', 'group', and 'parody' metadata fields since this information is, as for now, only available in gallery pages.
This commit is contained in:
@@ -10,7 +10,6 @@
|
|||||||
|
|
||||||
from .common import GalleryExtractor
|
from .common import GalleryExtractor
|
||||||
from .. import text, util
|
from .. import text, util
|
||||||
import string
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|
||||||
@@ -24,7 +23,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
|
|||||||
test = (
|
test = (
|
||||||
("https://hitomi.la/galleries/867789.html", {
|
("https://hitomi.la/galleries/867789.html", {
|
||||||
"pattern": r"https://[a-c]a.hitomi.la/images/./../[0-9a-f]+.jpg",
|
"pattern": r"https://[a-c]a.hitomi.la/images/./../[0-9a-f]+.jpg",
|
||||||
"keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2",
|
"keyword": "3314105a0b344ea1461c43257b14b0de415b88bb",
|
||||||
"count": 16,
|
"count": 16,
|
||||||
}),
|
}),
|
||||||
# download test
|
# download test
|
||||||
@@ -55,56 +54,43 @@ class HitomiGalleryExtractor(GalleryExtractor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
self.gallery_id = match.group(1)
|
gid = match.group(1)
|
||||||
url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gallery_id)
|
url = "https://ltn.hitomi.la/galleries/{}.js".format(gid)
|
||||||
GalleryExtractor.__init__(self, match, url)
|
GalleryExtractor.__init__(self, match, url)
|
||||||
|
self.data = None
|
||||||
self.session.headers["Referer"] = "{}/reader/{}.html".format(
|
self.session.headers["Referer"] = "{}/reader/{}.html".format(
|
||||||
self.root, self.gallery_id)
|
self.root, gid)
|
||||||
|
|
||||||
def metadata(self, _):
|
def metadata(self, page):
|
||||||
# try galleries page first
|
self.data = data = json.loads(page.partition("=")[2])
|
||||||
url = "{}/galleries/{}.html".format(self.root, self.gallery_id)
|
|
||||||
response = self.request(url, fatal=False)
|
|
||||||
|
|
||||||
# follow redirects
|
language = data.get("language")
|
||||||
while b"<title>Redirect</title>" in response.content:
|
if language:
|
||||||
url = text.extract(response.text, "href='", "'")[0]
|
language = language.capitalize()
|
||||||
if not url.startswith("http"):
|
|
||||||
url = text.urljoin(self.root, url)
|
|
||||||
response = self.request(url, fatal=False)
|
|
||||||
|
|
||||||
# fallback to reader page
|
tags = []
|
||||||
if response.status_code >= 400:
|
for tinfo in data["tags"]:
|
||||||
url = "{}/reader/{}.html".format(self.root, self.gallery_id)
|
tag = tinfo["tag"]
|
||||||
page = self.request(url).text
|
if tinfo.get("female"):
|
||||||
return {
|
tag = "female:" + tag
|
||||||
"gallery_id": text.parse_int(self.gallery_id),
|
elif tinfo.get("male"):
|
||||||
"title": text.unescape(text.extract(
|
tag = "male:" + tag
|
||||||
page, "<title>", "<")[0].rpartition(" | ")[0]),
|
tags.append(tag)
|
||||||
}
|
|
||||||
|
|
||||||
page = response.text
|
return {
|
||||||
extr = text.extract_from(page, page.index('<h1><a href="/reader/'))
|
"gallery_id": text.parse_int(data["id"]),
|
||||||
data = {
|
"title" : data["title"],
|
||||||
"gallery_id": text.parse_int(self.gallery_id),
|
"type" : data["type"],
|
||||||
"title" : text.unescape(extr('.html">', '<').strip()),
|
"language" : language,
|
||||||
"artist" : self._prep(extr('<h2>', '</h2>')),
|
"lang" : util.language_to_code(language),
|
||||||
"group" : self._prep(extr('<td>Group</td><td>', '</td>')),
|
"tags" : tags,
|
||||||
"type" : self._prep_1(extr('<td>Type</td><td>', '</td>')),
|
"date" : text.parse_datetime(
|
||||||
"language" : self._prep_1(extr('<td>Language</td><td>', '</td>')),
|
data["date"] + ":00", "%Y-%m-%d %H:%M:%S%z"),
|
||||||
"parody" : self._prep(extr('<td>Series</td><td>', '</td>')),
|
|
||||||
"characters": self._prep(extr('<td>Characters</td><td>', '</td>')),
|
|
||||||
"tags" : self._prep(extr('<td>Tags</td><td>', '</td>')),
|
|
||||||
"date" : self._date(extr('<span class="date">', '</span>')),
|
|
||||||
}
|
}
|
||||||
if data["language"] == "N/a":
|
|
||||||
data["language"] = None
|
|
||||||
data["lang"] = util.language_to_code(data["language"])
|
|
||||||
return data
|
|
||||||
|
|
||||||
def images(self, page):
|
def images(self, _):
|
||||||
result = []
|
result = []
|
||||||
for image in json.loads(page.partition("=")[2])["files"]:
|
for image in self.data["files"]:
|
||||||
ihash = image["hash"]
|
ihash = image["hash"]
|
||||||
idata = text.nameext_from_url(image["name"])
|
idata = text.nameext_from_url(image["name"])
|
||||||
|
|
||||||
@@ -120,18 +106,3 @@ class HitomiGalleryExtractor(GalleryExtractor):
|
|||||||
)
|
)
|
||||||
result.append((url, idata))
|
result.append((url, idata))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _prep(value):
|
|
||||||
return [
|
|
||||||
text.unescape(string.capwords(v))
|
|
||||||
for v in text.extract_iter(value or "", '.html">', '<')
|
|
||||||
]
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _prep_1(value):
|
|
||||||
return text.remove_html(value).capitalize()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _date(value):
|
|
||||||
return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z")
|
|
||||||
|
|||||||
Reference in New Issue
Block a user