[hitomi] fix image URLs and improve metadata

- use '?a.hitomi.la' as subdomain depending in gallery-id
- add 'characters', 'tags' and 'date' information
- support multiple entires per metadata-value
- rename 'num' to 'page'
This commit is contained in:
Mike Fährmann
2018-03-20 17:36:06 +01:00
parent d75dc71394
commit b2ba2b821d

View File

@@ -8,77 +8,75 @@
"""Extract images from https://hitomi.la/""" """Extract images from https://hitomi.la/"""
from .common import Extractor, Message from .common import ChapterExtractor
from .. import text, util from .. import text, util
import string import string
class HitomiGalleryExtractor(Extractor): class HitomiGalleryExtractor(ChapterExtractor):
"""Extractor for image galleries from hitomi.la""" """Extractor for image galleries from hitomi.la"""
category = "hitomi" category = "hitomi"
subcategory = "gallery" subcategory = "gallery"
directory_fmt = ["{category}", "{gallery_id} {title}"] directory_fmt = ["{category}", "{gallery_id} {title}"]
filename_fmt = "{category}_{gallery_id}_{num:>03}_{name}.{extension}" filename_fmt = "{category}_{gallery_id}_{page:>03}_{name}.{extension}"
archive_fmt = "{gallery_id}_{num}" archive_fmt = "{gallery_id}_{page}"
pattern = [r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)\.html"] pattern = [r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)"]
test = [("https://hitomi.la/galleries/867789.html", { test = [
"url": "e42a47dfadda93e4bf37e82b1dc9ad29edfa9130", ("https://hitomi.la/galleries/867789.html", {
"keyword": "c007cd41229d727b2ced3b364350561444738351", "url": "cb759868d090fe0e2655c3e29ebf146054322b6d",
})] "keyword": "b1e66ff971fc8cb80240a687f508f3b74053f799",
}),
("https://hitomi.la/reader/867789.html", None),
]
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) self.gid = util.safe_int(match.group(1))
self.gid = match.group(1) url = "https://hitomi.la/galleries/{}.html".format(self.gid)
ChapterExtractor.__init__(self, url)
def items(self): def get_metadata(self, page, extr=text.extract):
url = "https://hitomi.la/galleries/" + self.gid + ".html" pos = page.index('<h1><a href="/reader/')
page = self.request(url).text title , pos = extr(page, '.html">', '<', pos)
data = self.get_job_metadata(page) artist, pos = extr(page, '<h2>', '</h2>', pos)
images = self.get_image_urls(page) group , pos = extr(page, '<td>Group</td><td>', '</td>', pos)
data["count"] = len(images) gtype , pos = extr(page, '<td>Type</td><td>', '</td>', pos)
yield Message.Version, 1 lang , pos = extr(page, '<td>Language</td><td>', '</td>', pos)
yield Message.Directory, data series, pos = extr(page, '<td>Series</td><td>', '</td>', pos)
for data["num"], url in enumerate(images, 1): chars , pos = extr(page, '<td>Characters</td><td>', '</td>', pos)
yield Message.Url, url, text.nameext_from_url(url, data) tags , pos = extr(page, '<td>Tags</td><td>', '</td>', pos)
date , pos = extr(page, '<span class="date">', '</span>', pos)
lang = None if lang == "N/A" else text.remove_html(lang)
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
group = ""
gtype = ""
series = ""
_ , pos = text.extract(page, '<h1><a href="/reader/', '')
title , pos = text.extract(page, '.html">', "</a>", pos)
_ , pos = text.extract(page, '<li><a href="/artist/', '', pos)
artist, pos = text.extract(page, '.html">', '</a>', pos)
test , pos = text.extract(page, '<li><a href="/group/', '', pos)
if test is not None:
group , pos = text.extract(page, '.html">', '</a>', pos)
test , pos = text.extract(page, '<a href="/type/', '', pos)
if test is not None:
gtype , pos = text.extract(page, '.html">', '</a>', pos)
_ , pos = text.extract(page, '<tdLanguage</td>', '', pos)
lang , pos = text.extract(page, '.html">', '</a>', pos)
test , pos = text.extract(page, '<a href="/series/', '', pos)
if test is not None:
series, pos = text.extract(page, '.html">', '</a>', pos)
lang = lang.capitalize()
return { return {
"gallery_id": self.gid, "gallery_id": self.gid,
"title": " ".join(title.split()), "title": text.unescape(" ".join(title.split())),
"artist": string.capwords(artist), "artist": self._prepare(artist),
"group": string.capwords(group), "group": self._prepare(group),
"type": gtype.strip().capitalize(), "type": text.remove_html(gtype).capitalize(),
"lang": util.language_to_code(lang), "lang": util.language_to_code(lang),
"language": lang, "language": lang,
"series": string.capwords(series), "date": date,
"series": self._prepare(series),
"characters": self._prepare(chars),
"tags": self._prepare(tags),
} }
@staticmethod def get_images(self, page):
def get_image_urls(page): subdomain = chr(97 + self.gid % 2) + "a"
"""Extract and return a list of all image-urls""" base = "https://" + subdomain + ".hitomi.la/galleries/"
return [ return [
"https://la.hitomi.la/galleries/" + urlpart (base + urlpart, None)
for urlpart in text.extract_iter( for urlpart in text.extract_iter(
page, "'//tn.hitomi.la/smalltn/", ".jpg'," page, "'//tn.hitomi.la/smalltn/", ".jpg',"
) )
] ]
@staticmethod
def _prepare(value):
if not value or "<ul " not in value:
return ""
value = ", ".join(text.extract_iter(
value, '.html">', '<'))
return string.capwords(
text.unescape(value)
)