[hitomi] simplify data extraction code

This commit is contained in:
Mike Fährmann
2019-05-01 11:14:21 +02:00
parent 2756cc8dde
commit fc5e4f2b21

View File

@@ -40,32 +40,23 @@ class HitomiGalleryExtractor(GalleryExtractor):
GalleryExtractor.__init__(self, match, url) GalleryExtractor.__init__(self, match, url)
def metadata(self, page): def metadata(self, page):
pos = page.index('<h1><a href="/reader/') extr = text.extract_from(page, page.index('<h1><a href="/reader/'))
extr = text.extract data = {
title , pos = extr(page, '.html">', '<', pos)
artist, pos = extr(page, '<h2>', '</h2>', pos)
group , pos = extr(page, '<td>Group</td><td>', '</td>', pos)
gtype , pos = extr(page, '<td>Type</td><td>', '</td>', pos)
lang , pos = extr(page, '<td>Language</td><td>', '</td>', pos)
series, pos = extr(page, '<td>Series</td><td>', '</td>', pos)
chars , pos = extr(page, '<td>Characters</td><td>', '</td>', pos)
tags , pos = extr(page, '<td>Tags</td><td>', '</td>', pos)
date , pos = extr(page, '<span class="date">', '</span>', pos)
lang = None if lang == "N/A" else text.remove_html(lang)
return {
"gallery_id": self.gallery_id, "gallery_id": self.gallery_id,
"title" : text.unescape(title.strip()), "title" : text.unescape(extr('.html">', '<').strip()),
"artist" : self._prepare(artist), "artist" : self._prep(extr('<h2>', '</h2>')),
"group" : self._prepare(group), "group" : self._prep(extr('<td>Group</td><td>', '</td>')),
"parody" : self._prepare(series), "type" : self._prep_1(extr('<td>Type</td><td>', '</td>')),
"characters": self._prepare(chars), "language" : self._prep_1(extr('<td>Language</td><td>', '</td>')),
"tags" : self._prepare(tags), "parody" : self._prep(extr('<td>Series</td><td>', '</td>')),
"type" : text.remove_html(gtype).capitalize(), "characters": self._prep(extr('<td>Characters</td><td>', '</td>')),
"lang" : util.language_to_code(lang), "tags" : self._prep(extr('<td>Tags</td><td>', '</td>')),
"language" : lang, "date" : extr('<span class="date">', '</span>'),
"date" : date,
} }
if data["language"] == "N/A":
data["language"] = None
data["lang"] = util.language_to_code(data["language"])
return data
def images(self, page): def images(self, page):
# see https://ltn.hitomi.la/common.js # see https://ltn.hitomi.la/common.js
@@ -84,8 +75,12 @@ class HitomiGalleryExtractor(GalleryExtractor):
] ]
@staticmethod @staticmethod
def _prepare(value): def _prep(value):
return [ return [
text.unescape(string.capwords(v)) text.unescape(string.capwords(v))
for v in text.extract_iter(value or "", '.html">', '<') for v in text.extract_iter(value or "", '.html">', '<')
] ]
@staticmethod
def _prep_1(value):
return text.remove_html(value).capitalize()