adjust metadata types for GalleryExtractors

This commit is contained in:
Mike Fährmann
2019-03-01 23:13:40 +01:00
parent 13e0f2a78f
commit 26c4365baa
6 changed files with 92 additions and 89 deletions

View File

@@ -12,16 +12,20 @@ from .common import GalleryExtractor, Extractor, Message
from .. import text from .. import text
class HentaifoxGalleryExtractor(GalleryExtractor): class HentaifoxBase():
"""Extractor for image galleries on hentaifox.com""" """Base class for hentaifox extractors"""
category = "hentaifox" category = "hentaifox"
root = "https://hentaifox.com"
class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
"""Extractor for image galleries on hentaifox.com"""
pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))" pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
test = ("https://hentaifox.com/gallery/56622/", { test = ("https://hentaifox.com/gallery/56622/", {
"pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg", "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
"count": 24, "count": 24,
"keyword": "d0df47e073e32a7752236ab151949c3820f9d81e", "keyword": "38f8517605feb6854d48833297da6b05c6541b69",
}) })
root = "https://hentaifox.com"
def __init__(self, match): def __init__(self, match):
GalleryExtractor.__init__(self, match) GalleryExtractor.__init__(self, match)
@@ -30,7 +34,7 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
def metadata(self, page): def metadata(self, page):
title, pos = text.extract(page, "<h1>", "</h1>") title, pos = text.extract(page, "<h1>", "</h1>")
data = text.extract_all(page, ( data = text.extract_all(page, (
("parodies" , ">Parodies:" , "</a></span>"), ("parody" , ">Parodies:" , "</a></span>"),
("characters", ">Characters:", "</a></span>"), ("characters", ">Characters:", "</a></span>"),
("tags" , ">Tags:" , "</a></span>"), ("tags" , ">Tags:" , "</a></span>"),
("artist" , ">Artists:" , "</a></span>"), ("artist" , ">Artists:" , "</a></span>"),
@@ -39,9 +43,10 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
), pos)[0] ), pos)[0]
for key, value in data.items(): for key, value in data.items():
data[key] = text.remove_html(value).replace(" , ", ", ") data[key] = text.split_html(value)[::2]
data["gallery_id"] = text.parse_int(self.gallery_id) data["gallery_id"] = text.parse_int(self.gallery_id)
data["title"] = text.unescape(title) data["title"] = text.unescape(title)
data["type"] = data["type"][0] if data["type"] else ""
data["language"] = "English" data["language"] = "English"
data["lang"] = "en" data["lang"] = "en"
return data return data
@@ -53,9 +58,8 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
] ]
class HentaifoxSearchExtractor(Extractor): class HentaifoxSearchExtractor(HentaifoxBase, Extractor):
"""Extractor for search results and listings on hentaifox.com""" """Extractor for search results and listings on hentaifox.com"""
category = "hentaifox"
subcategory = "search" subcategory = "search"
pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com" pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"
r"(/(?:parody|tag|artist|character|search)/[^/?%#]+)") r"(/(?:parody|tag|artist|character|search)/[^/?%#]+)")
@@ -76,7 +80,6 @@ class HentaifoxSearchExtractor(Extractor):
}, },
}), }),
) )
root = "https://hentaifox.com"
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self, match) Extractor.__init__(self, match)

View File

@@ -20,7 +20,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
test = ( test = (
("https://hitomi.la/galleries/867789.html", { ("https://hitomi.la/galleries/867789.html", {
"url": "cb759868d090fe0e2655c3e29ebf146054322b6d", "url": "cb759868d090fe0e2655c3e29ebf146054322b6d",
"keyword": "52951edb50163180eb669a78aef0bab0522d32b7", "keyword": "07536afc5696cb4983a4831ab4c70c1d155f875c",
}), }),
("https://hitomi.la/galleries/1036181.html", { ("https://hitomi.la/galleries/1036181.html", {
# "aa" subdomain for gallery-id ending in 1 (#142) # "aa" subdomain for gallery-id ending in 1 (#142)
@@ -30,8 +30,8 @@ class HitomiGalleryExtractor(GalleryExtractor):
) )
def __init__(self, match): def __init__(self, match):
self.gid = text.parse_int(match.group(1)) self.gallery_id = text.parse_int(match.group(1))
url = "https://hitomi.la/galleries/{}.html".format(self.gid) url = "https://hitomi.la/galleries/{}.html".format(self.gallery_id)
GalleryExtractor.__init__(self, match, url) GalleryExtractor.__init__(self, match, url)
def metadata(self, page): def metadata(self, page):
@@ -49,23 +49,22 @@ class HitomiGalleryExtractor(GalleryExtractor):
lang = None if lang == "N/A" else text.remove_html(lang) lang = None if lang == "N/A" else text.remove_html(lang)
return { return {
"gallery_id": self.gid, "gallery_id": self.gallery_id,
"title": text.unescape(" ".join(title.split())), "title" : text.unescape(title.strip()),
"artist": self._prepare(artist), "artist" : self._prepare(artist),
"group": self._prepare(group), "group" : self._prepare(group),
"type": text.remove_html(gtype).capitalize(), "parody" : self._prepare(series),
"lang": util.language_to_code(lang),
"language": lang,
"date": date,
"series": self._prepare(series),
"characters": self._prepare(chars), "characters": self._prepare(chars),
"tags": self._prepare(tags), "tags" : self._prepare(tags),
"type" : text.remove_html(gtype).capitalize(),
"lang" : util.language_to_code(lang),
"language" : lang,
"date" : date,
} }
def images(self, page): def images(self, page):
# see https://ltn.hitomi.la/common.js # see https://ltn.hitomi.la/common.js
frontends = 2 offset = self.gallery_id % 2 if self.gallery_id % 10 != 1 else 0
offset = self.gid % frontends if self.gid % 10 != 1 else 0
subdomain = chr(97 + offset) + "a" subdomain = chr(97 + offset) + "a"
base = "https://" + subdomain + ".hitomi.la/galleries/" base = "https://" + subdomain + ".hitomi.la/galleries/"
@@ -78,10 +77,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
@staticmethod @staticmethod
def _prepare(value): def _prepare(value):
if not value or "<ul " not in value: return [
return "" text.unescape(string.capwords(v))
value = ", ".join(text.extract_iter( for v in text.extract_iter(value or "", '.html">', '<')
value, '.html">', '<')) ]
return string.capwords(
text.unescape(value)
)

View File

@@ -32,6 +32,7 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
"title_ja" : str, "title_ja" : str,
"gallery_id": 147850, "gallery_id": 147850,
"media_id" : 867789, "media_id" : 867789,
"count" : 16,
"date" : 1446050915, "date" : 1446050915,
"scanlator" : "", "scanlator" : "",
"artist" : ["morris"], "artist" : ["morris"],
@@ -40,8 +41,8 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
"characters": list, "characters": list,
"tags" : list, "tags" : list,
"type" : "manga", "type" : "manga",
"language" : ["translated", "english"],
"lang" : "en", "lang" : "en",
"language" : "English",
"width" : int, "width" : int,
"height" : int, "height" : int,
}, },
@@ -63,12 +64,11 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
for tag in data["tags"]: for tag in data["tags"]:
info[tag["type"]].append(tag["name"]) info[tag["type"]].append(tag["name"])
language = ""
for language in info["language"]: for language in info["language"]:
if language != "translated": if language != "translated":
lang = util.language_to_code(language) language = language.capitalize()
break break
else:
lang = ""
return { return {
"title" : title_en or title_ja, "title" : title_en or title_ja,
@@ -84,8 +84,8 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
"characters": info["character"], "characters": info["character"],
"tags" : info["tag"], "tags" : info["tag"],
"type" : info["category"][0] if info["category"] else "", "type" : info["category"][0] if info["category"] else "",
"language" : info["language"], "lang" : util.language_to_code(language),
"lang" : lang, "language" : language,
} }
def images(self, _): def images(self, _):

View File

@@ -20,24 +20,24 @@ class PururinGalleryExtractor(GalleryExtractor):
test = ("https://pururin.io/gallery/38661/iowant-2", { test = ("https://pururin.io/gallery/38661/iowant-2", {
"pattern": r"https://cdn.pururin.io/assets/images/data/38661/\d+\.jpg", "pattern": r"https://cdn.pururin.io/assets/images/data/38661/\d+\.jpg",
"keyword": { "keyword": {
"artist": "Shoda Norihiro", "title" : "Iowant 2!!",
"title_en" : "Iowant 2!!",
"title_jp" : "",
"gallery_id": 38661,
"count" : 19,
"artist" : ["Shoda Norihiro"],
"group" : ["Obsidian Order"],
"parody" : ["Kantai Collection"],
"characters": ["Iowa", "Teitoku"], "characters": ["Iowa", "Teitoku"],
"tags" : list,
"type" : "Doujinshi",
"collection": "", "collection": "",
"convention": "C92", "convention": "C92",
"count": 19, "rating" : float,
"extension": "jpg", "uploader" : "demo",
"gallery_id": 38661, "scanlator" : "",
"group": "Obsidian Order", "lang" : "en",
"lang": "en", "language" : "English",
"language": "English",
"parody": "Kantai Collection",
"rating": float,
"scanlator": "",
"tags": list,
"title": "Iowant 2!!",
"title_jp": str,
"type": "Doujinshi",
"uploader": "demo"
} }
}) })
root = "https://pururin.io" root = "https://pururin.io"
@@ -74,18 +74,19 @@ class PururinGalleryExtractor(GalleryExtractor):
self._ext = info["image_extension"] self._ext = info["image_extension"]
self._cnt = info["total_pages"] self._cnt = info["total_pages"]
for key in ("tags", "characters"): for key in ("artist", "group", "parody", "tags", "characters"):
data[key] = [ data[key] = [
text.unescape(item) text.unescape(item)
for item in text.extract_iter(data[key], 'title="', '"') for item in text.extract_iter(data[key], 'title="', '"')
] ]
for key in ("artist", "group", "parody", "type", "collection", for key in ("type", "collection", "language", "scanlator",
"language", "scanlator", "convention"): "convention"):
data[key] = text.unescape(text.extract( data[key] = text.unescape(text.extract(
data[key], 'title="', '"')[0] or "") data[key], 'title="', '"')[0] or "")
data["gallery_id"] = text.parse_int(self.gallery_id) data["gallery_id"] = text.parse_int(self.gallery_id)
data["title"] = info["title"] data["title"] = info["title"] or info.get("j_title") or ""
data["title_en"] = info["title"]
data["title_jp"] = info.get("j_title") or "" data["title_jp"] = info.get("j_title") or ""
data["uploader"] = text.remove_html(data["uploader"]) data["uploader"] = text.remove_html(data["uploader"])
data["rating"] = text.parse_float(data["rating"]) data["rating"] = text.parse_float(data["rating"])

View File

@@ -23,7 +23,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
(("https://original-work.simply-hentai.com" (("https://original-work.simply-hentai.com"
"/amazon-no-hiyaku-amazon-elixir"), { "/amazon-no-hiyaku-amazon-elixir"), {
"url": "258289249990502c3138719cb89e995a60861e49", "url": "258289249990502c3138719cb89e995a60861e49",
"keyword": "468a0a3db4fc6ad7fcae0facefb9753831c0404d", "keyword": "18ab9defca53dbb2aeb7965193e93e0ea125b76b",
}), }),
("https://www.simply-hentai.com/notfound", { ("https://www.simply-hentai.com/notfound", {
"exception": exception.GalleryDLException, "exception": exception.GalleryDLException,
@@ -55,14 +55,14 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
return { return {
"gallery_id": text.parse_int(gid), "gallery_id": text.parse_int(gid),
"title": text.unescape(title), "title" : text.unescape(title),
"series": text.remove_html(series), "artist" : text.split_html(artist),
"characters": ", ".join(text.split_html(chars)), "parody" : text.split_html(series),
"tags": text.split_html(tags), "characters": text.split_html(chars),
"artist": ", ".join(text.split_html(artist)), "tags" : text.split_html(tags),
"lang": util.language_to_code(lang), "lang" : util.language_to_code(lang),
"language": lang, "language" : lang,
"date": text.remove_html(date), "date" : text.remove_html(date),
} }
def images(self, _): def images(self, _):

View File

@@ -48,24 +48,24 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
("https://www.tsumino.com/Book/Info/40996", { ("https://www.tsumino.com/Book/Info/40996", {
"url": "84bf30a86623039fc87855680fada884dc8a1ddd", "url": "84bf30a86623039fc87855680fada884dc8a1ddd",
"keyword": { "keyword": {
"artist": "Itou Life", "title" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
"characters": "Carmilla, Gudako, Gudao, Lancelot, Nightingale", "title_en" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
"collection": "", "title_jp" : "シコシコ大好きナイチンゲール + 会場限定おまけ本",
"count": 42,
"date": "2018 June 29",
"gallery_id": 40996, "gallery_id": 40996,
"group": "Itou Life", "date" : "2018 June 29",
"lang": "en", "count" : 42,
"language": "English", "collection": "",
"page": int, "artist" : ["Itou Life"],
"parodies": "Fate/Grand Order", "group" : ["Itou Life"],
"rating": float, "parody" : ["Fate/Grand Order"],
"tags": str, "characters": list,
"thumbnail": "http://www.tsumino.com/Image/Thumb/40996", "tags" : list,
"title": r"re:Shikoshiko Daisuki Nightingale \+ Kaijou Gentei", "type" : "Doujinshi",
"title_jp": "シコシコ大好きナイチンゲール + 会場限定おまけ本", "rating" : float,
"type": "Doujinshi", "uploader" : "sehki",
"uploader": "sehki" "lang" : "en",
"language" : "English",
"thumbnail" : "http://www.tsumino.com/Image/Thumb/40996",
}, },
}), }),
("https://www.tsumino.com/Read/View/45834"), ("https://www.tsumino.com/Read/View/45834"),
@@ -81,6 +81,8 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
title, pos = extr(page, '"og:title" content="', '"') title, pos = extr(page, '"og:title" content="', '"')
thumb, pos = extr(page, '"og:image" content="', '"', pos) thumb, pos = extr(page, '"og:image" content="', '"', pos)
title_en, _, title_jp = text.unescape(title).partition("/") title_en, _, title_jp = text.unescape(title).partition("/")
title_en = title_en.strip()
title_jp = title_jp.strip()
uploader , pos = extr(page, 'id="Uploader">' , '</div>', pos) uploader , pos = extr(page, 'id="Uploader">' , '</div>', pos)
date , pos = extr(page, 'id="Uploaded">' , '</div>', pos) date , pos = extr(page, 'id="Uploaded">' , '</div>', pos)
@@ -95,19 +97,20 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
return { return {
"gallery_id": text.parse_int(self.gallery_id), "gallery_id": text.parse_int(self.gallery_id),
"title": title_en.strip(), "title": title_en or title_jp,
"title_jp": title_jp.strip(), "title_en": title_en,
"title_jp": title_jp,
"thumbnail": thumb, "thumbnail": thumb,
"uploader": text.remove_html(uploader), "uploader": text.remove_html(uploader),
"date": date.strip(), "date": date.strip(),
"rating": text.parse_float(rating.partition(" ")[0]), "rating": text.parse_float(rating.partition(" ")[0]),
"type": text.remove_html(gtype), "type": text.remove_html(gtype),
"collection": text.remove_html(collection), "collection": text.remove_html(collection),
"group": text.remove_html(group), "group": text.split_html(group),
"artist": ", ".join(text.split_html(artist)), "artist": text.split_html(artist),
"parodies": ", ".join(text.split_html(parody)), "parody": text.split_html(parody),
"characters": ", ".join(text.split_html(character)), "characters": text.split_html(character),
"tags": ", ".join(text.split_html(tags)), "tags": text.split_html(tags),
"language": "English", "language": "English",
"lang": "en", "lang": "en",
} }