adjust metadata types for GalleryExtractors
This commit is contained in:
@@ -12,16 +12,20 @@ from .common import GalleryExtractor, Extractor, Message
|
|||||||
from .. import text
|
from .. import text
|
||||||
|
|
||||||
|
|
||||||
class HentaifoxGalleryExtractor(GalleryExtractor):
|
class HentaifoxBase():
|
||||||
"""Extractor for image galleries on hentaifox.com"""
|
"""Base class for hentaifox extractors"""
|
||||||
category = "hentaifox"
|
category = "hentaifox"
|
||||||
|
root = "https://hentaifox.com"
|
||||||
|
|
||||||
|
|
||||||
|
class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
|
||||||
|
"""Extractor for image galleries on hentaifox.com"""
|
||||||
pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
|
pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
|
||||||
test = ("https://hentaifox.com/gallery/56622/", {
|
test = ("https://hentaifox.com/gallery/56622/", {
|
||||||
"pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
|
"pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
|
||||||
"count": 24,
|
"count": 24,
|
||||||
"keyword": "d0df47e073e32a7752236ab151949c3820f9d81e",
|
"keyword": "38f8517605feb6854d48833297da6b05c6541b69",
|
||||||
})
|
})
|
||||||
root = "https://hentaifox.com"
|
|
||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
GalleryExtractor.__init__(self, match)
|
GalleryExtractor.__init__(self, match)
|
||||||
@@ -30,7 +34,7 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
|
|||||||
def metadata(self, page):
|
def metadata(self, page):
|
||||||
title, pos = text.extract(page, "<h1>", "</h1>")
|
title, pos = text.extract(page, "<h1>", "</h1>")
|
||||||
data = text.extract_all(page, (
|
data = text.extract_all(page, (
|
||||||
("parodies" , ">Parodies:" , "</a></span>"),
|
("parody" , ">Parodies:" , "</a></span>"),
|
||||||
("characters", ">Characters:", "</a></span>"),
|
("characters", ">Characters:", "</a></span>"),
|
||||||
("tags" , ">Tags:" , "</a></span>"),
|
("tags" , ">Tags:" , "</a></span>"),
|
||||||
("artist" , ">Artists:" , "</a></span>"),
|
("artist" , ">Artists:" , "</a></span>"),
|
||||||
@@ -39,9 +43,10 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
|
|||||||
), pos)[0]
|
), pos)[0]
|
||||||
|
|
||||||
for key, value in data.items():
|
for key, value in data.items():
|
||||||
data[key] = text.remove_html(value).replace(" , ", ", ")
|
data[key] = text.split_html(value)[::2]
|
||||||
data["gallery_id"] = text.parse_int(self.gallery_id)
|
data["gallery_id"] = text.parse_int(self.gallery_id)
|
||||||
data["title"] = text.unescape(title)
|
data["title"] = text.unescape(title)
|
||||||
|
data["type"] = data["type"][0] if data["type"] else ""
|
||||||
data["language"] = "English"
|
data["language"] = "English"
|
||||||
data["lang"] = "en"
|
data["lang"] = "en"
|
||||||
return data
|
return data
|
||||||
@@ -53,9 +58,8 @@ class HentaifoxGalleryExtractor(GalleryExtractor):
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
class HentaifoxSearchExtractor(Extractor):
|
class HentaifoxSearchExtractor(HentaifoxBase, Extractor):
|
||||||
"""Extractor for search results and listings on hentaifox.com"""
|
"""Extractor for search results and listings on hentaifox.com"""
|
||||||
category = "hentaifox"
|
|
||||||
subcategory = "search"
|
subcategory = "search"
|
||||||
pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"
|
pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"
|
||||||
r"(/(?:parody|tag|artist|character|search)/[^/?%#]+)")
|
r"(/(?:parody|tag|artist|character|search)/[^/?%#]+)")
|
||||||
@@ -76,7 +80,6 @@ class HentaifoxSearchExtractor(Extractor):
|
|||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
root = "https://hentaifox.com"
|
|
||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
|
|||||||
test = (
|
test = (
|
||||||
("https://hitomi.la/galleries/867789.html", {
|
("https://hitomi.la/galleries/867789.html", {
|
||||||
"url": "cb759868d090fe0e2655c3e29ebf146054322b6d",
|
"url": "cb759868d090fe0e2655c3e29ebf146054322b6d",
|
||||||
"keyword": "52951edb50163180eb669a78aef0bab0522d32b7",
|
"keyword": "07536afc5696cb4983a4831ab4c70c1d155f875c",
|
||||||
}),
|
}),
|
||||||
("https://hitomi.la/galleries/1036181.html", {
|
("https://hitomi.la/galleries/1036181.html", {
|
||||||
# "aa" subdomain for gallery-id ending in 1 (#142)
|
# "aa" subdomain for gallery-id ending in 1 (#142)
|
||||||
@@ -30,8 +30,8 @@ class HitomiGalleryExtractor(GalleryExtractor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
self.gid = text.parse_int(match.group(1))
|
self.gallery_id = text.parse_int(match.group(1))
|
||||||
url = "https://hitomi.la/galleries/{}.html".format(self.gid)
|
url = "https://hitomi.la/galleries/{}.html".format(self.gallery_id)
|
||||||
GalleryExtractor.__init__(self, match, url)
|
GalleryExtractor.__init__(self, match, url)
|
||||||
|
|
||||||
def metadata(self, page):
|
def metadata(self, page):
|
||||||
@@ -49,23 +49,22 @@ class HitomiGalleryExtractor(GalleryExtractor):
|
|||||||
lang = None if lang == "N/A" else text.remove_html(lang)
|
lang = None if lang == "N/A" else text.remove_html(lang)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"gallery_id": self.gid,
|
"gallery_id": self.gallery_id,
|
||||||
"title": text.unescape(" ".join(title.split())),
|
"title" : text.unescape(title.strip()),
|
||||||
"artist": self._prepare(artist),
|
"artist" : self._prepare(artist),
|
||||||
"group": self._prepare(group),
|
"group" : self._prepare(group),
|
||||||
"type": text.remove_html(gtype).capitalize(),
|
"parody" : self._prepare(series),
|
||||||
"lang": util.language_to_code(lang),
|
|
||||||
"language": lang,
|
|
||||||
"date": date,
|
|
||||||
"series": self._prepare(series),
|
|
||||||
"characters": self._prepare(chars),
|
"characters": self._prepare(chars),
|
||||||
"tags": self._prepare(tags),
|
"tags" : self._prepare(tags),
|
||||||
|
"type" : text.remove_html(gtype).capitalize(),
|
||||||
|
"lang" : util.language_to_code(lang),
|
||||||
|
"language" : lang,
|
||||||
|
"date" : date,
|
||||||
}
|
}
|
||||||
|
|
||||||
def images(self, page):
|
def images(self, page):
|
||||||
# see https://ltn.hitomi.la/common.js
|
# see https://ltn.hitomi.la/common.js
|
||||||
frontends = 2
|
offset = self.gallery_id % 2 if self.gallery_id % 10 != 1 else 0
|
||||||
offset = self.gid % frontends if self.gid % 10 != 1 else 0
|
|
||||||
subdomain = chr(97 + offset) + "a"
|
subdomain = chr(97 + offset) + "a"
|
||||||
base = "https://" + subdomain + ".hitomi.la/galleries/"
|
base = "https://" + subdomain + ".hitomi.la/galleries/"
|
||||||
|
|
||||||
@@ -78,10 +77,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _prepare(value):
|
def _prepare(value):
|
||||||
if not value or "<ul " not in value:
|
return [
|
||||||
return ""
|
text.unescape(string.capwords(v))
|
||||||
value = ", ".join(text.extract_iter(
|
for v in text.extract_iter(value or "", '.html">', '<')
|
||||||
value, '.html">', '<'))
|
]
|
||||||
return string.capwords(
|
|
||||||
text.unescape(value)
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
|
|||||||
"title_ja" : str,
|
"title_ja" : str,
|
||||||
"gallery_id": 147850,
|
"gallery_id": 147850,
|
||||||
"media_id" : 867789,
|
"media_id" : 867789,
|
||||||
|
"count" : 16,
|
||||||
"date" : 1446050915,
|
"date" : 1446050915,
|
||||||
"scanlator" : "",
|
"scanlator" : "",
|
||||||
"artist" : ["morris"],
|
"artist" : ["morris"],
|
||||||
@@ -40,8 +41,8 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
|
|||||||
"characters": list,
|
"characters": list,
|
||||||
"tags" : list,
|
"tags" : list,
|
||||||
"type" : "manga",
|
"type" : "manga",
|
||||||
"language" : ["translated", "english"],
|
|
||||||
"lang" : "en",
|
"lang" : "en",
|
||||||
|
"language" : "English",
|
||||||
"width" : int,
|
"width" : int,
|
||||||
"height" : int,
|
"height" : int,
|
||||||
},
|
},
|
||||||
@@ -63,12 +64,11 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
|
|||||||
for tag in data["tags"]:
|
for tag in data["tags"]:
|
||||||
info[tag["type"]].append(tag["name"])
|
info[tag["type"]].append(tag["name"])
|
||||||
|
|
||||||
|
language = ""
|
||||||
for language in info["language"]:
|
for language in info["language"]:
|
||||||
if language != "translated":
|
if language != "translated":
|
||||||
lang = util.language_to_code(language)
|
language = language.capitalize()
|
||||||
break
|
break
|
||||||
else:
|
|
||||||
lang = ""
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"title" : title_en or title_ja,
|
"title" : title_en or title_ja,
|
||||||
@@ -84,8 +84,8 @@ class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
|
|||||||
"characters": info["character"],
|
"characters": info["character"],
|
||||||
"tags" : info["tag"],
|
"tags" : info["tag"],
|
||||||
"type" : info["category"][0] if info["category"] else "",
|
"type" : info["category"][0] if info["category"] else "",
|
||||||
"language" : info["language"],
|
"lang" : util.language_to_code(language),
|
||||||
"lang" : lang,
|
"language" : language,
|
||||||
}
|
}
|
||||||
|
|
||||||
def images(self, _):
|
def images(self, _):
|
||||||
|
|||||||
@@ -20,24 +20,24 @@ class PururinGalleryExtractor(GalleryExtractor):
|
|||||||
test = ("https://pururin.io/gallery/38661/iowant-2", {
|
test = ("https://pururin.io/gallery/38661/iowant-2", {
|
||||||
"pattern": r"https://cdn.pururin.io/assets/images/data/38661/\d+\.jpg",
|
"pattern": r"https://cdn.pururin.io/assets/images/data/38661/\d+\.jpg",
|
||||||
"keyword": {
|
"keyword": {
|
||||||
"artist": "Shoda Norihiro",
|
"title" : "Iowant 2!!",
|
||||||
|
"title_en" : "Iowant 2!!",
|
||||||
|
"title_jp" : "",
|
||||||
|
"gallery_id": 38661,
|
||||||
|
"count" : 19,
|
||||||
|
"artist" : ["Shoda Norihiro"],
|
||||||
|
"group" : ["Obsidian Order"],
|
||||||
|
"parody" : ["Kantai Collection"],
|
||||||
"characters": ["Iowa", "Teitoku"],
|
"characters": ["Iowa", "Teitoku"],
|
||||||
|
"tags" : list,
|
||||||
|
"type" : "Doujinshi",
|
||||||
"collection": "",
|
"collection": "",
|
||||||
"convention": "C92",
|
"convention": "C92",
|
||||||
"count": 19,
|
"rating" : float,
|
||||||
"extension": "jpg",
|
"uploader" : "demo",
|
||||||
"gallery_id": 38661,
|
"scanlator" : "",
|
||||||
"group": "Obsidian Order",
|
"lang" : "en",
|
||||||
"lang": "en",
|
"language" : "English",
|
||||||
"language": "English",
|
|
||||||
"parody": "Kantai Collection",
|
|
||||||
"rating": float,
|
|
||||||
"scanlator": "",
|
|
||||||
"tags": list,
|
|
||||||
"title": "Iowant 2!!",
|
|
||||||
"title_jp": str,
|
|
||||||
"type": "Doujinshi",
|
|
||||||
"uploader": "demo"
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
root = "https://pururin.io"
|
root = "https://pururin.io"
|
||||||
@@ -74,18 +74,19 @@ class PururinGalleryExtractor(GalleryExtractor):
|
|||||||
self._ext = info["image_extension"]
|
self._ext = info["image_extension"]
|
||||||
self._cnt = info["total_pages"]
|
self._cnt = info["total_pages"]
|
||||||
|
|
||||||
for key in ("tags", "characters"):
|
for key in ("artist", "group", "parody", "tags", "characters"):
|
||||||
data[key] = [
|
data[key] = [
|
||||||
text.unescape(item)
|
text.unescape(item)
|
||||||
for item in text.extract_iter(data[key], 'title="', '"')
|
for item in text.extract_iter(data[key], 'title="', '"')
|
||||||
]
|
]
|
||||||
for key in ("artist", "group", "parody", "type", "collection",
|
for key in ("type", "collection", "language", "scanlator",
|
||||||
"language", "scanlator", "convention"):
|
"convention"):
|
||||||
data[key] = text.unescape(text.extract(
|
data[key] = text.unescape(text.extract(
|
||||||
data[key], 'title="', '"')[0] or "")
|
data[key], 'title="', '"')[0] or "")
|
||||||
|
|
||||||
data["gallery_id"] = text.parse_int(self.gallery_id)
|
data["gallery_id"] = text.parse_int(self.gallery_id)
|
||||||
data["title"] = info["title"]
|
data["title"] = info["title"] or info.get("j_title") or ""
|
||||||
|
data["title_en"] = info["title"]
|
||||||
data["title_jp"] = info.get("j_title") or ""
|
data["title_jp"] = info.get("j_title") or ""
|
||||||
data["uploader"] = text.remove_html(data["uploader"])
|
data["uploader"] = text.remove_html(data["uploader"])
|
||||||
data["rating"] = text.parse_float(data["rating"])
|
data["rating"] = text.parse_float(data["rating"])
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
|
|||||||
(("https://original-work.simply-hentai.com"
|
(("https://original-work.simply-hentai.com"
|
||||||
"/amazon-no-hiyaku-amazon-elixir"), {
|
"/amazon-no-hiyaku-amazon-elixir"), {
|
||||||
"url": "258289249990502c3138719cb89e995a60861e49",
|
"url": "258289249990502c3138719cb89e995a60861e49",
|
||||||
"keyword": "468a0a3db4fc6ad7fcae0facefb9753831c0404d",
|
"keyword": "18ab9defca53dbb2aeb7965193e93e0ea125b76b",
|
||||||
}),
|
}),
|
||||||
("https://www.simply-hentai.com/notfound", {
|
("https://www.simply-hentai.com/notfound", {
|
||||||
"exception": exception.GalleryDLException,
|
"exception": exception.GalleryDLException,
|
||||||
@@ -55,14 +55,14 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
"gallery_id": text.parse_int(gid),
|
"gallery_id": text.parse_int(gid),
|
||||||
"title": text.unescape(title),
|
"title" : text.unescape(title),
|
||||||
"series": text.remove_html(series),
|
"artist" : text.split_html(artist),
|
||||||
"characters": ", ".join(text.split_html(chars)),
|
"parody" : text.split_html(series),
|
||||||
"tags": text.split_html(tags),
|
"characters": text.split_html(chars),
|
||||||
"artist": ", ".join(text.split_html(artist)),
|
"tags" : text.split_html(tags),
|
||||||
"lang": util.language_to_code(lang),
|
"lang" : util.language_to_code(lang),
|
||||||
"language": lang,
|
"language" : lang,
|
||||||
"date": text.remove_html(date),
|
"date" : text.remove_html(date),
|
||||||
}
|
}
|
||||||
|
|
||||||
def images(self, _):
|
def images(self, _):
|
||||||
|
|||||||
@@ -48,24 +48,24 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
|
|||||||
("https://www.tsumino.com/Book/Info/40996", {
|
("https://www.tsumino.com/Book/Info/40996", {
|
||||||
"url": "84bf30a86623039fc87855680fada884dc8a1ddd",
|
"url": "84bf30a86623039fc87855680fada884dc8a1ddd",
|
||||||
"keyword": {
|
"keyword": {
|
||||||
"artist": "Itou Life",
|
"title" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
|
||||||
"characters": "Carmilla, Gudako, Gudao, Lancelot, Nightingale",
|
"title_en" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
|
||||||
"collection": "",
|
"title_jp" : "シコシコ大好きナイチンゲール + 会場限定おまけ本",
|
||||||
"count": 42,
|
|
||||||
"date": "2018 June 29",
|
|
||||||
"gallery_id": 40996,
|
"gallery_id": 40996,
|
||||||
"group": "Itou Life",
|
"date" : "2018 June 29",
|
||||||
"lang": "en",
|
"count" : 42,
|
||||||
"language": "English",
|
"collection": "",
|
||||||
"page": int,
|
"artist" : ["Itou Life"],
|
||||||
"parodies": "Fate/Grand Order",
|
"group" : ["Itou Life"],
|
||||||
"rating": float,
|
"parody" : ["Fate/Grand Order"],
|
||||||
"tags": str,
|
"characters": list,
|
||||||
"thumbnail": "http://www.tsumino.com/Image/Thumb/40996",
|
"tags" : list,
|
||||||
"title": r"re:Shikoshiko Daisuki Nightingale \+ Kaijou Gentei",
|
"type" : "Doujinshi",
|
||||||
"title_jp": "シコシコ大好きナイチンゲール + 会場限定おまけ本",
|
"rating" : float,
|
||||||
"type": "Doujinshi",
|
"uploader" : "sehki",
|
||||||
"uploader": "sehki"
|
"lang" : "en",
|
||||||
|
"language" : "English",
|
||||||
|
"thumbnail" : "http://www.tsumino.com/Image/Thumb/40996",
|
||||||
},
|
},
|
||||||
}),
|
}),
|
||||||
("https://www.tsumino.com/Read/View/45834"),
|
("https://www.tsumino.com/Read/View/45834"),
|
||||||
@@ -81,6 +81,8 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
|
|||||||
title, pos = extr(page, '"og:title" content="', '"')
|
title, pos = extr(page, '"og:title" content="', '"')
|
||||||
thumb, pos = extr(page, '"og:image" content="', '"', pos)
|
thumb, pos = extr(page, '"og:image" content="', '"', pos)
|
||||||
title_en, _, title_jp = text.unescape(title).partition("/")
|
title_en, _, title_jp = text.unescape(title).partition("/")
|
||||||
|
title_en = title_en.strip()
|
||||||
|
title_jp = title_jp.strip()
|
||||||
|
|
||||||
uploader , pos = extr(page, 'id="Uploader">' , '</div>', pos)
|
uploader , pos = extr(page, 'id="Uploader">' , '</div>', pos)
|
||||||
date , pos = extr(page, 'id="Uploaded">' , '</div>', pos)
|
date , pos = extr(page, 'id="Uploaded">' , '</div>', pos)
|
||||||
@@ -95,19 +97,20 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
"gallery_id": text.parse_int(self.gallery_id),
|
"gallery_id": text.parse_int(self.gallery_id),
|
||||||
"title": title_en.strip(),
|
"title": title_en or title_jp,
|
||||||
"title_jp": title_jp.strip(),
|
"title_en": title_en,
|
||||||
|
"title_jp": title_jp,
|
||||||
"thumbnail": thumb,
|
"thumbnail": thumb,
|
||||||
"uploader": text.remove_html(uploader),
|
"uploader": text.remove_html(uploader),
|
||||||
"date": date.strip(),
|
"date": date.strip(),
|
||||||
"rating": text.parse_float(rating.partition(" ")[0]),
|
"rating": text.parse_float(rating.partition(" ")[0]),
|
||||||
"type": text.remove_html(gtype),
|
"type": text.remove_html(gtype),
|
||||||
"collection": text.remove_html(collection),
|
"collection": text.remove_html(collection),
|
||||||
"group": text.remove_html(group),
|
"group": text.split_html(group),
|
||||||
"artist": ", ".join(text.split_html(artist)),
|
"artist": text.split_html(artist),
|
||||||
"parodies": ", ".join(text.split_html(parody)),
|
"parody": text.split_html(parody),
|
||||||
"characters": ", ".join(text.split_html(character)),
|
"characters": text.split_html(character),
|
||||||
"tags": ", ".join(text.split_html(tags)),
|
"tags": text.split_html(tags),
|
||||||
"language": "English",
|
"language": "English",
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user