[hentaifox] improve metadata extraction (fixes #1378)
This commit is contained in:
@@ -22,27 +22,56 @@ class HentaifoxBase():
|
||||
class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
|
||||
"""Extractor for image galleries on hentaifox.com"""
|
||||
pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
|
||||
test = ("https://hentaifox.com/gallery/56622/", {
|
||||
"pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
|
||||
"keyword": "bcd6b67284f378e5cc30b89b761140e3e60fcd92",
|
||||
"count": 24,
|
||||
})
|
||||
test = (
|
||||
("https://hentaifox.com/gallery/56622/", {
|
||||
"pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
|
||||
"keyword": "bcd6b67284f378e5cc30b89b761140e3e60fcd92",
|
||||
"count": 24,
|
||||
}),
|
||||
# 'split_tag' element (#1378)
|
||||
("https://hentaifox.com/gallery/630/", {
|
||||
"keyword": {
|
||||
"artist": ["beti", "betty", "magi", "mimikaki"],
|
||||
"characters": [
|
||||
"aerith gainsborough",
|
||||
"tifa lockhart",
|
||||
"yuffie kisaragi"
|
||||
],
|
||||
"count": 32,
|
||||
"gallery_id": 630,
|
||||
"group": ["cu-little2"],
|
||||
"parody": ["darkstalkers | vampire", "final fantasy vii"],
|
||||
"tags": ["femdom", "fingering", "masturbation", "yuri"],
|
||||
"title": "Cu-Little Bakanya~",
|
||||
"type": "doujinshi",
|
||||
},
|
||||
}),
|
||||
)
|
||||
|
||||
def __init__(self, match):
|
||||
GalleryExtractor.__init__(self, match)
|
||||
self.gallery_id = match.group(2)
|
||||
|
||||
def metadata(self, page, split=text.split_html):
|
||||
@staticmethod
|
||||
def _split(txt):
|
||||
return [
|
||||
text.remove_html(tag.partition(">")[2], "", "")
|
||||
for tag in text.extract_iter(
|
||||
txt, "class='tag_btn", "<span class='t_badge")
|
||||
]
|
||||
|
||||
def metadata(self, page):
|
||||
extr = text.extract_from(page)
|
||||
split = self._split
|
||||
|
||||
return {
|
||||
"gallery_id": text.parse_int(self.gallery_id),
|
||||
"title" : text.unescape(extr("<h1>", "</h1>")),
|
||||
"parody" : split(extr(">Parodies:" , "</ul>"))[::2],
|
||||
"characters": split(extr(">Characters:", "</ul>"))[::2],
|
||||
"tags" : split(extr(">Tags:" , "</ul>"))[::2],
|
||||
"artist" : split(extr(">Artists:" , "</ul>"))[::2],
|
||||
"group" : split(extr(">Groups:" , "</ul>"))[::2],
|
||||
"parody" : split(extr(">Parodies:" , "</ul>")),
|
||||
"characters": split(extr(">Characters:", "</ul>")),
|
||||
"tags" : split(extr(">Tags:" , "</ul>")),
|
||||
"artist" : split(extr(">Artists:" , "</ul>")),
|
||||
"group" : split(extr(">Groups:" , "</ul>")),
|
||||
"type" : text.remove_html(extr(">Category:", "<span")),
|
||||
"language" : "English",
|
||||
"lang" : "en",
|
||||
|
||||
Reference in New Issue
Block a user