[hentaicafe] extract 'tags' and 'artist' metadata (closes #238)
These metadata fields will only be filled in when using a top-level URL, because that's the only place this information is available. Using a Foolslide URL (1) will leave these fields empty. (1) https://hentai.cafe/manga/read/.../en/0/1/"
This commit is contained in:
@@ -10,6 +10,7 @@
|
||||
|
||||
from . import foolslide
|
||||
from .. import text
|
||||
from ..cache import memcache
|
||||
import re
|
||||
|
||||
|
||||
@@ -21,17 +22,22 @@ class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor):
|
||||
r"(/manga/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)")
|
||||
test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", {
|
||||
"url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2",
|
||||
"keyword": "7182d262810faa692827c947d2f360dfcb8d5e43",
|
||||
"keyword": "6913608267d883c82b887303b9ced13821188329",
|
||||
})
|
||||
root = "https://hentai.cafe"
|
||||
|
||||
def metadata(self, page):
|
||||
info = text.unescape(text.extract(page, '<title>', '</title>')[0])
|
||||
manga, _, chapter_string = info.partition(" :: ")
|
||||
return self.parse_chapter_url(self.chapter_url, {
|
||||
"manga": manga,
|
||||
"chapter_string": chapter_string.rstrip(" :"),
|
||||
})
|
||||
|
||||
data = self._data(self.chapter_url.split("/")[5])
|
||||
data["manga"] = manga
|
||||
data["chapter_string"] = chapter_string.rstrip(" :")
|
||||
return self.parse_chapter_url(self.chapter_url, data)
|
||||
|
||||
@memcache(keyarg=1)
|
||||
def _data(self, manga):
|
||||
return {"artist": [], "tags": []}
|
||||
|
||||
|
||||
class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor):
|
||||
@@ -43,10 +49,12 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor):
|
||||
# single chapter
|
||||
("https://hentai.cafe/hazuki-yuuto-summer-blues/", {
|
||||
"url": "f8e24a07d6fbb7c6a6ec5ad8ad8faf2436f8751b",
|
||||
"keyword": "eb9f98544098c961bd8cf5dbe69e6da51c4fb2f6",
|
||||
}),
|
||||
# multi-chapter
|
||||
("https://hentai.cafe/saitom-saitom-box/", {
|
||||
"url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
|
||||
"keyword": "28271062d7b4a2f99a0e1a894f69af8c5581a6bb",
|
||||
}),
|
||||
# foolslide URL
|
||||
("https://hentai.cafe/manga/series/saitom-box/", {
|
||||
@@ -64,8 +72,17 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor):
|
||||
chapters.reverse()
|
||||
return chapters
|
||||
|
||||
tags , pos = text.extract(page, "<p>Tags: ", "</br>")
|
||||
artist, pos = text.extract(page, "\nArtists: ", "</br>", pos)
|
||||
manga , pos = text.extract(page, "/manga/read/", "/", pos)
|
||||
data = {
|
||||
"tags" : text.split_html(tags)[::2],
|
||||
"artist": text.split_html(artist),
|
||||
}
|
||||
HentaicafeChapterExtractor._data(manga).update(data)
|
||||
|
||||
return [
|
||||
(url, {})
|
||||
(url, data)
|
||||
for url in re.findall(
|
||||
r'<a +class="x-btn[^"]*" +href="([^"]+)"', page)
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user