[exhentai] update data extraction code
- parse 'date' to datetime object - use 'text.extract_from()'
This commit is contained in:
@@ -110,7 +110,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
|
|||||||
r"|/s/([\da-f]{10})/(\d+)-(\d+))")
|
r"|/s/([\da-f]{10})/(\d+)-(\d+))")
|
||||||
test = (
|
test = (
|
||||||
("https://exhentai.org/g/960460/4f0e369d82/", {
|
("https://exhentai.org/g/960460/4f0e369d82/", {
|
||||||
"keyword": "993bfaf68b4823084fbd0d3339564666463b1432",
|
"keyword": "1532ca4d0e4e0738dc994ca725a228af04a4e480",
|
||||||
"content": "493d759de534355c9f55f8e365565b62411de146",
|
"content": "493d759de534355c9f55f8e365565b62411de146",
|
||||||
}),
|
}),
|
||||||
("https://exhentai.org/g/960461/4f0e369d82/", {
|
("https://exhentai.org/g/960461/4f0e369d82/", {
|
||||||
@@ -169,57 +169,55 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
|
|||||||
|
|
||||||
def get_metadata(self, page):
|
def get_metadata(self, page):
|
||||||
"""Extract gallery metadata"""
|
"""Extract gallery metadata"""
|
||||||
data, pos = text.extract_all(page, (
|
extr = text.extract_from(page)
|
||||||
("title" , '<h1 id="gn">', '</h1>'),
|
data = {
|
||||||
("title_jp" , '<h1 id="gj">', '</h1>'),
|
"gallery_id" : self.gallery_id,
|
||||||
("date" , '>Posted:</td><td class="gdt2">', '</td>'),
|
"gallery_token": self.gallery_token,
|
||||||
("parent" , '>Parent:</td><td class="gdt2"><a href="', '"'),
|
"title" : text.unescape(extr('<h1 id="gn">', '</h1>')),
|
||||||
("visible" , '>Visible:</td><td class="gdt2">', '<'),
|
"title_jp" : text.unescape(extr('<h1 id="gj">', '</h1>')),
|
||||||
("language" , '>Language:</td><td class="gdt2">', ' '),
|
"date" : text.parse_datetime(extr(
|
||||||
("gallery_size", '>File Size:</td><td class="gdt2">', '<'),
|
'>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"),
|
||||||
("count" , '>Length:</td><td class="gdt2">', ' '),
|
"parent" : extr(
|
||||||
))
|
'>Parent:</td><td class="gdt2"><a href="', '"'),
|
||||||
|
"visible" : extr(
|
||||||
|
'>Visible:</td><td class="gdt2">', '<'),
|
||||||
|
"language" : extr(
|
||||||
|
'>Language:</td><td class="gdt2">', ' '),
|
||||||
|
"gallery_size" : text.parse_bytes(extr(
|
||||||
|
'>File Size:</td><td class="gdt2">', '<').rstrip("Bb")),
|
||||||
|
"count" : text.parse_int(extr(
|
||||||
|
'>Length:</td><td class="gdt2">', ' ')),
|
||||||
|
}
|
||||||
|
|
||||||
data["lang"] = util.language_to_code(data["language"])
|
data["lang"] = util.language_to_code(data["language"])
|
||||||
data["title"] = text.unescape(data["title"])
|
|
||||||
data["title_jp"] = text.unescape(data["title_jp"])
|
|
||||||
data["count"] = text.parse_int(data["count"])
|
|
||||||
data["gallery_id"] = self.gallery_id
|
|
||||||
data["gallery_token"] = self.gallery_token
|
|
||||||
data["gallery_size"] = text.parse_bytes(
|
|
||||||
data["gallery_size"].rstrip("Bb"))
|
|
||||||
data["tags"] = [
|
data["tags"] = [
|
||||||
text.unquote(tag)
|
text.unquote(tag)
|
||||||
for tag in text.extract_iter(page, 'hentai.org/tag/', '"', pos)
|
for tag in text.extract_iter(page, 'hentai.org/tag/', '"')
|
||||||
]
|
]
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def image_from_page(self, page):
|
def image_from_page(self, page):
|
||||||
"""Get image url and data from webpage"""
|
"""Get image url and data from webpage"""
|
||||||
info = text.extract_all(page, (
|
pos = page.index('<div id="i3"><a onclick="return load_image(') + 26
|
||||||
(None , '<div id="i3"><a onclick="return load_image(', ''),
|
extr = text.extract_from(page, pos)
|
||||||
("nextkey" , "'", "'"),
|
|
||||||
("url" , '<img id="img" src="', '"'),
|
|
||||||
("origurl" , 'hentai.org/fullimg.php', '"'),
|
|
||||||
("originfo", 'ownload original', '<'),
|
|
||||||
("startkey", 'var startkey="', '";'),
|
|
||||||
("showkey" , 'var showkey="', '";'),
|
|
||||||
))[0]
|
|
||||||
self.key["start"] = info["startkey"]
|
|
||||||
self.key["show"] = info["showkey"]
|
|
||||||
self.key["next"] = info["nextkey"]
|
|
||||||
|
|
||||||
if self.original and info["origurl"]:
|
self.key["next"] = extr("'", "'")
|
||||||
part = text.unescape(info["origurl"])
|
iurl = extr('<img id="img" src="', '"')
|
||||||
url = self.root + "/fullimg.php" + part
|
orig = extr('hentai.org/fullimg.php', '"')
|
||||||
data = self._parse_original_info(info["originfo"])
|
|
||||||
|
if self.original and orig:
|
||||||
|
url = self.root + "/fullimg.php" + text.unescape(orig)
|
||||||
|
data = self._parse_original_info(extr('ownload original', '<'))
|
||||||
else:
|
else:
|
||||||
url = info["url"]
|
url = iurl
|
||||||
data = self._parse_image_info(url)
|
data = self._parse_image_info(url)
|
||||||
|
|
||||||
data["num"] = self.image_num
|
data["num"] = self.image_num
|
||||||
data["image_token"] = info["startkey"]
|
data["image_token"] = self.key["start"] = extr('var startkey="', '";')
|
||||||
return url, text.nameext_from_url(info["url"], data)
|
self.key["show"] = extr('var showkey="', '";')
|
||||||
|
|
||||||
|
return url, text.nameext_from_url(iurl, data)
|
||||||
|
|
||||||
def images_from_api(self):
|
def images_from_api(self):
|
||||||
"""Get image url and data from api calls"""
|
"""Get image url and data from api calls"""
|
||||||
|
|||||||
Reference in New Issue
Block a user