[exhentai] use text.extract_all
This commit is contained in:
@@ -10,7 +10,6 @@
|
|||||||
|
|
||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import config, text
|
from .. import config, text
|
||||||
import re
|
|
||||||
import os.path
|
import os.path
|
||||||
import time
|
import time
|
||||||
import random
|
import random
|
||||||
@@ -27,7 +26,7 @@ info = {
|
|||||||
|
|
||||||
class ExhentaiExtractor(Extractor):
|
class ExhentaiExtractor(Extractor):
|
||||||
|
|
||||||
api_url = "http://exhentai.org/api.php"
|
api_url = "http://exhentai.org/api.php"
|
||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
Extractor.__init__(self)
|
Extractor.__init__(self)
|
||||||
@@ -64,39 +63,43 @@ class ExhentaiExtractor(Extractor):
|
|||||||
image["name"] = name
|
image["name"] = name
|
||||||
image["extension"] = ext[1:]
|
image["extension"] = ext[1:]
|
||||||
if "/fullimg.php" in image[urlkey]:
|
if "/fullimg.php" in image[urlkey]:
|
||||||
time.sleep( random.uniform(1, 2) )
|
time.sleep(random.uniform(1, 2))
|
||||||
yield Message.Url, image[urlkey], image
|
yield Message.Url, image[urlkey], image
|
||||||
|
|
||||||
def get_job_metadata(self, page):
|
def get_job_metadata(self, page):
|
||||||
title , pos = text.extract(page, '<h1 id="gn">', '</h1>')
|
"""Collect metadata for extractor-job"""
|
||||||
title_jp, pos = text.extract(page, '<h1 id="gj">', '</h1>', pos)
|
data = {
|
||||||
date , pos = text.extract(page, '>Posted:</td><td class="gdt2">', '</td>', pos)
|
"category" : info["category"],
|
||||||
language, pos = text.extract(page, '>Language:</td><td class="gdt2">', '</td>', pos)
|
"gallery-id" : self.gid,
|
||||||
size , pos = text.extract(page, '>File Size:</td><td class="gdt2">', ' ', pos)
|
|
||||||
url , pos = text.extract(page, 'hentai.org/s/', '"', pos)
|
|
||||||
return {
|
|
||||||
"category": info["category"],
|
|
||||||
"gallery-id": self.gid,
|
|
||||||
"gallery-token": self.token,
|
"gallery-token": self.token,
|
||||||
"title": title,
|
}
|
||||||
"title-jp": title_jp,
|
data, _ = text.extract_all(page, (
|
||||||
"date": date,
|
("title" , '<h1 id="gn">', '</h1>'),
|
||||||
"language": language,
|
("title_jp", '<h1 id="gj">', '</h1>'),
|
||||||
"size": size,
|
("date" , '>Posted:</td><td class="gdt2">', '</td>'),
|
||||||
}, "http://exhentai.org/s/" + url
|
("language", '>Language:</td><td class="gdt2">', '</td>'),
|
||||||
|
("size" , '>File Size:</td><td class="gdt2">', ' '),
|
||||||
|
("count" , '>Length:</td><td class="gdt2">', ' '),
|
||||||
|
("url" , 'hentai.org/s/', '"'),
|
||||||
|
), values=data)
|
||||||
|
url = "http://exhentai.org/s/" + data["url"]
|
||||||
|
del data["url"]
|
||||||
|
return data, url
|
||||||
|
|
||||||
def get_images(self, url):
|
def get_images(self, url):
|
||||||
time.sleep( random.uniform(3, 6) )
|
"""Collect url and metadata for all images in this gallery"""
|
||||||
|
time.sleep(random.uniform(3, 6))
|
||||||
page = self.request(url).text
|
page = self.request(url).text
|
||||||
data = {}
|
data, pos = text.extract_all(page, (
|
||||||
_ , pos = text.extract(page, '<div id="i3"><a onclick="return load_image(', '')
|
(None , '<div id="i3"><a onclick="return load_image(', ''),
|
||||||
data["imgkey"] , pos = text.extract(page, "'", "'", pos)
|
("imgkey" , "'", "'"),
|
||||||
data["url"] , pos = text.extract(page, '<img id="img" src="', '"', pos)
|
("url" , '<img id="img" src="', '"'),
|
||||||
data["title"] , pos = text.extract(page, '<div id="i4"><div>', ' :: ', pos)
|
("title" , '<div id="i4"><div>', ' :: '),
|
||||||
data["origurl"] , pos = text.extract(page, 'http://exhentai.org/fullimg.php', '"', pos)
|
("origurl" , 'http://exhentai.org/fullimg.php', '"'),
|
||||||
data["gid"] , pos = text.extract(page, 'var gid=' , ';', pos)
|
("gid" , 'var gid=', ';'),
|
||||||
data["startkey"], pos = text.extract(page, 'var startkey="', '";', pos)
|
("startkey", 'var startkey="', '";'),
|
||||||
data["showkey"] , pos = text.extract(page, 'var showkey="' , '";', pos)
|
("showkey" , 'var showkey="', '";'),
|
||||||
|
))
|
||||||
if data["origurl"]:
|
if data["origurl"]:
|
||||||
data["origurl"] = "http://exhentai.org/fullimg.php" + text.unescape(data["origurl"])
|
data["origurl"] = "http://exhentai.org/fullimg.php" + text.unescape(data["origurl"])
|
||||||
else:
|
else:
|
||||||
@@ -111,10 +114,7 @@ class ExhentaiExtractor(Extractor):
|
|||||||
"showkey": data["showkey"],
|
"showkey": data["showkey"],
|
||||||
}
|
}
|
||||||
while True:
|
while True:
|
||||||
time.sleep( random.uniform(3, 6) )
|
time.sleep(random.uniform(3, 6))
|
||||||
# page = safe_request(
|
|
||||||
# self.session, self.api_url, method="POST", json=request
|
|
||||||
# ).json
|
|
||||||
page = self.session.post(self.api_url, json=request).json()
|
page = self.session.post(self.api_url, json=request).json()
|
||||||
data["imgkey"] , pos = text.extract(page["i3"], "'", "'")
|
data["imgkey"] , pos = text.extract(page["i3"], "'", "'")
|
||||||
data["url"] , pos = text.extract(page["i3"], '<img id="img" src="', '"', pos)
|
data["url"] , pos = text.extract(page["i3"], '<img id="img" src="', '"', pos)
|
||||||
|
|||||||
Reference in New Issue
Block a user