[exhentai] use text.extract_all

This commit is contained in:
Mike Fährmann
2015-11-03 00:10:30 +01:00
parent 1fa6a99f18
commit 353ac1e00b

View File

@@ -10,7 +10,6 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import config, text from .. import config, text
import re
import os.path import os.path
import time import time
import random import random
@@ -27,7 +26,7 @@ info = {
class ExhentaiExtractor(Extractor): class ExhentaiExtractor(Extractor):
api_url = "http://exhentai.org/api.php" api_url = "http://exhentai.org/api.php"
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) Extractor.__init__(self)
@@ -64,39 +63,43 @@ class ExhentaiExtractor(Extractor):
image["name"] = name image["name"] = name
image["extension"] = ext[1:] image["extension"] = ext[1:]
if "/fullimg.php" in image[urlkey]: if "/fullimg.php" in image[urlkey]:
time.sleep( random.uniform(1, 2) ) time.sleep(random.uniform(1, 2))
yield Message.Url, image[urlkey], image yield Message.Url, image[urlkey], image
def get_job_metadata(self, page): def get_job_metadata(self, page):
title , pos = text.extract(page, '<h1 id="gn">', '</h1>') """Collect metadata for extractor-job"""
title_jp, pos = text.extract(page, '<h1 id="gj">', '</h1>', pos) data = {
date , pos = text.extract(page, '>Posted:</td><td class="gdt2">', '</td>', pos) "category" : info["category"],
language, pos = text.extract(page, '>Language:</td><td class="gdt2">', '</td>', pos) "gallery-id" : self.gid,
size , pos = text.extract(page, '>File Size:</td><td class="gdt2">', ' ', pos)
url , pos = text.extract(page, 'hentai.org/s/', '"', pos)
return {
"category": info["category"],
"gallery-id": self.gid,
"gallery-token": self.token, "gallery-token": self.token,
"title": title, }
"title-jp": title_jp, data, _ = text.extract_all(page, (
"date": date, ("title" , '<h1 id="gn">', '</h1>'),
"language": language, ("title_jp", '<h1 id="gj">', '</h1>'),
"size": size, ("date" , '>Posted:</td><td class="gdt2">', '</td>'),
}, "http://exhentai.org/s/" + url ("language", '>Language:</td><td class="gdt2">', '</td>'),
("size" , '>File Size:</td><td class="gdt2">', ' '),
("count" , '>Length:</td><td class="gdt2">', ' '),
("url" , 'hentai.org/s/', '"'),
), values=data)
url = "http://exhentai.org/s/" + data["url"]
del data["url"]
return data, url
def get_images(self, url): def get_images(self, url):
time.sleep( random.uniform(3, 6) ) """Collect url and metadata for all images in this gallery"""
time.sleep(random.uniform(3, 6))
page = self.request(url).text page = self.request(url).text
data = {} data, pos = text.extract_all(page, (
_ , pos = text.extract(page, '<div id="i3"><a onclick="return load_image(', '') (None , '<div id="i3"><a onclick="return load_image(', ''),
data["imgkey"] , pos = text.extract(page, "'", "'", pos) ("imgkey" , "'", "'"),
data["url"] , pos = text.extract(page, '<img id="img" src="', '"', pos) ("url" , '<img id="img" src="', '"'),
data["title"] , pos = text.extract(page, '<div id="i4"><div>', ' :: ', pos) ("title" , '<div id="i4"><div>', ' :: '),
data["origurl"] , pos = text.extract(page, 'http://exhentai.org/fullimg.php', '"', pos) ("origurl" , 'http://exhentai.org/fullimg.php', '"'),
data["gid"] , pos = text.extract(page, 'var gid=' , ';', pos) ("gid" , 'var gid=', ';'),
data["startkey"], pos = text.extract(page, 'var startkey="', '";', pos) ("startkey", 'var startkey="', '";'),
data["showkey"] , pos = text.extract(page, 'var showkey="' , '";', pos) ("showkey" , 'var showkey="', '";'),
))
if data["origurl"]: if data["origurl"]:
data["origurl"] = "http://exhentai.org/fullimg.php" + text.unescape(data["origurl"]) data["origurl"] = "http://exhentai.org/fullimg.php" + text.unescape(data["origurl"])
else: else:
@@ -111,10 +114,7 @@ class ExhentaiExtractor(Extractor):
"showkey": data["showkey"], "showkey": data["showkey"],
} }
while True: while True:
time.sleep( random.uniform(3, 6) ) time.sleep(random.uniform(3, 6))
# page = safe_request(
# self.session, self.api_url, method="POST", json=request
# ).json
page = self.session.post(self.api_url, json=request).json() page = self.session.post(self.api_url, json=request).json()
data["imgkey"] , pos = text.extract(page["i3"], "'", "'") data["imgkey"] , pos = text.extract(page["i3"], "'", "'")
data["url"] , pos = text.extract(page["i3"], '<img id="img" src="', '"', pos) data["url"] , pos = text.extract(page["i3"], '<img id="img" src="', '"', pos)