[exhentai] improve gallery extraction

- match image page URLs and extract galleries from that point onward
- add a few more metadata entries: 'parent', 'visible', 'cost'
This commit is contained in:
Mike Fährmann
2019-01-26 15:52:55 +01:00
parent a50e9faf0e
commit e868fb4393

View File

@@ -11,8 +11,10 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache from ..cache import cache
import time import itertools
import random import random
import time
import math
BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org" BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org"
@@ -102,10 +104,12 @@ class ExhentaiExtractor(Extractor):
class ExhentaiGalleryExtractor(ExhentaiExtractor): class ExhentaiGalleryExtractor(ExhentaiExtractor):
"""Extractor for image galleries from exhentai.org""" """Extractor for image galleries from exhentai.org"""
subcategory = "gallery" subcategory = "gallery"
pattern = [BASE_PATTERN + r"/g/(\d+)/([\da-f]{10})"] pattern = [BASE_PATTERN +
r"(?:/g/(\d+)/([\da-f]{10})"
r"|/s/([\da-f]{10})/(\d+)-(\d+))"]
test = [ test = [
("https://exhentai.org/g/960460/4f0e369d82/", { ("https://exhentai.org/g/960460/4f0e369d82/", {
"keyword": "900b8dccd23c41a76e915a8df70ae77c4e0f52c7", "keyword": "ba0785e49e3877cfa3f91c1ad9a5ac7816339bf5",
"content": "493d759de534355c9f55f8e365565b62411de146", "content": "493d759de534355c9f55f8e365565b62411de146",
}), }),
("https://exhentai.org/g/960461/4f0e369d82/", { ("https://exhentai.org/g/960461/4f0e369d82/", {
@@ -114,6 +118,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
("http://exhentai.org/g/962698/7f02358e00/", { ("http://exhentai.org/g/962698/7f02358e00/", {
"exception": exception.AuthorizationError, "exception": exception.AuthorizationError,
}), }),
("https://exhentai.org/s/3957343c3b/960460-5", {
"count": 2,
}),
("https://e-hentai.org/g/960460/4f0e369d82/", None), ("https://e-hentai.org/g/960460/4f0e369d82/", None),
("https://g.e-hentai.org/g/960460/4f0e369d82/", None), ("https://g.e-hentai.org/g/960460/4f0e369d82/", None),
] ]
@@ -122,52 +129,60 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
ExhentaiExtractor.__init__(self) ExhentaiExtractor.__init__(self)
self.key = {} self.key = {}
self.count = 0 self.count = 0
self.version, self.gid, self.token = match.groups() self.gallery_id = text.parse_int(match.group(2) or match.group(5))
self.gid = text.parse_int(self.gid) self.gallery_token = match.group(3)
self.image_token = match.group(4)
self.image_num = text.parse_int(match.group(6), 1)
def items(self): def items(self):
self.login() self.login()
yield Message.Version, 1
url = "{}/g/{}/{}/".format(self.root, self.gid, self.token) if self.gallery_token:
response = self.request(url, expect=range(400, 500)) gpage = self._gallery_page()
page = response.text self.image_token = text.extract(gpage, 'hentai.org/s/', '"')[0]
self.wait()
ipage = self._image_page()
else:
ipage = self._image_page()
part = text.extract(ipage, 'hentai.org/g/', '"')[0]
self.gallery_token = part.split("/")[1]
self.wait()
gpage = self._gallery_page()
if response.status_code == 404 and "Gallery Not Available" in page: data = self.get_metadata(gpage)
raise exception.AuthorizationError()
if page.startswith(("Key missing", "Gallery not found")):
raise exception.NotFoundError("gallery")
data = self.get_job_metadata(page)
self.count = data["count"] self.count = data["count"]
yield Message.Version, 1
yield Message.Directory, data yield Message.Directory, data
for url, image in self.get_images(page): images = itertools.chain(
(self.image_from_page(ipage),), self.images_from_api())
for url, image in images:
data.update(image) data.update(image)
if "/fullimg.php" in url: if "/fullimg.php" in url:
data["extension"] = "" data["extension"] = ""
self.wait(1.5) self.wait(1.5)
yield Message.Url, url, data yield Message.Url, url, data
def get_job_metadata(self, page): def get_metadata(self, page):
"""Collect metadata for extractor-job""" """Extract gallery metadata"""
data = {
"gallery_id" : self.gid,
"gallery_token": self.token,
}
data, pos = text.extract_all(page, ( data, pos = text.extract_all(page, (
("title" , '<h1 id="gn">', '</h1>'), ("title" , '<h1 id="gn">', '</h1>'),
("title_jp" , '<h1 id="gj">', '</h1>'), ("title_jp" , '<h1 id="gj">', '</h1>'),
("date" , '>Posted:</td><td class="gdt2">', '</td>'), ("date" , '>Posted:</td><td class="gdt2">', '</td>'),
("language" , '>Language:</td><td class="gdt2">', ' '), ("parent" , '>Parent:</td><td class="gdt2"><a href="', '"'),
("visible" , '>Visible:</td><td class="gdt2">', '<'),
("language" , '>Language:</td><td class="gdt2">', ' '),
("gallery_size", '>File Size:</td><td class="gdt2">', '<'), ("gallery_size", '>File Size:</td><td class="gdt2">', '<'),
("count" , '>Length:</td><td class="gdt2">', ' '), ("count" , '>Length:</td><td class="gdt2">', ' '),
), values=data) ))
data["lang"] = util.language_to_code(data["language"]) data["lang"] = util.language_to_code(data["language"])
data["title"] = text.unescape(data["title"]) data["title"] = text.unescape(data["title"])
data["title_jp"] = text.unescape(data["title_jp"]) data["title_jp"] = text.unescape(data["title_jp"])
data["count"] = text.parse_int(data["count"]) data["count"] = text.parse_int(data["count"])
data["gallery_id"] = self.gallery_id
data["gallery_token"] = self.gallery_token
data["gallery_size"] = text.parse_bytes( data["gallery_size"] = text.parse_bytes(
data["gallery_size"].rstrip("Bb")) data["gallery_size"].rstrip("Bb"))
data["tags"] = [ data["tags"] = [
@@ -176,17 +191,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
] ]
return data return data
def get_images(self, page): def image_from_page(self, page):
"""Collect url and metadata for all images in this gallery"""
part = text.extract(page, 'hentai.org/s/', '"')[0]
yield self.image_from_page(self.root + "/s/" + part)
yield from self.images_from_api()
def image_from_page(self, url):
"""Get image url and data from webpage""" """Get image url and data from webpage"""
self.wait() info = text.extract_all(page, (
page = self.request(url).text
data = text.extract_all(page, (
(None , '<div id="i3"><a onclick="return load_image(', ''), (None , '<div id="i3"><a onclick="return load_image(', ''),
("nextkey" , "'", "'"), ("nextkey" , "'", "'"),
("url" , '<img id="img" src="', '"'), ("url" , '<img id="img" src="', '"'),
@@ -195,21 +202,21 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
("startkey", 'var startkey="', '";'), ("startkey", 'var startkey="', '";'),
("showkey" , 'var showkey="', '";'), ("showkey" , 'var showkey="', '";'),
))[0] ))[0]
self.key["start"] = data["startkey"] self.key["start"] = info["startkey"]
self.key["show"] = data["showkey"] self.key["show"] = info["showkey"]
self.key["next"] = data["nextkey"] self.key["next"] = info["nextkey"]
if self.original and data["origurl"]: if self.original and info["origurl"]:
part = text.unescape(data["origurl"]) part = text.unescape(info["origurl"])
url = self.root + "/fullimg.php" + part url = self.root + "/fullimg.php" + part
info = self._parse_original_info(data["originfo"]) data = self._parse_original_info(info["originfo"])
else: else:
url = data["url"] url = info["url"]
info = self._parse_image_info(url) data = self._parse_image_info(url)
info["num"] = 1 data["num"] = self.image_num
info["image_token"] = data["startkey"] data["image_token"] = info["startkey"]
return url, text.nameext_from_url(data["url"], info) return url, text.nameext_from_url(info["url"], data)
def images_from_api(self): def images_from_api(self):
"""Get image url and data from api calls""" """Get image url and data from api calls"""
@@ -217,11 +224,11 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
nextkey = self.key["next"] nextkey = self.key["next"]
request = { request = {
"method" : "showpage", "method" : "showpage",
"gid" : self.gid, "gid" : self.gallery_id,
"imgkey" : nextkey, "imgkey" : nextkey,
"showkey": self.key["show"], "showkey": self.key["show"],
} }
for request["page"] in range(2, self.count + 1): for request["page"] in range(self.image_num + 1, self.count + 1):
self.wait() self.wait()
page = self.request(api_url, method="POST", json=request).json() page = self.request(api_url, method="POST", json=request).json()
imgkey = nextkey imgkey = nextkey
@@ -232,8 +239,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
if self.original and origurl: if self.original and origurl:
url = text.unescape(origurl) url = text.unescape(origurl)
data = self._parse_original_info( data = self._parse_original_info(
text.extract(page["i7"], "ownload original", "<", pos)[0] text.extract(page["i7"], "ownload original", "<", pos)[0])
)
else: else:
url = imgurl url = imgurl
data = self._parse_image_info(url) data = self._parse_image_info(url)
@@ -244,6 +250,27 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
request["imgkey"] = nextkey request["imgkey"] = nextkey
def _gallery_page(self):
url = "{}/g/{}/{}/".format(
self.root, self.gallery_id, self.gallery_token)
response = self.request(url, expect=range(400, 500))
page = response.text
if response.status_code == 404 and "Gallery Not Available" in page:
raise exception.AuthorizationError()
if page.startswith(("Key missing", "Gallery not found")):
raise exception.NotFoundError("gallery")
return page
def _image_page(self):
url = "{}/s/{}/{}-{}".format(
self.root, self.image_token, self.gallery_id, self.image_num)
page = self.request(url, expect=range(400, 500)).text
if page.startswith(("Invalid page", "Keep trying")):
raise exception.NotFoundError("image page")
return page
@staticmethod @staticmethod
def _parse_image_info(url): def _parse_image_info(url):
parts = url.split("/")[4].split("-") parts = url.split("/")[4].split("-")
@@ -251,15 +278,18 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"width": text.parse_int(parts[2]), "width": text.parse_int(parts[2]),
"height": text.parse_int(parts[3]), "height": text.parse_int(parts[3]),
"size": text.parse_int(parts[1]), "size": text.parse_int(parts[1]),
"cost": 1,
} }
@staticmethod @staticmethod
def _parse_original_info(info): def _parse_original_info(info):
parts = info.lstrip().split(" ") parts = info.lstrip().split(" ")
size = text.parse_bytes(parts[3] + parts[4][0])
return { return {
"width": text.parse_int(parts[0]), "width": text.parse_int(parts[0]),
"height": text.parse_int(parts[2]), "height": text.parse_int(parts[2]),
"size": text.parse_bytes(parts[3] + parts[4][0]), "size": size,
"cost": 1 + math.ceil(size * 5 / 1024 / 1024)
} }