From e868fb4393b5317789fcb92e27617ffa87f06a38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 26 Jan 2019 15:52:55 +0100 Subject: [PATCH] [exhentai] improve gallery extraction - match image page URLs and extract galleries from that point onward - add a few more metadata entries: 'parent', 'visible', 'cost' --- gallery_dl/extractor/exhentai.py | 138 +++++++++++++++++++------------ 1 file changed, 84 insertions(+), 54 deletions(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index cb98b999..ee7e70db 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -11,8 +11,10 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache -import time +import itertools import random +import time +import math BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org" @@ -102,10 +104,12 @@ class ExhentaiExtractor(Extractor): class ExhentaiGalleryExtractor(ExhentaiExtractor): """Extractor for image galleries from exhentai.org""" subcategory = "gallery" - pattern = [BASE_PATTERN + r"/g/(\d+)/([\da-f]{10})"] + pattern = [BASE_PATTERN + + r"(?:/g/(\d+)/([\da-f]{10})" + r"|/s/([\da-f]{10})/(\d+)-(\d+))"] test = [ ("https://exhentai.org/g/960460/4f0e369d82/", { - "keyword": "900b8dccd23c41a76e915a8df70ae77c4e0f52c7", + "keyword": "ba0785e49e3877cfa3f91c1ad9a5ac7816339bf5", "content": "493d759de534355c9f55f8e365565b62411de146", }), ("https://exhentai.org/g/960461/4f0e369d82/", { @@ -114,6 +118,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): ("http://exhentai.org/g/962698/7f02358e00/", { "exception": exception.AuthorizationError, }), + ("https://exhentai.org/s/3957343c3b/960460-5", { + "count": 2, + }), ("https://e-hentai.org/g/960460/4f0e369d82/", None), ("https://g.e-hentai.org/g/960460/4f0e369d82/", None), ] @@ -122,52 +129,60 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): ExhentaiExtractor.__init__(self) self.key = {} self.count = 0 - self.version, self.gid, self.token = match.groups() - self.gid = text.parse_int(self.gid) + self.gallery_id = text.parse_int(match.group(2) or match.group(5)) + self.gallery_token = match.group(3) + self.image_token = match.group(4) + self.image_num = text.parse_int(match.group(6), 1) def items(self): self.login() - yield Message.Version, 1 - url = "{}/g/{}/{}/".format(self.root, self.gid, self.token) - response = self.request(url, expect=range(400, 500)) - page = response.text + if self.gallery_token: + gpage = self._gallery_page() + self.image_token = text.extract(gpage, 'hentai.org/s/', '"')[0] + self.wait() + ipage = self._image_page() + else: + ipage = self._image_page() + part = text.extract(ipage, 'hentai.org/g/', '"')[0] + self.gallery_token = part.split("/")[1] + self.wait() + gpage = self._gallery_page() - if response.status_code == 404 and "Gallery Not Available" in page: - raise exception.AuthorizationError() - if page.startswith(("Key missing", "Gallery not found")): - raise exception.NotFoundError("gallery") - - data = self.get_job_metadata(page) + data = self.get_metadata(gpage) self.count = data["count"] + + yield Message.Version, 1 yield Message.Directory, data - for url, image in self.get_images(page): + images = itertools.chain( + (self.image_from_page(ipage),), self.images_from_api()) + for url, image in images: data.update(image) if "/fullimg.php" in url: data["extension"] = "" self.wait(1.5) yield Message.Url, url, data - def get_job_metadata(self, page): - """Collect metadata for extractor-job""" - data = { - "gallery_id" : self.gid, - "gallery_token": self.token, - } + def get_metadata(self, page): + """Extract gallery metadata""" data, pos = text.extract_all(page, ( - ("title" , '

', '

'), - ("title_jp" , '

', '

'), - ("date" , '>Posted:', ''), - ("language" , '>Language:', ' '), + ("title" , '

', '

'), + ("title_jp" , '

', '

'), + ("date" , '>Posted:', ''), + ("parent" , '>Parent:Visible:', '<'), + ("language" , '>Language:', ' '), ("gallery_size", '>File Size:', '<'), - ("count" , '>Length:', ' '), - ), values=data) + ("count" , '>Length:', ' '), + )) data["lang"] = util.language_to_code(data["language"]) data["title"] = text.unescape(data["title"]) data["title_jp"] = text.unescape(data["title_jp"]) data["count"] = text.parse_int(data["count"]) + data["gallery_id"] = self.gallery_id + data["gallery_token"] = self.gallery_token data["gallery_size"] = text.parse_bytes( data["gallery_size"].rstrip("Bb")) data["tags"] = [ @@ -176,17 +191,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): ] return data - def get_images(self, page): - """Collect url and metadata for all images in this gallery""" - part = text.extract(page, 'hentai.org/s/', '"')[0] - yield self.image_from_page(self.root + "/s/" + part) - yield from self.images_from_api() - - def image_from_page(self, url): + def image_from_page(self, page): """Get image url and data from webpage""" - self.wait() - page = self.request(url).text - data = text.extract_all(page, ( + info = text.extract_all(page, ( (None , '