From e3d156078cac3e9fe00d6add043fecd270c97e6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 20 Sep 2016 19:01:16 +0200 Subject: [PATCH] [exhentai] rewrite --- gallery_dl/extractor/exhentai.py | 156 +++++++++++++++---------------- 1 file changed, 78 insertions(+), 78 deletions(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 39fdda7c..e72a249a 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -19,57 +19,54 @@ class ExhentaiGalleryExtractor(Extractor): category = "exhentai" subcategory = "gallery" directory_fmt = ["{category}", "{gallery-id}"] - filename_fmt = "{gallery-id}_{num:>04}_{imgkey}_{name}.{extension}" - pattern = [r"(?:https?://)?(g\.e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"] + filename_fmt = "{gallery-id}_{num:>04}_{image-token}_{name}.{extension}" + pattern = [r"(?:https?://)?(?:g\.e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"] test = [("https://exhentai.org/g/960460/4f0e369d82/", { - "keyword": "aaac45cad1897a9815384bc3a743ce7502c692f6", + "keyword": "c1282ffbe5d452c62dec9dbde4ecb7037525cd64", "content": "493d759de534355c9f55f8e365565b62411de146", })] api_url = "https://exhentai.org/api.php" def __init__(self, match): Extractor.__init__(self) + self.key = {} self.url = match.group(0) - self.version, self.gid, self.token = match.groups() - self.login() - self.session.headers.update({ - "User-Agent": "Mozilla/5.0", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", - "Accept-Language": "en-US,en;q=0.5", - "Referer": "https://exhentai.org/", - }) + self.gid, self.token = match.groups() + self.original = config.interpolate(("extractor", "exhentai", "download-original"), True) self.wait_min = config.interpolate(("extractor", "exhentai", "wait-min"), 3) self.wait_max = config.interpolate(("extractor", "exhentai", "wait-max"), 6) if self.wait_max < self.wait_min: self.wait_max = self.wait_min def items(self): + self.login() yield Message.Version, 1 - page = self.request(self.url).text - if page.startswith("Key missing") \ - or page.startswith("Gallery not found"): - raise exception.NotFoundError("gallery") - data, url = self.get_job_metadata(page) - - headers = self.session.headers.copy() - headers["Accept"] = "image/png,image/*;q=0.8,*/*;q=0.5" - yield Message.Headers, headers + yield Message.Headers, self.setup_headers() yield Message.Cookies, self.session.cookies + + page = self.request(self.url).text + if page.startswith(("Key missing", "Gallery not found")): + raise exception.NotFoundError("gallery") + data = self.get_job_metadata(page) yield Message.Directory, data - urlkey = "url" - if config.interpolate(("extractor", "exhentai", "download-original"), True): - urlkey = "origurl" - for num, image in enumerate(self.get_images(url), 1): - image.update(data) - image["num"] = num - text.nameext_from_url(image["url"], image) - url = image[urlkey] - del image["url"] - del image["origurl"] + for url, image in self.get_images(page): + data.update(image) if "/fullimg.php" in url: self.wait((1, 2)) - yield Message.Url, url, image + yield Message.Url, url, data + + def setup_headers(self): + """Initialize headers""" + self.session.headers.update({ + "User-Agent": "Mozilla/5.0", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Referer": "https://exhentai.org/", + }) + headers = self.session.headers.copy() + headers["Accept"] = "image/png,image/*;q=0.8,*/*;q=0.5" + return headers def get_job_metadata(self, page): """Collect metadata for extractor-job""" @@ -78,71 +75,74 @@ class ExhentaiGalleryExtractor(Extractor): "gallery-id" : self.gid, "gallery-token": self.token, } - data, _ = text.extract_all(page, ( - ("title" , '

', '

'), - ("title_jp", '

', '

'), - ("date" , '>Posted:', ''), - ("language", '>Language:', ''), - ("size" , '>File Size:', ' '), - ("count" , '>Length:', ' '), - ("url" , 'hentai.org/s/', '"'), + text.extract_all(page, ( + ("title" , '

', '

'), + ("title_jp" , '

', '

'), + ("date" , '>Posted:', ''), + ("language" , '>Language:', ' '), + ("size" , '>File Size:', ' '), + ("size-units", '', '<'), + ("count" , '>Length:', ' '), ), values=data) - pos = data["language"].find(" ") - if pos != -1: - data["language"] = data["language"][:pos] data["lang"] = iso639_1.language_to_code(data["language"]) data["title"] = text.unescape(data["title"]) data["title_jp"] = text.unescape(data["title_jp"]) - url = "https://exhentai.org/s/" + data["url"] - del data["url"] - return data, url + return data - def get_images(self, url): + def get_images(self, page): """Collect url and metadata for all images in this gallery""" + url = "https://exhentai.org/s/" + text.extract(page, 'hentai.org/s/', '"')[0] + yield self.image_from_page(url) + yield from self.images_from_api() + + def image_from_page(self, url): + """Get image url and data from webpage""" self.wait() page = self.request(url).text - data, pos = text.extract_all(page, ( - (None , '
', ' :: '), - ("origurl" , 'https://exhentai.org/fullimg.php', '"'), - ("startkey" , 'var startkey="', '";'), - ("showkey" , 'var showkey="', '";'), - )) - data["imgkey"] = data["startkey"] + data = text.extract_all(page, ( + (None , '