diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 14121a5f..ce6b63ce 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -12,7 +12,7 @@ from .. import config modules = [ "pixiv", - # "exhentai", + "exhentai", "gelbooru", "3dbooru", "4chan", diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 43ad99d7..84497eb3 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -1,69 +1,131 @@ -from .common import BasicExtractor -from ..util import unescape, safe_request +# -*- coding: utf-8 -*- + +# Copyright 2014, 2015 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images and ugoira from http://www.pixiv.net/""" + +from .common import Extractor, Message +from .. import config, text +import re +import os.path import time import random -import json -class Extractor(BasicExtractor): +info = { + "category": "exhentai", + "extractor": "ExhentaiExtractor", + "directory": ["{category}", "{gallery-id}"], + "filename": "{gallery-id}_{num:>04}_{imgkey}_{name}.{extension}", + "pattern": [ + r"(?:https?://)?(g\.e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})", + ], +} + +class ExhentaiExtractor(Extractor): api_url = "http://exhentai.org/api.php" - name_fmt = "{}_{:>04}_{}_{}" - def __init__(self, match, config): - BasicExtractor.__init__(self, config) + def __init__(self, match): + Extractor.__init__(self) self.url = match.group(0) - self.gid, self.token = match.group(1).split("/") - self.category = "exhentai" - self.directory = self.gid - self.session.cookies.update(config["exhentai-cookies"]) + self.version, self.gid, self.token = match.groups() + self.session.headers.update({ + "User-Agent": "Mozilla/5.0", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.5", + "Referer": "http://exhentai.org/", + }) + cookies = config.get(("extractor", "exhentai", "cookies"), {}) + for key, value in cookies.items(): + self.session.cookies.set(key, value, domain=".exhentai.org", path="/") - def images(self): - e = self.extract + def items(self): + yield Message.Version, 1 + page = self.request(self.url).text + data, url = self.get_job_metadata(page) - # get gallery page - text = self.request(self.url).text + headers = self.session.headers.copy() + headers["Accept"] = "image/png,image/*;q=0.8,*/*;q=0.5" + yield Message.Headers, headers + yield Message.Cookies, self.session.cookies + yield Message.Directory, data - # get first image page - url, pos = self.extract_all(text, "http://exhentai.org/s/", "-1") - text = self.request(url).text + urlkey = "url" + if config.get(("extractor", "exhentai", "download-original"), True): + urlkey = "origurl" + for num, image in enumerate(self.get_images(url), 1): + image.update(data) + name, ext = os.path.splitext(text.filename_from_url(image["url"])) + image["num"] = num + image["name"] = name + image["extension"] = ext[1:] + if "/fullimg.php" in image[urlkey]: + time.sleep( random.uniform(1, 2) ) + yield Message.Url, image[urlkey], image - # extract information - _ , pos = e(text, '
', ' :: ', pos) - orgurl , pos = e(text, 'http://exhentai.org/fullimg.php', '"', pos) - gid , pos = e(text, 'var gid=' , ';', pos) - startkey, pos = e(text, 'var startkey="', '";', pos) - showkey , pos = e(text, 'var showkey="' , '";', pos) + def get_job_metadata(self, page): + title , pos = text.extract(page, '

', '

') + title_jp, pos = text.extract(page, '

', '

', pos) + date , pos = text.extract(page, '>Posted:', '', pos) + language, pos = text.extract(page, '>Language:', '', pos) + size , pos = text.extract(page, '>File Size:', ' ', pos) + url , pos = text.extract(page, 'hentai.org/s/', '"', pos) + return { + "category": info["category"], + "gallery-id": self.gid, + "gallery-token": self.token, + "title": title, + "title-jp": title_jp, + "date": date, + "language": language, + "size": size, + }, "http://exhentai.org/s/" + url - # - if orgurl: url = "http://exhentai.org/fullimg.php" + unescape(orgurl) - yield url, self.name_fmt.format(self.gid, 1, startkey, name) + def get_images(self, url): + time.sleep( random.uniform(3, 6) ) + page = self.request(url).text + data = {} + _ , pos = text.extract(page, '