# -*- coding: utf-8 -*- # Copyright 2014, 2015 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extract images and ugoira from http://www.pixiv.net/""" from .common import Extractor, Message from .. import config, text import os.path import time import random info = { "category": "exhentai", "extractor": "ExhentaiExtractor", "directory": ["{category}", "{gallery-id}"], "filename": "{gallery-id}_{num:>04}_{imgkey}_{name}.{extension}", "pattern": [ r"(?:https?://)?(g\.e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})", ], } class ExhentaiExtractor(Extractor): api_url = "http://exhentai.org/api.php" def __init__(self, match): Extractor.__init__(self) self.url = match.group(0) self.version, self.gid, self.token = match.groups() self.session.headers.update({ "User-Agent": "Mozilla/5.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", "Referer": "http://exhentai.org/", }) cookies = config.get(("extractor", "exhentai", "cookies"), {}) for key, value in cookies.items(): self.session.cookies.set(key, value, domain=".exhentai.org", path="/") def items(self): yield Message.Version, 1 page = self.request(self.url).text data, url = self.get_job_metadata(page) headers = self.session.headers.copy() headers["Accept"] = "image/png,image/*;q=0.8,*/*;q=0.5" yield Message.Headers, headers yield Message.Cookies, self.session.cookies yield Message.Directory, data urlkey = "url" if config.get(("extractor", "exhentai", "download-original"), True): urlkey = "origurl" for num, image in enumerate(self.get_images(url), 1): image.update(data) name, ext = os.path.splitext(text.filename_from_url(image["url"])) image["num"] = num image["name"] = name image["extension"] = ext[1:] if "/fullimg.php" in image[urlkey]: time.sleep(random.uniform(1, 2)) yield Message.Url, image[urlkey], image def get_job_metadata(self, page): """Collect metadata for extractor-job""" data = { "category" : info["category"], "gallery-id" : self.gid, "gallery-token": self.token, } data, _ = text.extract_all(page, ( ("title" , '