From 3e8f70188f714b9d2700303e4fcaea0c1dac75b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 6 Aug 2016 13:24:40 +0200 Subject: [PATCH] [hentaibox] add extractor --- README.rst | 8 ++--- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/hentaibox.py | 54 +++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 4 deletions(-) create mode 100644 gallery_dl/extractor/hentaibox.py diff --git a/README.rst b/README.rst index 47a2c9fe..4c90a8ee 100644 --- a/README.rst +++ b/README.rst @@ -44,15 +44,15 @@ Supported Sites powermanga.org, raw.senmanga.com, thespectrum.net * Hentai: doujinmode.net, exhentai.org, hbrowse.com, hentai2read.com, - hentai-foundry.com, hitomi.la, luscious.net, nhentai.net + hentaibox.net, hitomi.la, luscious.net, nhentai.net * Japanese: - pixiv.net, nijie.info + nijie.info, pixiv.net * Western: - deviantart.com, imgth.com, imgur.com, tumblr.com + deviantart.com, hentai-foundry.com, imgth.com, imgur.com, tumblr.com * Futaba Channel-like: 4chan.org, 8ch.net * Image Hosts: - chronos.to, imagebam.com, imagetwist.com, imgbox.com, imgchili.net, img.yt, + chronos.to, imagebam.com, imagetwist.com, img.yt, imgbox.com, imgchili.net, turboimagehost.com diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 729f6fe4..2d3e08b4 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -25,6 +25,7 @@ modules = [ "gelbooru", "hbrowse", "hentai2read", + "hentaibox", "hentaifoundry", "hitomi", "imagebam", diff --git a/gallery_dl/extractor/hentaibox.py b/gallery_dl/extractor/hentaibox.py new file mode 100644 index 00000000..a89d74a9 --- /dev/null +++ b/gallery_dl/extractor/hentaibox.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from http://www.hentaibox.net/""" + +from .common import Extractor, Message +from .. import text, iso639_1 + +class HentaiboxChapterExtractor(Extractor): + + category = "hentaibox" + subcategory = "chapter" + directory_fmt = ["{category}", "{series}", "{title}"] + filename_fmt = "{num:>03}.{extension}" + pattern = [r"(?:https?://)?(?:www\.)?hentaibox\.net/[^/]+/(\d+)_\d+_([^/&]+)"] + test = [("http://www.hentaibox.net/hentai-manga/16_18_Original_Amazon-No-Hiyaku-Amazon-Elixir-Decensored", { + "url": "d1a50a9b289d284f178971e01cf312791888e057", + "keyword": "294eda384689d4f1178ec952560d0dedd3e38647", + })] + + def __init__(self, match): + Extractor.__init__(self) + self.url = match.group(0) + self.count = match.group(1) + + def items(self): + page = self.request(self.url + "&slideshow=play").text + data = self.get_job_metadata(page) + yield Message.Version, 1 + yield Message.Directory, data + for num, url in enumerate(self.get_image_urls(page), 1): + data["num"] = num + data["extension"] = url[url.rfind(".")+1:] + yield Message.Url, url, data + + def get_job_metadata(self, page): + """Collect metadata for extractor-job""" + data = text.extract_all(page, ( + ("title" , 'content="Read or Download ', ' hentai manga from'), + ("series" , ' the series ', ' with ' + self.count), + ("language", ' translated pages to ', '.'), + ), values={"category": self.category, "count": self.count})[0] + data["lang"] = iso639_1.language_to_code(data["language"]) + return data + + @staticmethod + def get_image_urls(page): + """Extract and return a list of all image-urls""" + yield from text.extract_iter(page, '', '')