From 179d62023c3d721a65d8dfaed372489b17200353 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 20 Dec 2016 16:30:25 +0100 Subject: [PATCH] [readcomics] add comic-issue extractor --- README.rst | 2 +- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/readcomics.py | 55 ++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 gallery_dl/extractor/readcomics.py diff --git a/README.rst b/README.rst index 3f599929..f05367f0 100644 --- a/README.rst +++ b/README.rst @@ -54,7 +54,7 @@ Supported Sites reader.sensescans.com, thespectrum.net, slide.world-three.org, yonkouprod.com * Comic: - kisscomic.us, readcomiconline.to + kisscomic.us, readcomiconline.to, readcomics.tv * Hentai: exhentai.org, hbrowse.com, hentai2read.com, hentaibox.net, hentaihere.com, hitomi.la, luscious.net, nhentai.net diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 610f75a7..ffb991fc 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -52,6 +52,7 @@ modules = [ "pinterest", "powermanga", "readcomiconline", + "readcomics", "rule34", "safebooru", "sankaku", diff --git a/gallery_dl/extractor/readcomics.py b/gallery_dl/extractor/readcomics.py new file mode 100644 index 00000000..e24ccf5e --- /dev/null +++ b/gallery_dl/extractor/readcomics.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract comic issues from http://readcomics.tv/""" + +from .common import Extractor, Message +from .. import text + + +class ReadcomicsIssueExtractor(Extractor): + """Extractor for comic-issues from readcomics.tv""" + category = "readcomics" + subcategory = "issue" + directory_fmt = ["{category}", "{comic}", "{issue:>03}"] + filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}" + pattern = [(r"(?:https?://)?(?:www\.)?readcomics\.(?:tv|net)/" + r"([^/]+)/chapter-(\d+)")] + root = "https://readcomics.tv" + + def __init__(self, match): + Extractor.__init__(self) + self.comic, self.chapter = match.groups() + + def items(self): + url = "{}/{}/chapter-{}/full".format(self.root, self.comic, self.chapter) + page = self.request(url).text + data = self.get_job_metadata(page) + imgs = self.get_image_urls(page) + data["count"] = len(imgs) + yield Message.Version, 1 + yield Message.Directory, data + for data["page"], url in enumerate(imgs, 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + def get_job_metadata(self, page): + """Collect metadata for extractor-job""" + info = text.extract(page, "", " - Read ")[0].rsplit(maxsplit=1) + return { + "comic": info[0], + "issue": info[1][1:], + "lang": "en", + "language": "English", + } + + @staticmethod + def get_image_urls(page): + """Extract list of all image-urls for a comic-issue""" + needle = ('class="chapter_img" style="margin-bottom: ' + '20px; max-width: 100%;" src="') + return list(text.extract_iter(page, needle, '"'))