From 9a08f8a097261fb49e10401f46fefa8526da8ad6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 16 Feb 2017 23:42:30 +0100 Subject: [PATCH] improved foolslide-based extractors - this includes dokireader, fallenangels, jaiminisbox, powermanga, sensescans, worldthree, yonkouprod, gomanga, yomanga - added 'chapter_string', 'chapter_id', 'chapter_minor' and 'count' keywords - changed the 'chapter' keyword to always be just a number - changed the default directory format --- gallery_dl/extractor/dokireader.py | 15 +++----- gallery_dl/extractor/fallenangels.py | 15 +++----- gallery_dl/extractor/foolslide.py | 56 +++++++++++++++++----------- gallery_dl/extractor/gomanga.py | 25 +++++++------ gallery_dl/extractor/jaiminisbox.py | 15 +++----- gallery_dl/extractor/powermanga.py | 27 +++----------- gallery_dl/extractor/sensescans.py | 18 +++++---- gallery_dl/extractor/worldthree.py | 31 ++++++++------- gallery_dl/extractor/yomanga.py | 12 ++---- gallery_dl/extractor/yonkouprod.py | 13 ++----- test/test_extractors.py | 2 +- 11 files changed, 103 insertions(+), 126 deletions(-) diff --git a/gallery_dl/extractor/dokireader.py b/gallery_dl/extractor/dokireader.py index 5cbeae23..10811f5f 100644 --- a/gallery_dl/extractor/dokireader.py +++ b/gallery_dl/extractor/dokireader.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016 Mike Fährmann +# Copyright 2016-2017 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,19 +8,14 @@ """Extract manga-chapters from https://kobato.hologfx.com/""" -from .foolslide import FoolslideChapterExtractor +from . import foolslide -class DokireaderChapterExtractor(FoolslideChapterExtractor): +class DokireaderChapterExtractor(foolslide.FoolslideChapterExtractor): """Extractor for manga-chapters from kobato.hologfx.com""" category = "dokireader" - pattern = [(r"(?:https?://)?(kobato\.hologfx\.com/reader/read/" - r"[^/]+/([a-z]{2})/\d+/\d+)")] + pattern = foolslide.chapter_pattern("kobato\.hologfx\.com/reader") test = [(("https://kobato.hologfx.com/reader/read/" "hitoribocchi_no_oo_seikatsu/en/3/34"), { - "keyword": "4ee981ae14c6643f6a03a14c9f2c0d4898202671", + "keyword": "f28811c01b64031671108a4a3d6eea1040816b82", })] - - def __init__(self, match): - url = "https://" + match.group(1) - FoolslideChapterExtractor.__init__(self, url, match.group(2)) diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py index 828f5dbe..29e687db 100644 --- a/gallery_dl/extractor/fallenangels.py +++ b/gallery_dl/extractor/fallenangels.py @@ -8,25 +8,20 @@ """Extract manga-chapters from http://famatg.com/""" -from .foolslide import FoolslideChapterExtractor +from . import foolslide -class FallenangelsChapterExtractor(FoolslideChapterExtractor): +class FallenangelsChapterExtractor(foolslide.FoolslideChapterExtractor): """Extractor for manga-chapters from famatg.com""" category = "fallenangels" - pattern = [(r"(?:https?://)?((?:manga|truyen)\.famatg\.com/read/" - r"[^/]+/([a-z]{2})/\d+/\d+(?:/\d+)?)")] + pattern = foolslide.chapter_pattern(r"(?:manga|truyen)\.famatg\.com") test = [ ("http://manga.famatg.com/read/chronos_ruler/en/0/20/", { "url": "a777f93533674744b74c9b57c7dfa7254f5ddbed", - "keyword": "47ac083cac8a3c0aaf0f6b571a9bfb535217fd31", + "keyword": "76e7130a64d96317e3e4dcd55d770c9f6d9cb71d", }), ("https://truyen.famatg.com/read/madan_no_ou_to_vanadis/vi/0/33/", { "url": "b46bf1ef0537c3ce42bf2b9e4b62ace41c2299cd", - "keyword": "9eb750934f4f712211f5a7063c2206693b7cedf9", + "keyword": "658cdbecd3a1698f5462c1db437b423b6bcf7dd3", }), ] - - def __init__(self, match): - url = "https://" + match.group(1) - FoolslideChapterExtractor.__init__(self, url, match.group(2)) diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index f28e8477..56831f1c 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016 Mike Fährmann +# Copyright 2016-2017 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,28 +11,46 @@ from .common import Extractor, Message from .. import text, iso639_1 import json -import re + + +CHAPTER_RE = ( + r"/read/[^/]+" + r"/(?P[a-z]{2})" + r"/(?P\d+)" + r"/(?P\d+)" + r"(?:/(?P\d+))?)" +) + + +def chapter_pattern(domain_re): + return [r"(?:https?://)?(" + domain_re + CHAPTER_RE] class FoolslideChapterExtractor(Extractor): """Base class for chapter extractors on foolslide based sites""" subcategory = "chapter" - directory_fmt = ["{category}", "{manga}", "{chapter:>03} - {title}"] + directory_fmt = ["{category}", "{manga}", "{chapter_string}"] filename_fmt = "{manga}_{chapter:>03}_{page:>03}.{extension}" + scheme = "https" single = True - def __init__(self, url, lang): + def __init__(self, match, url=None): Extractor.__init__(self) - self.url = url - self.lang = lang + self.url = url or self.scheme + "://" + match.group(1) + self.data = match.groupdict(default="") def items(self): page = self.request(self.url, encoding="utf-8", method="post", data={"adult": "true"}).text data = self.get_job_metadata(page) + imgs = self.get_images(page) + + data["count"] = len(imgs) + data["chapter_id"] = imgs[0]["chapter_id"] + yield Message.Version, 1 yield Message.Directory, data - for data["page"], image in enumerate(self.get_images(page), 1): + for data["page"], image in enumerate(imgs, 1): try: url = image["url"] del image["url"] @@ -45,23 +63,19 @@ class FoolslideChapterExtractor(Extractor): def get_job_metadata(self, page): """Collect metadata for extractor-job""" - _ , pos = text.extract(page, '

', '') - manga , pos = text.extract(page, 'title="', '"', pos) - chapter , pos = text.extract(page, '">', '', pos) + _ , pos = text.extract(page, '

', '') + manga , pos = text.extract(page, 'title="', '"', pos) + chapter, pos = text.extract(page, 'title="', '"', pos) + chapter = text.unescape(chapter) parts = chapter.split(":", maxsplit=1) - match = re.match(r"(?:Vol.(\d+) )?(?:Chapter (\d+)$|(.+))", parts[0]) - volume = match.group(1) or "" - chapter = match.group(2) or match.group(3).strip() + title = parts[1].strip() if len(parts) > 1 else "" - return { - "manga": text.unescape(manga), - "chapter": chapter, - "volume": volume, - "lang": self.lang, - "language": iso639_1.code_to_language(self.lang), - "title": text.unescape(parts[1].strip() if len(parts) > 1 else ""), - } + self.data["manga"] = text.unescape(manga) + self.data["title"] = title + self.data["language"] = iso639_1.code_to_language(self.data["lang"]) + self.data["chapter_string"] = chapter + return self.data def get_images(self, page): """Return a list of all images in this chapter""" diff --git a/gallery_dl/extractor/gomanga.py b/gallery_dl/extractor/gomanga.py index 940408bd..dc7e8ba1 100644 --- a/gallery_dl/extractor/gomanga.py +++ b/gallery_dl/extractor/gomanga.py @@ -8,20 +8,21 @@ """Extract manga-chapters from https://gomanga.co/""" -from .foolslide import FoolslideChapterExtractor +from . import foolslide -class GomangaChapterExtractor(FoolslideChapterExtractor): +class GomangaChapterExtractor(foolslide.FoolslideChapterExtractor): """Extractor for manga-chapters from gomanga.co""" category = "gomanga" - pattern = [(r"(?:https?://)?(?:www\.)?(gomanga\.co/reader/read/" - r"[^/]+/([a-z]{2})/\d+/\d+)")] - test = [("https://gomanga.co/reader/read/mata-kata-omou/en/0/1/page/11", { - "url": "5088d75bb44327fc503c85b52b1d6a371b8057f2", - "keyword": "63f4d2cbbcaf3e7b5c48e71c4c4d453d9a399a26", - })] + pattern = foolslide.chapter_pattern(r"(?:www\.)?gomanga\.co/reader") + test = [ + ("https://gomanga.co/reader/read/mata-kata-omou/en/0/1/page/11", { + "url": "5088d75bb44327fc503c85b52b1d6a371b8057f2", + "keyword": "f534cfc4c3aef87cb0439e2a37e2ebee98077e92", + }), + ("https://gomanga.co/reader/read/pastel/en/31/144/", { + "url": "9cc2052fbf36344c573c754c5abe533a14b3e280", + "keyword": "a2ef55d26984c64baf026382f889bb013d01dc4f", + }), + ] single = False - - def __init__(self, match): - url = "https://" + match.group(1) - FoolslideChapterExtractor.__init__(self, url, match.group(2)) diff --git a/gallery_dl/extractor/jaiminisbox.py b/gallery_dl/extractor/jaiminisbox.py index 75fa021c..56f8b603 100644 --- a/gallery_dl/extractor/jaiminisbox.py +++ b/gallery_dl/extractor/jaiminisbox.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016 Mike Fährmann +# Copyright 2016-2017 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,19 +8,14 @@ """Extract manga-chapters from https://jaiminisbox.com/""" -from .foolslide import FoolslideChapterExtractor +from . import foolslide -class JaiminisboxChapterExtractor(FoolslideChapterExtractor): +class JaiminisboxChapterExtractor(foolslide.FoolslideChapterExtractor): """Extractor for manga-chapters from jaiminisbox.com""" category = "jaiminisbox" - pattern = [(r"(?:https?://)?(?:www\.)?(jaiminisbox.com/reader/read/" - r"[^/]+/([a-z]{2})/\d+/\d+)")] + pattern = foolslide.chapter_pattern(r"(?:www\.)?jaiminisbox.com/reader") test = [("https://jaiminisbox.com/reader/read/uratarou/en/0/1/", { "url": "f021de7f31ee3a3f688fdf3e8183aef4226c2b50", - "keyword": "836e94f68b78159cc10d12b72c981c276ff45b3f", + "keyword": "d187df3e3b6dbe09ec163626f6fd7c57133ab163", })] - - def __init__(self, match): - url = "https://" + match.group(1) - FoolslideChapterExtractor.__init__(self, url, match.group(2)) diff --git a/gallery_dl/extractor/powermanga.py b/gallery_dl/extractor/powermanga.py index 54002b8c..52bdbe01 100644 --- a/gallery_dl/extractor/powermanga.py +++ b/gallery_dl/extractor/powermanga.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015, 2016 Mike Fährmann +# Copyright 2015-2017 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,31 +8,14 @@ """Extract manga-chapters from http://powermanga.org/""" -from .foolslide import FoolslideChapterExtractor -from .. import text -import re +from . import foolslide -class PowermangaChapterExtractor(FoolslideChapterExtractor): +class PowermangaChapterExtractor(foolslide.FoolslideChapterExtractor): """Extractor for manga-chapters from powermanga.org""" category = "powermanga" - pattern = [ - (r"(?:https?://)?read(?:er)?\.powermanga\.org/read/" - r"(.+/([a-z]{2})/\d+/\d+)(?:/page)?"), - (r"(?:https?://)?(?:www\.)?(p)owermanga\.org/((?:[^-]+-)+[^-]+/?)"), - ] + pattern = foolslide.chapter_pattern(r"read(?:er)?\.powermanga\.org") test = [("https://read.powermanga.org/read/one_piece/en/0/803/page/1", { "url": "e6179c1565068f99180620281f86bdd25be166b4", - "keyword": "51cabad8995727334e5ca9773c18d709b3868f02", + "keyword": "203ea5d0ef7759f4517316f0678f3592fc27cdbe", })] - - def __init__(self, match): - if match.group(1) == "p": - url = "https://powermanga.org/" + match.group(2) - page = self.request(url).text - pos = page.index("class='small-button smallblack'>Download") - url = text.extract(page, " 1: extractors = [ extr for extr in extractor.extractors()