From b1c4c1e13c7c4664b62f6fb851f3a077d22aec4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 8 Aug 2018 18:08:26 +0200 Subject: [PATCH] [mangadex] fix extraction --- gallery_dl/extractor/mangadex.py | 140 +++++++++++++------------------ 1 file changed, 56 insertions(+), 84 deletions(-) diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 9f985f28..a6bf5e94 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -9,9 +9,8 @@ """Extract manga-chapters and entire manga from https://mangadex.org/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, util, exception +from .. import text, util import json -import re class MangadexExtractor(): @@ -19,6 +18,14 @@ class MangadexExtractor(): category = "mangadex" root = "https://mangadex.org" + # mangadex-to-iso639-1 codes + iso639_map = { + "br": "pt", + "ct": "ca", + "gb": "en", + "vn": "vi", + } + class MangadexChapterExtractor(MangadexExtractor, ChapterExtractor): """Extractor for manga-chapters from mangadex.org""" @@ -26,59 +33,43 @@ class MangadexChapterExtractor(MangadexExtractor, ChapterExtractor): pattern = [r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)/chapter/(\d+)"] test = [ ("https://mangadex.org/chapter/122094", { - "keyword": "da1262219afe50dfe0098011366468fa507cc3c6", + "keyword": "1fa2ed74f8da89f7b9d403f18d90a6f4df57a55f", "content": "7ab3bef5caccb62b881f8e6e70359d3c7be8137f", }), # oneshot ("https://mangadex.org/chapter/138086", { "count": 64, - "keyword": "1f6fb237a96cdf05b436ae2a37a4436e717bbb30", + "keyword": "f3cd8a938bfe44a8ad22d35c84f92d724bc5d66f", }), - # NotFoundError - ("https://mangadex.org/chapter/1", { - "exception": exception.NotFoundError, - }) ] def __init__(self, match): - url = self.root + "/chapter/" + match.group(1) + url = "{}/api/chapter/{}".format(self.root, match.group(1)) ChapterExtractor.__init__(self, url) self.data = None def get_metadata(self, page): - if "title='Warning'" in page and " does not exist." in page: - raise exception.NotFoundError("chapter") + self.data = data = json.loads(page) + chapter, sep, minor = data["chapter"].partition(".") - info, pos = text.extract(page, '="og:title" content="', ' (') - self.data = data = json.loads( - text.extract(page, 'data-type="chapter">', '<', pos)[0]) - - match = re.match(r"(?:[Vv]ol\. (\d+) )?[Cc]h\. ([^.]+)(\..+)?", info) - if match: - volume, chapter, minor = match.groups() - chapter = chapter.rpartition(" ")[2] - else: - volume = chapter = minor = "" - group = data["other_groups"][str(data["chapter_id"])] + url = "{}/api/manga/{}".format(self.root, data["manga_id"]) + mdata = self.request(url).json() return { - "manga": text.unescape(data["manga_title"]), + "manga": mdata["manga"]["title"], "manga_id": data["manga_id"], - "title": text.unescape(data["chapter_title"]), - "volume": text.parse_int(volume), + "title": text.unescape(data["title"]), + "volume": text.parse_int(data["volume"]), "chapter": text.parse_int(chapter), - "chapter_minor": minor or "", - "chapter_id": data["chapter_id"], - "chapter_string": info, - "group": text.unescape(group["group_name"]), - "lang": util.language_to_code(group["lang_name"]), - "language": group["lang_name"], + "chapter_minor": sep + minor, + "chapter_id": data["id"], + "group": mdata["chapter"][str(data["id"])]["group_name"], + "lang": util.language_to_code(data["lang_name"]), + "language": data["lang_name"], } def get_images(self, _): - base = self.data["server"] + self.data["dataurl"] + "/" - base = text.urljoin(self.root, base) - + base = self.data["server"] + self.data["hash"] + "/" return [ (base + page, None) for page in self.data["page_array"] @@ -87,7 +78,7 @@ class MangadexChapterExtractor(MangadexExtractor, ChapterExtractor): class MangadexMangaExtractor(MangadexExtractor, MangaExtractor): """Extractor for manga from mangadex.org""" - pattern = [r"(?:https?://)?(?:www\.)?(mangadex\.(?:org|com)/manga/\d+)"] + pattern = [r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)/manga/(\d+)"] test = [ ("https://mangadex.org/manga/2946/souten-no-koumori", { "count": ">= 1", @@ -100,9 +91,7 @@ class MangadexMangaExtractor(MangadexExtractor, MangaExtractor): "chapter_minor": "", "chapter_id": int, "group": str, - "contributor": str, - "date": str, - "views": int, + "date": int, "lang": str, "language": str, }, @@ -111,54 +100,37 @@ class MangadexMangaExtractor(MangadexExtractor, MangaExtractor): "count": ">= 100", }), ] - scheme = "https" - per_page = 100 + reverse = False + + def __init__(self, match): + self.manga_id = match.group(1) + url = "{}/api/manga/{}".format(self.root, self.manga_id) + MangaExtractor.__init__(self, match, url) def chapters(self, page): + data = json.loads(page) + manga = data["manga"] + results = [] - extr = text.extract - num = 1 + for chid, info in data["chapter"].items(): + chapter, sep, minor = info["chapter"].partition(".") + lang = self.iso639_map.get(info["lang_code"], info["lang_code"]) - manga = text.unescape(extr( - page, '"og:title" content="', '"')[0].rpartition(" (")[0]) - manga_id = text.parse_int(extr( - page, '/images/manga/', '.')[0]) + results.append((self.root + "/chapter/" + chid, { + "manga": manga["title"], + "manga_id": text.parse_int(self.manga_id), + "artist": manga["artist"], + "author": manga["author"], + "title": text.unescape(info["title"]), + "volume": text.parse_int(info["volume"]), + "chapter": text.parse_int(chapter), + "chapter_minor": sep + minor, + "chapter_id": text.parse_int(chid), + "group": text.unescape(info["group_name"]), + "date": info["timestamp"], + "lang": lang, + "language": util.code_to_language(lang), + })) - while True: - before = len(results) - - for info in text.extract_iter(page, '", "", pos) - user , pos = extr(info, "", "", pos) - views , pos = extr(info, ">", "<", pos) - date , pos = extr(info, ' datetime="', '"', pos) - - chapter, sep, minor = chapter.partition(".") - chapter = chapter.rpartition(" ")[2] - - results.append((self.root + "/chapter/" + chid, { - "manga": manga, - "manga_id": text.parse_int(manga_id), - "title": text.unescape(title), - "volume": text.parse_int(volume), - "chapter": text.parse_int(chapter), - "chapter_minor": sep + minor, - "chapter_id": text.parse_int(chid), - "group": text.unescape(text.remove_html(group)), - "contributor": text.remove_html(user), - "views": text.parse_int(views), - "date": date, - "lang": util.language_to_code(language), - "language": language, - })) - - if len(results) - before != self.per_page: - return results - - num += 1 - page = self.request("{}/_/chapters/{}/".format(self.url, num)).text + results.sort(key=lambda x: (x[1]["chapter"], x[1]["chapter_minor"])) + return results