From c0530a635c918e427b69baf8e898405bd508083c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 10 May 2015 15:47:44 +0200 Subject: [PATCH] [batoto] a few fixes - removed 'volume' fields from default format strings - fixed parsing by making volumes optional - removed debug output - updated docstring - added a few unescape calls --- gallery_dl/extractor/batoto.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py index a77b9eed..ac363052 100644 --- a/gallery_dl/extractor/batoto.py +++ b/gallery_dl/extractor/batoto.py @@ -10,7 +10,7 @@ from .common import AsynchronousExtractor from .common import Message -from .common import filename_from_url +from .common import filename_from_url, unescape from urllib.parse import unquote import os.path import re @@ -18,9 +18,8 @@ import re info = { "category": "batoto", "extractor": "BatotoExtractor", - "directory": ["{category}", "{manga}", - "v{volume:>02} c{chapter:>03} - {title}"], - "filename": "{manga}_v{volume:>02}_c{chapter:>03}_{page:>03}.{extension}", + "directory": ["{category}", "{manga}", "c{chapter:>03} - {title}"], + "filename": "{manga}_c{chapter:>03}_{page:>03}.{extension}", "pattern": [ r"(?:https?://)?(?:www\.)?bato\.to/read/_/(\d+).*", ], @@ -39,12 +38,11 @@ class BatotoExtractor(AsynchronousExtractor): url = self.url_base + self.chapter_id while url: url, data = self.get_page_metadata(url) - print(data) yield Message.Directory, data yield Message.Url, data["image-url"], data def get_page_metadata(self, page_url): - """Collect metadata for one manga-page""" + """Collect next url and metadata for one manga-page""" page = self.request(page_url).text _ , pos = self.extract(page, 'selected="selected"', '') title, pos = self.extract(page, ': ', '<', pos) @@ -53,20 +51,27 @@ class BatotoExtractor(AsynchronousExtractor): _ , pos = self.extract(page, '
(.+) - vol (\d+) ch (\d+) Page (\d+)", page) - tmatch = re.match(r"(.+) - ([^ ]+)", trans) + mmatch = re.search( + r"(.+) - (?:vol (\d+) )?" + r"ch (\d+)[^ ]+ Page (\d+) | Batoto!", + page + ) + tmatch = re.match( + r"(.+) - ([^ ]+)", + trans + ) filename = unquote(filename_from_url(image)) name, ext = os.path.splitext(filename) return url, { "category": info["category"], "chapter-id": self.chapter_id, - "manga": mmatch.group(1), - "volume": mmatch.group(2), + "manga": unescape(mmatch.group(1)), + "volume": mmatch.group(2) or "", "chapter": mmatch.group(3), "page": mmatch.group(4), "group": tmatch.group(1), "language": tmatch.group(2), - "title": title, + "title": unescape(title), "image-url": image, "name": name, "extension": ext[1:],