[mangafox] cleanup

2021-07-04 03:21:02 +02:00
parent 4763bc1e4e
commit 7f591c78cb
2 changed files with 20 additions and 26 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -400,7 +400,7 @@ Consider all sites to be NSFW unless otherwise known.
 <tr>
    <td>Manga Fox</td>
    <td>https://fanfox.net/</td>
-    <td>Chapters</td>
+    <td>Chapters, Manga</td>
    <td></td>
 </tr>
 <tr>
--- a/gallery_dl/extractor/mangafox.py
+++ b/gallery_dl/extractor/mangafox.py
@@ -6,21 +6,21 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
-"""Extractors for from https://fanfox.net/"""
+"""Extractors for https://fanfox.net/"""
 from .common import ChapterExtractor, MangaExtractor
 from .. import text
 import re
-BASE_PATTERN = \
+BASE_PATTERN = r"(?:https?://)?(?:www\.|m\.)?(?:fanfox\.net|mangafox\.me)"
    r"(?:https?://)?(?:www\.|m\.)?(?:fanfox\.net|mangafox\.me)"
 class MangafoxChapterExtractor(ChapterExtractor):
-    """Extractor for manga-chapters from fanfox.net"""
+    """Extractor for manga chapters from fanfox.net"""
    category = "mangafox"
-    pattern = (r"(?:https?://)?(?:www\.|m\.)?(?:fanfox\.net|mangafox\.me)"
+    root = "https://m.fanfox.net"
-               r"(/manga/[^/]+/((?:v([^/?#]+)/)?c(\d+)([^/?#]*)))")
+    pattern = BASE_PATTERN + \
        r"(/manga/[^/?#]+/((?:v([^/?#]+)/)?c(\d+)([^/?#]*)))"
    test = (
        ("http://fanfox.net/manga/kidou_keisatsu_patlabor/v05/c006.2/1.html", {
            "keyword": "5661dab258d42d09d98f194f7172fb9851a49766",
@@ -29,7 +29,6 @@ class MangafoxChapterExtractor(ChapterExtractor):
        ("http://mangafox.me/manga/kidou_keisatsu_patlabor/v05/c006.2/"),
        ("http://fanfox.net/manga/black_clover/vTBD/c295/1.html"),
    )
    root = "https://m.fanfox.net"
    def __init__(self, match):
        base, self.cstr, self.volume, self.chapter, self.minor = match.groups()
@@ -71,7 +70,7 @@ class MangafoxMangaExtractor(MangaExtractor):
    category = "mangafox"
    root = "https://m.fanfox.net"
    chapterclass = MangafoxChapterExtractor
-    pattern = BASE_PATTERN + r"(/manga/[^/]+)$"
+    pattern = BASE_PATTERN + r"(/manga/[^/?#]+)/?$"
    test = (
        ("https://fanfox.net/manga/kanojo_mo_kanojo", {
            "pattern": MangafoxChapterExtractor.pattern,
@@ -84,13 +83,18 @@ class MangafoxMangaExtractor(MangaExtractor):
        ("https://m.fanfox.net/manga/sentai_daishikkaku"),
    )
    def __init__(self, match):
        MangaExtractor.__init__(self, match)
    def chapters(self, page):
-        results = []
+        match_info = re.compile(r"Ch (\d+)(\S*)(?: (.*))?").match
-        data = self.parse_page(page, {"lang": "en", "language": "English"})
+        manga, pos = text.extract(page, '<p class="title">', '</p>')
        author, pos = text.extract(page, '<p>Author(s):', '</p>', pos)
        data = {
            "manga"   : text.unescape(manga),
            "author"  : text.remove_html(author),
            "lang"    : "en",
            "language": "English",
        }
        results = []
        pos = page.index('<dd class="chlist">')
        while True:
            url, pos = text.extract(page, '<a href="//', '"', pos)
@@ -99,9 +103,9 @@ class MangafoxMangaExtractor(MangaExtractor):
            info, pos = text.extract(page, '>', '<span', pos)
            date, pos = text.extract(page, 'right">', '</span>', pos)
-            match = re.match(r"Ch (\d+)(\S*)(?: (.*))?", text.unescape(info))
+            match = match_info(text.unescape(info))
            if match:
-                chapter, minor, _ = match.groups()
+                chapter, minor, title = match.groups()
                chapter_minor = minor
            else:
                chapter, _, minor = url[:-7].rpartition("/c")[2].partition(".")
@@ -111,13 +115,3 @@ class MangafoxMangaExtractor(MangaExtractor):
            data["chapter_minor"] = chapter_minor if minor else ""
            data["date"] = date
            results.append(("https://" + url, data.copy()))
    @staticmethod
    def parse_page(page, data):
        """Parse metadata on 'page' and add it to 'data'"""
        text.extract_all(page, (
            ("manga"  , '<p class="title">', '</p>'),
            ("author" , '<p>Author(s):', '</p>'),
        ), values=data)
        data["author"] = text.remove_html(data["author"])
        return data