[mangadex] fix extraction
This commit is contained in:
@@ -9,9 +9,8 @@
|
||||
"""Extract manga-chapters and entire manga from https://mangadex.org/"""
|
||||
|
||||
from .common import ChapterExtractor, MangaExtractor
|
||||
from .. import text, util, exception
|
||||
from .. import text, util
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
class MangadexExtractor():
|
||||
@@ -19,6 +18,14 @@ class MangadexExtractor():
|
||||
category = "mangadex"
|
||||
root = "https://mangadex.org"
|
||||
|
||||
# mangadex-to-iso639-1 codes
|
||||
iso639_map = {
|
||||
"br": "pt",
|
||||
"ct": "ca",
|
||||
"gb": "en",
|
||||
"vn": "vi",
|
||||
}
|
||||
|
||||
|
||||
class MangadexChapterExtractor(MangadexExtractor, ChapterExtractor):
|
||||
"""Extractor for manga-chapters from mangadex.org"""
|
||||
@@ -26,59 +33,43 @@ class MangadexChapterExtractor(MangadexExtractor, ChapterExtractor):
|
||||
pattern = [r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)/chapter/(\d+)"]
|
||||
test = [
|
||||
("https://mangadex.org/chapter/122094", {
|
||||
"keyword": "da1262219afe50dfe0098011366468fa507cc3c6",
|
||||
"keyword": "1fa2ed74f8da89f7b9d403f18d90a6f4df57a55f",
|
||||
"content": "7ab3bef5caccb62b881f8e6e70359d3c7be8137f",
|
||||
}),
|
||||
# oneshot
|
||||
("https://mangadex.org/chapter/138086", {
|
||||
"count": 64,
|
||||
"keyword": "1f6fb237a96cdf05b436ae2a37a4436e717bbb30",
|
||||
"keyword": "f3cd8a938bfe44a8ad22d35c84f92d724bc5d66f",
|
||||
}),
|
||||
# NotFoundError
|
||||
("https://mangadex.org/chapter/1", {
|
||||
"exception": exception.NotFoundError,
|
||||
})
|
||||
]
|
||||
|
||||
def __init__(self, match):
|
||||
url = self.root + "/chapter/" + match.group(1)
|
||||
url = "{}/api/chapter/{}".format(self.root, match.group(1))
|
||||
ChapterExtractor.__init__(self, url)
|
||||
self.data = None
|
||||
|
||||
def get_metadata(self, page):
|
||||
if "title='Warning'" in page and " does not exist." in page:
|
||||
raise exception.NotFoundError("chapter")
|
||||
self.data = data = json.loads(page)
|
||||
chapter, sep, minor = data["chapter"].partition(".")
|
||||
|
||||
info, pos = text.extract(page, '="og:title" content="', ' (')
|
||||
self.data = data = json.loads(
|
||||
text.extract(page, 'data-type="chapter">', '<', pos)[0])
|
||||
|
||||
match = re.match(r"(?:[Vv]ol\. (\d+) )?[Cc]h\. ([^.]+)(\..+)?", info)
|
||||
if match:
|
||||
volume, chapter, minor = match.groups()
|
||||
chapter = chapter.rpartition(" ")[2]
|
||||
else:
|
||||
volume = chapter = minor = ""
|
||||
group = data["other_groups"][str(data["chapter_id"])]
|
||||
url = "{}/api/manga/{}".format(self.root, data["manga_id"])
|
||||
mdata = self.request(url).json()
|
||||
|
||||
return {
|
||||
"manga": text.unescape(data["manga_title"]),
|
||||
"manga": mdata["manga"]["title"],
|
||||
"manga_id": data["manga_id"],
|
||||
"title": text.unescape(data["chapter_title"]),
|
||||
"volume": text.parse_int(volume),
|
||||
"title": text.unescape(data["title"]),
|
||||
"volume": text.parse_int(data["volume"]),
|
||||
"chapter": text.parse_int(chapter),
|
||||
"chapter_minor": minor or "",
|
||||
"chapter_id": data["chapter_id"],
|
||||
"chapter_string": info,
|
||||
"group": text.unescape(group["group_name"]),
|
||||
"lang": util.language_to_code(group["lang_name"]),
|
||||
"language": group["lang_name"],
|
||||
"chapter_minor": sep + minor,
|
||||
"chapter_id": data["id"],
|
||||
"group": mdata["chapter"][str(data["id"])]["group_name"],
|
||||
"lang": util.language_to_code(data["lang_name"]),
|
||||
"language": data["lang_name"],
|
||||
}
|
||||
|
||||
def get_images(self, _):
|
||||
base = self.data["server"] + self.data["dataurl"] + "/"
|
||||
base = text.urljoin(self.root, base)
|
||||
|
||||
base = self.data["server"] + self.data["hash"] + "/"
|
||||
return [
|
||||
(base + page, None)
|
||||
for page in self.data["page_array"]
|
||||
@@ -87,7 +78,7 @@ class MangadexChapterExtractor(MangadexExtractor, ChapterExtractor):
|
||||
|
||||
class MangadexMangaExtractor(MangadexExtractor, MangaExtractor):
|
||||
"""Extractor for manga from mangadex.org"""
|
||||
pattern = [r"(?:https?://)?(?:www\.)?(mangadex\.(?:org|com)/manga/\d+)"]
|
||||
pattern = [r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)/manga/(\d+)"]
|
||||
test = [
|
||||
("https://mangadex.org/manga/2946/souten-no-koumori", {
|
||||
"count": ">= 1",
|
||||
@@ -100,9 +91,7 @@ class MangadexMangaExtractor(MangadexExtractor, MangaExtractor):
|
||||
"chapter_minor": "",
|
||||
"chapter_id": int,
|
||||
"group": str,
|
||||
"contributor": str,
|
||||
"date": str,
|
||||
"views": int,
|
||||
"date": int,
|
||||
"lang": str,
|
||||
"language": str,
|
||||
},
|
||||
@@ -111,54 +100,37 @@ class MangadexMangaExtractor(MangadexExtractor, MangaExtractor):
|
||||
"count": ">= 100",
|
||||
}),
|
||||
]
|
||||
scheme = "https"
|
||||
per_page = 100
|
||||
reverse = False
|
||||
|
||||
def __init__(self, match):
|
||||
self.manga_id = match.group(1)
|
||||
url = "{}/api/manga/{}".format(self.root, self.manga_id)
|
||||
MangaExtractor.__init__(self, match, url)
|
||||
|
||||
def chapters(self, page):
|
||||
data = json.loads(page)
|
||||
manga = data["manga"]
|
||||
|
||||
results = []
|
||||
extr = text.extract
|
||||
num = 1
|
||||
for chid, info in data["chapter"].items():
|
||||
chapter, sep, minor = info["chapter"].partition(".")
|
||||
lang = self.iso639_map.get(info["lang_code"], info["lang_code"])
|
||||
|
||||
manga = text.unescape(extr(
|
||||
page, '"og:title" content="', '"')[0].rpartition(" (")[0])
|
||||
manga_id = text.parse_int(extr(
|
||||
page, '/images/manga/', '.')[0])
|
||||
results.append((self.root + "/chapter/" + chid, {
|
||||
"manga": manga["title"],
|
||||
"manga_id": text.parse_int(self.manga_id),
|
||||
"artist": manga["artist"],
|
||||
"author": manga["author"],
|
||||
"title": text.unescape(info["title"]),
|
||||
"volume": text.parse_int(info["volume"]),
|
||||
"chapter": text.parse_int(chapter),
|
||||
"chapter_minor": sep + minor,
|
||||
"chapter_id": text.parse_int(chid),
|
||||
"group": text.unescape(info["group_name"]),
|
||||
"date": info["timestamp"],
|
||||
"lang": lang,
|
||||
"language": util.code_to_language(lang),
|
||||
}))
|
||||
|
||||
while True:
|
||||
before = len(results)
|
||||
|
||||
for info in text.extract_iter(page, '<tr id="chapter_', '</tr>'):
|
||||
chid , pos = extr(info, 'data-chapter-id="', '"')
|
||||
chapter , pos = extr(info, 'data-chapter-num="', '"', pos)
|
||||
volume , pos = extr(info, 'data-volume-num="', '"', pos)
|
||||
title , pos = extr(info, 'data-chapter-name="', '"', pos)
|
||||
language, pos = extr(info, " title='", "'", pos)
|
||||
group , pos = extr(info, "<td>", "</td>", pos)
|
||||
user , pos = extr(info, "<td>", "</td>", pos)
|
||||
views , pos = extr(info, ">", "<", pos)
|
||||
date , pos = extr(info, ' datetime="', '"', pos)
|
||||
|
||||
chapter, sep, minor = chapter.partition(".")
|
||||
chapter = chapter.rpartition(" ")[2]
|
||||
|
||||
results.append((self.root + "/chapter/" + chid, {
|
||||
"manga": manga,
|
||||
"manga_id": text.parse_int(manga_id),
|
||||
"title": text.unescape(title),
|
||||
"volume": text.parse_int(volume),
|
||||
"chapter": text.parse_int(chapter),
|
||||
"chapter_minor": sep + minor,
|
||||
"chapter_id": text.parse_int(chid),
|
||||
"group": text.unescape(text.remove_html(group)),
|
||||
"contributor": text.remove_html(user),
|
||||
"views": text.parse_int(views),
|
||||
"date": date,
|
||||
"lang": util.language_to_code(language),
|
||||
"language": language,
|
||||
}))
|
||||
|
||||
if len(results) - before != self.per_page:
|
||||
return results
|
||||
|
||||
num += 1
|
||||
page = self.request("{}/_/chapters/{}/".format(self.url, num)).text
|
||||
results.sort(key=lambda x: (x[1]["chapter"], x[1]["chapter_minor"]))
|
||||
return results
|
||||
|
||||
Reference in New Issue
Block a user