implement generic manga-chapter extractor
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014-2017 Mike Fährmann
|
||||
# Copyright 2014-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -18,7 +18,7 @@ import requests
|
||||
import threading
|
||||
import http.cookiejar
|
||||
from .message import Message
|
||||
from .. import config, exception
|
||||
from .. import config, text, exception
|
||||
|
||||
|
||||
class Extractor():
|
||||
@@ -163,6 +163,47 @@ class AsynchronousExtractor(Extractor):
|
||||
put(None)
|
||||
|
||||
|
||||
class ChapterExtractor(Extractor):
|
||||
|
||||
subcategory = "chapter"
|
||||
directory_fmt = [
|
||||
"{category}", "{manga}",
|
||||
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"]
|
||||
filename_fmt = (
|
||||
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
|
||||
|
||||
def __init__(self, url):
|
||||
Extractor.__init__(self)
|
||||
self.url = url
|
||||
|
||||
def items(self):
|
||||
page = self.request(self.url).text
|
||||
data = self.get_metadata(page)
|
||||
imgs = self.get_images(page)
|
||||
|
||||
if "count" in data:
|
||||
images = zip(range(1, data["count"]+1), imgs)
|
||||
else:
|
||||
try:
|
||||
data["count"] = len(imgs)
|
||||
except TypeError:
|
||||
pass
|
||||
images = enumerate(imgs, 1)
|
||||
|
||||
yield Message.Version, 1
|
||||
yield Message.Directory, data
|
||||
for data["page"], (url, imgdata) in images:
|
||||
if imgdata:
|
||||
data.update(imgdata)
|
||||
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||
|
||||
def get_metadata(self, page):
|
||||
"""Return a dict with general metadata"""
|
||||
|
||||
def get_images(self, page):
|
||||
"""Return a list of all (image-url, metadata)-tuples"""
|
||||
|
||||
|
||||
class MangaExtractor(Extractor):
|
||||
|
||||
subcategory = "manga"
|
||||
@@ -176,7 +217,6 @@ class MangaExtractor(Extractor):
|
||||
self.url = url or self.scheme + "://" + match.group(1)
|
||||
|
||||
def items(self):
|
||||
self.login()
|
||||
page = self.request(self.url).text
|
||||
|
||||
chapters = self.chapters(page)
|
||||
@@ -187,12 +227,8 @@ class MangaExtractor(Extractor):
|
||||
for chapter, data in chapters:
|
||||
yield Message.Queue, chapter, data
|
||||
|
||||
def login(self):
|
||||
"""Login and set necessary cookies"""
|
||||
|
||||
def chapters(self, page):
|
||||
"""Return a list of all (url, metadata)-tuples"""
|
||||
return []
|
||||
"""Return a list of all (chapter-url, metadata)-tuples"""
|
||||
|
||||
|
||||
class SharedConfigExtractor(Extractor):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -8,54 +8,36 @@
|
||||
|
||||
"""Extract manga-chapters from https://dynasty-scans.com/"""
|
||||
|
||||
from .common import Extractor, Message
|
||||
from .common import ChapterExtractor
|
||||
from .. import text, util
|
||||
import re
|
||||
import json
|
||||
|
||||
|
||||
class DynastyscansChapterExtractor(Extractor):
|
||||
class DynastyscansChapterExtractor(ChapterExtractor):
|
||||
"""Extractor for manga-chapters from dynasty-scans.com"""
|
||||
category = "dynastyscans"
|
||||
subcategory = "chapter"
|
||||
directory_fmt = [
|
||||
"{category}", "{manga}", "c{chapter:>03}{chapter_minor}{title:?: //}"]
|
||||
filename_fmt = (
|
||||
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
|
||||
pattern = [r"(?:https?://)?(?:www\.)?dynasty-scans\.com/chapters/([^/]+)"]
|
||||
test = [
|
||||
(("http://dynasty-scans.com/chapters/"
|
||||
"hitoribocchi_no_oo_seikatsu_ch33"), {
|
||||
"url": "dce64e8c504118f1ab4135c00245ea12413896cb",
|
||||
"keyword": "fb2f470b995df5b301ccede31ed9829a010236db",
|
||||
"keyword": "ec5c56bbd5c97aa521d00f2598bba4663fb8ab9f",
|
||||
}),
|
||||
(("http://dynasty-scans.com/chapters/"
|
||||
"new_game_the_spinoff_special_13"), {
|
||||
"url": "dbe5bbb74da2edcfb1832895a484e2a40bc8b538",
|
||||
"keyword": "281bbe0fb74b812ced595619ca5876983490dc0e",
|
||||
"keyword": "1208a102d9a1bb0b0c740a67996d9b26a9357b64",
|
||||
}),
|
||||
]
|
||||
root = "https://dynasty-scans.com"
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self)
|
||||
self.chaptername = match.group(1)
|
||||
url = self.root + "/chapters/" + self.chaptername
|
||||
ChapterExtractor.__init__(self, url)
|
||||
|
||||
def items(self):
|
||||
page = self.request(self.root + "/chapters/" + self.chaptername,
|
||||
encoding="utf-8").text
|
||||
data = self.get_job_metadata(page)
|
||||
imgs = self.get_image_data(page)
|
||||
data["count"] = len(imgs)
|
||||
yield Message.Version, 1
|
||||
yield Message.Directory, data
|
||||
for data["page"], img in enumerate(imgs, 1):
|
||||
url = self.root + img["image"]
|
||||
text.nameext_from_url(url, data)
|
||||
data["name"] = img["name"]
|
||||
yield Message.Url, url, data
|
||||
|
||||
def get_job_metadata(self, page):
|
||||
def get_metadata(self, page):
|
||||
"""Collect metadata for extractor-job"""
|
||||
info , pos = text.extract(page, "<h3 id='chapter-title'><b>", "</b>")
|
||||
author, pos = text.extract(page, " by ", "</a>", pos)
|
||||
@@ -82,8 +64,10 @@ class DynastyscansChapterExtractor(Extractor):
|
||||
"language": "English",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def get_image_data(page):
|
||||
def get_images(self, page):
|
||||
"""Extract list of all image-urls for a manga chapter"""
|
||||
data = text.extract(page, "var pages = ", ";\n")[0]
|
||||
return json.loads(data)
|
||||
return [
|
||||
(self.root + img["image"], None)
|
||||
for img in json.loads(data)
|
||||
]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
# Copyright 2017-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -8,21 +8,16 @@
|
||||
|
||||
"""Extract manga-chapters from https://www.fascans.com/"""
|
||||
|
||||
from .common import Extractor, MangaExtractor, Message
|
||||
from .common import ChapterExtractor, MangaExtractor
|
||||
from .. import text, util
|
||||
import json
|
||||
|
||||
|
||||
class FallenangelsChapterExtractor(Extractor):
|
||||
class FallenangelsChapterExtractor(ChapterExtractor):
|
||||
"""Extractor for manga-chapters from fascans.com"""
|
||||
category = "fallenangels"
|
||||
subcategory = "chapter"
|
||||
directory_fmt = ["{category}", "{manga}",
|
||||
"c{chapter:>03}{chapter_minor}{title:?: //}"]
|
||||
filename_fmt = (
|
||||
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
|
||||
pattern = [(r"(?:https?://)?(manga|truyen)\.fascans\.com/"
|
||||
r"manga/([^/]+)/(\d+)(\.[^/?&#]+)?")]
|
||||
pattern = [(r"(?:https?://)?(manga|truyen)\.fascans\.com"
|
||||
r"/manga/([^/]+)/(\d+)(\.[^/?&#]+)?")]
|
||||
test = [
|
||||
("https://manga.fascans.com/manga/chronos-ruler/20/1", {
|
||||
"url": "4604a7914566cc2da0ff789aa178e2d1c8c241e3",
|
||||
@@ -38,24 +33,12 @@ class FallenangelsChapterExtractor(Extractor):
|
||||
]
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self)
|
||||
self.version, self.manga, self.chapter, self.minor = match.groups()
|
||||
|
||||
def items(self):
|
||||
url = "https://{}.fascans.com/manga/{}/{}/1".format(
|
||||
self.version, self.manga, self.chapter)
|
||||
page = self.request(url).text
|
||||
data = self.get_metadata(page)
|
||||
imgs = self.get_images(page)
|
||||
data["count"] = len(imgs)
|
||||
yield Message.Version, 1
|
||||
yield Message.Directory, data
|
||||
for data["page"], img in enumerate(imgs, 1):
|
||||
url = img["page_image"]
|
||||
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||
ChapterExtractor.__init__(self, url)
|
||||
|
||||
def get_metadata(self, page):
|
||||
"""Collect metadata for extractor-job"""
|
||||
lang = "vi" if self.version == "truyen" else "en"
|
||||
data = {
|
||||
"chapter": self.chapter,
|
||||
@@ -70,8 +53,12 @@ class FallenangelsChapterExtractor(Extractor):
|
||||
|
||||
@staticmethod
|
||||
def get_images(page):
|
||||
"""Return a list of all images in this chapter"""
|
||||
return json.loads(text.extract(page, "var pages = ", ";")[0])
|
||||
return [
|
||||
(img["page_image"], None)
|
||||
for img in json.loads(
|
||||
text.extract(page, "var pages = ", ";")[0]
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
class FallenangelsMangaExtractor(MangaExtractor):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -8,13 +8,13 @@
|
||||
|
||||
"""Extract images from http://www.hbrowse.com/"""
|
||||
|
||||
from .common import Extractor, MangaExtractor, Message
|
||||
from .common import ChapterExtractor, MangaExtractor
|
||||
from .. import text, util
|
||||
from urllib.parse import urljoin
|
||||
import json
|
||||
|
||||
|
||||
class HbrowseExtractor(Extractor):
|
||||
class HbrowseExtractor():
|
||||
"""Base class for hbrowse extractors"""
|
||||
category = "hbrowse"
|
||||
root = "http://www.hbrowse.com"
|
||||
@@ -64,41 +64,30 @@ class HbrowseMangaExtractor(HbrowseExtractor, MangaExtractor):
|
||||
results.append((urljoin(self.root, url), data.copy()))
|
||||
|
||||
|
||||
class HbrowseChapterExtractor(HbrowseExtractor):
|
||||
class HbrowseChapterExtractor(HbrowseExtractor, ChapterExtractor):
|
||||
"""Extractor for manga-chapters from hbrowse.com"""
|
||||
subcategory = "chapter"
|
||||
directory_fmt = ["{category}", "{manga_id} {manga}", "c{chapter:>05}"]
|
||||
filename_fmt = ("{category}_{manga_id}_{chapter:>05}_"
|
||||
"{num:>03}.{extension}")
|
||||
"{page:>03}.{extension}")
|
||||
pattern = [r"(?:https?://)?(?:www\.)?hbrowse\.com/(\d+)/c(\d+)"]
|
||||
test = [("http://www.hbrowse.com/10363/c00000", {
|
||||
"url": "634f4800858913f097bc3b62a8fedaf74b5254bd",
|
||||
"keyword": "730bd33de2a0a0fb4e0b6dcdafedcaeee1060047",
|
||||
"keyword": "f37cafef404696312f5db6ccaaaf72737d309e2d",
|
||||
"content": "44578ebbe176c2c27434966aef22945787e2781e",
|
||||
})]
|
||||
|
||||
def __init__(self, match):
|
||||
HbrowseExtractor.__init__(self)
|
||||
self.gid, self.chapter = match.groups()
|
||||
self.path = "/{}/c{}/".format(self.gid, self.chapter)
|
||||
ChapterExtractor.__init__(self, self.root + self.path)
|
||||
|
||||
def items(self):
|
||||
page = self.request(self.root + self.path).text
|
||||
data = self.get_job_metadata(page)
|
||||
yield Message.Version, 1
|
||||
yield Message.Directory, data
|
||||
for data["num"], url in enumerate(self.get_image_urls(page), 1):
|
||||
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||
|
||||
def get_job_metadata(self, page):
|
||||
"""Collect metadata for extractor-job"""
|
||||
def get_metadata(self, page):
|
||||
return self.parse_page(page, {
|
||||
"manga_id": util.safe_int(self.gid),
|
||||
"chapter": util.safe_int(self.chapter)
|
||||
})
|
||||
|
||||
def get_image_urls(self, page):
|
||||
"""Yield all image-urls for a 'chapter'"""
|
||||
def get_images(self, page):
|
||||
base = self.root + "/data" + self.path
|
||||
json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]"
|
||||
return [base + name for name in json.loads(json_data)]
|
||||
return [(base + name, None) for name in json.loads(json_data)]
|
||||
|
||||
@@ -8,12 +8,12 @@
|
||||
|
||||
"""Extract manga-chapters and entire manga from https://mangapark.me/"""
|
||||
|
||||
from .common import Extractor, MangaExtractor, Message
|
||||
from .common import ChapterExtractor, MangaExtractor
|
||||
from .. import text, util
|
||||
from urllib.parse import urljoin
|
||||
|
||||
|
||||
class MangaparkExtractor(Extractor):
|
||||
class MangaparkExtractor():
|
||||
"""Base class for mangapark extractors"""
|
||||
category = "mangapark"
|
||||
root = "https://mangapark.me"
|
||||
@@ -68,14 +68,8 @@ class MangaparkMangaExtractor(MangaparkExtractor, MangaExtractor):
|
||||
results.append((self.root + path, data.copy()))
|
||||
|
||||
|
||||
class MangaparkChapterExtractor(MangaparkExtractor):
|
||||
class MangaparkChapterExtractor(MangaparkExtractor, ChapterExtractor):
|
||||
"""Extractor for manga-chapters from mangapark.me"""
|
||||
subcategory = "chapter"
|
||||
directory_fmt = [
|
||||
"{category}", "{manga}",
|
||||
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"]
|
||||
filename_fmt = (
|
||||
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
|
||||
pattern = [(r"(?:https?://)?(?:www\.)?mangapark\.me(/manga/[^/]+"
|
||||
r"/s\d+(?:/v\d+)?/c\d+[^/]*(?:/e\d+)?)")]
|
||||
test = [
|
||||
@@ -95,20 +89,11 @@ class MangaparkChapterExtractor(MangaparkExtractor):
|
||||
]
|
||||
|
||||
def __init__(self, match):
|
||||
MangaparkExtractor.__init__(self)
|
||||
self.path = match.group(1)
|
||||
url = self.root + self.path + "?zoom=2"
|
||||
ChapterExtractor.__init__(self, url)
|
||||
|
||||
def items(self):
|
||||
page = self.request(self.root + self.path + "?zoom=2").text
|
||||
data = self.get_job_metadata(page)
|
||||
yield Message.Version, 1
|
||||
yield Message.Directory, data
|
||||
for url, image in self.get_images(page):
|
||||
data.update(image)
|
||||
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||
|
||||
def get_job_metadata(self, page):
|
||||
"""Collect metadata for extractor-job"""
|
||||
def get_metadata(self, page):
|
||||
data = {"lang": "en", "language": "English"}
|
||||
self.parse_chapter_path(self.path, data)
|
||||
text.extract_all(page, (
|
||||
@@ -126,7 +111,6 @@ class MangaparkChapterExtractor(MangaparkExtractor):
|
||||
return data
|
||||
|
||||
def get_images(self, page):
|
||||
"""Collect image-urls, -widths and -heights"""
|
||||
pos = 0
|
||||
num = 0
|
||||
while True:
|
||||
|
||||
@@ -20,7 +20,7 @@ SKIP = {
|
||||
# temporary issues
|
||||
"chronos",
|
||||
"coreimg",
|
||||
"luscious",
|
||||
"yeet",
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user