use generic chapter-extractor in more modules
This commit is contained in:
@@ -168,9 +168,9 @@ class ChapterExtractor(Extractor):
|
||||
subcategory = "chapter"
|
||||
directory_fmt = [
|
||||
"{category}", "{manga}",
|
||||
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"]
|
||||
"{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}"]
|
||||
filename_fmt = (
|
||||
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
|
||||
"{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
|
||||
|
||||
def __init__(self, url):
|
||||
Extractor.__init__(self)
|
||||
@@ -182,7 +182,10 @@ class ChapterExtractor(Extractor):
|
||||
imgs = self.get_images(page)
|
||||
|
||||
if "count" in data:
|
||||
images = zip(range(1, data["count"]+1), imgs)
|
||||
images = zip(
|
||||
range(1, data["count"]+1),
|
||||
imgs
|
||||
)
|
||||
else:
|
||||
try:
|
||||
data["count"] = len(imgs)
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2016-2017 Mike Fährmann
|
||||
# Copyright 2016-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -8,9 +8,8 @@
|
||||
|
||||
"""Extract hentai-manga from https://hentai2read.com/"""
|
||||
|
||||
from .common import MangaExtractor
|
||||
from .common import ChapterExtractor, MangaExtractor
|
||||
from .. import text, util
|
||||
from . import hentaicdn
|
||||
import re
|
||||
import json
|
||||
|
||||
@@ -53,42 +52,43 @@ class Hentai2readMangaExtractor(MangaExtractor):
|
||||
return results
|
||||
|
||||
|
||||
class Hentai2readChapterExtractor(hentaicdn.HentaicdnChapterExtractor):
|
||||
class Hentai2readChapterExtractor(ChapterExtractor):
|
||||
"""Extractor for a single manga chapter from hentai2read.com"""
|
||||
category = "hentai2read"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?hentai2read\.com/([^/]+)/(\d+)"]
|
||||
test = [("http://hentai2read.com/amazon_elixir/1/", {
|
||||
"url": "964b942cf492b3a129d2fe2608abfc475bc99e71",
|
||||
"keyword": "0f6408d462a14bfe58030117dc295b84666843d0",
|
||||
"keyword": "9845105898d28c6a540cffdea60a1a20fab52431",
|
||||
})]
|
||||
|
||||
def __init__(self, match):
|
||||
hentaicdn.HentaicdnChapterExtractor.__init__(self)
|
||||
url_title, self.chapter = match.groups()
|
||||
self.url = "https://hentai2read.com/{}/{}/".format(
|
||||
url_title, self.chapter
|
||||
)
|
||||
url = "https://hentai2read.com/{}/{}/".format(url_title, self.chapter)
|
||||
ChapterExtractor.__init__(self, url)
|
||||
|
||||
def get_job_metadata(self, page, images):
|
||||
title = text.extract(page, "<title>", "</title>")[0]
|
||||
chapter_id = text.extract(page, 'data-cid="', '"')[0]
|
||||
def get_metadata(self, page):
|
||||
title, pos = text.extract(page, "<title>", "</title>")
|
||||
manga_id, pos = text.extract(page, 'data-mid="', '"', pos)
|
||||
chapter_id, pos = text.extract(page, 'data-cid="', '"', pos)
|
||||
match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - "
|
||||
r"(\d+): (.+) . Page 1 ", title)
|
||||
return {
|
||||
"manga_id": images[0].split("/")[-3],
|
||||
"manga": match.group(1),
|
||||
"manga_id": util.safe_int(manga_id),
|
||||
"chapter": util.safe_int(self.chapter),
|
||||
"chapter_id": util.safe_int(chapter_id),
|
||||
"type": match.group(2),
|
||||
"chapter_id": chapter_id,
|
||||
"chapter": self.chapter,
|
||||
"author": match.group(3),
|
||||
"title": match.group(5),
|
||||
"count": len(images),
|
||||
"lang": "en",
|
||||
"language": "English",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def get_image_urls(page):
|
||||
def get_images(page):
|
||||
"""Extract and return a list of all image-urls"""
|
||||
images = text.extract(page, "'images' : ", ",\n")[0]
|
||||
return json.loads(images)
|
||||
return [
|
||||
("https://hentaicdn.com/hentai" + part, None)
|
||||
for part in json.loads(images)
|
||||
]
|
||||
|
||||
@@ -1,41 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2016-2017 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Base classes for extractors from sites based on hentaicdn"""
|
||||
|
||||
from .common import Extractor, Message
|
||||
from .. import text
|
||||
import json
|
||||
|
||||
|
||||
class HentaicdnChapterExtractor(Extractor):
|
||||
"""Base class for extractors for a single manga chapter"""
|
||||
subcategory = "chapter"
|
||||
directory_fmt = ["{category}", "{manga_id} {title}"]
|
||||
filename_fmt = ("{category}_{manga_id}_{chapter:>02}_"
|
||||
"{num:>03}.{extension}")
|
||||
url = ""
|
||||
|
||||
def items(self):
|
||||
page = self.request(self.url).text
|
||||
images = self.get_image_urls(page)
|
||||
data = self.get_job_metadata(page, images)
|
||||
yield Message.Version, 1
|
||||
yield Message.Directory, data
|
||||
for data["num"], part in enumerate(images, 1):
|
||||
url = "https://hentaicdn.com/hentai" + part
|
||||
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||
|
||||
def get_job_metadata(self, page, images):
|
||||
"""Collect metadata for extractor-job"""
|
||||
|
||||
@staticmethod
|
||||
def get_image_urls(page):
|
||||
"""Extract and return a list of all image-urls"""
|
||||
images = text.extract(page, "var rff_imageList = ", ";")[0]
|
||||
return json.loads(images)
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2016-2017 Mike Fährmann
|
||||
# Copyright 2016-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -8,10 +8,10 @@
|
||||
|
||||
"""Extract hentai-manga from https://hentaihere.com/"""
|
||||
|
||||
from .common import MangaExtractor
|
||||
from .common import ChapterExtractor, MangaExtractor
|
||||
from .. import text, util
|
||||
from . import hentaicdn
|
||||
import re
|
||||
import json
|
||||
|
||||
|
||||
class HentaihereMangaExtractor(MangaExtractor):
|
||||
@@ -56,36 +56,42 @@ class HentaihereMangaExtractor(MangaExtractor):
|
||||
}))
|
||||
|
||||
|
||||
class HentaihereChapterExtractor(hentaicdn.HentaicdnChapterExtractor):
|
||||
class HentaihereChapterExtractor(ChapterExtractor):
|
||||
"""Extractor for a single manga chapter from hentaihere.com"""
|
||||
category = "hentaihere"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?hentaihere\.com/m/S(\d+)/(\d+)"]
|
||||
test = [("https://hentaihere.com/m/S13812/1/1/", {
|
||||
"url": "964b942cf492b3a129d2fe2608abfc475bc99e71",
|
||||
"keyword": "a07753f655210525a80ff62607261715746f3273",
|
||||
"keyword": "e9382a9be337abce3db2b1132e85751379dc05c5",
|
||||
})]
|
||||
|
||||
def __init__(self, match):
|
||||
hentaicdn.HentaicdnChapterExtractor.__init__(self)
|
||||
self.gid, self.chapter = match.groups()
|
||||
self.url = "https://hentaihere.com/m/S{}/{}/1".format(
|
||||
self.gid, self.chapter
|
||||
)
|
||||
self.manga_id, self.chapter = match.groups()
|
||||
url = "https://hentaihere.com/m/S{}/{}/1".format(
|
||||
self.manga_id, self.chapter)
|
||||
ChapterExtractor.__init__(self, url)
|
||||
|
||||
def get_job_metadata(self, page, images):
|
||||
def get_metadata(self, page):
|
||||
title = text.extract(page, "<title>", "</title>")[0]
|
||||
chapter_id = text.extract(page, 'report/C', '"')[0]
|
||||
pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at "
|
||||
match = re.match(pattern, title)
|
||||
return {
|
||||
"manga_id": self.gid,
|
||||
"manga": match.group(1),
|
||||
"manga_id": util.safe_int(self.manga_id),
|
||||
"chapter": util.safe_int(self.chapter),
|
||||
"chapter_id": util.safe_int(chapter_id),
|
||||
"type": match.group(2),
|
||||
"chapter_id": chapter_id,
|
||||
"chapter": self.chapter,
|
||||
"title": match.group(3),
|
||||
"author": match.group(4),
|
||||
"count": len(images),
|
||||
"lang": "en",
|
||||
"language": "English",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def get_images(page):
|
||||
images = text.extract(page, "var rff_imageList = ", ";")[0]
|
||||
return [
|
||||
("https://hentaicdn.com/hentai" + part, None)
|
||||
for part in json.loads(images)
|
||||
]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
"""Extract manga-chapters and entire manga from http://kissmanga.com/"""
|
||||
|
||||
from .common import Extractor, MangaExtractor, Message
|
||||
from .common import ChapterExtractor, MangaExtractor
|
||||
from .. import text, util, cloudflare, aes, exception
|
||||
from ..cache import cache
|
||||
import re
|
||||
@@ -21,21 +21,11 @@ IV = [
|
||||
]
|
||||
|
||||
|
||||
class KissmangaExtractor(Extractor):
|
||||
class KissmangaBase():
|
||||
"""Base class for kissmanga extractors"""
|
||||
category = "kissmanga"
|
||||
directory_fmt = [
|
||||
"{category}", "{manga}",
|
||||
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}"]
|
||||
filename_fmt = (
|
||||
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
|
||||
root = "http://kissmanga.com"
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self)
|
||||
self.url = match.group(0)
|
||||
self.session.headers["Referer"] = self.root
|
||||
|
||||
def request(self, url):
|
||||
response = cloudflare.request_func(self, url)
|
||||
if response.history and "/Message/AreYouHuman?" in response.url:
|
||||
@@ -72,10 +62,10 @@ class KissmangaExtractor(Extractor):
|
||||
return data
|
||||
|
||||
|
||||
class KissmangaMangaExtractor(KissmangaExtractor, MangaExtractor):
|
||||
class KissmangaMangaExtractor(KissmangaBase, MangaExtractor):
|
||||
"""Extractor for manga from kissmanga.com"""
|
||||
pattern = [r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com/"
|
||||
r"Manga/[^/?&#]+/?$"]
|
||||
pattern = [r"(?i)(?:https?://)?(?:www\.)?(kissmanga\.com"
|
||||
r"/Manga/[^/?&#]+/?)$"]
|
||||
test = [
|
||||
("http://kissmanga.com/Manga/Dropout", {
|
||||
"url": "992befdd64e178fe5af67de53f8b510860d968ca",
|
||||
@@ -105,11 +95,10 @@ class KissmangaMangaExtractor(KissmangaExtractor, MangaExtractor):
|
||||
return results
|
||||
|
||||
|
||||
class KissmangaChapterExtractor(KissmangaExtractor):
|
||||
class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor):
|
||||
"""Extractor for manga-chapters from kissmanga.com"""
|
||||
subcategory = "chapter"
|
||||
pattern = [r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com/"
|
||||
r"Manga/[^/?&#]+/[^/?&#]+\?id=\d+"]
|
||||
pattern = [r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com"
|
||||
r"/Manga/[^/?&#]+/[^/?&#]+\?id=\d+"]
|
||||
test = [
|
||||
("http://kissmanga.com/Manga/Dropout/Ch-000---Oneshot-?id=145847", {
|
||||
"url": "4136bcd1c6cecbca8cc2bc965d54f33ef0a97cc0",
|
||||
@@ -126,18 +115,11 @@ class KissmangaChapterExtractor(KissmangaExtractor):
|
||||
("http://kissmanga.com/mAnGa/mOnStEr/Monster-79?id=7608", None),
|
||||
]
|
||||
|
||||
def items(self):
|
||||
page = self.request(self.url).text
|
||||
data = self.get_job_metadata(page)
|
||||
imgs = self.get_image_urls(page)
|
||||
data["count"] = len(imgs)
|
||||
yield Message.Version, 1
|
||||
yield Message.Directory, data
|
||||
for data["page"], url in enumerate(imgs, 1):
|
||||
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||
def __init__(self, match):
|
||||
ChapterExtractor.__init__(self, match.group(0))
|
||||
self.session.headers["Referer"] = self.root
|
||||
|
||||
def get_job_metadata(self, page):
|
||||
"""Collect metadata for extractor-job"""
|
||||
def get_metadata(self, page):
|
||||
title = text.extract(page, "<title>", "</title>")[0].strip()
|
||||
manga, cinfo = title.split("\n")[1:3]
|
||||
data = {
|
||||
@@ -148,12 +130,11 @@ class KissmangaChapterExtractor(KissmangaExtractor):
|
||||
}
|
||||
return self.parse_chapter_string(data)
|
||||
|
||||
def get_image_urls(self, page):
|
||||
"""Extract list of all image-urls for a manga chapter"""
|
||||
def get_images(self, page):
|
||||
try:
|
||||
key = self.build_aes_key(page)
|
||||
return [
|
||||
aes.aes_cbc_decrypt_text(data, key, IV)
|
||||
(aes.aes_cbc_decrypt_text(data, key, IV), None)
|
||||
for data in text.extract_iter(
|
||||
page, 'lstImages.push(wrapKA("', '"'
|
||||
)
|
||||
|
||||
@@ -1,56 +1,39 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
# Copyright 2017-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extract manga-chapters and entire manga from http://www.mangafox.me/"""
|
||||
"""Extract manga-chapters and entire manga from http://fanfox.net/"""
|
||||
|
||||
from .common import AsynchronousExtractor, Message
|
||||
from .common import ChapterExtractor
|
||||
from .. import text, util, exception
|
||||
import re
|
||||
|
||||
|
||||
class MangafoxChapterExtractor(AsynchronousExtractor):
|
||||
"""Extractor for manga-chapters from mangafox.me"""
|
||||
class MangafoxChapterExtractor(ChapterExtractor):
|
||||
"""Extractor for manga-chapters from fanfox.net"""
|
||||
category = "mangafox"
|
||||
subcategory = "chapter"
|
||||
directory_fmt = [
|
||||
"{category}", "{manga}",
|
||||
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}"]
|
||||
filename_fmt = (
|
||||
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
|
||||
pattern = [(r"(?:https?://)?(?:www\.)?(mangafox\.me/manga/"
|
||||
r"[^/]+/(v\d+/)?c\d+[^/]*)")]
|
||||
test = [(("http://mangafox.me/manga/kidou_keisatsu_patlabor/"
|
||||
"v05/c006.2/1.html"), {
|
||||
"keyword": "36b570e9ef11b4748407324fe08bebbe4856e6fd",
|
||||
"content": "5c50c252dcf12ffecf68801f4db8a2167265f66c",
|
||||
})]
|
||||
pattern = [(r"(?:https?://)?(?:www\.)?(?:mangafox\.me|fanfox\.net)"
|
||||
r"(/manga/[^/]+/(?:v\d+/)?c\d+[^/?&#]*)")]
|
||||
test = [
|
||||
("http://fanfox.net/manga/kidou_keisatsu_patlabor/v05/c006.2/1.html", {
|
||||
"keyword": "36b570e9ef11b4748407324fe08bebbe4856e6fd",
|
||||
"content": "5c50c252dcf12ffecf68801f4db8a2167265f66c",
|
||||
}),
|
||||
("http://mangafox.me/manga/kidou_keisatsu_patlabor/v05/c006.2/", None),
|
||||
]
|
||||
root = "http://fanfox.net"
|
||||
|
||||
def __init__(self, match):
|
||||
AsynchronousExtractor.__init__(self)
|
||||
self.url = "http://" + match.group(1)
|
||||
|
||||
def items(self):
|
||||
page = self.request(self.url + "/1.html").text
|
||||
if "Sorry, its licensed, and not available." in page:
|
||||
raise exception.AuthorizationError()
|
||||
data = self.get_metadata(page)
|
||||
urls = zip(
|
||||
range(1, data["count"]+1),
|
||||
self.get_image_urls(page),
|
||||
)
|
||||
yield Message.Version, 1
|
||||
yield Message.Directory, data.copy()
|
||||
for data["page"], url in urls:
|
||||
text.nameext_from_url(url, data)
|
||||
yield Message.Url, url, data.copy()
|
||||
self.urlbase = self.root + match.group(1)
|
||||
ChapterExtractor.__init__(self, self.urlbase + "/1.html")
|
||||
|
||||
def get_metadata(self, page):
|
||||
"""Collect metadata for extractor-job"""
|
||||
if "Sorry, its licensed, and not available." in page:
|
||||
raise exception.AuthorizationError()
|
||||
data = text.extract_all(page, (
|
||||
("manga" , " - Read ", " Manga Scans "),
|
||||
("sid" , "var sid=", ";"),
|
||||
@@ -67,14 +50,14 @@ class MangafoxChapterExtractor(AsynchronousExtractor):
|
||||
data[key] = util.safe_int(data[key])
|
||||
return data
|
||||
|
||||
def get_image_urls(self, page):
|
||||
"""Yield all image-urls for this chapter"""
|
||||
def get_images(self, page):
|
||||
pnum = 1
|
||||
while True:
|
||||
url, pos = text.extract(page, '<img src="', '"')
|
||||
yield url
|
||||
yield url, None
|
||||
_ , pos = text.extract(page, '<img src="', '"', pos)
|
||||
url, pos = text.extract(page, '<img src="', '"', pos)
|
||||
yield url
|
||||
yield url, None
|
||||
|
||||
pnum += 2
|
||||
page = self.request(self.url + "/{}.html".format(pnum)).text
|
||||
page = self.request("{}/{}.html".format(self.urlbase, pnum)).text
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
"""Extract manga-chapters and entire manga from http://www.mangahere.co/"""
|
||||
|
||||
from .common import MangaExtractor, AsynchronousExtractor, Message
|
||||
from .common import ChapterExtractor, MangaExtractor
|
||||
from .. import text, util
|
||||
from urllib.parse import urljoin
|
||||
import re
|
||||
@@ -60,15 +60,9 @@ class MangahereMangaExtractor(MangaExtractor):
|
||||
}))
|
||||
|
||||
|
||||
class MangahereChapterExtractor(AsynchronousExtractor):
|
||||
class MangahereChapterExtractor(ChapterExtractor):
|
||||
"""Extractor for manga-chapters from mangahere.co"""
|
||||
category = "mangahere"
|
||||
subcategory = "chapter"
|
||||
directory_fmt = [
|
||||
"{category}", "{manga}",
|
||||
"{volume:?v/ />02}c{chapter:>03}{chapter_minor}"]
|
||||
filename_fmt = (
|
||||
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
|
||||
pattern = [(r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]/manga/"
|
||||
r"([^/]+(?:/v0*(\d+))?/c0*(\d+)(\.\d+)?)")]
|
||||
test = [
|
||||
@@ -82,27 +76,12 @@ class MangahereChapterExtractor(AsynchronousExtractor):
|
||||
url_fmt = "http://www.mangahere.cc/manga/{}/{}.html"
|
||||
|
||||
def __init__(self, match):
|
||||
AsynchronousExtractor.__init__(self)
|
||||
self.part, self.volume, self.chapter, self.chminor = match.groups()
|
||||
|
||||
def items(self):
|
||||
# remove ".html" for the first chapter page to avoid redirects
|
||||
url = self.url_fmt.format(self.part, "")[:-5]
|
||||
ChapterExtractor.__init__(self, url)
|
||||
|
||||
page = self.request(url).text
|
||||
data = self.get_job_metadata(page)
|
||||
urls = zip(
|
||||
range(1, data["count"]+1),
|
||||
self.get_image_urls(page),
|
||||
)
|
||||
|
||||
yield Message.Version, 1
|
||||
yield Message.Directory, data.copy()
|
||||
for data["page"], url in urls:
|
||||
text.nameext_from_url(url, data)
|
||||
yield Message.Url, url, data.copy()
|
||||
|
||||
def get_job_metadata(self, page):
|
||||
def get_metadata(self, page):
|
||||
"""Collect metadata for extractor-job"""
|
||||
manga, pos = text.extract(page, '<title>', '</title>')
|
||||
chid , pos = text.extract(page, '.net/store/manga/', '/', pos)
|
||||
@@ -122,15 +101,16 @@ class MangahereChapterExtractor(AsynchronousExtractor):
|
||||
"language": "English",
|
||||
}
|
||||
|
||||
def get_image_urls(self, page):
|
||||
def get_images(self, page):
|
||||
"""Yield all image-urls for this chapter"""
|
||||
pnum = 1
|
||||
while True:
|
||||
url, pos = text.extract(page, '<img src="', '"')
|
||||
yield url
|
||||
yield url, None
|
||||
_ , pos = text.extract(page, '<img src="', '"', pos)
|
||||
_ , pos = text.extract(page, '<img src="', '"', pos)
|
||||
url, pos = text.extract(page, '<img src="', '"', pos)
|
||||
yield url
|
||||
yield url, None
|
||||
|
||||
pnum += 2
|
||||
page = self.request(self.url_fmt.format(self.part, pnum)).text
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -35,5 +35,5 @@ class MangapandaChapterExtractor(MangapandaBase, MangareaderChapterExtractor):
|
||||
]
|
||||
test = [("http://www.mangapanda.com/red-storm/2", {
|
||||
"url": "4bf4ddf6c50105ec8a37675495ab80c46608275d",
|
||||
"keyword": "367d2694b49cc7cac82d68530d7d467a62453502",
|
||||
"keyword": "32b5e84017c2bf5f122b339ecf40899e41f18cc9",
|
||||
})]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
"""Extract manga-chapters and entire manga from http://www.mangareader.net/"""
|
||||
|
||||
from .common import AsynchronousExtractor, MangaExtractor, Message
|
||||
from .common import ChapterExtractor, MangaExtractor
|
||||
from .. import text, util
|
||||
|
||||
|
||||
@@ -57,11 +57,8 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor):
|
||||
results.append((self.root + url, data.copy()))
|
||||
|
||||
|
||||
class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor):
|
||||
class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
|
||||
"""Extractor for manga-chapters from mangareader.net"""
|
||||
subcategory = "chapter"
|
||||
directory_fmt = ["{category}", "{manga}", "c{chapter:>03}{title:?: //}"]
|
||||
filename_fmt = "{manga}_c{chapter:>03}_{page:>03}.{extension}"
|
||||
pattern = [
|
||||
(r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"),
|
||||
(r"(?:https?://)?(?:www\.)?mangareader\.net"
|
||||
@@ -70,26 +67,14 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor):
|
||||
test = [(("http://www.mangareader.net/"
|
||||
"karate-shoukoushi-kohinata-minoru/11"), {
|
||||
"url": "84ffaab4c027ef9022695c53163c3aeabd07ca58",
|
||||
"keyword": "2038e6a780a0028eee0067985b55debb1d4a6aab",
|
||||
"keyword": "2893cfcd1916859fb498f3345f1929f868fe667f",
|
||||
})]
|
||||
|
||||
def __init__(self, match):
|
||||
AsynchronousExtractor.__init__(self)
|
||||
self.part, self.url_title, self.chapter = match.groups()
|
||||
ChapterExtractor.__init__(self, self.root + self.part)
|
||||
|
||||
def items(self):
|
||||
page = self.request(self.root + self.part).text
|
||||
data = self.get_job_metadata(page)
|
||||
yield Message.Version, 1
|
||||
yield Message.Directory, data
|
||||
for data["page"] in range(1, data["count"]+1):
|
||||
next_url, image_url, image_data = self.get_page_metadata(page)
|
||||
image_data.update(data)
|
||||
yield Message.Url, image_url, image_data
|
||||
if next_url:
|
||||
page = self.request(next_url).text
|
||||
|
||||
def get_job_metadata(self, chapter_page):
|
||||
def get_metadata(self, chapter_page):
|
||||
"""Collect metadata for extractor-job"""
|
||||
page = self.request(self.root + self.url_title).text
|
||||
data = self.parse_page(page, {
|
||||
@@ -106,7 +91,16 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor):
|
||||
)
|
||||
return data
|
||||
|
||||
def get_page_metadata(self, page):
|
||||
def get_images(self, page):
|
||||
while True:
|
||||
next_url, image_url, image_data = self.get_image_metadata(page)
|
||||
yield image_url, image_data
|
||||
|
||||
if not next_url:
|
||||
return
|
||||
page = self.request(next_url).text
|
||||
|
||||
def get_image_metadata(self, page):
|
||||
"""Collect next url, image-url and metadata for one manga-page"""
|
||||
extr = text.extract
|
||||
width = None
|
||||
@@ -122,7 +116,7 @@ class MangareaderChapterExtractor(MangareaderBase, AsynchronousExtractor):
|
||||
width , pos = extr(page, '<img id="img" width="', '"', pos)
|
||||
height, pos = extr(page, ' height="', '"', pos)
|
||||
image, pos = extr(page, ' src="', '"', pos)
|
||||
return self.root + url, image, text.nameext_from_url(image, {
|
||||
return self.root + url, image, {
|
||||
"width": util.safe_int(width),
|
||||
"height": util.safe_int(height),
|
||||
})
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -8,42 +8,23 @@
|
||||
|
||||
"""Extract manga-chapters from https://mangastream.com/"""
|
||||
|
||||
from .common import AsynchronousExtractor, Message
|
||||
from .common import ChapterExtractor
|
||||
from .. import text, util
|
||||
from urllib.parse import urljoin
|
||||
|
||||
|
||||
class MangastreamChapterExtractor(AsynchronousExtractor):
|
||||
class MangastreamChapterExtractor(ChapterExtractor):
|
||||
"""Extractor for manga-chapters from mangastream.com"""
|
||||
category = "mangastream"
|
||||
subcategory = "chapter"
|
||||
directory_fmt = ["{category}", "{manga}", "c{chapter}{title:?: //}"]
|
||||
filename_fmt = "{manga}_c{chapter}_{page:>03}.{extension}"
|
||||
pattern = [(r"(?:https?://)?(?:www\.)?(?:readms|mangastream)\.(?:com|net)/"
|
||||
r"r(?:ead)?/([^/]*/([^/]+)/(\d+))")]
|
||||
base_url = "https://mangastream.com/r/"
|
||||
base_url = "https://readms.net/r/"
|
||||
|
||||
def __init__(self, match):
|
||||
AsynchronousExtractor.__init__(self)
|
||||
self.part, self.chapter, self.ch_id = match.groups()
|
||||
ChapterExtractor.__init__(self, self.base_url + self.part)
|
||||
|
||||
def items(self):
|
||||
page = self.request(self.base_url + self.part).text
|
||||
data = self.get_job_metadata(page)
|
||||
next_url = None
|
||||
yield Message.Version, 1
|
||||
yield Message.Directory, data.copy()
|
||||
for data["page"] in range(1, data["count"]+1):
|
||||
if next_url:
|
||||
page = self.request(next_url).text
|
||||
next_url, image_url = self.get_page_metadata(page)
|
||||
text.nameext_from_url(image_url, data)
|
||||
next_url = urljoin(self.base_url, next_url)
|
||||
image_url = urljoin(self.base_url, image_url)
|
||||
yield Message.Url, image_url, data.copy()
|
||||
|
||||
def get_job_metadata(self, page):
|
||||
"""Collect metadata for extractor-job"""
|
||||
def get_metadata(self, page):
|
||||
manga, pos = text.extract(
|
||||
page, '<span class="hidden-xs hidden-sm">', "<")
|
||||
pos = page.find(self.part, pos)
|
||||
@@ -59,9 +40,11 @@ class MangastreamChapterExtractor(AsynchronousExtractor):
|
||||
"language": "English",
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def get_page_metadata(page):
|
||||
"""Collect next url, image-url and metadata for one manga-page"""
|
||||
nurl, pos = text.extract(page, '<div class="page">\n<a href="', '"')
|
||||
iurl, pos = text.extract(page, '<img id="manga-page" src="', '"', pos)
|
||||
return nurl, iurl
|
||||
def get_images(self, page):
|
||||
while True:
|
||||
next_url, pos = text.extract(
|
||||
page, '<div class="page">\n<a href="', '"')
|
||||
image_url, pos = text.extract(
|
||||
page, '<img id="manga-page" src="', '"', pos)
|
||||
yield urljoin(self.base_url, image_url), None
|
||||
page = self.request(urljoin(self.base_url, next_url)).text
|
||||
|
||||
@@ -26,6 +26,6 @@ class PowermangaMangaExtractor(foolslide.FoolslideMangaExtractor):
|
||||
category = "powermanga"
|
||||
pattern = foolslide.manga_pattern(r"read\.powermanga\.org")
|
||||
test = [("https://read.powermanga.org/series/one_piece/", {
|
||||
"url": "3b2037a9ffe30ea0da4e710a40863f0693f21afe",
|
||||
"keyword": "e2a924b0924cba711e78b3585ad24a97dec70006",
|
||||
"url": "e5e9a64c14ca51a170e14c4b711aaa88fdf7a7aa",
|
||||
"keyword": "1245ab2a730f9129001a4589b1d8615a17dc4a7b",
|
||||
})]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2016-2017 Mike Fährmann
|
||||
# Copyright 2016-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -8,29 +8,27 @@
|
||||
|
||||
"""Extract comic-issues and entire comics from http://readcomiconline.to/"""
|
||||
|
||||
from . import kissmanga
|
||||
from .. import text
|
||||
from .common import ChapterExtractor, MangaExtractor
|
||||
from .. import text, cloudflare
|
||||
import re
|
||||
|
||||
|
||||
class ReadcomiconlineExtractor(kissmanga.KissmangaExtractor):
|
||||
class ReadcomiconlineBase():
|
||||
"""Base class for readcomiconline extractors"""
|
||||
category = "readcomiconline"
|
||||
directory_fmt = ["{category}", "{comic}", "{issue:>03}"]
|
||||
filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
|
||||
root = "http://readcomiconline.to"
|
||||
useragent = "Wget/1.19.2 (linux-gnu)"
|
||||
|
||||
def __init__(self, match):
|
||||
kissmanga.KissmangaExtractor.__init__(self, match)
|
||||
self.session.headers["User-Agent"] = "Wget/1.19.2 (linux-gnu)"
|
||||
request = cloudflare.request_func
|
||||
|
||||
|
||||
class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor,
|
||||
kissmanga.KissmangaMangaExtractor):
|
||||
class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
|
||||
"""Extractor for comics from readcomiconline.to"""
|
||||
subcategory = "comic"
|
||||
pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to/"
|
||||
r"Comic/[^/?&#]+/?$"]
|
||||
pattern = [r"(?i)(?:https?://)?(?:www\.)?(readcomiconline\.to"
|
||||
r"/Comic/[^/?&#]+/?)$"]
|
||||
test = [
|
||||
("http://readcomiconline.to/Comic/W-i-t-c-h", {
|
||||
"url": "c5a530538a30b176916e30cbe223a93d83cb2691",
|
||||
@@ -42,6 +40,10 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor,
|
||||
}),
|
||||
]
|
||||
|
||||
def __init__(self, match):
|
||||
MangaExtractor.__init__(self, match)
|
||||
self.session.headers["User-Agent"] = self.useragent
|
||||
|
||||
def chapters(self, page):
|
||||
results = []
|
||||
comic, pos = text.extract(page, '<div class="heading"><h3>', '<')
|
||||
@@ -58,19 +60,21 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor,
|
||||
return results
|
||||
|
||||
|
||||
class ReadcomiconlineIssueExtractor(ReadcomiconlineExtractor,
|
||||
kissmanga.KissmangaChapterExtractor):
|
||||
class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
|
||||
"""Extractor for comic-issues from readcomiconline.to"""
|
||||
subcategory = "issue"
|
||||
pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to/"
|
||||
r"Comic/[^/?&#]+/[^/?&#]+\?id=\d+"]
|
||||
pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
|
||||
r"/Comic/[^/?&#]+/[^/?&#]+\?id=\d+"]
|
||||
test = [("http://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {
|
||||
"url": "a45c77f8fbde66091fe2346d6341f9cf3c6b1bc5",
|
||||
"keyword": "dee8a8a44659825afe1d69e1d809a48b03e98c68",
|
||||
})]
|
||||
|
||||
def get_job_metadata(self, page):
|
||||
"""Collect metadata for extractor-job"""
|
||||
def __init__(self, match):
|
||||
ChapterExtractor.__init__(self, match.group(0))
|
||||
self.session.headers["User-Agent"] = self.useragent
|
||||
|
||||
def get_metadata(self, page):
|
||||
comic, pos = text.extract(page, " - Read\r\n ", "\r\n")
|
||||
iinfo, pos = text.extract(page, " ", "\r\n", pos)
|
||||
match = re.match(r"(?:Issue )?#(\d+)|(.+)", iinfo)
|
||||
@@ -82,6 +86,10 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineExtractor,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def get_image_urls(page):
|
||||
"""Extract list of all image-urls for a manga chapter"""
|
||||
return list(text.extract_iter(page, 'lstImages.push("', '"'))
|
||||
def get_images(page):
|
||||
return [
|
||||
(url, None)
|
||||
for url in text.extract_iter(
|
||||
page, 'lstImages.push("', '"'
|
||||
)
|
||||
]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2017 Mike Fährmann
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -8,7 +8,7 @@
|
||||
|
||||
"""Extract manga pages from http://www.thespectrum.net/manga_scans/"""
|
||||
|
||||
from .common import MangaExtractor, AsynchronousExtractor, Message
|
||||
from .common import ChapterExtractor, MangaExtractor
|
||||
from .. import text, util
|
||||
|
||||
|
||||
@@ -33,55 +33,31 @@ class SpectrumnexusMangaExtractor(MangaExtractor):
|
||||
return results
|
||||
|
||||
|
||||
class SpectrumnexusChapterExtractor(AsynchronousExtractor):
|
||||
class SpectrumnexusChapterExtractor(ChapterExtractor):
|
||||
"""Extractor for manga-chapters or -volumes from thespectrum.net"""
|
||||
category = "spectrumnexus"
|
||||
subcategory = "chapter"
|
||||
directory_fmt = ["{category}", "{manga}", "{identifier}"]
|
||||
filename_fmt = "{manga} {identifier} {page:>03}.{extension}"
|
||||
pattern = [
|
||||
(r"(?:https?://)?(view\.thespectrum\.net/series/"
|
||||
r"[^\.]+\.html)\?ch=(Chapter\+(\d+)|Volume\+(\d+))"),
|
||||
(r"(?:https?://)?(view\.thespectrum\.net/series/"
|
||||
r"[^/]+-chapter-(\d+)\.html)"),
|
||||
]
|
||||
directory_fmt = ["{category}", "{manga}", "{chapter_string}"]
|
||||
filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}"
|
||||
|
||||
pattern = [r"(?:https?://)?view\.thespectrum\.net/series/"
|
||||
r"([^\.]+\.html)\?ch=(Chapter\+(\d+)|Volume\+(\d+))"]
|
||||
test = [(("http://view.thespectrum.net/series/"
|
||||
"toriko.html?ch=Chapter+343&page=1"), {
|
||||
"url": "c0fc7dc594841217cc622a67edd79f06e9900333",
|
||||
"keyword": "3d0cb57b6b1c2cbecc7aed33f83c24891a4ff53f",
|
||||
"keyword": "a8abe126cbc5fc798148b0b155242a470c1ba9d1",
|
||||
})]
|
||||
|
||||
def __init__(self, match):
|
||||
AsynchronousExtractor.__init__(self)
|
||||
self.url = "http://" + match.group(1)
|
||||
self.identifier = match.group(2)
|
||||
self.chapter = match.group(3)
|
||||
self.volume = match.group(4)
|
||||
path, self.chapter_string, self.chapter, self.volume = match.groups()
|
||||
url = "http://view.thespectrum.net/series/{}?ch={}".format(
|
||||
path, self.chapter_string)
|
||||
ChapterExtractor.__init__(self, url)
|
||||
|
||||
def items(self):
|
||||
params = {
|
||||
"ch": self.identifier,
|
||||
"page": 1,
|
||||
}
|
||||
page = self.request(self.url, params=params).text
|
||||
data = self.get_job_metadata(page)
|
||||
yield Message.Version, 1
|
||||
yield Message.Directory, data.copy()
|
||||
for i in range(1, data["count"]+1):
|
||||
url = self.get_image_url(page)
|
||||
text.nameext_from_url(url, data)
|
||||
data["page"] = i
|
||||
yield Message.Url, url, data.copy()
|
||||
if i < data["count"]:
|
||||
params["page"] += 1
|
||||
page = self.request(self.url, params=params).text
|
||||
|
||||
def get_job_metadata(self, page):
|
||||
"""Collect metadata for extractor-job"""
|
||||
def get_metadata(self, page):
|
||||
data = {
|
||||
"chapter": util.safe_int(self.chapter),
|
||||
"chapter_string": self.chapter_string.replace("+", " "),
|
||||
"volume": util.safe_int(self.volume),
|
||||
"identifier": self.identifier.replace("+", " "),
|
||||
}
|
||||
data = text.extract_all(page, (
|
||||
('manga', '<title>', ' · SPECTRUM NEXUS </title>'),
|
||||
@@ -90,7 +66,9 @@ class SpectrumnexusChapterExtractor(AsynchronousExtractor):
|
||||
data["count"] = util.safe_int(data["count"])
|
||||
return data
|
||||
|
||||
@staticmethod
|
||||
def get_image_url(page):
|
||||
"""Extract url of one manga page"""
|
||||
return text.extract(page, '<img id="mainimage" src="', '"')[0]
|
||||
def get_images(self, page):
|
||||
params = {"page": 1}
|
||||
while True:
|
||||
yield text.extract(page, '<img id="mainimage" src="', '"')[0], None
|
||||
params["page"] += 1
|
||||
page = self.request(self.url, params=params).text
|
||||
|
||||
Reference in New Issue
Block a user