[kissmanga][readcomiconline] add 'captcha' option (#279)

to configure how to handle CAPTCHA page redirects:
- either interactively wait for the user to solve the CAPTCHA
- or raise StopExtraction like before
This commit is contained in:
Mike Fährmann
2019-05-27 22:24:48 +02:00
parent e30ada162d
commit 4465a3ea68
4 changed files with 60 additions and 22 deletions

View File

@@ -593,6 +593,18 @@ Description Controls whether to choose the GIF or MP4 version of an animation.
=========== ===== =========== =====
extractor.kissmanga.captcha
---------------------------
=========== =====
Type ``string``
Default ``"stop"``
Description Controls how to handle redirects to CAPTCHA pages.
* ``"stop``: Stop the current extractor run.
* ``"wait``: Ask the user to solve the CAPTCHA and wait.
=========== =====
extractor.oauth.browser extractor.oauth.browser
----------------------- -----------------------
=========== ===== =========== =====
@@ -646,6 +658,18 @@ Description Minimum and maximum wait time in seconds between HTTP requests
=========== ===== =========== =====
extractor.readcomiconline.captcha
---------------------------------
=========== =====
Type ``string``
Default ``"stop"``
Description Controls how to handle redirects to CAPTCHA pages.
* ``"stop``: Stop the current extractor run.
* ``"wait``: Ask the user to solve the CAPTCHA and wait.
=========== =====
extractor.recursive.blacklist extractor.recursive.blacklist
----------------------------- -----------------------------
=========== ===== =========== =====

View File

@@ -62,6 +62,10 @@
{ {
"mp4": true "mp4": true
}, },
"kissmanga":
{
"captcha": "stop"
},
"nijie": "nijie":
{ {
"username": null, "username": null,
@@ -82,6 +86,10 @@
"wait-min": 3.0, "wait-min": 3.0,
"wait-max": 6.0 "wait-max": 6.0
}, },
"readcomiconline":
{
"captcha": "stop"
},
"recursive": "recursive":
{ {
"blacklist": ["directlink", "oauth", "recursive", "test"] "blacklist": ["directlink", "oauth", "recursive", "test"]

View File

@@ -8,7 +8,7 @@
"""Extract manga-chapters and entire manga from https://kissmanga.com/""" """Extract manga-chapters and entire manga from https://kissmanga.com/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor, Extractor
from .. import text, aes, exception from .. import text, aes, exception
from ..cache import cache from ..cache import cache
import hashlib import hashlib
@@ -16,21 +16,35 @@ import ast
import re import re
class KissmangaBase(): class RedirectMixin():
"""Detect and handle redirects to CAPTCHA pages"""
def request(self, url):
while True:
response = Extractor.request(self, url)
if not response.history or "/AreYouHuman" not in response.url:
return response
if self.config("captcha", "stop") == "wait":
self.log.warning(
"Redirect to \n%s\nVisit this URL in your browser, solve "
"the CAPTCHA, and press ENTER to continue", response.url)
try:
input()
except (EOFError, OSError):
pass
else:
self.log.error(
"Redirect to \n%s\nVisit this URL in your browser and "
"solve the CAPTCHA to continue", response.url)
raise exception.StopExtraction()
class KissmangaBase(RedirectMixin):
"""Base class for kissmanga extractors""" """Base class for kissmanga extractors"""
category = "kissmanga" category = "kissmanga"
archive_fmt = "{chapter_id}_{page}" archive_fmt = "{chapter_id}_{page}"
root = "https://kissmanga.com" root = "https://kissmanga.com"
def request(self, url):
response = super().request(url)
if response.history and "/AreYouHuman" in response.url:
self.log.error("Redirect to \n%s\n"
"Visit this URL in your browser and solve "
"the CAPTCHA to continue.", response.url)
raise exception.StopExtraction()
return response
@staticmethod @staticmethod
def parse_chapter_string(data): def parse_chapter_string(data):
"""Parse 'chapter_string' value contained in 'data'""" """Parse 'chapter_string' value contained in 'data'"""

View File

@@ -9,11 +9,12 @@
"""Extract comic-issues and entire comics from https://readcomiconline.to/""" """Extract comic-issues and entire comics from https://readcomiconline.to/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text, exception from .kissmanga import RedirectMixin
from .. import text
import re import re
class ReadcomiconlineBase(): class ReadcomiconlineBase(RedirectMixin):
"""Base class for readcomiconline extractors""" """Base class for readcomiconline extractors"""
category = "readcomiconline" category = "readcomiconline"
directory_fmt = ("{category}", "{comic}", "{issue:>03}") directory_fmt = ("{category}", "{comic}", "{issue:>03}")
@@ -21,15 +22,6 @@ class ReadcomiconlineBase():
archive_fmt = "{issue_id}_{page}" archive_fmt = "{issue_id}_{page}"
root = "https://readcomiconline.to" root = "https://readcomiconline.to"
def request(self, url):
response = super().request(url)
if response.history and "/AreYouHuman" in response.url:
self.log.error("Redirect to \n%s\n"
"Visit this URL in your browser and solve "
"the CAPTCHA to continue.", response.url)
raise exception.StopExtraction()
return response
class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
"""Extractor for comic-issues from readcomiconline.to""" """Extractor for comic-issues from readcomiconline.to"""