automatically detect and bypass cloudflare challenge pages

TODO: cache and re-apply cfclearance cookies
This commit is contained in:
Mike Fährmann
2019-03-10 15:31:33 +01:00
parent 25aaf55514
commit 6dae6bee37
5 changed files with 55 additions and 57 deletions

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2018 Mike Fährmann # Copyright 2015-2019 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -13,46 +13,52 @@ import time
import operator import operator
import urllib.parse import urllib.parse
from . import text from . import text
from .cache import cache
def request_func(self, *args, **kwargs): def is_challenge(response):
cookies = _cookiecache(self.root) return (response.status_code == 503 and
if cookies: response.headers.get("Server", "").startswith("cloudflare") and
self.session.cookies.update(cookies) b"jschl-answer" in response.content)
response = self.session.get(*args, **kwargs)
if response.status_code == 503:
_cookiecache.invalidate(self.root)
self.log.info("Solving Cloudflare challenge")
response = solve_challenge(self.session, response)
_cookiecache(self.root, self.session.cookies)
return response
def solve_challenge(session, response): def solve_challenge(session, response, kwargs):
"""Solve Cloudflare challenge and get cfclearance cookie"""
parsed = urllib.parse.urlsplit(response.url)
root = parsed.scheme + "://" + parsed.netloc
cf_kwargs = kwargs.copy()
headers = cf_kwargs["headers"] = (
kwargs["headers"].copy() if "headers" in kwargs else {})
params = cf_kwargs["params"] = (
kwargs["params"].copy() if "params" in kwargs else {})
session.headers["Referer"] = response.url
page = response.text page = response.text
params = text.extract_all(page, ( params["pass"] = text.extract(page, 'name="pass" value="', '"')[0]
('jschl_vc', 'name="jschl_vc" value="', '"'), params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0]
('pass' , 'name="pass" value="', '"'), params["jschl_answer"] = solve_js_challenge(page, parsed.netloc)
))[0] headers["Referer"] = response.url
params["jschl_answer"] = solve_jschl(response.url, page)
time.sleep(4) time.sleep(4)
url = text.urljoin(response.url, "/cdn-cgi/l/chk_jschl")
return session.get(url, params=params) url = root + "/cdn-cgi/l/chk_jschl"
cf_kwargs["allow_redirects"] = False
cf_response = session.request(response.request.method, url, **cf_kwargs)
location = cf_response.headers["Location"]
if location[0] == "/":
location = root + location
return location
def solve_jschl(url, page): def solve_js_challenge(page, netloc):
"""Solve challenge to get 'jschl_answer' value""" """Evaluate JS challenge in 'page' to get 'jschl_answer' value"""
# build variable name # build variable name
# e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk # e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk
data, pos = text.extract_all(page, ( data, pos = text.extract_all(page, (
('var' , ',f, ', '='), ('var' , ',f, ', '='),
('key' , '"', '"'), ('key' , '"' , '"'),
('expr', ':', '}'), ('expr', ':' , '}'),
)) ))
variable = "{}.{}".format(data["var"], data["key"]) variable = "{}.{}".format(data["var"], data["key"])
vlength = len(variable) vlength = len(variable)
@@ -67,19 +73,19 @@ def solve_jschl(url, page):
for expr in expressions.split(";")[1:]: for expr in expressions.split(";")[1:]:
if expr.startswith(variable): if expr.startswith(variable):
# select arithmetc function based on operator (+, -, *) # select arithmetc function based on operator (+/-/*)
func = operator_functions[expr[vlength]] func = OPERATORS[expr[vlength]]
# evaluate the rest of the expression # evaluate the rest of the expression
value = evaluate_expression(expr[vlength+2:]) value = evaluate_expression(expr[vlength+2:])
# combine the expression value with our current solution # combine expression value with our current solution
solution = func(solution, value) solution = func(solution, value)
elif expr.startswith("a.value"): elif expr.startswith("a.value"):
# add length of the hostname, i.e. add 11 for 'example.org' # add length of hostname
solution += len(urllib.parse.urlsplit(url).netloc) solution += len(netloc)
if ".toFixed(" in expr: if ".toFixed(" in expr:
# trim the solution to 10 decimal places # trim solution to 10 decimal places
# and strip trailing zeros # and strip trailing zeros
solution = "{:.10f}".format(solution).rstrip("0") solution = "{:.10f}".format(solution).rstrip("0")
@@ -87,7 +93,7 @@ def solve_jschl(url, page):
def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")): def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")):
"""Evaluate a Javascript expression for the challenge""" """Evaluate a single Javascript expression for the challenge"""
if "/" in expr: if "/" in expr:
# split the expression in numerator and denominator subexpressions, # split the expression in numerator and denominator subexpressions,
@@ -102,26 +108,21 @@ def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")):
result = "" result = ""
for subexpr in split_re.findall(expr): for subexpr in split_re.findall(expr):
result += str(sum( result += str(sum(
expression_values[part] VALUES[part]
for part in subexpr.split("[]") for part in subexpr.split("[]")
)) ))
return int(result) return int(result)
operator_functions = { OPERATORS = {
"+": operator.add, "+": operator.add,
"-": operator.sub, "-": operator.sub,
"*": operator.mul, "*": operator.mul,
} }
expression_values = { VALUES = {
"": 0, "": 0,
"+": 0, "+": 0,
"!+": 1, "!+": 1,
"+!!": 1, "+!!": 1,
} }
@cache(maxage=365*24*60*60, keyarg=0)
def _cookiecache(key, item=None):
return item

View File

@@ -18,7 +18,7 @@ import requests
import threading import threading
import http.cookiejar import http.cookiejar
from .message import Message from .message import Message
from .. import config, text, exception from .. import config, text, exception, cloudflare
class Extractor(): class Extractor():
@@ -86,6 +86,10 @@ class Extractor():
if encoding: if encoding:
response.encoding = encoding response.encoding = encoding
return response return response
if cloudflare.is_challenge(response):
self.log.info("Solving Cloudflare challenge")
url = cloudflare.solve_challenge(session, response, kwargs)
continue
msg = "{}: {} for url: {}".format(code, response.reason, url) msg = "{}: {} for url: {}".format(code, response.reason, url)
if code < 500 and code != 429: if code < 500 and code != 429:

View File

@@ -9,17 +9,12 @@
"""Extract manga-chapters and entire manga from https://kissmanga.com/""" """Extract manga-chapters and entire manga from https://kissmanga.com/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text, cloudflare, aes, exception from .. import text, aes, exception
from ..cache import cache from ..cache import cache
import hashlib import hashlib
import ast import ast
import re import re
IV = [
0xa5, 0xe8, 0xe2, 0xe9, 0xc2, 0x72, 0x1b, 0xe0,
0xa8, 0x4a, 0xd6, 0x60, 0xc4, 0x72, 0xc1, 0xf3
]
class KissmangaBase(): class KissmangaBase():
"""Base class for kissmanga extractors""" """Base class for kissmanga extractors"""
@@ -28,10 +23,10 @@ class KissmangaBase():
root = "https://kissmanga.com" root = "https://kissmanga.com"
def request(self, url): def request(self, url):
response = cloudflare.request_func(self, url) response = super().request(url)
if response.history and "/Message/AreYouHuman?" in response.url: if response.history and "/Message/AreYouHuman?" in response.url:
self.log.error("Requesting too many pages caused a redirect to %s." self.log.error("Requesting too many pages caused a redirect to %s."
" Try visiting this URL in your browser and solving" " Try visiting this URL in your browser and solve"
" the CAPTCHA to continue.", response.url) " the CAPTCHA to continue.", response.url)
raise exception.StopExtraction() raise exception.StopExtraction()
return response return response
@@ -112,8 +107,10 @@ class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor):
self.session.headers["Referer"] = None self.session.headers["Referer"] = None
try: try:
key = self.build_aes_key(page) key = self.build_aes_key(page)
iv = (0xa5, 0xe8, 0xe2, 0xe9, 0xc2, 0x72, 0x1b, 0xe0,
0xa8, 0x4a, 0xd6, 0x60, 0xc4, 0x72, 0xc1, 0xf3)
return [ return [
(aes.aes_cbc_decrypt_text(data, key, IV), None) (aes.aes_cbc_decrypt_text(data, key, iv), None)
for data in text.extract_iter( for data in text.extract_iter(
page, 'lstImages.push(wrapKA("', '"' page, 'lstImages.push(wrapKA("', '"'
) )

View File

@@ -9,7 +9,7 @@
"""Extract manga-chapters and entire manga from https://komikcast.com/""" """Extract manga-chapters and entire manga from https://komikcast.com/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text, cloudflare from .. import text
import re import re
@@ -18,8 +18,6 @@ class KomikcastBase():
category = "komikcast" category = "komikcast"
root = "https://komikcast.com" root = "https://komikcast.com"
request = cloudflare.request_func
@staticmethod @staticmethod
def parse_chapter_string(chapter_string, data=None): def parse_chapter_string(chapter_string, data=None):
"""Parse 'chapter_string' value and add its info to 'data'""" """Parse 'chapter_string' value and add its info to 'data'"""

View File

@@ -9,7 +9,7 @@
"""Extract comic-issues and entire comics from https://readcomiconline.to/""" """Extract comic-issues and entire comics from https://readcomiconline.to/"""
from .common import ChapterExtractor, MangaExtractor from .common import ChapterExtractor, MangaExtractor
from .. import text, cloudflare from .. import text
import re import re
@@ -21,8 +21,6 @@ class ReadcomiconlineBase():
archive_fmt = "{issue_id}_{page}" archive_fmt = "{issue_id}_{page}"
root = "https://readcomiconline.to" root = "https://readcomiconline.to"
request = cloudflare.request_func
class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
"""Extractor for comic-issues from readcomiconline.to""" """Extractor for comic-issues from readcomiconline.to"""