automatically detect and bypass cloudflare challenge pages
TODO: cache and re-apply cfclearance cookies
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# Copyright 2015-2018 Mike Fährmann
|
# Copyright 2015-2019 Mike Fährmann
|
||||||
#
|
#
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License version 2 as
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
@@ -13,46 +13,52 @@ import time
|
|||||||
import operator
|
import operator
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from . import text
|
from . import text
|
||||||
from .cache import cache
|
|
||||||
|
|
||||||
|
|
||||||
def request_func(self, *args, **kwargs):
|
def is_challenge(response):
|
||||||
cookies = _cookiecache(self.root)
|
return (response.status_code == 503 and
|
||||||
if cookies:
|
response.headers.get("Server", "").startswith("cloudflare") and
|
||||||
self.session.cookies.update(cookies)
|
b"jschl-answer" in response.content)
|
||||||
response = self.session.get(*args, **kwargs)
|
|
||||||
if response.status_code == 503:
|
|
||||||
_cookiecache.invalidate(self.root)
|
|
||||||
self.log.info("Solving Cloudflare challenge")
|
|
||||||
response = solve_challenge(self.session, response)
|
|
||||||
_cookiecache(self.root, self.session.cookies)
|
|
||||||
return response
|
|
||||||
|
|
||||||
|
|
||||||
def solve_challenge(session, response):
|
def solve_challenge(session, response, kwargs):
|
||||||
|
"""Solve Cloudflare challenge and get cfclearance cookie"""
|
||||||
|
parsed = urllib.parse.urlsplit(response.url)
|
||||||
|
root = parsed.scheme + "://" + parsed.netloc
|
||||||
|
|
||||||
|
cf_kwargs = kwargs.copy()
|
||||||
|
headers = cf_kwargs["headers"] = (
|
||||||
|
kwargs["headers"].copy() if "headers" in kwargs else {})
|
||||||
|
params = cf_kwargs["params"] = (
|
||||||
|
kwargs["params"].copy() if "params" in kwargs else {})
|
||||||
|
|
||||||
session.headers["Referer"] = response.url
|
|
||||||
page = response.text
|
page = response.text
|
||||||
params = text.extract_all(page, (
|
params["pass"] = text.extract(page, 'name="pass" value="', '"')[0]
|
||||||
('jschl_vc', 'name="jschl_vc" value="', '"'),
|
params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0]
|
||||||
('pass' , 'name="pass" value="', '"'),
|
params["jschl_answer"] = solve_js_challenge(page, parsed.netloc)
|
||||||
))[0]
|
headers["Referer"] = response.url
|
||||||
params["jschl_answer"] = solve_jschl(response.url, page)
|
|
||||||
|
|
||||||
time.sleep(4)
|
time.sleep(4)
|
||||||
url = text.urljoin(response.url, "/cdn-cgi/l/chk_jschl")
|
|
||||||
return session.get(url, params=params)
|
url = root + "/cdn-cgi/l/chk_jschl"
|
||||||
|
cf_kwargs["allow_redirects"] = False
|
||||||
|
cf_response = session.request(response.request.method, url, **cf_kwargs)
|
||||||
|
|
||||||
|
location = cf_response.headers["Location"]
|
||||||
|
if location[0] == "/":
|
||||||
|
location = root + location
|
||||||
|
return location
|
||||||
|
|
||||||
|
|
||||||
def solve_jschl(url, page):
|
def solve_js_challenge(page, netloc):
|
||||||
"""Solve challenge to get 'jschl_answer' value"""
|
"""Evaluate JS challenge in 'page' to get 'jschl_answer' value"""
|
||||||
|
|
||||||
# build variable name
|
# build variable name
|
||||||
# e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk
|
# e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk
|
||||||
data, pos = text.extract_all(page, (
|
data, pos = text.extract_all(page, (
|
||||||
('var' , ',f, ', '='),
|
('var' , ',f, ', '='),
|
||||||
('key' , '"', '"'),
|
('key' , '"' , '"'),
|
||||||
('expr', ':', '}'),
|
('expr', ':' , '}'),
|
||||||
))
|
))
|
||||||
variable = "{}.{}".format(data["var"], data["key"])
|
variable = "{}.{}".format(data["var"], data["key"])
|
||||||
vlength = len(variable)
|
vlength = len(variable)
|
||||||
@@ -67,19 +73,19 @@ def solve_jschl(url, page):
|
|||||||
for expr in expressions.split(";")[1:]:
|
for expr in expressions.split(";")[1:]:
|
||||||
|
|
||||||
if expr.startswith(variable):
|
if expr.startswith(variable):
|
||||||
# select arithmetc function based on operator (+, -, *)
|
# select arithmetc function based on operator (+/-/*)
|
||||||
func = operator_functions[expr[vlength]]
|
func = OPERATORS[expr[vlength]]
|
||||||
# evaluate the rest of the expression
|
# evaluate the rest of the expression
|
||||||
value = evaluate_expression(expr[vlength+2:])
|
value = evaluate_expression(expr[vlength+2:])
|
||||||
# combine the expression value with our current solution
|
# combine expression value with our current solution
|
||||||
solution = func(solution, value)
|
solution = func(solution, value)
|
||||||
|
|
||||||
elif expr.startswith("a.value"):
|
elif expr.startswith("a.value"):
|
||||||
# add length of the hostname, i.e. add 11 for 'example.org'
|
# add length of hostname
|
||||||
solution += len(urllib.parse.urlsplit(url).netloc)
|
solution += len(netloc)
|
||||||
|
|
||||||
if ".toFixed(" in expr:
|
if ".toFixed(" in expr:
|
||||||
# trim the solution to 10 decimal places
|
# trim solution to 10 decimal places
|
||||||
# and strip trailing zeros
|
# and strip trailing zeros
|
||||||
solution = "{:.10f}".format(solution).rstrip("0")
|
solution = "{:.10f}".format(solution).rstrip("0")
|
||||||
|
|
||||||
@@ -87,7 +93,7 @@ def solve_jschl(url, page):
|
|||||||
|
|
||||||
|
|
||||||
def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")):
|
def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")):
|
||||||
"""Evaluate a Javascript expression for the challenge"""
|
"""Evaluate a single Javascript expression for the challenge"""
|
||||||
|
|
||||||
if "/" in expr:
|
if "/" in expr:
|
||||||
# split the expression in numerator and denominator subexpressions,
|
# split the expression in numerator and denominator subexpressions,
|
||||||
@@ -102,26 +108,21 @@ def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")):
|
|||||||
result = ""
|
result = ""
|
||||||
for subexpr in split_re.findall(expr):
|
for subexpr in split_re.findall(expr):
|
||||||
result += str(sum(
|
result += str(sum(
|
||||||
expression_values[part]
|
VALUES[part]
|
||||||
for part in subexpr.split("[]")
|
for part in subexpr.split("[]")
|
||||||
))
|
))
|
||||||
return int(result)
|
return int(result)
|
||||||
|
|
||||||
|
|
||||||
operator_functions = {
|
OPERATORS = {
|
||||||
"+": operator.add,
|
"+": operator.add,
|
||||||
"-": operator.sub,
|
"-": operator.sub,
|
||||||
"*": operator.mul,
|
"*": operator.mul,
|
||||||
}
|
}
|
||||||
|
|
||||||
expression_values = {
|
VALUES = {
|
||||||
"": 0,
|
"": 0,
|
||||||
"+": 0,
|
"+": 0,
|
||||||
"!+": 1,
|
"!+": 1,
|
||||||
"+!!": 1,
|
"+!!": 1,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@cache(maxage=365*24*60*60, keyarg=0)
|
|
||||||
def _cookiecache(key, item=None):
|
|
||||||
return item
|
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ import requests
|
|||||||
import threading
|
import threading
|
||||||
import http.cookiejar
|
import http.cookiejar
|
||||||
from .message import Message
|
from .message import Message
|
||||||
from .. import config, text, exception
|
from .. import config, text, exception, cloudflare
|
||||||
|
|
||||||
|
|
||||||
class Extractor():
|
class Extractor():
|
||||||
@@ -86,6 +86,10 @@ class Extractor():
|
|||||||
if encoding:
|
if encoding:
|
||||||
response.encoding = encoding
|
response.encoding = encoding
|
||||||
return response
|
return response
|
||||||
|
if cloudflare.is_challenge(response):
|
||||||
|
self.log.info("Solving Cloudflare challenge")
|
||||||
|
url = cloudflare.solve_challenge(session, response, kwargs)
|
||||||
|
continue
|
||||||
|
|
||||||
msg = "{}: {} for url: {}".format(code, response.reason, url)
|
msg = "{}: {} for url: {}".format(code, response.reason, url)
|
||||||
if code < 500 and code != 429:
|
if code < 500 and code != 429:
|
||||||
|
|||||||
@@ -9,17 +9,12 @@
|
|||||||
"""Extract manga-chapters and entire manga from https://kissmanga.com/"""
|
"""Extract manga-chapters and entire manga from https://kissmanga.com/"""
|
||||||
|
|
||||||
from .common import ChapterExtractor, MangaExtractor
|
from .common import ChapterExtractor, MangaExtractor
|
||||||
from .. import text, cloudflare, aes, exception
|
from .. import text, aes, exception
|
||||||
from ..cache import cache
|
from ..cache import cache
|
||||||
import hashlib
|
import hashlib
|
||||||
import ast
|
import ast
|
||||||
import re
|
import re
|
||||||
|
|
||||||
IV = [
|
|
||||||
0xa5, 0xe8, 0xe2, 0xe9, 0xc2, 0x72, 0x1b, 0xe0,
|
|
||||||
0xa8, 0x4a, 0xd6, 0x60, 0xc4, 0x72, 0xc1, 0xf3
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class KissmangaBase():
|
class KissmangaBase():
|
||||||
"""Base class for kissmanga extractors"""
|
"""Base class for kissmanga extractors"""
|
||||||
@@ -28,10 +23,10 @@ class KissmangaBase():
|
|||||||
root = "https://kissmanga.com"
|
root = "https://kissmanga.com"
|
||||||
|
|
||||||
def request(self, url):
|
def request(self, url):
|
||||||
response = cloudflare.request_func(self, url)
|
response = super().request(url)
|
||||||
if response.history and "/Message/AreYouHuman?" in response.url:
|
if response.history and "/Message/AreYouHuman?" in response.url:
|
||||||
self.log.error("Requesting too many pages caused a redirect to %s."
|
self.log.error("Requesting too many pages caused a redirect to %s."
|
||||||
" Try visiting this URL in your browser and solving"
|
" Try visiting this URL in your browser and solve"
|
||||||
" the CAPTCHA to continue.", response.url)
|
" the CAPTCHA to continue.", response.url)
|
||||||
raise exception.StopExtraction()
|
raise exception.StopExtraction()
|
||||||
return response
|
return response
|
||||||
@@ -112,8 +107,10 @@ class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor):
|
|||||||
self.session.headers["Referer"] = None
|
self.session.headers["Referer"] = None
|
||||||
try:
|
try:
|
||||||
key = self.build_aes_key(page)
|
key = self.build_aes_key(page)
|
||||||
|
iv = (0xa5, 0xe8, 0xe2, 0xe9, 0xc2, 0x72, 0x1b, 0xe0,
|
||||||
|
0xa8, 0x4a, 0xd6, 0x60, 0xc4, 0x72, 0xc1, 0xf3)
|
||||||
return [
|
return [
|
||||||
(aes.aes_cbc_decrypt_text(data, key, IV), None)
|
(aes.aes_cbc_decrypt_text(data, key, iv), None)
|
||||||
for data in text.extract_iter(
|
for data in text.extract_iter(
|
||||||
page, 'lstImages.push(wrapKA("', '"'
|
page, 'lstImages.push(wrapKA("', '"'
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
"""Extract manga-chapters and entire manga from https://komikcast.com/"""
|
"""Extract manga-chapters and entire manga from https://komikcast.com/"""
|
||||||
|
|
||||||
from .common import ChapterExtractor, MangaExtractor
|
from .common import ChapterExtractor, MangaExtractor
|
||||||
from .. import text, cloudflare
|
from .. import text
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
@@ -18,8 +18,6 @@ class KomikcastBase():
|
|||||||
category = "komikcast"
|
category = "komikcast"
|
||||||
root = "https://komikcast.com"
|
root = "https://komikcast.com"
|
||||||
|
|
||||||
request = cloudflare.request_func
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_chapter_string(chapter_string, data=None):
|
def parse_chapter_string(chapter_string, data=None):
|
||||||
"""Parse 'chapter_string' value and add its info to 'data'"""
|
"""Parse 'chapter_string' value and add its info to 'data'"""
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
"""Extract comic-issues and entire comics from https://readcomiconline.to/"""
|
"""Extract comic-issues and entire comics from https://readcomiconline.to/"""
|
||||||
|
|
||||||
from .common import ChapterExtractor, MangaExtractor
|
from .common import ChapterExtractor, MangaExtractor
|
||||||
from .. import text, cloudflare
|
from .. import text
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
@@ -21,8 +21,6 @@ class ReadcomiconlineBase():
|
|||||||
archive_fmt = "{issue_id}_{page}"
|
archive_fmt = "{issue_id}_{page}"
|
||||||
root = "https://readcomiconline.to"
|
root = "https://readcomiconline.to"
|
||||||
|
|
||||||
request = cloudflare.request_func
|
|
||||||
|
|
||||||
|
|
||||||
class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
|
class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
|
||||||
"""Extractor for comic-issues from readcomiconline.to"""
|
"""Extractor for comic-issues from readcomiconline.to"""
|
||||||
|
|||||||
Reference in New Issue
Block a user