From 9ebd29fcc1ed23418f2fe257770825552607d653 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 1 Apr 2019 15:14:59 +0200 Subject: [PATCH] update cloudflare bypass (wip) This commit adds support for the two new JS expressions embedded in the overall challenge code. It does compute the correct 'js_answer' value, but the HTTP request to /cdn-cgi/l/chk_jschl to get the 'cf_clearance' cookie always results in a 403 response with a CAPTCHA inside (hence 'wip') All steps to make this HTTP request indistinguishable from a regular web browser (which passes the test) show no effect. This includes: - using the exact same HTTP headers as a web browser - follow query argument order - different wait times --- gallery_dl/cloudflare.py | 56 +++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 12 deletions(-) diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py index 0cb56f83..594c1b6c 100644 --- a/gallery_dl/cloudflare.py +++ b/gallery_dl/cloudflare.py @@ -12,7 +12,7 @@ import re import time import operator import urllib.parse -from . import text +from . import text, exception from .cache import memcache @@ -22,6 +22,11 @@ def is_challenge(response): b"jschl-answer" in response.content) +def is_captcha(response): + return (response.status_code == 403 and + b'name="captcha-bypass"' in response.content) + + def solve_challenge(session, response, kwargs): """Solve Cloudflare challenge and get cfclearance cookie""" parsed = urllib.parse.urlsplit(response.url) @@ -35,8 +40,8 @@ def solve_challenge(session, response, kwargs): page = response.text params["s"] = text.extract(page, 'name="s" value="', '"')[0] - params["pass"] = text.extract(page, 'name="pass" value="', '"')[0] params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0] + params["pass"] = text.extract(page, 'name="pass" value="', '"')[0] params["jschl_answer"] = solve_js_challenge(page, parsed.netloc) headers["Referer"] = response.url @@ -46,7 +51,15 @@ def solve_challenge(session, response, kwargs): cf_kwargs["allow_redirects"] = False cf_response = session.request(response.request.method, url, **cf_kwargs) - location = cf_response.headers["Location"] + location = cf_response.headers.get("Location") + if not location: + import logging + log = logging.getLogger("cloudflare") + rtype = "CAPTCHA" if is_captcha(cf_response) else "Unexpected" + log.error("%s response", rtype) + log.debug("Headers:\n%s", cf_response.headers) + log.debug("Content:\n%s", cf_response.text) + raise exception.StopExtraction() if location[0] == "/": location = root + location @@ -73,7 +86,7 @@ def solve_js_challenge(page, netloc): vlength = len(variable) # evaluate the initial expression - solution = evaluate_expression(data["expr"]) + solution = evaluate_expression(data["expr"], page, netloc) # iterator over all remaining expressions # and combine their values in 'solution' @@ -85,37 +98,55 @@ def solve_js_challenge(page, netloc): # select arithmetc function based on operator (+/-/*) func = OPERATORS[expr[vlength]] # evaluate the rest of the expression - value = evaluate_expression(expr[vlength+2:]) + value = evaluate_expression(expr[vlength+2:], page, netloc) # combine expression value with our current solution solution = func(solution, value) elif expr.startswith("a.value"): - # add length of hostname - solution += len(netloc) - + if "t.length)" in expr: + # add length of hostname + solution += len(netloc) if ".toFixed(" in expr: # trim solution to 10 decimal places # and strip trailing zeros solution = "{:.10f}".format(solution).rstrip("0") - return solution -def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")): +def evaluate_expression(expr, page, netloc, *, + split_re=re.compile(r"[(+]+([^)]*)\)")): """Evaluate a single Javascript expression for the challenge""" + if expr.startswith("function(p)"): + # get HTML element with ID k and evaluate the expression inside + # 'eval(eval("document.getElementById(k).innerHTML"))' + k, pos = text.extract(page, "k = '", "'") + e, pos = text.extract(page, 'id="'+k+'"', '<') + return evaluate_expression(e.partition(">")[2], page, netloc) + if "/" in expr: # split the expression in numerator and denominator subexpressions, # evaluate them separately, # and return their fraction-result num, _, denom = expr.partition("/") - return evaluate_expression(num) / evaluate_expression(denom) + num = evaluate_expression(num, page, netloc) + denom = evaluate_expression(denom, page, netloc) + return num / denom + + if "function(p)" in expr: + # split initial expression and function code + initial, _, func = expr.partition("function(p)") + # evaluate said expression + initial = evaluate_expression(initial, page, netloc) + # get function argument and use it as index into 'netloc' + index = evaluate_expression(func[func.index("}")+1:], page, netloc) + return initial + ord(netloc[int(index)]) # iterate over all subexpressions, # evaluate them, # and accumulate their values in 'result' result = "" - for subexpr in split_re.findall(expr): + for subexpr in split_re.findall(expr) or (expr,): result += str(sum( VALUES[part] for part in subexpr.split("[]") @@ -133,6 +164,7 @@ VALUES = { "": 0, "+": 0, "!+": 1, + "!!": 1, "+!!": 1, }