update cloudflare bypass (wip)

This commit adds support for the two new JS expressions embedded in the overall challenge code. It does compute the correct 'js_answer' value, but the HTTP request to /cdn-cgi/l/chk_jschl to get the 'cf_clearance' cookie always results in a 403 response with a CAPTCHA inside (hence 'wip') All steps to make this HTTP request indistinguishable from a regular web browser (which passes the test) show no effect. This includes: - using the exact same HTTP headers as a web browser - follow query argument order - different wait times
2019-04-01 15:14:59 +02:00
parent 0f02e85961
commit 9ebd29fcc1
1 changed files with 44 additions and 12 deletions
--- a/gallery_dl/cloudflare.py
+++ b/gallery_dl/cloudflare.py
@@ -12,7 +12,7 @@ import re
 import time
 import operator
 import urllib.parse
-from . import text
+from . import text, exception
 from .cache import memcache


@@ -22,6 +22,11 @@ def is_challenge(response):
            b"jschl-answer" in response.content)


+def is_captcha(response):
+    return (response.status_code == 403 and
+            b'name="captcha-bypass"' in response.content)
+
+
 def solve_challenge(session, response, kwargs):
    """Solve Cloudflare challenge and get cfclearance cookie"""
    parsed = urllib.parse.urlsplit(response.url)
@@ -35,8 +40,8 @@ def solve_challenge(session, response, kwargs):

    page = response.text
    params["s"] = text.extract(page, 'name="s" value="', '"')[0]
-    params["pass"] = text.extract(page, 'name="pass" value="', '"')[0]
    params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0]
+    params["pass"] = text.extract(page, 'name="pass" value="', '"')[0]
    params["jschl_answer"] = solve_js_challenge(page, parsed.netloc)
    headers["Referer"] = response.url

@@ -46,7 +51,15 @@ def solve_challenge(session, response, kwargs):
    cf_kwargs["allow_redirects"] = False
    cf_response = session.request(response.request.method, url, **cf_kwargs)

-    location = cf_response.headers["Location"]
+    location = cf_response.headers.get("Location")
+    if not location:
+        import logging
+        log = logging.getLogger("cloudflare")
+        rtype = "CAPTCHA" if is_captcha(cf_response) else "Unexpected"
+        log.error("%s response", rtype)
+        log.debug("Headers:\n%s", cf_response.headers)
+        log.debug("Content:\n%s", cf_response.text)
+        raise exception.StopExtraction()
    if location[0] == "/":
        location = root + location

@@ -73,7 +86,7 @@ def solve_js_challenge(page, netloc):
    vlength = len(variable)

    # evaluate the initial expression
-    solution = evaluate_expression(data["expr"])
+    solution = evaluate_expression(data["expr"], page, netloc)

    # iterator over all remaining expressions
    # and combine their values in 'solution'
@@ -85,37 +98,55 @@ def solve_js_challenge(page, netloc):
            # select arithmetc function based on operator (+/-/*)
            func = OPERATORS[expr[vlength]]
            # evaluate the rest of the expression
-            value = evaluate_expression(expr[vlength+2:])
+            value = evaluate_expression(expr[vlength+2:], page, netloc)
            # combine expression value with our current solution
            solution = func(solution, value)

        elif expr.startswith("a.value"):
-            # add length of hostname
-            solution += len(netloc)
-
+            if "t.length)" in expr:
+                # add length of hostname
+                solution += len(netloc)
            if ".toFixed(" in expr:
                # trim solution to 10 decimal places
                # and strip trailing zeros
                solution = "{:.10f}".format(solution).rstrip("0")
-
            return solution


-def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")):
+def evaluate_expression(expr, page, netloc, *,
+                        split_re=re.compile(r"[(+]+([^)]*)\)")):
    """Evaluate a single Javascript expression for the challenge"""

+    if expr.startswith("function(p)"):
+        # get HTML element with ID k and evaluate the expression inside
+        # 'eval(eval("document.getElementById(k).innerHTML"))'
+        k, pos = text.extract(page, "k = '", "'")
+        e, pos = text.extract(page, 'id="'+k+'"', '<')
+        return evaluate_expression(e.partition(">")[2], page, netloc)
+
    if "/" in expr:
        # split the expression in numerator and denominator subexpressions,
        # evaluate them separately,
        # and return their fraction-result
        num, _, denom = expr.partition("/")
-        return evaluate_expression(num) / evaluate_expression(denom)
+        num = evaluate_expression(num, page, netloc)
+        denom = evaluate_expression(denom, page, netloc)
+        return num / denom
+
+    if "function(p)" in expr:
+        # split initial expression and function code
+        initial, _, func = expr.partition("function(p)")
+        # evaluate said expression
+        initial = evaluate_expression(initial, page, netloc)
+        # get function argument and use it as index into 'netloc'
+        index = evaluate_expression(func[func.index("}")+1:], page, netloc)
+        return initial + ord(netloc[int(index)])

    # iterate over all subexpressions,
    # evaluate them,
    # and accumulate their values in 'result'
    result = ""
-    for subexpr in split_re.findall(expr):
+    for subexpr in split_re.findall(expr) or (expr,):
        result += str(sum(
            VALUES[part]
            for part in subexpr.split("[]")
@@ -133,6 +164,7 @@ VALUES = {
    "": 0,
    "+": 0,
    "!+": 1,
+    "!!": 1,
    "+!!": 1,
 }