diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py index 7c022c8c..04c96a83 100644 --- a/gallery_dl/cloudflare.py +++ b/gallery_dl/cloudflare.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2018 Mike Fährmann +# Copyright 2015-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -13,46 +13,52 @@ import time import operator import urllib.parse from . import text -from .cache import cache -def request_func(self, *args, **kwargs): - cookies = _cookiecache(self.root) - if cookies: - self.session.cookies.update(cookies) - response = self.session.get(*args, **kwargs) - if response.status_code == 503: - _cookiecache.invalidate(self.root) - self.log.info("Solving Cloudflare challenge") - response = solve_challenge(self.session, response) - _cookiecache(self.root, self.session.cookies) - return response +def is_challenge(response): + return (response.status_code == 503 and + response.headers.get("Server", "").startswith("cloudflare") and + b"jschl-answer" in response.content) -def solve_challenge(session, response): +def solve_challenge(session, response, kwargs): + """Solve Cloudflare challenge and get cfclearance cookie""" + parsed = urllib.parse.urlsplit(response.url) + root = parsed.scheme + "://" + parsed.netloc + + cf_kwargs = kwargs.copy() + headers = cf_kwargs["headers"] = ( + kwargs["headers"].copy() if "headers" in kwargs else {}) + params = cf_kwargs["params"] = ( + kwargs["params"].copy() if "params" in kwargs else {}) - session.headers["Referer"] = response.url page = response.text - params = text.extract_all(page, ( - ('jschl_vc', 'name="jschl_vc" value="', '"'), - ('pass' , 'name="pass" value="', '"'), - ))[0] - params["jschl_answer"] = solve_jschl(response.url, page) + params["pass"] = text.extract(page, 'name="pass" value="', '"')[0] + params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0] + params["jschl_answer"] = solve_js_challenge(page, parsed.netloc) + headers["Referer"] = response.url time.sleep(4) - url = text.urljoin(response.url, "/cdn-cgi/l/chk_jschl") - return session.get(url, params=params) + + url = root + "/cdn-cgi/l/chk_jschl" + cf_kwargs["allow_redirects"] = False + cf_response = session.request(response.request.method, url, **cf_kwargs) + + location = cf_response.headers["Location"] + if location[0] == "/": + location = root + location + return location -def solve_jschl(url, page): - """Solve challenge to get 'jschl_answer' value""" +def solve_js_challenge(page, netloc): + """Evaluate JS challenge in 'page' to get 'jschl_answer' value""" # build variable name # e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk data, pos = text.extract_all(page, ( ('var' , ',f, ', '='), - ('key' , '"', '"'), - ('expr', ':', '}'), + ('key' , '"' , '"'), + ('expr', ':' , '}'), )) variable = "{}.{}".format(data["var"], data["key"]) vlength = len(variable) @@ -67,19 +73,19 @@ def solve_jschl(url, page): for expr in expressions.split(";")[1:]: if expr.startswith(variable): - # select arithmetc function based on operator (+, -, *) - func = operator_functions[expr[vlength]] + # select arithmetc function based on operator (+/-/*) + func = OPERATORS[expr[vlength]] # evaluate the rest of the expression value = evaluate_expression(expr[vlength+2:]) - # combine the expression value with our current solution + # combine expression value with our current solution solution = func(solution, value) elif expr.startswith("a.value"): - # add length of the hostname, i.e. add 11 for 'example.org' - solution += len(urllib.parse.urlsplit(url).netloc) + # add length of hostname + solution += len(netloc) if ".toFixed(" in expr: - # trim the solution to 10 decimal places + # trim solution to 10 decimal places # and strip trailing zeros solution = "{:.10f}".format(solution).rstrip("0") @@ -87,7 +93,7 @@ def solve_jschl(url, page): def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")): - """Evaluate a Javascript expression for the challenge""" + """Evaluate a single Javascript expression for the challenge""" if "/" in expr: # split the expression in numerator and denominator subexpressions, @@ -102,26 +108,21 @@ def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")): result = "" for subexpr in split_re.findall(expr): result += str(sum( - expression_values[part] + VALUES[part] for part in subexpr.split("[]") )) return int(result) -operator_functions = { +OPERATORS = { "+": operator.add, "-": operator.sub, "*": operator.mul, } -expression_values = { +VALUES = { "": 0, "+": 0, "!+": 1, "+!!": 1, } - - -@cache(maxage=365*24*60*60, keyarg=0) -def _cookiecache(key, item=None): - return item diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index f600fae0..def2bc9d 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -18,7 +18,7 @@ import requests import threading import http.cookiejar from .message import Message -from .. import config, text, exception +from .. import config, text, exception, cloudflare class Extractor(): @@ -86,6 +86,10 @@ class Extractor(): if encoding: response.encoding = encoding return response + if cloudflare.is_challenge(response): + self.log.info("Solving Cloudflare challenge") + url = cloudflare.solve_challenge(session, response, kwargs) + continue msg = "{}: {} for url: {}".format(code, response.reason, url) if code < 500 and code != 429: diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py index 20b9d96a..cfb53336 100644 --- a/gallery_dl/extractor/kissmanga.py +++ b/gallery_dl/extractor/kissmanga.py @@ -9,17 +9,12 @@ """Extract manga-chapters and entire manga from https://kissmanga.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, cloudflare, aes, exception +from .. import text, aes, exception from ..cache import cache import hashlib import ast import re -IV = [ - 0xa5, 0xe8, 0xe2, 0xe9, 0xc2, 0x72, 0x1b, 0xe0, - 0xa8, 0x4a, 0xd6, 0x60, 0xc4, 0x72, 0xc1, 0xf3 -] - class KissmangaBase(): """Base class for kissmanga extractors""" @@ -28,10 +23,10 @@ class KissmangaBase(): root = "https://kissmanga.com" def request(self, url): - response = cloudflare.request_func(self, url) + response = super().request(url) if response.history and "/Message/AreYouHuman?" in response.url: self.log.error("Requesting too many pages caused a redirect to %s." - " Try visiting this URL in your browser and solving" + " Try visiting this URL in your browser and solve" " the CAPTCHA to continue.", response.url) raise exception.StopExtraction() return response @@ -112,8 +107,10 @@ class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor): self.session.headers["Referer"] = None try: key = self.build_aes_key(page) + iv = (0xa5, 0xe8, 0xe2, 0xe9, 0xc2, 0x72, 0x1b, 0xe0, + 0xa8, 0x4a, 0xd6, 0x60, 0xc4, 0x72, 0xc1, 0xf3) return [ - (aes.aes_cbc_decrypt_text(data, key, IV), None) + (aes.aes_cbc_decrypt_text(data, key, iv), None) for data in text.extract_iter( page, 'lstImages.push(wrapKA("', '"' ) diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index 6e2b40ca..11c85246 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -9,7 +9,7 @@ """Extract manga-chapters and entire manga from https://komikcast.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, cloudflare +from .. import text import re @@ -18,8 +18,6 @@ class KomikcastBase(): category = "komikcast" root = "https://komikcast.com" - request = cloudflare.request_func - @staticmethod def parse_chapter_string(chapter_string, data=None): """Parse 'chapter_string' value and add its info to 'data'""" diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index ff2db45b..98bdcddb 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -9,7 +9,7 @@ """Extract comic-issues and entire comics from https://readcomiconline.to/""" from .common import ChapterExtractor, MangaExtractor -from .. import text, cloudflare +from .. import text import re @@ -21,8 +21,6 @@ class ReadcomiconlineBase(): archive_fmt = "{issue_id}_{page}" root = "https://readcomiconline.to" - request = cloudflare.request_func - class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): """Extractor for comic-issues from readcomiconline.to"""