From 40dbea7ed2e207ebaf7db17fc5fc199ff7e2f097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 16 Dec 2016 13:28:36 +0100 Subject: [PATCH] rewrite parts of the cloudflare bypass system --- gallery_dl/cloudflare.py | 40 +++++++++---------- gallery_dl/extractor/kisscomic.py | 51 ++++--------------------- gallery_dl/extractor/kissmanga.py | 24 +++--------- gallery_dl/extractor/readcomiconline.py | 4 +- 4 files changed, 35 insertions(+), 84 deletions(-) diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py index b090d09b..82f922a0 100644 --- a/gallery_dl/cloudflare.py +++ b/gallery_dl/cloudflare.py @@ -14,32 +14,28 @@ import urllib.parse from . import text from .cache import cache -def bypass(url, maxage): - def decorator(func): - solve = cache(maxage=maxage, keyarg=1)(solve_challenge) - def wrap(self, *args): - self.session.cookies = solve(self.session, url) - return func(self, *args) - return wrap - return decorator +def request_func(self, *args): + cookies = _cookiecache(self.root) + if cookies: + self.session.cookies = cookies + response = self.session.get(*args) + if response.status_code != 200: + _cookiecache.invalidate(self.root) + response = solve_challenge(self.session, response) + _cookiecache(self.root, self.session.cookies) + return response -def bypass_ddos_protection(session, url): - """Prepare a requests.session to access 'url' behind Cloudflare protection""" - session.cookies = solve_challenge(session, url) - return session - -def solve_challenge(session, url): - session.headers["Referer"] = url - page = session.get(url).text +def solve_challenge(session, response): + session.headers["Referer"] = response.url + page = response.text params = text.extract_all(page, ( ('jschl_vc', 'name="jschl_vc" value="', '"'), ('pass' , 'name="pass" value="', '"'), ))[0] - params["jschl_answer"] = solve_jschl(url, page) + params["jschl_answer"] = solve_jschl(response.url, page) time.sleep(4) - url = urllib.parse.urljoin(url, "/cdn-cgi/l/chk_jschl") - session.get(url, params=params) - return session.cookies + url = urllib.parse.urljoin(response.url, "/cdn-cgi/l/chk_jschl") + return session.get(url, params=params) def solve_jschl(url, page): """Solve challenge to get 'jschl_answer' value""" @@ -91,3 +87,7 @@ expression_values = { "!+": 1, "+!!": 1, } + +@cache(maxage=365*24*60*60, keyarg=0) +def _cookiecache(key, item=None): + return item diff --git a/gallery_dl/extractor/kisscomic.py b/gallery_dl/extractor/kisscomic.py index 8670d310..42544c63 100644 --- a/gallery_dl/extractor/kisscomic.py +++ b/gallery_dl/extractor/kisscomic.py @@ -8,36 +8,19 @@ """Extract comic-issues and entire comics from http://kisscomic.us/""" -from .common import Extractor, Message -from .. import text, cloudflare, cache -import re +from . import kissmanga +from .. import text -class KisscomicExtractor(Extractor): +class KisscomicExtractor(kissmanga.KissmangaExtractor): """Base class for kisscomic extractors""" category = "kisscomic" directory_fmt = ["{category}", "{comic}", "{issue:>03}"] filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}" - url_base = "http://kisscomic.us" - - def __init__(self, match): - Extractor.__init__(self) - self.url = match.group(0) - self.session.headers["Referer"] = self.url_base - self.cookies = cache.cache(maxage=365*24*60*60, keyarg=0)(_cache_helper) - - def request(self, url, cookies=None): - cookies = self.cookies(self.url_base, cookies) - if cookies: - self.session.cookies = cookies - response = self.session.get(url) - if response.status_code != 200: - self.cookies.invalidate(self.url_base) - cookies = cloudflare.solve_challenge(self.session, self.url_base) - response = self.request(url, cookies) - return response + root = "http://kisscomic.us" -class KisscomicMangaExtractor(KisscomicExtractor): +class KisscomicMangaExtractor(KisscomicExtractor, + kissmanga.KissmangaMangaExtractor): """Extractor for comics from kisscomic.us""" subcategory = "comic" pattern = [r"(?:https?://)?(?:www\.)?kisscomic\.us/comics/[^/]+\.html$"] @@ -45,11 +28,6 @@ class KisscomicMangaExtractor(KisscomicExtractor): "url": "8c180e2ec2492712b089ca091c54909cb0fe3d4a", })] - def items(self): - yield Message.Version, 1 - for chapter in self.get_chapters(): - yield Message.Queue, self.url_base + chapter - def get_chapters(self): """Return a list of all chapter urls""" page = self.request(self.url).text @@ -59,7 +37,8 @@ class KisscomicMangaExtractor(KisscomicExtractor): )) -class KisscomicIssueExtractor(KisscomicExtractor): +class KisscomicIssueExtractor(KisscomicExtractor, + kissmanga.KissmangaChapterExtractor): """Extractor for comic-issues from kisscomic.us""" subcategory = "issue" pattern = [r"(?:https?://)?(?:www\.)?kisscomic\.us/chapters/.+-chapter-\d+\.html"] @@ -68,16 +47,6 @@ class KisscomicIssueExtractor(KisscomicExtractor): "keyword": "a685f92b6989eebf57f8981b1edd6d3de9148ad6", })] - def items(self): - page = self.request(self.url).text - data = self.get_job_metadata(page) - imgs = self.get_image_urls(page) - data["count"] = len(imgs) - yield Message.Version, 1 - yield Message.Directory, data - for data["page"], url in enumerate(imgs, 1): - yield Message.Url, url, text.nameext_from_url(url, data) - def get_job_metadata(self, page): """Collect metadata for extractor-job""" info = text.extract(page, "", " Comic - Read ")[0] @@ -93,7 +62,3 @@ class KisscomicIssueExtractor(KisscomicExtractor): def get_image_urls(page): """Extract list of all image-urls for a manga chapter""" return list(text.extract_iter(page, '<li><img src="', '"')) - - -def _cache_helper(key, item=None): - return item diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py index 711175ef..a653c1a2 100644 --- a/gallery_dl/extractor/kissmanga.py +++ b/gallery_dl/extractor/kissmanga.py @@ -9,7 +9,7 @@ """Extract manga-chapters and entire manga from http://kissmanga.com/""" from .common import Extractor, Message -from .. import text, cloudflare, cache +from .. import text, cloudflare import re class KissmangaExtractor(Extractor): @@ -17,24 +17,14 @@ class KissmangaExtractor(Extractor): category = "kissmanga" directory_fmt = ["{category}", "{manga}", "c{chapter:>03}{chapter-minor} - {title}"] filename_fmt = "{manga}_c{chapter:>03}{chapter-minor}_{page:>03}.{extension}" - url_base = "http://kissmanga.com" + root = "http://kissmanga.com" def __init__(self, match): Extractor.__init__(self) self.url = match.group(0) - self.session.headers["Referer"] = self.url_base - self.cookies = cache.cache(maxage=365*24*60*60, keyarg=0)(_cache_helper) + self.session.headers["Referer"] = self.root - def request(self, url, cookies=None): - cookies = self.cookies(self.url_base, cookies) - if cookies: - self.session.cookies = cookies - response = self.session.get(url) - if response.status_code != 200: - self.cookies.invalidate(self.url_base) - cookies = cloudflare.solve_challenge(self.session, self.url_base) - response = self.request(url, cookies) - return response + request = cloudflare.request_func class KissmangaMangaExtractor(KissmangaExtractor): @@ -48,7 +38,7 @@ class KissmangaMangaExtractor(KissmangaExtractor): def items(self): yield Message.Version, 1 for chapter in self.get_chapters(): - yield Message.Queue, self.url_base + chapter + yield Message.Queue, self.root + chapter def get_chapters(self): """Return a list of all chapter urls""" @@ -104,7 +94,3 @@ class KissmangaChapterExtractor(KissmangaExtractor): def get_image_urls(page): """Extract list of all image-urls for a manga chapter""" return list(text.extract_iter(page, 'lstImages.push("', '"')) - - -def _cache_helper(key, item=None): - return item diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 8ae7256c..775d9203 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -8,8 +8,8 @@ """Extract comic-issues and entire comics from http://readcomiconline.to/""" -from .. import text from . import kissmanga +from .. import text import re class ReadcomiconlineExtractor(kissmanga.KissmangaExtractor): @@ -17,7 +17,7 @@ class ReadcomiconlineExtractor(kissmanga.KissmangaExtractor): category = "readcomiconline" directory_fmt = ["{category}", "{comic}", "{issue:>03}"] filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}" - url_base = "http://readcomiconline.to" + root = "http://readcomiconline.to" class ReadcomiconlineComicExtractor(ReadcomiconlineExtractor,