retry Cloudflare challenges

This commit is contained in:
Mike Fährmann
2020-04-24 22:47:27 +02:00
parent 3eab07739f
commit 5d7ca76885
2 changed files with 10 additions and 9 deletions

View File

@@ -13,7 +13,7 @@ import time
import operator import operator
import collections import collections
import urllib.parse import urllib.parse
from . import text, exception from . import text
from .cache import memcache from .cache import memcache
@@ -58,14 +58,13 @@ def solve_challenge(session, response, kwargs):
cookie.name: cookie.value cookie.name: cookie.value
for cookie in cf_response.cookies for cookie in cf_response.cookies
} }
if not cookies: if not cookies:
import logging import logging
log = logging.getLogger("cloudflare") log = logging.getLogger("cloudflare")
rtype = "CAPTCHA" if is_captcha(cf_response) else "Unexpected"
log.error("%s response", rtype)
log.debug("Headers:\n%s", cf_response.headers) log.debug("Headers:\n%s", cf_response.headers)
log.debug("Content:\n%s", cf_response.text) log.debug("Content:\n%s", cf_response.text)
raise exception.StopExtraction() return cf_response, None, None
domain = next(iter(cf_response.cookies)).domain domain = next(iter(cf_response.cookies)).domain
cookies["__cfduid"] = response.cookies.get("__cfduid", "") cookies["__cfduid"] = response.cookies.get("__cfduid", "")

View File

@@ -99,18 +99,20 @@ class Extractor():
return response return response
if notfound and code == 404: if notfound and code == 404:
raise exception.NotFoundError(notfound) raise exception.NotFoundError(notfound)
reason = response.reason
if cloudflare.is_challenge(response): if cloudflare.is_challenge(response):
self.log.info("Solving Cloudflare challenge") self.log.info("Solving Cloudflare challenge")
response, domain, cookies = cloudflare.solve_challenge( response, domain, cookies = cloudflare.solve_challenge(
session, response, kwargs) session, response, kwargs)
if response.status_code >= 400: if cookies:
continue cloudflare.cookies.update(
cloudflare.cookies.update(self.category, (domain, cookies)) self.category, (domain, cookies))
return response return response
if cloudflare.is_captcha(response): if cloudflare.is_captcha(response):
self.log.warning("Cloudflare CAPTCHA") self.log.warning("Cloudflare CAPTCHA")
msg = "'{} {}' for '{}'".format(code, response.reason, url) msg = "'{} {}' for '{}'".format(code, reason, url)
if code < 500 and code != 429 and code != 430: if code < 500 and code != 429 and code != 430:
break break