ensure consistent headers and params ordering

Necessary to avoid being labeled a bot and getting a CAPTCHA response
after solving a Cloudflare challenge.
This commit is contained in:
Mike Fährmann
2019-04-09 10:52:27 +02:00
parent 9af9823067
commit 49a6522c38
2 changed files with 14 additions and 8 deletions

View File

@@ -11,6 +11,7 @@
import re
import time
import operator
import collections
import urllib.parse
from . import text, exception
from .cache import memcache
@@ -32,11 +33,9 @@ def solve_challenge(session, response, kwargs):
parsed = urllib.parse.urlsplit(response.url)
root = parsed.scheme + "://" + parsed.netloc
cf_kwargs = kwargs.copy()
headers = cf_kwargs["headers"] = (
kwargs["headers"].copy() if "headers" in kwargs else {})
params = cf_kwargs["params"] = (
kwargs["params"].copy() if "params" in kwargs else {})
cf_kwargs = {}
headers = cf_kwargs["headers"] = collections.OrderedDict()
params = cf_kwargs["params"] = collections.OrderedDict()
page = response.text
params["s"] = text.extract(page, 'name="s" value="', '"')[0]
@@ -49,7 +48,7 @@ def solve_challenge(session, response, kwargs):
url = root + "/cdn-cgi/l/chk_jschl"
cf_kwargs["allow_redirects"] = False
cf_response = session.request(response.request.method, url, **cf_kwargs)
cf_response = session.request("GET", url, **cf_kwargs)
location = cf_response.headers.get("Location")
if not location:

View File

@@ -127,10 +127,17 @@ class Extractor():
def _init_headers(self):
"""Set additional headers for the 'session' object"""
self.session.headers["Accept-Language"] = "en-US,en;q=0.5"
self.session.headers["User-Agent"] = self.config(
headers = self.session.headers
headers.clear()
headers["User-Agent"] = self.config(
"user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:62.0) "
"Gecko/20100101 Firefox/62.0"))
headers["Accept"] = "*/*"
headers["Accept-Language"] = "en-US,en;q=0.5"
headers["Accept-Encoding"] = "gzip, deflate"
headers["Connection"] = "keep-alive"
headers["Upgrade-Insecure-Requests"] = "1"
def _init_proxies(self):
"""Update the session's proxy map"""