ensure consistent headers and params ordering
Necessary to avoid being labeled a bot and getting a CAPTCHA response after solving a Cloudflare challenge.
This commit is contained in:
@@ -11,6 +11,7 @@
|
|||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
import operator
|
import operator
|
||||||
|
import collections
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from . import text, exception
|
from . import text, exception
|
||||||
from .cache import memcache
|
from .cache import memcache
|
||||||
@@ -32,11 +33,9 @@ def solve_challenge(session, response, kwargs):
|
|||||||
parsed = urllib.parse.urlsplit(response.url)
|
parsed = urllib.parse.urlsplit(response.url)
|
||||||
root = parsed.scheme + "://" + parsed.netloc
|
root = parsed.scheme + "://" + parsed.netloc
|
||||||
|
|
||||||
cf_kwargs = kwargs.copy()
|
cf_kwargs = {}
|
||||||
headers = cf_kwargs["headers"] = (
|
headers = cf_kwargs["headers"] = collections.OrderedDict()
|
||||||
kwargs["headers"].copy() if "headers" in kwargs else {})
|
params = cf_kwargs["params"] = collections.OrderedDict()
|
||||||
params = cf_kwargs["params"] = (
|
|
||||||
kwargs["params"].copy() if "params" in kwargs else {})
|
|
||||||
|
|
||||||
page = response.text
|
page = response.text
|
||||||
params["s"] = text.extract(page, 'name="s" value="', '"')[0]
|
params["s"] = text.extract(page, 'name="s" value="', '"')[0]
|
||||||
@@ -49,7 +48,7 @@ def solve_challenge(session, response, kwargs):
|
|||||||
|
|
||||||
url = root + "/cdn-cgi/l/chk_jschl"
|
url = root + "/cdn-cgi/l/chk_jschl"
|
||||||
cf_kwargs["allow_redirects"] = False
|
cf_kwargs["allow_redirects"] = False
|
||||||
cf_response = session.request(response.request.method, url, **cf_kwargs)
|
cf_response = session.request("GET", url, **cf_kwargs)
|
||||||
|
|
||||||
location = cf_response.headers.get("Location")
|
location = cf_response.headers.get("Location")
|
||||||
if not location:
|
if not location:
|
||||||
|
|||||||
@@ -127,10 +127,17 @@ class Extractor():
|
|||||||
|
|
||||||
def _init_headers(self):
|
def _init_headers(self):
|
||||||
"""Set additional headers for the 'session' object"""
|
"""Set additional headers for the 'session' object"""
|
||||||
self.session.headers["Accept-Language"] = "en-US,en;q=0.5"
|
headers = self.session.headers
|
||||||
self.session.headers["User-Agent"] = self.config(
|
headers.clear()
|
||||||
|
|
||||||
|
headers["User-Agent"] = self.config(
|
||||||
"user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:62.0) "
|
"user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:62.0) "
|
||||||
"Gecko/20100101 Firefox/62.0"))
|
"Gecko/20100101 Firefox/62.0"))
|
||||||
|
headers["Accept"] = "*/*"
|
||||||
|
headers["Accept-Language"] = "en-US,en;q=0.5"
|
||||||
|
headers["Accept-Encoding"] = "gzip, deflate"
|
||||||
|
headers["Connection"] = "keep-alive"
|
||||||
|
headers["Upgrade-Insecure-Requests"] = "1"
|
||||||
|
|
||||||
def _init_proxies(self):
|
def _init_proxies(self):
|
||||||
"""Update the session's proxy map"""
|
"""Update the session's proxy map"""
|
||||||
|
|||||||
Reference in New Issue
Block a user