overhaul session initialization
- share adapter & connection pool across sessions with the same ssl options, ssl ciphers, and source address - simplify browser emulation to just a list of headers and ciphers
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# Copyright 2014-2021 Mike Fährmann
|
# Copyright 2014-2022 Mike Fährmann
|
||||||
#
|
#
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License version 2 as
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
@@ -63,11 +63,6 @@ class Extractor():
|
|||||||
if self._retries < 0:
|
if self._retries < 0:
|
||||||
self._retries = float("inf")
|
self._retries = float("inf")
|
||||||
|
|
||||||
self._additional_adapter_options = 0
|
|
||||||
if self.config("disabletls12") or self.disabletls12:
|
|
||||||
self._additional_adapter_options |= ssl.OP_NO_TLSv1_2
|
|
||||||
self.log.info("TLS 1.2 disabled.")
|
|
||||||
|
|
||||||
self._init_session()
|
self._init_session()
|
||||||
self._init_cookies()
|
self._init_cookies()
|
||||||
self._init_proxies()
|
self._init_proxies()
|
||||||
@@ -225,14 +220,7 @@ class Extractor():
|
|||||||
self.session = session = requests.Session()
|
self.session = session = requests.Session()
|
||||||
headers = session.headers
|
headers = session.headers
|
||||||
headers.clear()
|
headers.clear()
|
||||||
|
ssl_options = ssl_ciphers = 0
|
||||||
source_address = self.config("source-address")
|
|
||||||
if source_address:
|
|
||||||
if isinstance(source_address, str):
|
|
||||||
source_address = (source_address, 0)
|
|
||||||
else:
|
|
||||||
source_address = (source_address[0], source_address[1])
|
|
||||||
session.mount("http://", SourceAdapter(source_address))
|
|
||||||
|
|
||||||
browser = self.config("browser") or self.browser
|
browser = self.config("browser") or self.browser
|
||||||
if browser and isinstance(browser, str):
|
if browser and isinstance(browser, str):
|
||||||
@@ -249,14 +237,21 @@ class Extractor():
|
|||||||
platform = "Macintosh; Intel Mac OS X 11.5"
|
platform = "Macintosh; Intel Mac OS X 11.5"
|
||||||
|
|
||||||
if browser == "chrome":
|
if browser == "chrome":
|
||||||
_emulate_browser_chrome(session, platform, source_address,
|
if platform.startswith("Macintosh"):
|
||||||
self._additional_adapter_options)
|
platform = platform.replace(".", "_") + "_2"
|
||||||
else:
|
else:
|
||||||
_emulate_browser_firefox(session, platform, source_address,
|
browser = "firefox"
|
||||||
self._additional_adapter_options)
|
|
||||||
|
for key, value in HTTP_HEADERS[browser]:
|
||||||
|
if value and "{}" in value:
|
||||||
|
headers[key] = value.format(platform)
|
||||||
|
else:
|
||||||
|
headers[key] = value
|
||||||
|
|
||||||
|
ssl_options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
|
||||||
|
ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
|
||||||
|
ssl_ciphers = SSL_CIPHERS[browser]
|
||||||
else:
|
else:
|
||||||
if source_address:
|
|
||||||
session.mount("https://", SourceAdapter(source_address))
|
|
||||||
headers["User-Agent"] = self.config("user-agent", (
|
headers["User-Agent"] = self.config("user-agent", (
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
|
||||||
"rv:91.0) Gecko/20100101 Firefox/91.0"))
|
"rv:91.0) Gecko/20100101 Firefox/91.0"))
|
||||||
@@ -268,12 +263,28 @@ class Extractor():
|
|||||||
if custom_headers:
|
if custom_headers:
|
||||||
headers.update(custom_headers)
|
headers.update(custom_headers)
|
||||||
|
|
||||||
ciphers = self.config("ciphers")
|
custom_ciphers = self.config("ciphers")
|
||||||
if ciphers:
|
if custom_ciphers:
|
||||||
if isinstance(ciphers, list):
|
if isinstance(custom_ciphers, list):
|
||||||
ciphers = ":".join(ciphers)
|
ssl_ciphers = ":".join(custom_ciphers)
|
||||||
session.mount("https://", HTTPSAdapter(ciphers,
|
else:
|
||||||
self._additional_adapter_options))
|
ssl_ciphers = custom_ciphers
|
||||||
|
|
||||||
|
source_address = self.config("source-address")
|
||||||
|
if source_address:
|
||||||
|
if isinstance(source_address, str):
|
||||||
|
source_address = (source_address, 0)
|
||||||
|
else:
|
||||||
|
source_address = (source_address[0], source_address[1])
|
||||||
|
|
||||||
|
if self.config("disabletls12") or self.disabletls12:
|
||||||
|
ssl_options |= ssl.OP_NO_TLSv1_2
|
||||||
|
self.log.debug("TLS 1.2 disabled.")
|
||||||
|
|
||||||
|
adapter = _build_requests_adapter(
|
||||||
|
ssl_options, ssl_ciphers, source_address)
|
||||||
|
session.mount("https://", adapter)
|
||||||
|
session.mount("http://", adapter)
|
||||||
|
|
||||||
def _init_proxies(self):
|
def _init_proxies(self):
|
||||||
"""Update the session's proxy map"""
|
"""Update the session's proxy map"""
|
||||||
@@ -624,30 +635,10 @@ class BaseExtractor(Extractor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class SourceAdapter(HTTPAdapter):
|
class RequestsAdapter(HTTPAdapter):
|
||||||
|
|
||||||
def __init__(self, source_address):
|
def __init__(self, ssl_context=None, source_address=None):
|
||||||
self.source_address = source_address
|
self.ssl_context = ssl_context
|
||||||
HTTPAdapter.__init__(self)
|
|
||||||
|
|
||||||
def init_poolmanager(self, *args, **kwargs):
|
|
||||||
kwargs["source_address"] = self.source_address
|
|
||||||
return HTTPAdapter.init_poolmanager(self, *args, **kwargs)
|
|
||||||
|
|
||||||
def proxy_manager_for(self, *args, **kwargs):
|
|
||||||
kwargs["source_address"] = self.source_address
|
|
||||||
return HTTPAdapter.proxy_manager_for(self, *args, **kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
class HTTPSAdapter(HTTPAdapter):
|
|
||||||
|
|
||||||
def __init__(self, ciphers, additional_options, source_address=None):
|
|
||||||
context = self.ssl_context = ssl.create_default_context()
|
|
||||||
context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
|
|
||||||
ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
|
|
||||||
context.options |= additional_options
|
|
||||||
context.set_ecdh_curve("prime256v1")
|
|
||||||
context.set_ciphers(ciphers)
|
|
||||||
self.source_address = source_address
|
self.source_address = source_address
|
||||||
HTTPAdapter.__init__(self)
|
HTTPAdapter.__init__(self)
|
||||||
|
|
||||||
@@ -662,20 +653,59 @@ class HTTPSAdapter(HTTPAdapter):
|
|||||||
return HTTPAdapter.proxy_manager_for(self, *args, **kwargs)
|
return HTTPAdapter.proxy_manager_for(self, *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def _emulate_browser_firefox(session, platform, source_address,
|
def _build_requests_adapter(ssl_options, ssl_ciphers, source_address):
|
||||||
additional_options):
|
key = (ssl_options, ssl_ciphers, source_address)
|
||||||
headers = session.headers
|
try:
|
||||||
headers["User-Agent"] = ("Mozilla/5.0 (" + platform + "; rv:91.0) "
|
return _adapter_cache[key]
|
||||||
"Gecko/20100101 Firefox/91.0")
|
except KeyError:
|
||||||
headers["Accept"] = ("text/html,application/xhtml+xml,"
|
pass
|
||||||
"application/xml;q=0.9,image/webp,*/*;q=0.8")
|
|
||||||
headers["Accept-Language"] = "en-US,en;q=0.5"
|
|
||||||
headers["Accept-Encoding"] = "gzip, deflate"
|
|
||||||
headers["Referer"] = None
|
|
||||||
headers["Upgrade-Insecure-Requests"] = "1"
|
|
||||||
headers["Cookie"] = None
|
|
||||||
|
|
||||||
session.mount("https://", HTTPSAdapter(
|
if ssl_options or ssl_ciphers:
|
||||||
|
ssl_context = ssl.create_default_context()
|
||||||
|
if ssl_options:
|
||||||
|
ssl_context.options |= ssl_options
|
||||||
|
if ssl_ciphers:
|
||||||
|
ssl_context.set_ecdh_curve("prime256v1")
|
||||||
|
ssl_context.set_ciphers(ssl_ciphers)
|
||||||
|
else:
|
||||||
|
ssl_context = None
|
||||||
|
|
||||||
|
adapter = _adapter_cache[key] = RequestsAdapter(
|
||||||
|
ssl_context, source_address)
|
||||||
|
return adapter
|
||||||
|
|
||||||
|
|
||||||
|
_adapter_cache = {}
|
||||||
|
|
||||||
|
|
||||||
|
HTTP_HEADERS = {
|
||||||
|
"firefox": (
|
||||||
|
("User-Agent", "Mozilla/5.0 ({}; rv:91.0) "
|
||||||
|
"Gecko/20100101 Firefox/91.0"),
|
||||||
|
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
|
||||||
|
"image/avif,*/*;q=0.8"),
|
||||||
|
("Accept-Language", "en-US,en;q=0.5"),
|
||||||
|
("Accept-Encoding", "gzip, deflate"),
|
||||||
|
("Referer", None),
|
||||||
|
("Connection", "keep-alive"),
|
||||||
|
("Upgrade-Insecure-Requests", "1"),
|
||||||
|
("Cookie", None),
|
||||||
|
),
|
||||||
|
"chrome": (
|
||||||
|
("Upgrade-Insecure-Requests", "1"),
|
||||||
|
("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, "
|
||||||
|
"like Gecko) Chrome/92.0.4515.131 Safari/537.36"),
|
||||||
|
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
|
||||||
|
"image/webp,image/apng,*/*;q=0.8"),
|
||||||
|
("Referer", None),
|
||||||
|
("Accept-Encoding", "gzip, deflate"),
|
||||||
|
("Accept-Language", "en-US,en;q=0.9"),
|
||||||
|
("Cookie", None),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
SSL_CIPHERS = {
|
||||||
|
"firefox": (
|
||||||
"TLS_AES_128_GCM_SHA256:"
|
"TLS_AES_128_GCM_SHA256:"
|
||||||
"TLS_CHACHA20_POLY1305_SHA256:"
|
"TLS_CHACHA20_POLY1305_SHA256:"
|
||||||
"TLS_AES_256_GCM_SHA384:"
|
"TLS_AES_256_GCM_SHA384:"
|
||||||
@@ -693,30 +723,9 @@ def _emulate_browser_firefox(session, platform, source_address,
|
|||||||
"DHE-RSA-AES256-SHA:"
|
"DHE-RSA-AES256-SHA:"
|
||||||
"AES128-SHA:"
|
"AES128-SHA:"
|
||||||
"AES256-SHA:"
|
"AES256-SHA:"
|
||||||
"DES-CBC3-SHA",
|
"DES-CBC3-SHA"
|
||||||
additional_options,
|
),
|
||||||
source_address
|
"chrome": (
|
||||||
))
|
|
||||||
|
|
||||||
|
|
||||||
def _emulate_browser_chrome(session, platform, source_address,
|
|
||||||
additional_options):
|
|
||||||
if platform.startswith("Macintosh"):
|
|
||||||
platform = platform.replace(".", "_") + "_2"
|
|
||||||
|
|
||||||
headers = session.headers
|
|
||||||
headers["Upgrade-Insecure-Requests"] = "1"
|
|
||||||
headers["User-Agent"] = (
|
|
||||||
"Mozilla/5.0 (" + platform + ") AppleWebKit/537.36 "
|
|
||||||
"(KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36")
|
|
||||||
headers["Accept"] = ("text/html,application/xhtml+xml,application/xml;"
|
|
||||||
"q=0.9,image/webp,image/apng,*/*;q=0.8")
|
|
||||||
headers["Referer"] = None
|
|
||||||
headers["Accept-Encoding"] = "gzip, deflate"
|
|
||||||
headers["Accept-Language"] = "en-US,en;q=0.9"
|
|
||||||
headers["Cookie"] = None
|
|
||||||
|
|
||||||
session.mount("https://", HTTPSAdapter(
|
|
||||||
"TLS_AES_128_GCM_SHA256:"
|
"TLS_AES_128_GCM_SHA256:"
|
||||||
"TLS_AES_256_GCM_SHA384:"
|
"TLS_AES_256_GCM_SHA384:"
|
||||||
"TLS_CHACHA20_POLY1305_SHA256:"
|
"TLS_CHACHA20_POLY1305_SHA256:"
|
||||||
@@ -732,10 +741,9 @@ def _emulate_browser_chrome(session, platform, source_address,
|
|||||||
"AES256-GCM-SHA384:"
|
"AES256-GCM-SHA384:"
|
||||||
"AES128-SHA:"
|
"AES128-SHA:"
|
||||||
"AES256-SHA:"
|
"AES256-SHA:"
|
||||||
"DES-CBC3-SHA",
|
"DES-CBC3-SHA"
|
||||||
additional_options,
|
),
|
||||||
source_address
|
}
|
||||||
))
|
|
||||||
|
|
||||||
|
|
||||||
# Undo automatic pyOpenSSL injection by requests
|
# Undo automatic pyOpenSSL injection by requests
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# Copyright 2015-2021 Mike Fährmann
|
# Copyright 2015-2022 Mike Fährmann
|
||||||
#
|
#
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License version 2 as
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
@@ -53,9 +53,6 @@ class Job():
|
|||||||
extr.category = pextr.category
|
extr.category = pextr.category
|
||||||
extr.subcategory = pextr.subcategory
|
extr.subcategory = pextr.subcategory
|
||||||
|
|
||||||
# reuse connection adapters
|
|
||||||
extr.session.adapters = pextr.session.adapters
|
|
||||||
|
|
||||||
# user-supplied metadata
|
# user-supplied metadata
|
||||||
kwdict = extr.config("keywords")
|
kwdict = extr.config("keywords")
|
||||||
if kwdict:
|
if kwdict:
|
||||||
|
|||||||
Reference in New Issue
Block a user