[http] Improve User-Agent handling (#474)
- Simplify overriding the User-Agent by replacing the generic (unused) `headers` parameter by a new `user_agent` parameters. - Take into account the User-Agent even in `fetch_javascript_url` : Playwright user agent is `Playwright/1.12.0` by default anyway. - Make it possible to override the User-Agent with the `release_table` method. - Update splunk to use Firefox's User-Agent. This follows #470.
This commit is contained in:
@@ -16,10 +16,10 @@ from requests_futures.sessions import FuturesSession
|
||||
from urllib3.util import Retry
|
||||
|
||||
# See https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent.
|
||||
USER_AGENT = 'endoflife.date-bot/1.0 (endoflife.date automation; +https://endoflife.date/bot)'
|
||||
ENDOFLIFE_BOT_USER_AGENT = 'endoflife.date-bot/1.0 (endoflife.date automation; +https://endoflife.date/bot)'
|
||||
FIREFOX_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0'
|
||||
|
||||
|
||||
def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None,
|
||||
def fetch_urls(urls: list[str], data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT,
|
||||
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> list[Response]:
|
||||
logging.info(f"Fetching {urls}")
|
||||
|
||||
@@ -33,7 +33,7 @@ def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None
|
||||
session.mount('http://', adapter)
|
||||
session.mount('https://', adapter)
|
||||
|
||||
headers = {'User-Agent': USER_AGENT} | ({} if headers is None else headers)
|
||||
headers = {'User-Agent': user_agent}
|
||||
futures = [session.get(url, headers=headers, data=data, timeout=timeout, stream=None) for url in urls]
|
||||
results = [future.result() for future in as_completed(futures)]
|
||||
|
||||
@@ -48,44 +48,47 @@ def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None
|
||||
# We could wait a bit before retrying, but it's not clear if it would help.
|
||||
logging.warning(
|
||||
f"Got ChunkedEncodingError while fetching {urls} ({e}), retrying (remaining retries = {next_max_retries}).")
|
||||
return fetch_urls(urls, data, headers, next_max_retries, backoff_factor, timeout)
|
||||
return fetch_urls(urls, data, user_agent, next_max_retries, backoff_factor, timeout)
|
||||
|
||||
|
||||
def fetch_url(url: str, data: any = None, headers: dict[str, str] = None,
|
||||
def fetch_url(url: str, data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT,
|
||||
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Response:
|
||||
return fetch_urls([url], data, headers, max_retries, backoff_factor, timeout)[0]
|
||||
return fetch_urls([url], data, user_agent, max_retries, backoff_factor, timeout)[0]
|
||||
|
||||
def fetch_html(url: str, data: any = None, headers: dict[str, str] = None,
|
||||
def fetch_html(url: str, data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT,
|
||||
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30,
|
||||
features: str = "html5lib") -> BeautifulSoup:
|
||||
response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
|
||||
response = fetch_url(url, data, user_agent, max_retries, backoff_factor, timeout)
|
||||
return BeautifulSoup(response.text, features=features)
|
||||
|
||||
def fetch_json(url: str, data: any = None, headers: dict[str, str] = None,
|
||||
def fetch_json(url: str, data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT,
|
||||
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Document:
|
||||
response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
|
||||
response = fetch_url(url, data, user_agent, max_retries, backoff_factor, timeout)
|
||||
return response.json()
|
||||
|
||||
def fetch_yaml(url: str, data: any = None, headers: dict[str, str] = None,
|
||||
def fetch_yaml(url: str, data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT,
|
||||
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> any:
|
||||
response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
|
||||
response = fetch_url(url, data, user_agent, max_retries, backoff_factor, timeout)
|
||||
return yaml.safe_load(response.text)
|
||||
|
||||
def fetch_xml(url: str, data: any = None, headers: dict[str, str] = None,
|
||||
def fetch_xml(url: str, data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT,
|
||||
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Document:
|
||||
response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
|
||||
response = fetch_url(url, data, user_agent, max_retries, backoff_factor, timeout)
|
||||
return xml.dom.minidom.parseString(response.text)
|
||||
|
||||
def fetch_markdown(url: str, data: any = None, headers: dict[str, str] = None,
|
||||
def fetch_markdown(url: str, data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT,
|
||||
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Wikicode:
|
||||
response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
|
||||
response = fetch_url(url, data, user_agent, max_retries, backoff_factor, timeout)
|
||||
return mwparserfromhell.parse(response.text)
|
||||
|
||||
# This requires some setup, see https://playwright.dev/python/docs/intro#installing-playwright.
|
||||
def fetch_javascript_url(url: str, wait_until: str = None, wait_for: str = None, select_wait_for: bool = False) -> str:
|
||||
def fetch_javascript_url(url: str, user_agent: str = ENDOFLIFE_BOT_USER_AGENT, wait_until: str = None, wait_for: str = None, select_wait_for: bool = False) -> str:
|
||||
logging.info(f"Fetching {url} with JavaScript (wait_until = {wait_until}, wait_for = {wait_for}, select_wait_for = {select_wait_for})")
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch()
|
||||
context = browser.new_context()
|
||||
context.set_extra_http_headers({'User-Agent': user_agent})
|
||||
|
||||
try:
|
||||
page = browser.new_page()
|
||||
page.goto(url, wait_until=wait_until)
|
||||
|
||||
@@ -16,6 +16,8 @@ necessary information. Available configuration options are:
|
||||
- selector (mandatory, no default): A CSS selector used to locate one or more tables in the page.
|
||||
- header_selector (mandatory, default = thead tr): A CSS selector used to locate the table's header row.
|
||||
- rows_selector (mandatory, default = tbody tr): A CSS selector used to locate the table's rows.
|
||||
- user_agent (optional, default = <endoflife.date-bot User-Agent>): A user agent string to use when fetching the page.
|
||||
Unused when render_javascript is true.
|
||||
- render_javascript (optional, default = false): A boolean value indicating whether to render JavaScript on the page.
|
||||
- render_javascript_wait_for (optional, default = None): Wait until the given selector appear on the page. Only use when
|
||||
render_javascript is true.
|
||||
@@ -150,20 +152,21 @@ class Field:
|
||||
|
||||
config = config_from_argv()
|
||||
with ProductData(config.product) as product_data:
|
||||
render_javascript = config.data.get("render_javascript", False)
|
||||
render_javascript_wait_until = config.data.get("render_javascript_wait_until", None)
|
||||
render_javascript_wait_for = config.data.get("render_javascript_wait_for", None)
|
||||
user_agent = config.data.get("user_agent", http.ENDOFLIFE_BOT_USER_AGENT)
|
||||
render_js = config.data.get("render_javascript", False)
|
||||
render_js_wait_until = config.data.get("render_javascript_wait_until", None)
|
||||
render_js_wait_for = config.data.get("render_javascript_wait_for", None)
|
||||
header_row_selector = config.data.get("header_selector", "thead tr")
|
||||
rows_selector = config.data.get("rows_selector", "tbody tr")
|
||||
cells_selector = "td, th"
|
||||
release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle"))
|
||||
fields = [Field(name, definition) for name, definition in config.data["fields"].items()]
|
||||
|
||||
if render_javascript:
|
||||
response_text = http.fetch_javascript_url(config.url, wait_until=render_javascript_wait_until,
|
||||
wait_for=render_javascript_wait_for)
|
||||
if render_js:
|
||||
response_text = http.fetch_javascript_url(config.url, user_agent=user_agent, wait_until=render_js_wait_until,
|
||||
wait_for=render_js_wait_for)
|
||||
else:
|
||||
response_text = http.fetch_url(config.url).text
|
||||
response_text = http.fetch_url(config.url, user_agent=user_agent).text
|
||||
soup = BeautifulSoup(response_text, features="html5lib")
|
||||
|
||||
for table in soup.select(config.data["selector"]):
|
||||
|
||||
@@ -41,7 +41,8 @@ with ProductData(config.product) as product_data:
|
||||
# For example, 9.0.5 release notes also contains release notes for 9.0.0 to 9.0.4.
|
||||
latest_minor_versions = get_latest_minor_versions(all_versions)
|
||||
latest_minor_versions_urls = [f"{config.url}/{v}/ReleaseNotes/MeetSplunk" for v in latest_minor_versions]
|
||||
for response in http.fetch_urls(latest_minor_versions_urls):
|
||||
# Oddly using the endoflife.date user agent does not work for 9.0, 9.2 and 9.3.
|
||||
for response in http.fetch_urls(latest_minor_versions_urls, user_agent=http.FIREFOX_USER_AGENT):
|
||||
for (version_str, date_str) in VERSION_DATE_PATTERN.findall(response.text):
|
||||
version_str = f"{version_str}.0" if len(version_str.split(".")) == 2 else version_str # convert x.y to x.y.0
|
||||
date = dates.parse_date(date_str)
|
||||
|
||||
Reference in New Issue
Block a user