diff --git a/releases/splunk.json b/releases/splunk.json index f4aec689..9ac27e7e 100644 --- a/releases/splunk.json +++ b/releases/splunk.json @@ -1,10 +1,26 @@ { "releases": {}, "versions": { + "9.3.5": { + "name": "9.3.5", + "date": "2025-06-05" + }, + "9.2.7": { + "name": "9.2.7", + "date": "2025-06-05" + }, "9.4.2": { "name": "9.4.2", "date": "2025-04-28" }, + "9.3.4": { + "name": "9.3.4", + "date": "2025-04-28" + }, + "9.2.6": { + "name": "9.2.6", + "date": "2025-04-28" + }, "9.1.9": { "name": "9.1.9", "date": "2025-04-28" @@ -13,6 +29,14 @@ "name": "9.4.1", "date": "2025-02-26" }, + "9.3.3": { + "name": "9.3.3", + "date": "2025-02-26" + }, + "9.2.5": { + "name": "9.2.5", + "date": "2025-02-26" + }, "9.1.8": { "name": "9.1.8", "date": "2025-02-26" @@ -21,34 +45,90 @@ "name": "9.4.0", "date": "2024-12-16" }, + "9.3.2": { + "name": "9.3.2", + "date": "2024-11-07" + }, + "9.2.4": { + "name": "9.2.4", + "date": "2024-11-07" + }, "9.1.7": { "name": "9.1.7", "date": "2024-11-07" }, + "9.3.1": { + "name": "9.3.1", + "date": "2024-09-12" + }, + "9.2.3": { + "name": "9.2.3", + "date": "2024-09-12" + }, "9.1.6": { "name": "9.1.6", "date": "2024-09-12" }, + "9.3.0": { + "name": "9.3.0", + "date": "2024-07-24" + }, + "9.2.2": { + "name": "9.2.2", + "date": "2024-07-01" + }, "9.1.5": { "name": "9.1.5", "date": "2024-07-01" }, + "9.0.10": { + "name": "9.0.10", + "date": "2024-07-01" + }, + "9.2.1": { + "name": "9.2.1", + "date": "2024-03-27" + }, "9.1.4": { "name": "9.1.4", "date": "2024-03-27" }, + "9.0.9": { + "name": "9.0.9", + "date": "2024-03-27" + }, + "9.2.0.1": { + "name": "9.2.0.1", + "date": "2024-02-08" + }, + "9.2.0": { + "name": "9.2.0", + "date": "2024-01-31" + }, "9.1.3": { "name": "9.1.3", "date": "2024-01-22" }, + "9.0.8": { + "name": "9.0.8", + "date": "2024-01-22" + }, "9.1.2": { "name": "9.1.2", "date": "2023-11-16" }, + "9.0.7": { + "name": "9.0.7", + "date": "2023-11-16" + }, "9.1.1": { "name": "9.1.1", "date": "2023-08-30" }, + "9.0.6": { + "name": "9.0.6", + "date": "2023-08-30" + }, "8.2.12": { "name": "8.2.12", "date": "2023-08-30" @@ -57,6 +137,10 @@ "name": "9.1.0.2", "date": "2023-07-31" }, + "9.0.5.1": { + "name": "9.0.5.1", + "date": "2023-07-31" + }, "8.2.11.2": { "name": "8.2.11.2", "date": "2023-07-31" @@ -69,6 +153,10 @@ "name": "9.1.0", "date": "2023-06-28" }, + "9.0.5": { + "name": "9.0.5", + "date": "2023-06-01" + }, "8.2.11": { "name": "8.2.11", "date": "2023-06-01" @@ -77,6 +165,14 @@ "name": "8.1.14", "date": "2023-06-01" }, + "9.0.4.1": { + "name": "9.0.4.1", + "date": "2023-03-17" + }, + "9.0.4": { + "name": "9.0.4", + "date": "2023-02-14" + }, "8.2.10": { "name": "8.2.10", "date": "2023-02-14" @@ -85,6 +181,14 @@ "name": "8.1.13", "date": "2023-02-14" }, + "9.0.3": { + "name": "9.0.3", + "date": "2022-12-14" + }, + "9.0.2": { + "name": "9.0.2", + "date": "2022-11-01" + }, "8.2.9": { "name": "8.2.9", "date": "2022-11-01" @@ -97,6 +201,10 @@ "name": "8.2.8", "date": "2022-09-07" }, + "9.0.1": { + "name": "9.0.1", + "date": "2022-08-16" + }, "8.2.7.1": { "name": "8.2.7.1", "date": "2022-08-16" @@ -105,6 +213,10 @@ "name": "8.1.11", "date": "2022-08-16" }, + "9.0.0.1": { + "name": "9.0.0.1", + "date": "2022-07-20" + }, "8.2.7": { "name": "8.2.7", "date": "2022-06-30" @@ -117,6 +229,10 @@ "name": "8.1.10.1", "date": "2022-06-30" }, + "9.0.0": { + "name": "9.0.0", + "date": "2022-06-14" + }, "8.1.10": { "name": "8.1.10", "date": "2022-04-14" diff --git a/src/common/http.py b/src/common/http.py index 3e9085e3..50fdfcf9 100644 --- a/src/common/http.py +++ b/src/common/http.py @@ -16,10 +16,10 @@ from requests_futures.sessions import FuturesSession from urllib3.util import Retry # See https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent. -USER_AGENT = 'endoflife.date-bot/1.0 (endoflife.date automation; +https://endoflife.date/bot)' +ENDOFLIFE_BOT_USER_AGENT = 'endoflife.date-bot/1.0 (endoflife.date automation; +https://endoflife.date/bot)' +FIREFOX_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:140.0) Gecko/20100101 Firefox/140.0' - -def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None, +def fetch_urls(urls: list[str], data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT, max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> list[Response]: logging.info(f"Fetching {urls}") @@ -33,7 +33,7 @@ def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None session.mount('http://', adapter) session.mount('https://', adapter) - headers = {'User-Agent': USER_AGENT} | ({} if headers is None else headers) + headers = {'User-Agent': user_agent} futures = [session.get(url, headers=headers, data=data, timeout=timeout, stream=None) for url in urls] results = [future.result() for future in as_completed(futures)] @@ -48,44 +48,47 @@ def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None # We could wait a bit before retrying, but it's not clear if it would help. logging.warning( f"Got ChunkedEncodingError while fetching {urls} ({e}), retrying (remaining retries = {next_max_retries}).") - return fetch_urls(urls, data, headers, next_max_retries, backoff_factor, timeout) + return fetch_urls(urls, data, user_agent, next_max_retries, backoff_factor, timeout) -def fetch_url(url: str, data: any = None, headers: dict[str, str] = None, +def fetch_url(url: str, data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT, max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Response: - return fetch_urls([url], data, headers, max_retries, backoff_factor, timeout)[0] + return fetch_urls([url], data, user_agent, max_retries, backoff_factor, timeout)[0] -def fetch_html(url: str, data: any = None, headers: dict[str, str] = None, +def fetch_html(url: str, data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT, max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30, features: str = "html5lib") -> BeautifulSoup: - response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout) + response = fetch_url(url, data, user_agent, max_retries, backoff_factor, timeout) return BeautifulSoup(response.text, features=features) -def fetch_json(url: str, data: any = None, headers: dict[str, str] = None, +def fetch_json(url: str, data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT, max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Document: - response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout) + response = fetch_url(url, data, user_agent, max_retries, backoff_factor, timeout) return response.json() -def fetch_yaml(url: str, data: any = None, headers: dict[str, str] = None, +def fetch_yaml(url: str, data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT, max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> any: - response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout) + response = fetch_url(url, data, user_agent, max_retries, backoff_factor, timeout) return yaml.safe_load(response.text) -def fetch_xml(url: str, data: any = None, headers: dict[str, str] = None, +def fetch_xml(url: str, data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT, max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Document: - response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout) + response = fetch_url(url, data, user_agent, max_retries, backoff_factor, timeout) return xml.dom.minidom.parseString(response.text) -def fetch_markdown(url: str, data: any = None, headers: dict[str, str] = None, +def fetch_markdown(url: str, data: any = None, user_agent: str = ENDOFLIFE_BOT_USER_AGENT, max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Wikicode: - response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout) + response = fetch_url(url, data, user_agent, max_retries, backoff_factor, timeout) return mwparserfromhell.parse(response.text) # This requires some setup, see https://playwright.dev/python/docs/intro#installing-playwright. -def fetch_javascript_url(url: str, wait_until: str = None, wait_for: str = None, select_wait_for: bool = False) -> str: +def fetch_javascript_url(url: str, user_agent: str = ENDOFLIFE_BOT_USER_AGENT, wait_until: str = None, wait_for: str = None, select_wait_for: bool = False) -> str: logging.info(f"Fetching {url} with JavaScript (wait_until = {wait_until}, wait_for = {wait_for}, select_wait_for = {select_wait_for})") with sync_playwright() as p: browser = p.chromium.launch() + context = browser.new_context() + context.set_extra_http_headers({'User-Agent': user_agent}) + try: page = browser.new_page() page.goto(url, wait_until=wait_until) diff --git a/src/release_table.py b/src/release_table.py index 9643a286..2aac7762 100644 --- a/src/release_table.py +++ b/src/release_table.py @@ -16,6 +16,8 @@ necessary information. Available configuration options are: - selector (mandatory, no default): A CSS selector used to locate one or more tables in the page. - header_selector (mandatory, default = thead tr): A CSS selector used to locate the table's header row. - rows_selector (mandatory, default = tbody tr): A CSS selector used to locate the table's rows. +- user_agent (optional, default = ): A user agent string to use when fetching the page. + Unused when render_javascript is true. - render_javascript (optional, default = false): A boolean value indicating whether to render JavaScript on the page. - render_javascript_wait_for (optional, default = None): Wait until the given selector appear on the page. Only use when render_javascript is true. @@ -150,20 +152,21 @@ class Field: config = config_from_argv() with ProductData(config.product) as product_data: - render_javascript = config.data.get("render_javascript", False) - render_javascript_wait_until = config.data.get("render_javascript_wait_until", None) - render_javascript_wait_for = config.data.get("render_javascript_wait_for", None) + user_agent = config.data.get("user_agent", http.ENDOFLIFE_BOT_USER_AGENT) + render_js = config.data.get("render_javascript", False) + render_js_wait_until = config.data.get("render_javascript_wait_until", None) + render_js_wait_for = config.data.get("render_javascript_wait_for", None) header_row_selector = config.data.get("header_selector", "thead tr") rows_selector = config.data.get("rows_selector", "tbody tr") cells_selector = "td, th" release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle")) fields = [Field(name, definition) for name, definition in config.data["fields"].items()] - if render_javascript: - response_text = http.fetch_javascript_url(config.url, wait_until=render_javascript_wait_until, - wait_for=render_javascript_wait_for) + if render_js: + response_text = http.fetch_javascript_url(config.url, user_agent=user_agent, wait_until=render_js_wait_until, + wait_for=render_js_wait_for) else: - response_text = http.fetch_url(config.url).text + response_text = http.fetch_url(config.url, user_agent=user_agent).text soup = BeautifulSoup(response_text, features="html5lib") for table in soup.select(config.data["selector"]): diff --git a/src/splunk.py b/src/splunk.py index 5210f80d..31b38041 100644 --- a/src/splunk.py +++ b/src/splunk.py @@ -41,7 +41,8 @@ with ProductData(config.product) as product_data: # For example, 9.0.5 release notes also contains release notes for 9.0.0 to 9.0.4. latest_minor_versions = get_latest_minor_versions(all_versions) latest_minor_versions_urls = [f"{config.url}/{v}/ReleaseNotes/MeetSplunk" for v in latest_minor_versions] - for response in http.fetch_urls(latest_minor_versions_urls): + # Oddly using the endoflife.date user agent does not work for 9.0, 9.2 and 9.3. + for response in http.fetch_urls(latest_minor_versions_urls, user_agent=http.FIREFOX_USER_AGENT): for (version_str, date_str) in VERSION_DATE_PATTERN.findall(response.text): version_str = f"{version_str}.0" if len(version_str.split(".")) == 2 else version_str # convert x.y to x.y.0 date = dates.parse_date(date_str)