Improve HTTP handling in scripts (#449)

Provide some helper methods to hide the complexity of parsing HTML, JSON, YAML, XML or Markdown.
This commit is contained in:
Marc Wrobel
2025-06-28 11:46:04 +02:00
parent fda4967c38
commit 312ce078bb
43 changed files with 103 additions and 137 deletions

View File

@@ -1,6 +1,5 @@
import logging
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches EKS versions from AWS docs.
@@ -8,8 +7,8 @@ Now that AWS no longer publishes docs on GitHub, we use the Web Archive to get t
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
html = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
for tr in html.select("#main-col-body")[0].findAll("tr"):
cells = tr.findAll("td")
if not cells:

View File

@@ -1,5 +1,4 @@
import logging
import xml.dom.minidom
from common import dates, endoflife, http, releasedata
@@ -7,8 +6,7 @@ from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
rss_response = http.fetch_url(config.url)
rss = xml.dom.minidom.parseString(rss_response.text)
rss = http.fetch_xml(config.url)
for entry in rss.getElementsByTagName("item"):
version_str = entry.getElementsByTagName("title")[0].firstChild.nodeValue

View File

@@ -1,14 +1,12 @@
import logging
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
ul = soup.find("h2").find_next("ul")
ul = html.find("h2").find_next("ul")
for li in ul.find_all("li"):
text = li.get_text(strip=True)
match = config.first_match(text)

View File

@@ -1,16 +1,14 @@
import logging
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches AWS lambda runtimes with their support / EOL dates from https://docs.aws.amazon.com."""
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
for i, table in enumerate(soup.find_all("table")):
for i, table in enumerate(html.find_all("table")):
headers = [th.get_text().strip().lower() for th in table.find("thead").find_all("tr")[0].find_all("th")]
if "identifier" not in headers or "deprecation date" not in headers or "block function update" not in headers:
logging.info(f"table with header '{headers}' does not contain all the expected headers")

View File

@@ -1,4 +1,3 @@
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches versions from repositories managed with cgit, such as the Linux kernel repository.
@@ -6,10 +5,9 @@ Ideally we would want to use the git repository directly, but cgit-managed repos
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url + '/refs/tags')
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url + '/refs/tags')
for table in soup.find_all("table", class_="list"):
for table in html.find_all("table", class_="list"):
for row in table.find_all("tr"):
columns = row.find_all("td")
if len(columns) != 4:

View File

@@ -1,4 +1,3 @@
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
from common.git import Git
@@ -10,9 +9,8 @@ More context on https://github.com/endoflife-date/endoflife.date/pull/4425#discu
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
rn_response = http.fetch_url(config.url)
rn_soup = BeautifulSoup(rn_response.text, features="html5lib")
released_versions = [h2.get('id') for h2 in rn_soup.find_all('h2', id=True) if h2.get('id')]
html = http.fetch_html(config.url)
released_versions = [h2.get('id') for h2 in html.find_all('h2', id=True) if h2.get('id')]
git = Git(config.data.get('repository'))
git.setup(bare=True)

View File

@@ -1,4 +1,3 @@
from bs4 import BeautifulSoup
from common import dates, endoflife, github, http, releasedata
"""Fetch released versions from docs.chef.io and retrieve their date from GitHub.
@@ -9,9 +8,8 @@ More context on https://github.com/endoflife-date/endoflife.date/pull/4425#discu
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
rn_response = http.fetch_url(config.url)
rn_soup = BeautifulSoup(rn_response.text, features="html5lib")
released_versions = [h2.get('id') for h2 in rn_soup.find_all('h2', id=True) if h2.get('id')]
html = http.fetch_html(config.url)
released_versions = [h2.get('id') for h2 in html.find_all('h2', id=True) if h2.get('id')]
for release in github.fetch_releases("inspec/inspec"):
sanitized_version = release.tag_name.replace("v", "")

View File

@@ -1,6 +1,5 @@
import re
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches versions from Adobe ColdFusion release notes on helpx.adobe.com.
@@ -24,10 +23,9 @@ FIXED_VERSIONS = {
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
changelog = http.fetch_url(config.url)
changelog_soup = BeautifulSoup(changelog.text, features="html5lib")
html = http.fetch_html(config.url)
for p in changelog_soup.findAll("div", class_="text"):
for p in html.findAll("div", class_="text"):
version_and_date_str = p.get_text().strip().replace('\xa0', ' ')
for (date_str, version_str) in VERSION_AND_DATE_PATTERN.findall(version_and_date_str):
date = dates.parse_date(date_str)

View File

@@ -1,6 +1,12 @@
import logging
import xml.dom.minidom
from concurrent.futures import as_completed
from xml.dom.minidom import Document
import mwparserfromhell
import yaml
from bs4 import BeautifulSoup
from mwparserfromhell.wikicode import Wikicode
from playwright.sync_api import sync_playwright
from requests import Response
from requests.adapters import HTTPAdapter
@@ -47,6 +53,31 @@ def fetch_url(url: str, data: any = None, headers: dict[str, str] = None,
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Response:
return fetch_urls([url], data, headers, max_retries, backoff_factor, timeout)[0]
def fetch_html(url: str, data: any = None, headers: dict[str, str] = None,
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30,
features: str = "html5lib") -> BeautifulSoup:
response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
return BeautifulSoup(response.text, features=features)
def fetch_json(url: str, data: any = None, headers: dict[str, str] = None,
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Document:
response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
return response.json()
def fetch_yaml(url: str, data: any = None, headers: dict[str, str] = None,
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> any:
response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
return yaml.safe_load(response.text)
def fetch_xml(url: str, data: any = None, headers: dict[str, str] = None,
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Document:
response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
return xml.dom.minidom.parseString(response.text)
def fetch_markdown(url: str, data: any = None, headers: dict[str, str] = None,
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Wikicode:
response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
return mwparserfromhell.parse(response.text)
# This requires some setup, see https://playwright.dev/python/docs/intro#installing-playwright.
def fetch_javascript_url(url: str, click_selector: str = None, wait_until: str = None) -> str:

View File

@@ -18,10 +18,9 @@ MANUAL_VERSIONS = {
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
main = http.fetch_url(f"{config.url}/current/install/install-intro.html")
main_soup = BeautifulSoup(main.text, features="html5lib")
html = http.fetch_html(f"{config.url}/current/install/install-intro.html")
minor_versions = [options.attrs["value"] for options in main_soup.find(class_="version_list").find_all("option")]
minor_versions = [options.attrs["value"] for options in html.find(class_="version_list").find_all("option")]
minor_version_urls = [f"{config.url}/{minor}/release-notes/relnotes.html" for minor in minor_versions]
for minor_version in http.fetch_urls(minor_version_urls):

View File

@@ -1,12 +1,10 @@
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(f"https://distrowatch.com/index.php?distribution={config.url}")
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(f"https://distrowatch.com/index.php?distribution={config.url}")
for table in soup.select("td.News1>table.News"):
for table in html.select("td.News1>table.News"):
headline = table.select_one("td.NewsHeadline a[href]").get_text().strip()
versions_match = config.first_match(headline)
if not versions_match:

View File

@@ -5,7 +5,7 @@ from common import dates, endoflife, http, releasedata
Unfortunately images creation date cannot be retrieved, so we had to use the tag_last_pushed field instead."""
def fetch_releases(p: releasedata.ProductData, c: endoflife.AutoConfig, url: str) -> None:
data = http.fetch_url(url).json()
data = http.fetch_json(url)
for result in data["results"]:
version_str = result["name"]

View File

@@ -1,6 +1,5 @@
import re
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
# https://regex101.com/r/zPxBqT/1
@@ -15,10 +14,9 @@ URL_BY_PRODUCT = {
for config in endoflife.list_configs_from_argv(): # noqa: B007 multiple JSON produced for historical reasons
for product_name, url in URL_BY_PRODUCT.items():
with releasedata.ProductData(product_name) as product_data:
relnotes = http.fetch_url(url)
relnotes_soup = BeautifulSoup(relnotes.text, features="html5lib")
html = http.fetch_html(url)
for section in relnotes_soup.find_all('section', class_='releases'):
for section in html.find_all('section', class_='releases'):
for h2 in section.find_all('h2'): # h2 contains the date
date = dates.parse_date(h2.get('data-text'))

View File

@@ -1,12 +1,10 @@
import logging
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
html = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
table_selector = config.data.get("table_selector", "#previous-releases + table").strip()
date_column = config.data.get("date_column", "Date").strip().lower()
versions_column = config.data.get("versions_column").strip().lower()

View File

@@ -1,6 +1,5 @@
import re
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
CYCLE_PATTERN = re.compile(r"^(\d+\.\d+)/$")
@@ -9,10 +8,9 @@ DATE_AND_VERSION_PATTERN = re.compile(r"^(\d{4})/(\d{2})/(\d{2})\s+:\s+(\d+\.\d+
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
# First, get all minor releases from the download page
download = http.fetch_url(config.url)
download_soup = BeautifulSoup(download.text, features="html5lib")
download_html = http.fetch_html(config.url)
minor_versions = []
for link in download_soup.select("a"):
for link in download_html.select("a"):
minor_version_match = CYCLE_PATTERN.match(link.attrs["href"])
if not minor_version_match:
continue

View File

@@ -1,12 +1,10 @@
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
page = http.fetch_url(config.url)
page_soup = BeautifulSoup(page.text, features="html5lib")
html = http.fetch_html(config.url)
for release_table in page_soup.find("div", class_="ibm-container-body").find_all("table", class_="ibm-data-table ibm-grid"):
for release_table in html.find("div", class_="ibm-container-body").find_all("table", class_="ibm-data-table ibm-grid"):
for row in release_table.find_all("tr")[1:]: # for all rows except the header
cells = row.find_all("td")
version = cells[0].text.strip("AIX ").replace(' TL', '.')

View File

@@ -1,6 +1,5 @@
import logging
import yaml
from common import dates, endoflife, http, releasedata
"""Fetch version data for Kuma from https://raw.githubusercontent.com/kumahq/kuma/master/versions.yml.
@@ -12,8 +11,7 @@ EOL_FIELD = 'endOfLifeDate'
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
yml_response = http.fetch_url(config.url)
versions_data = yaml.safe_load(yml_response.text)
versions_data = http.fetch_yaml(config.url)
# Iterate through the versions and their associated dates
for version_info in versions_data:

View File

@@ -1,16 +1,14 @@
import logging
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches LibreOffice versions from https://downloadarchive.documentfoundation.org/libreoffice/old/"""
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
for table in soup.find_all("table"):
for table in html.find_all("table"):
for row in table.find_all("tr")[1:]:
cells = row.find_all("td")
if len(cells) < 4:

View File

@@ -1,5 +1,4 @@
import re
import xml.dom.minidom
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
@@ -11,8 +10,7 @@ ANNOUNCEMENT_PATTERN = re.compile(r"includes\s+the\s+following\s+changes", re.IG
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
rss = xml.dom.minidom.parseString(response.text)
rss = http.fetch_xml(config.url)
for item in rss.getElementsByTagName("entry"):
content = item.getElementsByTagName("content")[0].firstChild.nodeValue

View File

@@ -1,6 +1,5 @@
import re
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches Lua releases from lua.org."""
@@ -10,9 +9,8 @@ VERSION_PATTERN = re.compile(r"(?P<version>\d+\.\d+\.\d+),\s*released\s*on\s*(?P
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
page = http.fetch_url(config.url)
soup = BeautifulSoup(page.text, 'html.parser')
page_text = soup.text # HTML is broken, no way to parse it with beautifulsoup
html = http.fetch_html(config.url, features = 'html.parser')
page_text = html.text # HTML is broken, no way to parse it with beautifulsoup
for release_match in RELEASED_AT_PATTERN.finditer(page_text):
release = release_match.group('release')

View File

@@ -9,7 +9,7 @@ for config in endoflife.list_configs_from_argv():
while True:
url = f"https://search.maven.org/solrsearch/select?q=g:{group_id}+AND+a:{artifact_id}&core=gav&wt=json&start={start}&rows=100"
data = http.fetch_url(url).json()
data = http.fetch_json(url)
for row in data["response"]["docs"]:
version_match = config.first_match(row["v"])

View File

@@ -1,16 +1,14 @@
import logging
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches NetBSD versions and EOL information from https://www.netbsd.org/."""
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
for row in soup.select('table tbody tr'):
for row in html.select('table tbody tr'):
cells = [cell.get_text(strip=True) for cell in row.select('td')]
version = cells[0]

View File

@@ -2,7 +2,7 @@ from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
data = http.fetch_url(f"https://registry.npmjs.org/{config.url}").json()
data = http.fetch_json(f"https://registry.npmjs.org/{config.url}")
for version_str in data["versions"]:
version_match = config.first_match(version_str)
if version_match:

View File

@@ -4,8 +4,8 @@ from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
url = f"https://portal.nutanix.com/api/v1/eol/find?type={config.url}"
data = http.fetch_url(url).json()
data = http.fetch_json(f"https://portal.nutanix.com/api/v1/eol/find?type={config.url}")
for version_data in data["contents"]:
release_name = '.'.join(version_data["version"].split(".")[:2])

View File

@@ -4,7 +4,7 @@ from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
versions = http.fetch_url(config.url).json()
versions = http.fetch_json(config.url)
for version in versions:
name = version['version']

View File

@@ -1,4 +1,3 @@
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches versions from Plesk's change log.
@@ -8,10 +7,9 @@ there is no entry for GA of version 18.0.18 and older."""
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
for release in soup.find_all("div", class_="changelog-entry--obsidian"):
for release in html.find_all("div", class_="changelog-entry--obsidian"):
version = release.h2.text.strip()
if not version.startswith('Plesk Obsidian 18'):
continue

View File

@@ -2,7 +2,7 @@ from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
data = http.fetch_url(f"https://pypi.org/pypi/{config.url}/json").json()
data = http.fetch_json(f"https://pypi.org/pypi/{config.url}/json")
for version_str in data["releases"]:
version_match = config.first_match(version_str)

View File

@@ -1,6 +1,5 @@
import logging
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches Amazon RDS versions from the version management pages on AWS docs.
@@ -11,10 +10,9 @@ in the third column (usually named 'RDS release date').
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
for table in soup.find_all("table"):
for table in html.find_all("table"):
for row in table.find_all("tr"):
columns = row.find_all("td")
if len(columns) <= 3:

View File

@@ -1,16 +1,14 @@
import logging
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches RedHat JBoss EAP version data for JBoss 7"""
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
for h4 in soup.find_all("h4"):
for h4 in html.find_all("h4"):
title = h4.get_text(strip=True)
if not title.startswith("7."):
continue

View File

@@ -1,5 +1,4 @@
import re
from xml.dom.minidom import parseString
from common import dates, endoflife, http, releasedata
@@ -7,9 +6,8 @@ from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
xml = http.fetch_xml(config.url)
xml = parseString(response.text)
versioning = xml.getElementsByTagName("metadata")[0].getElementsByTagName("versioning")[0]
latest_str = versioning.getElementsByTagName("latest")[0].firstChild.nodeValue

View File

@@ -1,6 +1,5 @@
import logging
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches Satellite versions from access.redhat.com.
@@ -9,10 +8,9 @@ A few of the older versions, such as 'Satellite 6.1 GA Release (Build 6.1.1)', w
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
for table in soup.findAll("tbody"):
for table in html.findAll("tbody"):
for tr in table.findAll("tr"):
td_list = tr.findAll("td")

View File

@@ -22,7 +22,7 @@ for config in endoflife.list_configs_from_argv():
name = urllib.parse.quote(config.url)
mapping = Mapping(config.data["fields"])
data = http.fetch_url('https://access.redhat.com/product-life-cycles/api/v1/products?name=' + name).json()
data = http.fetch_json('https://access.redhat.com/product-life-cycles/api/v1/products?name=' + name)
for version in data["data"][0]["versions"]:
version_name = version["name"]

View File

@@ -1,6 +1,5 @@
import re
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
# https://regex101.com/r/877ibq/1
@@ -8,10 +7,9 @@ VERSION_PATTERN = re.compile(r"RHEL (?P<major>\d)(\. ?(?P<minor>\d+))?(( Update
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
for tr in soup.findAll("tr"):
for tr in html.findAll("tr"):
td_list = tr.findAll("td")
if len(td_list) == 0:
continue

View File

@@ -1,14 +1,12 @@
import logging
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
for tr in soup.findAll("tr"):
for tr in html.findAll("tr"):
td_list = tr.findAll("td")
if len(td_list) == 0:
continue

View File

@@ -2,7 +2,6 @@ import logging
import re
from datetime import date, datetime, time, timezone
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Detect new models and aggregate EOL data for Samsung Mobile devices.
@@ -27,12 +26,11 @@ for config in endoflife.list_configs_from_argv():
release.set_eol(eol)
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
sections = config.data.get("sections", {})
for update_cadence, title in sections.items():
models_list = soup.find(string=lambda text, search=title: search in text if text else False).find_next("ul")
models_list = html.find(string=lambda text, search=title: search in text if text else False).find_next("ul")
for item in models_list.find_all("li"):
models = item.text.replace("Enterprise Models:", "")

View File

@@ -1,14 +1,12 @@
import logging
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
products_table = soup.find("tbody", id="productSupportLifecycle")
products_table = html.find("tbody", id="productSupportLifecycle")
sles_header_rows = products_table.find_all("tr", class_="row", attrs={"data-productfilter": "SUSE Linux Enterprise Server"})
# Extract rows' IDs to find related sub-rows with details (normally hidden until a user expands a section)

View File

@@ -1,6 +1,5 @@
import re
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
VERSION_DATE_PATTERN = re.compile(r"Splunk Enterprise (?P<version>\d+\.\d+(?:\.\d+)*) was (?:first )?released on (?P<date>\w+\s\d\d?,\s\d{4})\.", re.MULTILINE)
@@ -32,10 +31,9 @@ def get_latest_minor_versions(versions: list[str]) -> list[str]:
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
main = http.fetch_url(config.url)
soup = BeautifulSoup(main.text, features="html5lib")
html = http.fetch_html(config.url)
all_versions = [option.attrs['value'] for option in soup.select("select#version-select > option")]
all_versions = [option.attrs['value'] for option in html.select("select#version-select > option")]
all_versions = [v for v in all_versions if v != "DataMonitoringAppPreview"]
# Latest minor release notes contains release notes for all previous minor versions.

View File

@@ -2,7 +2,7 @@ from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
data = http.fetch_url(config.url).json()
data = http.fetch_json(config.url)
for v in data:
if v['type'] == 'development':
continue

View File

@@ -1,4 +1,3 @@
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches the Unity LTS releases from the Unity website. Non-LTS releases are not listed there, so this automation
@@ -19,10 +18,9 @@ The script will need to be updated if someday those conditions are not met."""
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
for release in soup.find_all('div', class_='component-releases-item__show__inner-header'):
for release in html.find_all('div', class_='component-releases-item__show__inner-header'):
version = release.find('h4').find('span').text
date = dates.parse_datetime(release.find('time').attrs['datetime'])
product_data.declare_version(version, date)

View File

@@ -1,14 +1,12 @@
import re
import mwparserfromhell
from common import dates, endoflife, http, releasedata
DATE_PATTERN = re.compile(r"\d{4}-\d{2}-\d{2}")
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
wikicode = mwparserfromhell.parse(response.text)
wikicode = http.fetch_markdown(config.url)
for tr in wikicode.ifilter_tags(matches=lambda node: node.tag == "tr"):
items = tr.contents.filter_tags(matches=lambda node: node.tag == "td")

View File

@@ -1,7 +1,6 @@
import logging
import re
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches Veeam products versions from https://www.veeam.com.
@@ -12,12 +11,11 @@ such as `https://www.veeam.com/kb2680`.
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
version_column = config.data.get("version_column", "Build Number").lower()
date_column = config.data.get("date_column", "Release Date").lower()
for table in soup.find_all("table"):
for table in html.find_all("table"):
headers = [header.get_text().strip().lower() for header in table.find("tr").find_all("td")]
if version_column not in headers or date_column not in headers:
logging.warning("Skipping table with headers %s as it does not contains '%s' or '%s'",

View File

@@ -1,7 +1,6 @@
import logging
import re
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
"""Fetches releases from VirtualBox download page."""
@@ -10,10 +9,10 @@ EOL_REGEX = re.compile(r"^\(no longer supported, support ended (?P<value>\d{4}/\
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
for li in soup.select_one("#DownloadVirtualBoxOldBuilds + ul").find_all("li"):
for li in html.select_one("#DownloadVirtualBoxOldBuilds + ul").find_all("li"):
li_text = li.find("a").text.strip()
release_match = config.first_match(li_text)

View File

@@ -1,12 +1,10 @@
from bs4 import BeautifulSoup
from common import dates, endoflife, http, releasedata
for config in endoflife.list_configs_from_argv():
with releasedata.ProductData(config.product) as product_data:
response = http.fetch_url(config.url)
soup = BeautifulSoup(response.text, features="html5lib")
html = http.fetch_html(config.url)
for table in soup.find_all("table"):
for table in html.find_all("table"):
headers = [th.get_text().strip().lower() for th in table.find_all("th")]
if "version" not in headers or "release date" not in headers:
continue