Improve HTTP handling in scripts (#449)

Provide some helper methods to hide the complexity of parsing HTML, JSON, YAML, XML or Markdown.
2025-06-28 11:46:04 +02:00
parent fda4967c38
commit 312ce078bb
43 changed files with 103 additions and 137 deletions
--- a/src/amazon-eks.py
+++ b/src/amazon-eks.py
@@ -1,6 +1,5 @@
 import logging

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches EKS versions from AWS docs.
@@ -8,8 +7,8 @@ Now that AWS no longer publishes docs on GitHub, we use the Web Archive to get t

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        html = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)
+
        for tr in html.select("#main-col-body")[0].findAll("tr"):
            cells = tr.findAll("td")
            if not cells:
--- a/src/amazon-neptune.py
+++ b/src/amazon-neptune.py
@@ -1,5 +1,4 @@
 import logging
-import xml.dom.minidom

 from common import dates, endoflife, http, releasedata

@@ -7,8 +6,7 @@ from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        rss_response = http.fetch_url(config.url)
-        rss = xml.dom.minidom.parseString(rss_response.text)
+        rss = http.fetch_xml(config.url)

        for entry in rss.getElementsByTagName("item"):
            version_str = entry.getElementsByTagName("title")[0].firstChild.nodeValue
--- a/src/apache-subversion.py
+++ b/src/apache-subversion.py
@@ -1,14 +1,12 @@
 import logging

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        ul = soup.find("h2").find_next("ul")
+        ul = html.find("h2").find_next("ul")
        for li in ul.find_all("li"):
            text = li.get_text(strip=True)
            match = config.first_match(text)
--- a/src/aws-lambda.py
+++ b/src/aws-lambda.py
@@ -1,16 +1,14 @@
 import logging

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches AWS lambda runtimes with their support / EOL dates from https://docs.aws.amazon.com."""

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for i, table in enumerate(soup.find_all("table")):
+        for i, table in enumerate(html.find_all("table")):
            headers = [th.get_text().strip().lower() for th in table.find("thead").find_all("tr")[0].find_all("th")]
            if "identifier" not in headers or "deprecation date" not in headers or "block function update" not in headers:
                logging.info(f"table with header '{headers}' does not contain all the expected headers")
--- a/src/cgit.py
+++ b/src/cgit.py
@@ -1,4 +1,3 @@
-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches versions from repositories managed with cgit, such as the Linux kernel repository.
@@ -6,10 +5,9 @@ Ideally we would want to use the git repository directly, but cgit-managed repos

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url + '/refs/tags')
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url + '/refs/tags')

-        for table in soup.find_all("table", class_="list"):
+        for table in html.find_all("table", class_="list"):
            for row in table.find_all("tr"):
                columns = row.find_all("td")
                if len(columns) != 4:
--- a/src/chef-infra.py
+++ b/src/chef-infra.py
@@ -1,4 +1,3 @@
-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata
 from common.git import Git

@@ -10,9 +9,8 @@ More context on https://github.com/endoflife-date/endoflife.date/pull/4425#discu

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        rn_response = http.fetch_url(config.url)
-        rn_soup = BeautifulSoup(rn_response.text, features="html5lib")
-        released_versions = [h2.get('id') for h2 in rn_soup.find_all('h2', id=True) if h2.get('id')]
+        html = http.fetch_html(config.url)
+        released_versions = [h2.get('id') for h2 in html.find_all('h2', id=True) if h2.get('id')]

        git = Git(config.data.get('repository'))
        git.setup(bare=True)
--- a/src/chef-inspec.py
+++ b/src/chef-inspec.py
@@ -1,4 +1,3 @@
-from bs4 import BeautifulSoup
 from common import dates, endoflife, github, http, releasedata

 """Fetch released versions from docs.chef.io and retrieve their date from GitHub.
@@ -9,9 +8,8 @@ More context on https://github.com/endoflife-date/endoflife.date/pull/4425#discu

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        rn_response = http.fetch_url(config.url)
-        rn_soup = BeautifulSoup(rn_response.text, features="html5lib")
-        released_versions = [h2.get('id') for h2 in rn_soup.find_all('h2', id=True) if h2.get('id')]
+        html = http.fetch_html(config.url)
+        released_versions = [h2.get('id') for h2 in html.find_all('h2', id=True) if h2.get('id')]

        for release in github.fetch_releases("inspec/inspec"):
            sanitized_version = release.tag_name.replace("v", "")
--- a/src/coldfusion.py
+++ b/src/coldfusion.py
@@ -1,6 +1,5 @@
 import re

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches versions from Adobe ColdFusion release notes on helpx.adobe.com.
@@ -24,10 +23,9 @@ FIXED_VERSIONS = {

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        changelog = http.fetch_url(config.url)
-        changelog_soup = BeautifulSoup(changelog.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for p in changelog_soup.findAll("div", class_="text"):
+        for p in html.findAll("div", class_="text"):
            version_and_date_str = p.get_text().strip().replace('\xa0', ' ')
            for (date_str, version_str) in VERSION_AND_DATE_PATTERN.findall(version_and_date_str):
                date = dates.parse_date(date_str)
--- a/src/common/http.py
+++ b/src/common/http.py
@@ -1,6 +1,12 @@
 import logging
+import xml.dom.minidom
 from concurrent.futures import as_completed
+from xml.dom.minidom import Document

+import mwparserfromhell
+import yaml
+from bs4 import BeautifulSoup
+from mwparserfromhell.wikicode import Wikicode
 from playwright.sync_api import sync_playwright
 from requests import Response
 from requests.adapters import HTTPAdapter
@@ -47,6 +53,31 @@ def fetch_url(url: str, data: any = None, headers: dict[str, str] = None,
              max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Response:
    return fetch_urls([url], data, headers, max_retries, backoff_factor, timeout)[0]

+def fetch_html(url: str, data: any = None, headers: dict[str, str] = None,
+               max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30,
+               features: str = "html5lib") -> BeautifulSoup:
+    response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
+    return BeautifulSoup(response.text, features=features)
+
+def fetch_json(url: str, data: any = None, headers: dict[str, str] = None,
+              max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Document:
+    response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
+    return response.json()
+
+def fetch_yaml(url: str, data: any = None, headers: dict[str, str] = None,
+               max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> any:
+    response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
+    return yaml.safe_load(response.text)
+
+def fetch_xml(url: str, data: any = None, headers: dict[str, str] = None,
+              max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Document:
+    response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
+    return xml.dom.minidom.parseString(response.text)
+
+def fetch_markdown(url: str, data: any = None, headers: dict[str, str] = None,
+              max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Wikicode:
+    response = fetch_url(url, data, headers, max_retries, backoff_factor, timeout)
+    return mwparserfromhell.parse(response.text)

 # This requires some setup, see https://playwright.dev/python/docs/intro#installing-playwright.
 def fetch_javascript_url(url: str, click_selector: str = None, wait_until: str = None) -> str:
--- a/src/couchbase-server.py
+++ b/src/couchbase-server.py
@@ -18,10 +18,9 @@ MANUAL_VERSIONS = {

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        main = http.fetch_url(f"{config.url}/current/install/install-intro.html")
-        main_soup = BeautifulSoup(main.text, features="html5lib")
+        html = http.fetch_html(f"{config.url}/current/install/install-intro.html")

-        minor_versions = [options.attrs["value"] for options in main_soup.find(class_="version_list").find_all("option")]
+        minor_versions = [options.attrs["value"] for options in html.find(class_="version_list").find_all("option")]
        minor_version_urls = [f"{config.url}/{minor}/release-notes/relnotes.html" for minor in minor_versions]

        for minor_version in http.fetch_urls(minor_version_urls):
--- a/src/distrowatch.py
+++ b/src/distrowatch.py
@@ -1,12 +1,10 @@
-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(f"https://distrowatch.com/index.php?distribution={config.url}")
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(f"https://distrowatch.com/index.php?distribution={config.url}")

-        for table in soup.select("td.News1>table.News"):
+        for table in html.select("td.News1>table.News"):
            headline = table.select_one("td.NewsHeadline a[href]").get_text().strip()
            versions_match = config.first_match(headline)
            if not versions_match:
--- a/src/docker_hub.py
+++ b/src/docker_hub.py
@@ -5,7 +5,7 @@ from common import dates, endoflife, http, releasedata
 Unfortunately images creation date cannot be retrieved, so we had to use the tag_last_pushed field instead."""

 def fetch_releases(p: releasedata.ProductData, c: endoflife.AutoConfig, url: str) -> None:
-    data = http.fetch_url(url).json()
+    data = http.fetch_json(url)

    for result in data["results"]:
        version_str = result["name"]
--- a/src/google-kubernetes-engine.py
+++ b/src/google-kubernetes-engine.py
@@ -1,6 +1,5 @@
 import re

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 # https://regex101.com/r/zPxBqT/1
@@ -15,10 +14,9 @@ URL_BY_PRODUCT = {
 for config in endoflife.list_configs_from_argv(): # noqa: B007 multiple JSON produced for historical reasons
    for product_name, url in URL_BY_PRODUCT.items():
        with releasedata.ProductData(product_name) as product_data:
-            relnotes = http.fetch_url(url)
-            relnotes_soup = BeautifulSoup(relnotes.text, features="html5lib")
+            html = http.fetch_html(url)

-            for section in relnotes_soup.find_all('section', class_='releases'):
+            for section in html.find_all('section', class_='releases'):
                for h2 in section.find_all('h2'):  # h2 contains the date
                    date = dates.parse_date(h2.get('data-text'))

--- a/src/graalvm.py
+++ b/src/graalvm.py
@@ -1,12 +1,10 @@
 import logging

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        html = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)
        table_selector = config.data.get("table_selector", "#previous-releases + table").strip()
        date_column = config.data.get("date_column", "Date").strip().lower()
        versions_column = config.data.get("versions_column").strip().lower()
--- a/src/haproxy.py
+++ b/src/haproxy.py
@@ -1,6 +1,5 @@
 import re

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 CYCLE_PATTERN = re.compile(r"^(\d+\.\d+)/$")
@@ -9,10 +8,9 @@ DATE_AND_VERSION_PATTERN = re.compile(r"^(\d{4})/(\d{2})/(\d{2})\s+:\s+(\d+\.\d+
 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
        # First, get all minor releases from the download page
-        download = http.fetch_url(config.url)
-        download_soup = BeautifulSoup(download.text, features="html5lib")
+        download_html = http.fetch_html(config.url)
        minor_versions = []
-        for link in download_soup.select("a"):
+        for link in download_html.select("a"):
            minor_version_match = CYCLE_PATTERN.match(link.attrs["href"])
            if not minor_version_match:
                continue
--- a/src/ibm-aix.py
+++ b/src/ibm-aix.py
@@ -1,12 +1,10 @@
-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        page = http.fetch_url(config.url)
-        page_soup = BeautifulSoup(page.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for release_table in page_soup.find("div", class_="ibm-container-body").find_all("table", class_="ibm-data-table ibm-grid"):
+        for release_table in html.find("div", class_="ibm-container-body").find_all("table", class_="ibm-data-table ibm-grid"):
            for row in release_table.find_all("tr")[1:]:  # for all rows except the header
                cells = row.find_all("td")
                version = cells[0].text.strip("AIX ").replace(' TL', '.')
--- a/src/kuma.py
+++ b/src/kuma.py
@@ -1,6 +1,5 @@
 import logging

-import yaml
 from common import dates, endoflife, http, releasedata

 """Fetch version data for Kuma from https://raw.githubusercontent.com/kumahq/kuma/master/versions.yml.
@@ -12,8 +11,7 @@ EOL_FIELD = 'endOfLifeDate'

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        yml_response = http.fetch_url(config.url)
-        versions_data = yaml.safe_load(yml_response.text)
+        versions_data = http.fetch_yaml(config.url)

        # Iterate through the versions and their associated dates
        for version_info in versions_data:
--- a/src/libreoffice.py
+++ b/src/libreoffice.py
@@ -1,16 +1,14 @@
 import logging

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches LibreOffice versions from https://downloadarchive.documentfoundation.org/libreoffice/old/"""

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for table in soup.find_all("table"):
+        for table in html.find_all("table"):
            for row in table.find_all("tr")[1:]:
                cells = row.find_all("td")
                if len(cells) < 4:
--- a/src/looker.py
+++ b/src/looker.py
@@ -1,5 +1,4 @@
 import re
-import xml.dom.minidom

 from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata
@@ -11,8 +10,7 @@ ANNOUNCEMENT_PATTERN = re.compile(r"includes\s+the\s+following\s+changes", re.IG

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        rss = xml.dom.minidom.parseString(response.text)
+        rss = http.fetch_xml(config.url)

        for item in rss.getElementsByTagName("entry"):
            content = item.getElementsByTagName("content")[0].firstChild.nodeValue
--- a/src/lua.py
+++ b/src/lua.py
@@ -1,6 +1,5 @@
 import re

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches Lua releases from lua.org."""
@@ -10,9 +9,8 @@ VERSION_PATTERN = re.compile(r"(?P<version>\d+\.\d+\.\d+),\s*released\s*on\s*(?P

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        page = http.fetch_url(config.url)
-        soup = BeautifulSoup(page.text, 'html.parser')
-        page_text = soup.text # HTML is broken, no way to parse it with beautifulsoup
+        html = http.fetch_html(config.url, features = 'html.parser')
+        page_text = html.text # HTML is broken, no way to parse it with beautifulsoup

        for release_match in RELEASED_AT_PATTERN.finditer(page_text):
            release = release_match.group('release')
--- a/src/maven.py
+++ b/src/maven.py
@@ -9,7 +9,7 @@ for config in endoflife.list_configs_from_argv():

        while True:
            url = f"https://search.maven.org/solrsearch/select?q=g:{group_id}+AND+a:{artifact_id}&core=gav&wt=json&start={start}&rows=100"
-            data = http.fetch_url(url).json()
+            data = http.fetch_json(url)

            for row in data["response"]["docs"]:
                version_match = config.first_match(row["v"])
--- a/src/netbsd.py
+++ b/src/netbsd.py
@@ -1,16 +1,14 @@
 import logging

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches NetBSD versions and EOL information from https://www.netbsd.org/."""

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for row in soup.select('table tbody tr'):
+        for row in html.select('table tbody tr'):
            cells = [cell.get_text(strip=True) for cell in row.select('td')]

            version = cells[0]
--- a/src/npm.py
+++ b/src/npm.py
@@ -2,7 +2,7 @@ from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        data = http.fetch_url(f"https://registry.npmjs.org/{config.url}").json()
+        data = http.fetch_json(f"https://registry.npmjs.org/{config.url}")
        for version_str in data["versions"]:
            version_match = config.first_match(version_str)
            if version_match:
--- a/src/nutanix.py
+++ b/src/nutanix.py
@@ -4,8 +4,8 @@ from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        url = f"https://portal.nutanix.com/api/v1/eol/find?type={config.url}"
-        data = http.fetch_url(url).json()
+        data = http.fetch_json(f"https://portal.nutanix.com/api/v1/eol/find?type={config.url}")
+
        for version_data in data["contents"]:
            release_name = '.'.join(version_data["version"].split(".")[:2])

--- a/src/pan-os.py
+++ b/src/pan-os.py
@@ -4,7 +4,7 @@ from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        versions = http.fetch_url(config.url).json()
+        versions = http.fetch_json(config.url)

        for version in versions:
            name = version['version']
--- a/src/plesk.py
+++ b/src/plesk.py
@@ -1,4 +1,3 @@
-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches versions from Plesk's change log.
@@ -8,10 +7,9 @@ there is no entry for GA of version 18.0.18 and older."""

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for release in soup.find_all("div", class_="changelog-entry--obsidian"):
+        for release in html.find_all("div", class_="changelog-entry--obsidian"):
            version = release.h2.text.strip()
            if not version.startswith('Plesk Obsidian 18'):
                continue
--- a/src/pypi.py
+++ b/src/pypi.py
@@ -2,7 +2,7 @@ from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        data = http.fetch_url(f"https://pypi.org/pypi/{config.url}/json").json()
+        data = http.fetch_json(f"https://pypi.org/pypi/{config.url}/json")

        for version_str in data["releases"]:
            version_match = config.first_match(version_str)
--- a/src/rds.py
+++ b/src/rds.py
@@ -1,6 +1,5 @@
 import logging

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches Amazon RDS versions from the version management pages on AWS docs.
@@ -11,10 +10,9 @@ in the third column (usually named 'RDS release date').

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for table in soup.find_all("table"):
+        for table in html.find_all("table"):
            for row in table.find_all("tr"):
                columns = row.find_all("td")
                if len(columns) <= 3:
--- a/src/red-hat-jboss-eap-7.py
+++ b/src/red-hat-jboss-eap-7.py
@@ -1,16 +1,14 @@
 import logging

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches RedHat JBoss EAP version data for JBoss 7"""

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for h4 in soup.find_all("h4"):
+        for h4 in html.find_all("h4"):
            title = h4.get_text(strip=True)
            if not title.startswith("7."):
                continue
--- a/src/red-hat-jboss-eap-8.py
+++ b/src/red-hat-jboss-eap-8.py
@@ -1,5 +1,4 @@
 import re
-from xml.dom.minidom import parseString

 from common import dates, endoflife, http, releasedata

@@ -7,9 +6,8 @@ from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
+        xml = http.fetch_xml(config.url)

-        xml = parseString(response.text)
        versioning = xml.getElementsByTagName("metadata")[0].getElementsByTagName("versioning")[0]

        latest_str = versioning.getElementsByTagName("latest")[0].firstChild.nodeValue
--- a/src/red-hat-satellite.py
+++ b/src/red-hat-satellite.py
@@ -1,6 +1,5 @@
 import logging

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches Satellite versions from access.redhat.com.
@@ -9,10 +8,9 @@ A few of the older versions, such as 'Satellite 6.1 GA Release (Build 6.1.1)', w

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for table in soup.findAll("tbody"):
+        for table in html.findAll("tbody"):
            for tr in table.findAll("tr"):
                td_list = tr.findAll("td")

--- a/src/redhat_lifecycles.py
+++ b/src/redhat_lifecycles.py
@@ -22,7 +22,7 @@ for config in endoflife.list_configs_from_argv():
        name = urllib.parse.quote(config.url)
        mapping = Mapping(config.data["fields"])

-        data = http.fetch_url('https://access.redhat.com/product-life-cycles/api/v1/products?name=' + name).json()
+        data = http.fetch_json('https://access.redhat.com/product-life-cycles/api/v1/products?name=' + name)

        for version in data["data"][0]["versions"]:
            version_name = version["name"]
--- a/src/rhel.py
+++ b/src/rhel.py
@@ -1,6 +1,5 @@
 import re

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 # https://regex101.com/r/877ibq/1
@@ -8,10 +7,9 @@ VERSION_PATTERN = re.compile(r"RHEL (?P<major>\d)(\. ?(?P<minor>\d+))?(( Update

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for tr in soup.findAll("tr"):
+        for tr in html.findAll("tr"):
            td_list = tr.findAll("td")
            if len(td_list) == 0:
                continue
--- a/src/ros.py
+++ b/src/ros.py
@@ -1,14 +1,12 @@
 import logging

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for tr in soup.findAll("tr"):
+        for tr in html.findAll("tr"):
            td_list = tr.findAll("td")
            if len(td_list) == 0:
                continue
--- a/src/samsung-security.py
+++ b/src/samsung-security.py
@@ -2,7 +2,6 @@ import logging
 import re
 from datetime import date, datetime, time, timezone

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Detect new models and aggregate EOL data for Samsung Mobile devices.
@@ -27,12 +26,11 @@ for config in endoflife.list_configs_from_argv():
            release.set_eol(eol)


-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

        sections = config.data.get("sections", {})
        for update_cadence, title in sections.items():
-            models_list = soup.find(string=lambda text, search=title: search in text if text else False).find_next("ul")
+            models_list = html.find(string=lambda text, search=title: search in text if text else False).find_next("ul")

            for item in models_list.find_all("li"):
                models = item.text.replace("Enterprise Models:", "")
--- a/src/sles.py
+++ b/src/sles.py
@@ -1,14 +1,12 @@
 import logging

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        products_table = soup.find("tbody", id="productSupportLifecycle")
+        products_table = html.find("tbody", id="productSupportLifecycle")
        sles_header_rows = products_table.find_all("tr", class_="row", attrs={"data-productfilter": "SUSE Linux Enterprise Server"})

        # Extract rows' IDs to find related sub-rows with details (normally hidden until a user expands a section)
--- a/src/splunk.py
+++ b/src/splunk.py
@@ -1,6 +1,5 @@
 import re

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 VERSION_DATE_PATTERN = re.compile(r"Splunk Enterprise (?P<version>\d+\.\d+(?:\.\d+)*) was (?:first )?released on (?P<date>\w+\s\d\d?,\s\d{4})\.", re.MULTILINE)
@@ -32,10 +31,9 @@ def get_latest_minor_versions(versions: list[str]) -> list[str]:

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        main = http.fetch_url(config.url)
-        soup = BeautifulSoup(main.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        all_versions = [option.attrs['value'] for option in soup.select("select#version-select > option")]
+        all_versions = [option.attrs['value'] for option in html.select("select#version-select > option")]
        all_versions = [v for v in all_versions if v != "DataMonitoringAppPreview"]

        # Latest minor release notes contains release notes for all previous minor versions.
--- a/src/typo3.py
+++ b/src/typo3.py
@@ -2,7 +2,7 @@ from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        data = http.fetch_url(config.url).json()
+        data = http.fetch_json(config.url)
        for v in data:
            if v['type'] == 'development':
                continue
--- a/src/unity.py
+++ b/src/unity.py
@@ -1,4 +1,3 @@
-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches the Unity LTS releases from the Unity website. Non-LTS releases are not listed there, so this automation
@@ -19,10 +18,9 @@ The script will need to be updated if someday those conditions are not met."""

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for release in soup.find_all('div', class_='component-releases-item__show__inner-header'):
+        for release in html.find_all('div', class_='component-releases-item__show__inner-header'):
            version = release.find('h4').find('span').text
            date = dates.parse_datetime(release.find('time').attrs['datetime'])
            product_data.declare_version(version, date)
--- a/src/unrealircd.py
+++ b/src/unrealircd.py
@@ -1,14 +1,12 @@
 import re

-import mwparserfromhell
 from common import dates, endoflife, http, releasedata

 DATE_PATTERN = re.compile(r"\d{4}-\d{2}-\d{2}")

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        wikicode = mwparserfromhell.parse(response.text)
+        wikicode = http.fetch_markdown(config.url)

        for tr in wikicode.ifilter_tags(matches=lambda node: node.tag == "tr"):
            items = tr.contents.filter_tags(matches=lambda node: node.tag == "td")
--- a/src/veeam.py
+++ b/src/veeam.py
@@ -1,7 +1,6 @@
 import logging
 import re

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches Veeam products versions from https://www.veeam.com.
@@ -12,12 +11,11 @@ such as `https://www.veeam.com/kb2680`.

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

        version_column = config.data.get("version_column", "Build Number").lower()
        date_column = config.data.get("date_column", "Release Date").lower()
-        for table in soup.find_all("table"):
+        for table in html.find_all("table"):
            headers = [header.get_text().strip().lower() for header in table.find("tr").find_all("td")]
            if version_column not in headers or date_column not in headers:
                logging.warning("Skipping table with headers %s as it does not contains '%s' or '%s'",
--- a/src/virtualbox.py
+++ b/src/virtualbox.py
@@ -1,7 +1,6 @@
 import logging
 import re

-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 """Fetches releases from VirtualBox download page."""
@@ -10,10 +9,10 @@ EOL_REGEX = re.compile(r"^\(no longer supported, support ended (?P<value>\d{4}/\

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for li in soup.select_one("#DownloadVirtualBoxOldBuilds + ul").find_all("li"):
+
+        for li in html.select_one("#DownloadVirtualBoxOldBuilds + ul").find_all("li"):
            li_text = li.find("a").text.strip()

            release_match = config.first_match(li_text)
--- a/src/visual-studio.py
+++ b/src/visual-studio.py
@@ -1,12 +1,10 @@
-from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata

 for config in endoflife.list_configs_from_argv():
    with releasedata.ProductData(config.product) as product_data:
-        response = http.fetch_url(config.url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+        html = http.fetch_html(config.url)

-        for table in soup.find_all("table"):
+        for table in html.find_all("table"):
            headers = [th.get_text().strip().lower() for th in table.find_all("th")]
            if "version" not in headers or "release date" not in headers:
                continue