diff --git a/src/common/dates.py b/src/common/dates.py index 17c71176..16a2c708 100644 --- a/src/common/dates.py +++ b/src/common/dates.py @@ -25,6 +25,7 @@ def parse_month_year_date(text: str, formats: list[str] = frozenset([ "%B %Y", # January 2020 "%b %Y", # Jan 2020 "%Y-%m", # 2020-01 + "%Y/%m", # 2020/01 "%m-%Y", # 01-2020 "%m/%Y", # 01/2020 ])) -> datetime: diff --git a/src/virtualbox.py b/src/virtualbox.py new file mode 100644 index 00000000..0e8a4ab6 --- /dev/null +++ b/src/virtualbox.py @@ -0,0 +1,35 @@ +import logging +import re + +from bs4 import BeautifulSoup +from common import dates, http, releasedata + +"""Fetches releases from VirtualBox download page.""" + +RELEASE_REGEX = re.compile(r"^VirtualBox (?P\d+\.\d+)$") +EOL_REGEX = re.compile(r"^\(no longer supported, support ended (?P\d{4}/\d{2})\)$") + +with releasedata.ProductData("virtualbox") as product_data: + response = http.fetch_url("https://www.virtualbox.org/wiki/Download_Old_Builds") + soup = BeautifulSoup(response.text, features="html5lib") + + for li in soup.select_one("#DownloadVirtualBoxOldBuilds + ul").find_all("li"): + li_text = li.find("a").text.strip() + + release_match = RELEASE_REGEX.match(li_text) + if not release_match: + logging.info(f"Skipping '{li_text}': does not match {RELEASE_REGEX}") + continue + + release_name = release_match.group("value") + release = product_data.get_release(release_name) + + eol_text = li.find("em").text.lower().strip() + eol_match = EOL_REGEX.match(eol_text) + if not eol_match: + logging.info(f"Ignoring '{eol_text}': does not match {EOL_REGEX}") + continue + + eol_date_str = eol_match.group("value") + eol_date = dates.parse_month_year_date(eol_date_str) + release.set_eol(eol_date)