diff --git a/src/common/endoflife.py b/src/common/endoflife.py index 8fab08bb..ae246ff8 100644 --- a/src/common/endoflife.py +++ b/src/common/endoflife.py @@ -18,19 +18,20 @@ PRODUCTS_PATH = Path(os.environ.get("PRODUCTS_PATH", "website/products")) class AutoConfig: - def __init__(self, product: str, config: dict) -> None: + def __init__(self, product: str, data: dict) -> None: self.product = product - self.method = next(key for key in config if key not in ("template", "regex", "regex_exclude")) - self.url = config[self.method] - self.version_template = Template(config.get("template", DEFAULT_VERSION_TEMPLATE)) + self.data = data + self.method = next(key for key in data if key not in ("template", "regex", "regex_exclude")) + self.url = data[self.method] + self.version_template = Template(data.get("template", DEFAULT_VERSION_TEMPLATE)) self.script = f"{self.url}.py" if self.method == "custom" else f"{self.method}.py" - regexes_include = config.get("regex", DEFAULT_VERSION_REGEX) + regexes_include = data.get("regex", DEFAULT_VERSION_REGEX) regexes_include = regexes_include if isinstance(regexes_include, list) else [regexes_include] self.include_version_patterns = [re.compile(r) for r in regexes_include] - regexes_exclude = config.get("regex_exclude", []) + regexes_exclude = data.get("regex_exclude", []) regexes_exclude = regexes_exclude if isinstance(regexes_exclude, list) else [regexes_exclude] self.exclude_version_patterns = [re.compile(r) for r in regexes_exclude] diff --git a/src/common/releasedata.py b/src/common/releasedata.py index 3df359a7..a512697c 100644 --- a/src/common/releasedata.py +++ b/src/common/releasedata.py @@ -28,12 +28,18 @@ class ProductRelease: def name(self) -> str: return self.data["name"] + def set_release_date(self, new_value: datetime) -> None: + self.set_field("releaseDate", new_value) + def set_support(self, new_value: datetime | bool) -> None: self.set_field("support", new_value) def set_eol(self, new_value: datetime | bool) -> None: self.set_field("eol", new_value) + def set_extended_support(self, new_value: datetime | bool) -> None: + self.set_field("extendedSupport", new_value) + def set_field(self, field: str, new_value: any) -> None: new_value = new_value.strftime("%Y-%m-%d") if isinstance(new_value, datetime) else new_value old_value = self.data.get(field, None) diff --git a/src/release_table.py b/src/release_table.py new file mode 100644 index 00000000..21fa38c6 --- /dev/null +++ b/src/release_table.py @@ -0,0 +1,54 @@ +import sys + +from bs4 import BeautifulSoup +from common import dates, endoflife, http, releasedata + +"""Fetch release-level data from an HTML table in a web page. + +This script works based on a definition provided in the product's frontmatter to locate the table and extract the +necessary information. Available configuration options are: + +- selector: A CSS selector used to locate one or more tables in the page. +- headers_selector: A CSS selector used to locate the table's headers (column names). +- rows_selector: A CSS selector used to locate the table's rows. +- mapping: A dictionary that maps release fields to the table's columns names. All identifiers are case-insensitive. + +Supported CSS selectors are defined by BeautifulSoup and documented on its website. For more information, see +https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=selector#css-selectors. + +Column data types are auto-detected. The currently supported types are 'date' (parsed using the dates module) and +string.""" + +METHOD = "release_table" + +p_filter = sys.argv[1] if len(sys.argv) > 1 else None +m_filter = sys.argv[2] if len(sys.argv) > 2 else None +for config in endoflife.list_configs(p_filter, METHOD, m_filter): + with releasedata.ProductData(config.product) as product_data: + response = http.fetch_url(config.url) + soup = BeautifulSoup(response.text, features="html5lib") + + for table in soup.select(config.data["selector"]): + headers = [th.get_text().strip().lower() for th in table.select(config.data["headers_selector"])] + + index_by_target = {} + for target, column in config.data["mapping"].items(): + index_by_target[target] = headers.index(str(column).lower()) + + min_column_count = max(index_by_target.values()) + 1 + for row in table.select(config.data["rows_selector"]): + cells = row.findAll("td") + if len(cells) < min_column_count: + continue + + release_cycle = cells[index_by_target["releaseCycle"]].get_text().strip() + release = product_data.get_release(release_cycle) + for target, index in index_by_target.items(): + value_str = cells[index].get_text().strip() + + try: + value = dates.parse_date(value_str) + except ValueError: + value = value_str + + release.set_field(target, value)