diff --git a/src/common/dates.py b/src/common/dates.py index 7ba54371..db0a42f6 100644 --- a/src/common/dates.py +++ b/src/common/dates.py @@ -43,16 +43,10 @@ def parse_datetime(text: str, formats: list[str] = frozenset([ # so that we don't have to deal with some special cases in formats text = ( text.strip() - .replace("th, ", " ") # November 10th, 2015 -> November 10, 2015 - .replace("st, ", " ") # March 31st, 2015 -> March 31, 2015 - .replace("Augu ", "August ") # 17 Augu 2023 -> 17 August 2023 - revert after st replacement - .replace("augu ", "August ") # 17 Augu 2023 -> 17 august 2023 - revert after st replacement - .replace("rd, ", " ") # March 3rd, 2015 -> March 3, 2015 .replace(", ", " ") # November 10, 2015 -> November 10 2015 .replace(". ", " ") # November 10. 2015 -> November 10 2015 .replace("(", "") # (November 10 2015) -> November 10 2015) .replace(")", "") # (November 10 2015) -> (November 10 2015 - .replace("*", "") # November 10 2015* -> November 10 2015 ) for fmt in formats: try: diff --git a/src/release_table.py b/src/release_table.py index b2641b58..fb854c04 100644 --- a/src/release_table.py +++ b/src/release_table.py @@ -1,7 +1,10 @@ +import re import sys +from datetime import datetime -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, PageElement from common import dates, endoflife, http, releasedata +from liquid import Template """Fetch release-level data from an HTML table in a web page. @@ -27,6 +30,72 @@ string.""" METHOD = "release_table" + +class Field: + SUPPORTED_TYPES = ["date", "string"] + DATE_FIELDS = ["releaseDate", "support", "eol", "extendedSupport"] + DEFAULT_REGEX = r"^(?P.+)$" + DEFAULT_TEMPLATE = "{{value}}" + DEFAULT_RELEASE_REGEX = r"^v?(?P\d+(\.\d+)?)$" + + def __init__(self, name: str, definition: str | dict, columns: list[str]) -> None: + if isinstance(definition, str): + definition = {"column": definition} + + self.name = name + if self.name == "releaseCycle": + definition["type"] = "string" + definition["regex"] = definition.get("regex", [self.DEFAULT_RELEASE_REGEX]) + definition["template"] = definition.get("template", self.DEFAULT_TEMPLATE) + + self.column = definition["column"].lower() + if self.column not in columns: + msg = f"column {self.column} not found in {columns}" + raise ValueError(msg) + self.column_index = columns.index(self.column) + + self.type = definition.get("type", "string") + if self.name in self.DATE_FIELDS: + self.type = "date" # override type for known date fields + elif self.type not in self.SUPPORTED_TYPES: + msg = f"unsupported type: {self.type} for field {self.name}" + raise ValueError(msg) + + regex = definition.get("regex", [self.DEFAULT_REGEX]) + regex = regex if isinstance(regex, list) else [regex] + self.include_version_patterns = [re.compile(r, re.MULTILINE) for r in regex] + + exclude_regex = definition.get("regex_exclude", []) + exclude_regex = exclude_regex if isinstance(exclude_regex, list) else [exclude_regex] + self.exclude_version_patterns = [re.compile(r, re.MULTILINE) for r in exclude_regex] + + self.template = Template(definition.get("template", self.DEFAULT_TEMPLATE)) \ + if "template" in definition or regex else None + + def extract_from(self, cells: list[PageElement]) -> str | datetime | None: + raw_value = cells[self.column_index].get_text(strip=True) + + for exclude_pattern in self.exclude_version_patterns: + if exclude_pattern.match(raw_value): + return None + + for include_pattern in self.include_version_patterns: + match = include_pattern.match(raw_value) + if not match: + continue + + str_value = self.template.render(**match.groupdict()) if self.template else raw_value + if self.type == "date": + return dates.parse_date(str_value) + return str_value + + if self.name == "releaseCycle": + return None # skipping entire rows is allowed + + msg = f"{raw_value} is not matching any regex in {self.include_version_patterns}" + raise ValueError(msg) + + p_filter = sys.argv[1] if len(sys.argv) > 1 else None m_filter = sys.argv[2] if len(sys.argv) > 2 else None for config in endoflife.list_configs(p_filter, METHOD, m_filter): @@ -39,31 +108,20 @@ for config in endoflife.list_configs(p_filter, METHOD, m_filter): message = f"No table found for {config.product} with selector {config.data['selector']}" raise ValueError(message) - index_by_target = {} headers = [th.get_text().strip().lower() for th in table.select(config.data["headers_selector"])] - for target, column in config.data["mapping"].items(): - index_by_target[target] = headers.index(str(column).lower()) + release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle"), headers) + fields = [Field(name, definition, headers) for name, definition in config.data["fields"].items()] + min_column_count = max([f.column_index for f in fields] + [release_cycle_field.column_index]) + 1 - min_column_count = max(index_by_target.values()) + 1 - release_cycle_index = index_by_target.pop("releaseCycle") for row in table.select(config.data["rows_selector"]): - cells = row.findAll("td") - if len(cells) < min_column_count: + row_cells = row.findAll("td") + if len(row_cells) < min_column_count: continue - release_cycle = cells[release_cycle_index].get_text().strip() - release_cycle_match = config.first_match(release_cycle) - if not release_cycle_match: + release_cycle = release_cycle_field.extract_from(row_cells) + if not release_cycle: continue - release = product_data.get_release(config.render(release_cycle_match)) - release.set_field("releaseCycle", release.name()) - for target, index in index_by_target.items(): - value_str = cells[index].get_text().strip() - - try: - value = dates.parse_date(value_str) - except ValueError: - value = value_str - - release.set_field(target, value) + release = product_data.get_release(release_cycle) + for field in fields: + release.set_field(field.name, field.extract_from(row_cells))