diff --git a/src/common/dates.py b/src/common/dates.py index 69da0bfb..16109ce9 100644 --- a/src/common/dates.py +++ b/src/common/dates.py @@ -56,6 +56,8 @@ def parse_datetime(text: str, formats: list[str] = frozenset([ "%Y/%m/%d %H:%M:%S", # 2023/05/01 08:32:34 "%a %d %b %Y %H:%M:%S %Z", # Wed 01 Jan 2020 00:00:00 GMT "%a %d %b %Y %H:%M:%S %z", # Wed 01 Jan 2020 00:00:00 -0400 + "%a %b %d %H:%M:%S %Z %Y", # Wed Jan 01 00:00:00 UTC 2020 + "%a %b %d %H:%M:%S %z %Y", # Wed Jan 01 00:00:00 -0400 2020 "%Y%m%d%H%M%S", # 20230501083234 ]), to_utc: bool = True) -> datetime: """Parse a given text representing a datetime using a list of formats, diff --git a/src/release_table.py b/src/release_table.py index 140f377d..6c1fbfc9 100644 --- a/src/release_table.py +++ b/src/release_table.py @@ -13,9 +13,9 @@ from liquid import Template This script works based on a definition provided in the product's frontmatter to locate the table and extract the necessary information. Available configuration options are: -- selector (mandatory, no default): A CSS selector used to locate one or more tables in the page. -- header_selector (mandatory, default = thead tr): A CSS selector used to locate the table's header row. -- rows_selector (mandatory, default = tbody tr): A CSS selector used to locate the table's rows. +- selector (optional, default = table): A CSS selector used to locate one or more tables in the page. +- header_selector (optional, default = thead tr): A CSS selector used to locate the table's header row. +- rows_selector (optional, default = tbody tr): A CSS selector used to locate the table's rows. - user_agent (optional, default = ): A user agent string to use when fetching the page. Unused when render_javascript is true. - render_javascript (optional, default = false): A boolean value indicating whether to render JavaScript on the page. @@ -61,7 +61,6 @@ fields: Supported CSS selectors are defined by BeautifulSoup and documented on its website. For more information, see https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=selector#css-selectors.""" -METHOD = "release_table" SUPPORTED_TYPES = ["date", "string", "range", "identifier"] STRING_TYPES = ["string", "identifier"] STRING_FIELDS = ["releaseCycle", "releaseLabel"] @@ -160,15 +159,19 @@ class Field: config = config_from_argv() with ProductData(config.product) as product_data: user_agent: str = config.data.get("user_agent", http.ENDOFLIFE_BOT_USER_AGENT) + render_js: bool = config.data.get("render_javascript", False) render_js_wait_until: str | None = config.data.get("render_javascript_wait_until", None) render_js_wait_for: str | None = config.data.get("render_javascript_wait_for", None) render_js_click_selector: str | None = config.data.get("render_javascript_click_selector", None) header_row_selector: str = config.data.get("header_selector", "thead tr") - remove_if_undefined_field: str | None = config.data.get("remove_if_undefined", None) + + table_selector: str = config.data.get("selector", "table") rows_selector: str = config.data.get("rows_selector", "tbody tr") cells_selector: str = "td, th" + remove_if_undefined_field: str | None = config.data.get("remove_if_undefined", None) + release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle")) fields = [Field(name, definition) for name, definition in config.data["fields"].items()] @@ -179,7 +182,7 @@ with ProductData(config.product) as product_data: response_text = http.fetch_url(config.url, user_agent=user_agent).text soup = BeautifulSoup(response_text, features="html5lib") - for table in soup.select(config.data["selector"]): + for table in soup.select(table_selector): header_row = table.select_one(header_row_selector) if not header_row: logging.info(f"skipping table with attributes {table.attrs}: no header row found") diff --git a/src/version_table.py b/src/version_table.py new file mode 100644 index 00000000..7c60a606 --- /dev/null +++ b/src/version_table.py @@ -0,0 +1,66 @@ +import logging + +from common import dates, http +from common.releasedata import ProductData, config_from_argv + +"""Fetch version-level data from an HTML table in a web page. + +This script works based on a definition provided in the product's frontmatter to locate the table and extract the +necessary information. Available configuration options are: + +- selector (optional, default = table): A CSS selector used to locate one or more tables in the page. +- header_selector (optional, default = thead tr): A CSS selector used to locate the table's header row. +- rows_selector (optional, default = tbody tr): A CSS selector used to locate the table's rows. +- name_column (mandatory): The name of the column containing the version names. +- date_column (mandatory): The name of the column containing the version dates. + +Supported CSS selectors are defined by BeautifulSoup and documented on its website. For more information, see +https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=selector#css-selectors. +""" + +config = config_from_argv() +with ProductData(config.product) as product_data: + table_selector: str = config.data.get("selector", "table") + header_row_selector: str = config.data.get("header_selector", "thead tr") + rows_selector: str = config.data.get("rows_selector", "tbody tr") + cells_selector: str = "td, th" + + version_name_column = config.data["name_column"].strip().lower() + version_date_column = config.data["date_column"].strip().lower() + + html = http.fetch_html(config.url) + + for table in html.select(table_selector): + header_row = table.select_one(header_row_selector) + if not header_row: + logging.info(f"skipping table with attributes {table.attrs}: no header row found") + continue + + headers = [th.get_text().strip().lower() for th in header_row.select(cells_selector)] + logging.info(f"processing table with headers {headers}") + + try: + version_name_index = headers.index(version_name_column) + version_date_index = headers.index(version_date_column) + min_columns_count = max([version_name_index, version_date_index]) + 1 + + for row in table.select(rows_selector): + cells = [cell.get_text().strip() for cell in row.select(cells_selector)] + if len(cells) < min_columns_count: + logging.debug(f"skipping row {cells}: not enough columns") + continue + + raw_version_name = cells[version_name_index] + version_match = config.first_match(raw_version_name) + if not version_match: + logging.debug(f"skipping row {cells}: invalid release cycle '{raw_version_name}', " + f"should match one of {config.include_version_patterns} " + f"and not match all of {config.exclude_version_patterns}") + continue + + version_name = config.render(version_match) + version_date = dates.parse_datetime(cells[version_date_index]) + product_data.declare_version(version_name, version_date) + + except ValueError as e: + logging.info(f"skipping table with headers {headers}: {e}")