From 356269d692b406f6df754d5126bfb69cd89b1886 Mon Sep 17 00:00:00 2001
From: Marc Wrobel <marc.wrobel@gmail.com>
Date: Sat, 17 Feb 2024 11:23:07 +0100
Subject: [PATCH] [release-table] Support parsing multiple tables (#312)

It has been observed that in many cases, such as for amazon-neptune or amazon-corretto, data is split in two tables: supported and unsupported. Those table usually have similar column names, so by allowing mutiple tables the configuration is simplified.

Also fix script's documentation and improve logging.
---
 src/release_table.py | 110 ++++++++++++++++++++++---------------------
 1 file changed, 57 insertions(+), 53 deletions(-)
diff --git a/src/release_table.py b/src/release_table.py
index 4b47cd35..536e034e 100644
--- a/src/release_table.py
+++ b/src/release_table.py
@@ -1,8 +1,9 @@
+import logging
 import re
 import sys
 from datetime import datetime
 
-from bs4 import BeautifulSoup, PageElement
+from bs4 import BeautifulSoup
 from common import dates, endoflife, http, releasedata
 from liquid import Template
 
@@ -11,58 +12,53 @@ from liquid import Template
 This script works based on a definition provided in the product's frontmatter to locate the table and extract the
 necessary information. Available configuration options are:
 
-- regex: A regular expression used to match release based on their names (aka releaseCycle).
-  Releases not matching this expression are ignored. Default value is defined in endoflife.py (DEFAULT_VERSION_REGEX).
-- regex_exclude: A regular expression used to exclude matching releases based on their names  (aka releaseCycle).
-  Releases matching this expression are ignored, even if they match the above regex. This is empty by default.
-- template: A liquid template used to render the release name. The template is rendered using the matched groups from
-  the regex. Default value is defined in endoflife.py (DEFAULT_VERSION_TEMPLATE).
 - selector: A CSS selector used to locate one or more tables in the page.
 - headers_selector: A CSS selector used to locate the table's headers (column names).
 - rows_selector: A CSS selector used to locate the table's rows.
-- mapping: A dictionary that maps release fields to the table's columns names. All identifiers are case-insensitive.
+- fields: A dictionary that maps release fields to the table's columns. Field definition include:
+    - column (mandatory): The name of the column in the table. This is case-insensitive.
+    - type (mandatory, default = string): The type of the field. Supported types are listed in SUPPORTED_TYPES. If the
+      field is one of the known date fields (DATE_FIELDS), the type is automatically set to 'date' if not provided.
+    - regex (mandatory, default = [DEFAULT_REGEX]): A regular expression, or a list of regular expressions, used to
+      validate allowed values. Note that default value for the releaseCycle field is not DEFAULT_REGEX, but
+      DEFAULT_RELEASE_REGEX.
+    - regex_exclude (mandatory, default = []): A regular expression, or a list of regular expressions, used to exclude
+      values even if they match any regular expression in 'regex'.
+    - template (mandatory, default = DEFAULT_TEMPLATE): A liquid template used to clean up the value using the matched
+      groups from a 'regex'.
 
 Supported CSS selectors are defined by BeautifulSoup and documented on its website. For more information, see
-https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=selector#css-selectors.
-
-Column data types are auto-detected. The currently supported types are 'date' (parsed using the dates module) and
-string."""
+https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=selector#css-selectors."""
 
 METHOD = "release_table"
+SUPPORTED_TYPES = ["date", "month_year_date", "string"]
+DATE_TYPES = ["date", "month_year_date"]
+DATE_FIELDS = ["releaseDate", "support", "eol", "extendedSupport"]
+DEFAULT_REGEX = r"^(?P<value>.+)$"
+DEFAULT_TEMPLATE = "{{value}}"
+DEFAULT_RELEASE_REGEX = r"^v?(?P<value>\d+(\.\d+)*)$"
 
 
 class Field:
-    SUPPORTED_TYPES = ["date", "month_year_date", "string"]
-    DATE_TYPES = ["date", "month_year_date"]
-    DATE_FIELDS = ["releaseDate", "support", "eol", "extendedSupport"]
-    DEFAULT_REGEX = r"^(?P<value>.+)$"
-    DEFAULT_TEMPLATE = "{{value}}"
-    DEFAULT_RELEASE_REGEX = r"^v?(?P<value>\d+(\.\d+)?)$"
-
-    def __init__(self, name: str, definition: str | dict, columns: list[str]) -> None:
+    def __init__(self, name: str, definition: str | dict) -> None:
         if isinstance(definition, str):
             definition = {"column": definition}
 
         self.name = name
         if self.name == "releaseCycle":
             definition["type"] = "string"
-            definition["regex"] = definition.get("regex", [self.DEFAULT_RELEASE_REGEX])
-            definition["template"] = definition.get("template", self.DEFAULT_TEMPLATE)
+            definition["regex"] = definition.get("regex", [DEFAULT_RELEASE_REGEX])
+            definition["template"] = definition.get("template", DEFAULT_TEMPLATE)
 
         self.column = definition["column"].lower()
-        if self.column not in columns:
-            msg = f"column {self.column} not found in {columns}"
-            raise ValueError(msg)
-        self.column_index = columns.index(self.column)
-
         self.type = definition.get("type", "string")
-        if self.name in self.DATE_FIELDS and self.type not in self.DATE_TYPES:
+        if self.name in DATE_FIELDS and self.type not in DATE_TYPES:
             self.type = "date"  # override type for known date fields
-        elif self.type not in self.SUPPORTED_TYPES:
+        elif self.type not in SUPPORTED_TYPES:
             msg = f"unsupported type: {self.type} for field {self.name}"
             raise ValueError(msg)
 
-        regex = definition.get("regex", [self.DEFAULT_REGEX])
+        regex = definition.get("regex", [DEFAULT_REGEX])
         regex = regex if isinstance(regex, list) else [regex]
         self.include_version_patterns = [re.compile(r, re.MULTILINE) for r in regex]
 
@@ -70,12 +66,10 @@ class Field:
         exclude_regex = exclude_regex if isinstance(exclude_regex, list) else [exclude_regex]
         self.exclude_version_patterns = [re.compile(r, re.MULTILINE) for r in exclude_regex]
 
-        self.template = Template(definition.get("template", self.DEFAULT_TEMPLATE)) \
+        self.template = Template(definition.get("template", DEFAULT_TEMPLATE)) \
             if "template" in definition or regex else None
 
-    def extract_from(self, cells: list[PageElement]) -> str | datetime | None:
-        raw_value = cells[self.column_index].get_text(strip=True)
-
+    def extract_from(self, raw_value: str) -> str | datetime | None:
         for exclude_pattern in self.exclude_version_patterns:
             if exclude_pattern.match(raw_value):
                 return None
@@ -95,9 +89,12 @@ class Field:
         if self.name == "releaseCycle":
             return None  # skipping entire rows is allowed
 
-        msg = f"{raw_value} is not matching any regex in {self.include_version_patterns}"
+        msg = f"field {self}'s value {raw_value} does not matching any regex in {self.include_version_patterns}"
         raise ValueError(msg)
 
+    def __repr__(self) -> str:
+        return f"{self.name}({self.column})"
+
 
 p_filter = sys.argv[1] if len(sys.argv) > 1 else None
 m_filter = sys.argv[2] if len(sys.argv) > 2 else None
@@ -105,26 +102,33 @@ for config in endoflife.list_configs(p_filter, METHOD, m_filter):
     with releasedata.ProductData(config.product) as product_data:
         response = http.fetch_url(config.url)
         soup = BeautifulSoup(response.text, features="html5lib")
-        table = soup.select_one(config.data["selector"])
 
-        if not table:
-            message = f"No table found for {config.product} with selector {config.data['selector']}"
-            raise ValueError(message)
+        release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle"))
+        fields = [Field(name, definition) for name, definition in config.data["fields"].items()]
+        for table in soup.select(config.data["selector"]):
+            headers = [th.get_text().strip().lower() for th in table.select(config.data["headers_selector"])]
 
-        headers = [th.get_text().strip().lower() for th in table.select(config.data["headers_selector"])]
-        release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle"), headers)
-        fields = [Field(name, definition, headers) for name, definition in config.data["fields"].items()]
-        min_column_count = max([f.column_index for f in fields] + [release_cycle_field.column_index]) + 1
+            try:
+                fields_index = {"releaseCycle": headers.index(release_cycle_field.column)}
+                for field in fields:
+                    fields_index[field.name] = headers.index(field.column)
+                min_column_count = max(fields_index.values()) + 1
 
-        for row in table.select(config.data["rows_selector"]):
-            row_cells = row.findAll("td")
-            if len(row_cells) < min_column_count:
-                continue
+                for row in table.select(config.data["rows_selector"]):
+                    cells = [cell.get_text().strip() for cell in row.findAll("td")]
+                    if len(cells) < min_column_count:
+                        logging.info(f"skipping row {cells}: not enough columns")
+                        continue
 
-            release_cycle = release_cycle_field.extract_from(row_cells)
-            if not release_cycle:
-                continue
+                    raw_release_cycle = cells[fields_index[release_cycle_field.name]]
+                    release_cycle = release_cycle_field.extract_from(raw_release_cycle)
+                    if not release_cycle:
+                        logging.info(f"skipping row {cells}: invalid release cycle '{raw_release_cycle}'")
+                        continue
 
-            release = product_data.get_release(release_cycle)
-            for field in fields:
-                release.set_field(field.name, field.extract_from(row_cells))
+                    release = product_data.get_release(release_cycle)
+                    for field in fields:
+                        raw_field = cells[fields_index[field.name]]
+                        release.set_field(field.name, field.extract_from(raw_field))
+            except ValueError as e:
+                logging.info(f"skipping table with headers {headers}: {e}")