[version_table] Add auto method (#514)

Similar to release_table, but for versions.
This commit is contained in:
Marc Wrobel
2025-09-10 17:39:28 +02:00
parent 91788dcca3
commit 20b0c8d1ff
3 changed files with 77 additions and 6 deletions

View File

@@ -56,6 +56,8 @@ def parse_datetime(text: str, formats: list[str] = frozenset([
"%Y/%m/%d %H:%M:%S", # 2023/05/01 08:32:34 "%Y/%m/%d %H:%M:%S", # 2023/05/01 08:32:34
"%a %d %b %Y %H:%M:%S %Z", # Wed 01 Jan 2020 00:00:00 GMT "%a %d %b %Y %H:%M:%S %Z", # Wed 01 Jan 2020 00:00:00 GMT
"%a %d %b %Y %H:%M:%S %z", # Wed 01 Jan 2020 00:00:00 -0400 "%a %d %b %Y %H:%M:%S %z", # Wed 01 Jan 2020 00:00:00 -0400
"%a %b %d %H:%M:%S %Z %Y", # Wed Jan 01 00:00:00 UTC 2020
"%a %b %d %H:%M:%S %z %Y", # Wed Jan 01 00:00:00 -0400 2020
"%Y%m%d%H%M%S", # 20230501083234 "%Y%m%d%H%M%S", # 20230501083234
]), to_utc: bool = True) -> datetime: ]), to_utc: bool = True) -> datetime:
"""Parse a given text representing a datetime using a list of formats, """Parse a given text representing a datetime using a list of formats,

View File

@@ -13,9 +13,9 @@ from liquid import Template
This script works based on a definition provided in the product's frontmatter to locate the table and extract the This script works based on a definition provided in the product's frontmatter to locate the table and extract the
necessary information. Available configuration options are: necessary information. Available configuration options are:
- selector (mandatory, no default): A CSS selector used to locate one or more tables in the page. - selector (optional, default = table): A CSS selector used to locate one or more tables in the page.
- header_selector (mandatory, default = thead tr): A CSS selector used to locate the table's header row. - header_selector (optional, default = thead tr): A CSS selector used to locate the table's header row.
- rows_selector (mandatory, default = tbody tr): A CSS selector used to locate the table's rows. - rows_selector (optional, default = tbody tr): A CSS selector used to locate the table's rows.
- user_agent (optional, default = <endoflife.date-bot User-Agent>): A user agent string to use when fetching the page. - user_agent (optional, default = <endoflife.date-bot User-Agent>): A user agent string to use when fetching the page.
Unused when render_javascript is true. Unused when render_javascript is true.
- render_javascript (optional, default = false): A boolean value indicating whether to render JavaScript on the page. - render_javascript (optional, default = false): A boolean value indicating whether to render JavaScript on the page.
@@ -61,7 +61,6 @@ fields:
Supported CSS selectors are defined by BeautifulSoup and documented on its website. For more information, see Supported CSS selectors are defined by BeautifulSoup and documented on its website. For more information, see
https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=selector#css-selectors.""" https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=selector#css-selectors."""
METHOD = "release_table"
SUPPORTED_TYPES = ["date", "string", "range", "identifier"] SUPPORTED_TYPES = ["date", "string", "range", "identifier"]
STRING_TYPES = ["string", "identifier"] STRING_TYPES = ["string", "identifier"]
STRING_FIELDS = ["releaseCycle", "releaseLabel"] STRING_FIELDS = ["releaseCycle", "releaseLabel"]
@@ -160,15 +159,19 @@ class Field:
config = config_from_argv() config = config_from_argv()
with ProductData(config.product) as product_data: with ProductData(config.product) as product_data:
user_agent: str = config.data.get("user_agent", http.ENDOFLIFE_BOT_USER_AGENT) user_agent: str = config.data.get("user_agent", http.ENDOFLIFE_BOT_USER_AGENT)
render_js: bool = config.data.get("render_javascript", False) render_js: bool = config.data.get("render_javascript", False)
render_js_wait_until: str | None = config.data.get("render_javascript_wait_until", None) render_js_wait_until: str | None = config.data.get("render_javascript_wait_until", None)
render_js_wait_for: str | None = config.data.get("render_javascript_wait_for", None) render_js_wait_for: str | None = config.data.get("render_javascript_wait_for", None)
render_js_click_selector: str | None = config.data.get("render_javascript_click_selector", None) render_js_click_selector: str | None = config.data.get("render_javascript_click_selector", None)
header_row_selector: str = config.data.get("header_selector", "thead tr") header_row_selector: str = config.data.get("header_selector", "thead tr")
remove_if_undefined_field: str | None = config.data.get("remove_if_undefined", None)
table_selector: str = config.data.get("selector", "table")
rows_selector: str = config.data.get("rows_selector", "tbody tr") rows_selector: str = config.data.get("rows_selector", "tbody tr")
cells_selector: str = "td, th" cells_selector: str = "td, th"
remove_if_undefined_field: str | None = config.data.get("remove_if_undefined", None)
release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle")) release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle"))
fields = [Field(name, definition) for name, definition in config.data["fields"].items()] fields = [Field(name, definition) for name, definition in config.data["fields"].items()]
@@ -179,7 +182,7 @@ with ProductData(config.product) as product_data:
response_text = http.fetch_url(config.url, user_agent=user_agent).text response_text = http.fetch_url(config.url, user_agent=user_agent).text
soup = BeautifulSoup(response_text, features="html5lib") soup = BeautifulSoup(response_text, features="html5lib")
for table in soup.select(config.data["selector"]): for table in soup.select(table_selector):
header_row = table.select_one(header_row_selector) header_row = table.select_one(header_row_selector)
if not header_row: if not header_row:
logging.info(f"skipping table with attributes {table.attrs}: no header row found") logging.info(f"skipping table with attributes {table.attrs}: no header row found")

66
src/version_table.py Normal file
View File

@@ -0,0 +1,66 @@
import logging
from common import dates, http
from common.releasedata import ProductData, config_from_argv
"""Fetch version-level data from an HTML table in a web page.
This script works based on a definition provided in the product's frontmatter to locate the table and extract the
necessary information. Available configuration options are:
- selector (optional, default = table): A CSS selector used to locate one or more tables in the page.
- header_selector (optional, default = thead tr): A CSS selector used to locate the table's header row.
- rows_selector (optional, default = tbody tr): A CSS selector used to locate the table's rows.
- name_column (mandatory): The name of the column containing the version names.
- date_column (mandatory): The name of the column containing the version dates.
Supported CSS selectors are defined by BeautifulSoup and documented on its website. For more information, see
https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=selector#css-selectors.
"""
config = config_from_argv()
with ProductData(config.product) as product_data:
table_selector: str = config.data.get("selector", "table")
header_row_selector: str = config.data.get("header_selector", "thead tr")
rows_selector: str = config.data.get("rows_selector", "tbody tr")
cells_selector: str = "td, th"
version_name_column = config.data["name_column"].strip().lower()
version_date_column = config.data["date_column"].strip().lower()
html = http.fetch_html(config.url)
for table in html.select(table_selector):
header_row = table.select_one(header_row_selector)
if not header_row:
logging.info(f"skipping table with attributes {table.attrs}: no header row found")
continue
headers = [th.get_text().strip().lower() for th in header_row.select(cells_selector)]
logging.info(f"processing table with headers {headers}")
try:
version_name_index = headers.index(version_name_column)
version_date_index = headers.index(version_date_column)
min_columns_count = max([version_name_index, version_date_index]) + 1
for row in table.select(rows_selector):
cells = [cell.get_text().strip() for cell in row.select(cells_selector)]
if len(cells) < min_columns_count:
logging.debug(f"skipping row {cells}: not enough columns")
continue
raw_version_name = cells[version_name_index]
version_match = config.first_match(raw_version_name)
if not version_match:
logging.debug(f"skipping row {cells}: invalid release cycle '{raw_version_name}', "
f"should match one of {config.include_version_patterns} "
f"and not match all of {config.exclude_version_patterns}")
continue
version_name = config.render(version_match)
version_date = dates.parse_datetime(cells[version_date_index])
product_data.declare_version(version_name, version_date)
except ValueError as e:
logging.info(f"skipping table with headers {headers}: {e}")