[version_table] Add auto method (#514)
Similar to release_table, but for versions.
This commit is contained in:
@@ -56,6 +56,8 @@ def parse_datetime(text: str, formats: list[str] = frozenset([
|
|||||||
"%Y/%m/%d %H:%M:%S", # 2023/05/01 08:32:34
|
"%Y/%m/%d %H:%M:%S", # 2023/05/01 08:32:34
|
||||||
"%a %d %b %Y %H:%M:%S %Z", # Wed 01 Jan 2020 00:00:00 GMT
|
"%a %d %b %Y %H:%M:%S %Z", # Wed 01 Jan 2020 00:00:00 GMT
|
||||||
"%a %d %b %Y %H:%M:%S %z", # Wed 01 Jan 2020 00:00:00 -0400
|
"%a %d %b %Y %H:%M:%S %z", # Wed 01 Jan 2020 00:00:00 -0400
|
||||||
|
"%a %b %d %H:%M:%S %Z %Y", # Wed Jan 01 00:00:00 UTC 2020
|
||||||
|
"%a %b %d %H:%M:%S %z %Y", # Wed Jan 01 00:00:00 -0400 2020
|
||||||
"%Y%m%d%H%M%S", # 20230501083234
|
"%Y%m%d%H%M%S", # 20230501083234
|
||||||
]), to_utc: bool = True) -> datetime:
|
]), to_utc: bool = True) -> datetime:
|
||||||
"""Parse a given text representing a datetime using a list of formats,
|
"""Parse a given text representing a datetime using a list of formats,
|
||||||
|
|||||||
@@ -13,9 +13,9 @@ from liquid import Template
|
|||||||
This script works based on a definition provided in the product's frontmatter to locate the table and extract the
|
This script works based on a definition provided in the product's frontmatter to locate the table and extract the
|
||||||
necessary information. Available configuration options are:
|
necessary information. Available configuration options are:
|
||||||
|
|
||||||
- selector (mandatory, no default): A CSS selector used to locate one or more tables in the page.
|
- selector (optional, default = table): A CSS selector used to locate one or more tables in the page.
|
||||||
- header_selector (mandatory, default = thead tr): A CSS selector used to locate the table's header row.
|
- header_selector (optional, default = thead tr): A CSS selector used to locate the table's header row.
|
||||||
- rows_selector (mandatory, default = tbody tr): A CSS selector used to locate the table's rows.
|
- rows_selector (optional, default = tbody tr): A CSS selector used to locate the table's rows.
|
||||||
- user_agent (optional, default = <endoflife.date-bot User-Agent>): A user agent string to use when fetching the page.
|
- user_agent (optional, default = <endoflife.date-bot User-Agent>): A user agent string to use when fetching the page.
|
||||||
Unused when render_javascript is true.
|
Unused when render_javascript is true.
|
||||||
- render_javascript (optional, default = false): A boolean value indicating whether to render JavaScript on the page.
|
- render_javascript (optional, default = false): A boolean value indicating whether to render JavaScript on the page.
|
||||||
@@ -61,7 +61,6 @@ fields:
|
|||||||
Supported CSS selectors are defined by BeautifulSoup and documented on its website. For more information, see
|
Supported CSS selectors are defined by BeautifulSoup and documented on its website. For more information, see
|
||||||
https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=selector#css-selectors."""
|
https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=selector#css-selectors."""
|
||||||
|
|
||||||
METHOD = "release_table"
|
|
||||||
SUPPORTED_TYPES = ["date", "string", "range", "identifier"]
|
SUPPORTED_TYPES = ["date", "string", "range", "identifier"]
|
||||||
STRING_TYPES = ["string", "identifier"]
|
STRING_TYPES = ["string", "identifier"]
|
||||||
STRING_FIELDS = ["releaseCycle", "releaseLabel"]
|
STRING_FIELDS = ["releaseCycle", "releaseLabel"]
|
||||||
@@ -160,15 +159,19 @@ class Field:
|
|||||||
config = config_from_argv()
|
config = config_from_argv()
|
||||||
with ProductData(config.product) as product_data:
|
with ProductData(config.product) as product_data:
|
||||||
user_agent: str = config.data.get("user_agent", http.ENDOFLIFE_BOT_USER_AGENT)
|
user_agent: str = config.data.get("user_agent", http.ENDOFLIFE_BOT_USER_AGENT)
|
||||||
|
|
||||||
render_js: bool = config.data.get("render_javascript", False)
|
render_js: bool = config.data.get("render_javascript", False)
|
||||||
render_js_wait_until: str | None = config.data.get("render_javascript_wait_until", None)
|
render_js_wait_until: str | None = config.data.get("render_javascript_wait_until", None)
|
||||||
render_js_wait_for: str | None = config.data.get("render_javascript_wait_for", None)
|
render_js_wait_for: str | None = config.data.get("render_javascript_wait_for", None)
|
||||||
render_js_click_selector: str | None = config.data.get("render_javascript_click_selector", None)
|
render_js_click_selector: str | None = config.data.get("render_javascript_click_selector", None)
|
||||||
header_row_selector: str = config.data.get("header_selector", "thead tr")
|
header_row_selector: str = config.data.get("header_selector", "thead tr")
|
||||||
remove_if_undefined_field: str | None = config.data.get("remove_if_undefined", None)
|
|
||||||
|
table_selector: str = config.data.get("selector", "table")
|
||||||
rows_selector: str = config.data.get("rows_selector", "tbody tr")
|
rows_selector: str = config.data.get("rows_selector", "tbody tr")
|
||||||
cells_selector: str = "td, th"
|
cells_selector: str = "td, th"
|
||||||
|
|
||||||
|
remove_if_undefined_field: str | None = config.data.get("remove_if_undefined", None)
|
||||||
|
|
||||||
release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle"))
|
release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle"))
|
||||||
fields = [Field(name, definition) for name, definition in config.data["fields"].items()]
|
fields = [Field(name, definition) for name, definition in config.data["fields"].items()]
|
||||||
|
|
||||||
@@ -179,7 +182,7 @@ with ProductData(config.product) as product_data:
|
|||||||
response_text = http.fetch_url(config.url, user_agent=user_agent).text
|
response_text = http.fetch_url(config.url, user_agent=user_agent).text
|
||||||
soup = BeautifulSoup(response_text, features="html5lib")
|
soup = BeautifulSoup(response_text, features="html5lib")
|
||||||
|
|
||||||
for table in soup.select(config.data["selector"]):
|
for table in soup.select(table_selector):
|
||||||
header_row = table.select_one(header_row_selector)
|
header_row = table.select_one(header_row_selector)
|
||||||
if not header_row:
|
if not header_row:
|
||||||
logging.info(f"skipping table with attributes {table.attrs}: no header row found")
|
logging.info(f"skipping table with attributes {table.attrs}: no header row found")
|
||||||
|
|||||||
66
src/version_table.py
Normal file
66
src/version_table.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
from common import dates, http
|
||||||
|
from common.releasedata import ProductData, config_from_argv
|
||||||
|
|
||||||
|
"""Fetch version-level data from an HTML table in a web page.
|
||||||
|
|
||||||
|
This script works based on a definition provided in the product's frontmatter to locate the table and extract the
|
||||||
|
necessary information. Available configuration options are:
|
||||||
|
|
||||||
|
- selector (optional, default = table): A CSS selector used to locate one or more tables in the page.
|
||||||
|
- header_selector (optional, default = thead tr): A CSS selector used to locate the table's header row.
|
||||||
|
- rows_selector (optional, default = tbody tr): A CSS selector used to locate the table's rows.
|
||||||
|
- name_column (mandatory): The name of the column containing the version names.
|
||||||
|
- date_column (mandatory): The name of the column containing the version dates.
|
||||||
|
|
||||||
|
Supported CSS selectors are defined by BeautifulSoup and documented on its website. For more information, see
|
||||||
|
https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=selector#css-selectors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
config = config_from_argv()
|
||||||
|
with ProductData(config.product) as product_data:
|
||||||
|
table_selector: str = config.data.get("selector", "table")
|
||||||
|
header_row_selector: str = config.data.get("header_selector", "thead tr")
|
||||||
|
rows_selector: str = config.data.get("rows_selector", "tbody tr")
|
||||||
|
cells_selector: str = "td, th"
|
||||||
|
|
||||||
|
version_name_column = config.data["name_column"].strip().lower()
|
||||||
|
version_date_column = config.data["date_column"].strip().lower()
|
||||||
|
|
||||||
|
html = http.fetch_html(config.url)
|
||||||
|
|
||||||
|
for table in html.select(table_selector):
|
||||||
|
header_row = table.select_one(header_row_selector)
|
||||||
|
if not header_row:
|
||||||
|
logging.info(f"skipping table with attributes {table.attrs}: no header row found")
|
||||||
|
continue
|
||||||
|
|
||||||
|
headers = [th.get_text().strip().lower() for th in header_row.select(cells_selector)]
|
||||||
|
logging.info(f"processing table with headers {headers}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
version_name_index = headers.index(version_name_column)
|
||||||
|
version_date_index = headers.index(version_date_column)
|
||||||
|
min_columns_count = max([version_name_index, version_date_index]) + 1
|
||||||
|
|
||||||
|
for row in table.select(rows_selector):
|
||||||
|
cells = [cell.get_text().strip() for cell in row.select(cells_selector)]
|
||||||
|
if len(cells) < min_columns_count:
|
||||||
|
logging.debug(f"skipping row {cells}: not enough columns")
|
||||||
|
continue
|
||||||
|
|
||||||
|
raw_version_name = cells[version_name_index]
|
||||||
|
version_match = config.first_match(raw_version_name)
|
||||||
|
if not version_match:
|
||||||
|
logging.debug(f"skipping row {cells}: invalid release cycle '{raw_version_name}', "
|
||||||
|
f"should match one of {config.include_version_patterns} "
|
||||||
|
f"and not match all of {config.exclude_version_patterns}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
version_name = config.render(version_match)
|
||||||
|
version_date = dates.parse_datetime(cells[version_date_index])
|
||||||
|
product_data.declare_version(version_name, version_date)
|
||||||
|
|
||||||
|
except ValueError as e:
|
||||||
|
logging.info(f"skipping table with headers {headers}: {e}")
|
||||||
Reference in New Issue
Block a user