227 lines
12 KiB
Python
227 lines
12 KiB
Python
import logging
|
|
import re
|
|
from datetime import datetime
|
|
from re import Match
|
|
|
|
from bs4 import BeautifulSoup
|
|
from common import dates, endoflife, http
|
|
from common.releasedata import ProductData, config_from_argv
|
|
from liquid import Template
|
|
|
|
"""Fetch release-level data from an HTML table in a web page.
|
|
|
|
This script works based on a definition provided in the product's frontmatter to locate the table and extract the
|
|
necessary information. Available configuration options are:
|
|
|
|
- selector (optional, default = table): A CSS selector used to locate one or more tables in the page.
|
|
- header_selector (optional, default = thead tr): A CSS selector used to locate the table's header row.
|
|
- rows_selector (optional, default = tbody tr): A CSS selector used to locate the table's rows.
|
|
- user_agent (optional, default = <endoflife.date-bot User-Agent>): A user agent string to use when fetching the page.
|
|
Unused when render_javascript is true.
|
|
- render_javascript (optional, default = false): A boolean value indicating whether to render JavaScript on the page.
|
|
- render_javascript_headless (optional, default = true): Indicates whether to run the browser in headless mode.
|
|
- render_javascript_wait_for (optional, default = None): Wait until the given selector appear on the page. Only use when
|
|
render_javascript is true.
|
|
- render_javascript_wait_until (optional, default = None): Argument to pass to Playwright, one of "commit",
|
|
"domcontentloaded", "load", or "networkidle". Only use when render_javascript is true and if the script fails without it.
|
|
- remove_if_undefined (optional, default = None): Ignore rows where the given field is undefined. This is useful for
|
|
example when the table contains a row for a future release with no release date yet.
|
|
- fields: A dictionary that maps release fields to the table's columns. Field definition include:
|
|
- column (mandatory): The name or index (starts at 1) of the column in the table.
|
|
- type (mandatory, default = string): The type of the field. Supported types are:
|
|
- string: The raw string value.
|
|
- identifier: A transformation of the raw string value so that it can be used as an identifier. The transformation
|
|
consists of putting the string in lower case, replacing spaces with dashes, and removing all
|
|
characters that are not alphanumeric, dashes, dots, plus signs, or underscores.
|
|
- date : A full or year-month date (supported patterns available in common.dates).
|
|
- range : Convert a comma-separated list of values into a range, only keeping the first and last value.
|
|
For example, "1.0, 1.1, 1.2" becomes "1.0 - 1.2".
|
|
If the field is one of the known date fields, the type is automatically set to 'date' if not provided.
|
|
- regex (mandatory, default = [DEFAULT_REGEX]): A regular expression, or a list of regular expressions, used to
|
|
validate allowed values. Note that default value for the releaseCycle field is not DEFAULT_REGEX, but
|
|
DEFAULT_RELEASE_REGEX.
|
|
- regex_exclude (mandatory, default = []): A regular expression, or a list of regular expressions, used to exclude
|
|
values even if they match any regular expression in 'regex'.
|
|
- template (mandatory, default = DEFAULT_TEMPLATE): A liquid template used to clean up the value using the matched
|
|
groups from a 'regex'.
|
|
|
|
Note that defining the column attribute directly instead of its full definition is allowed when
|
|
there the column name or index is the only attribute. For example, this:
|
|
```
|
|
fields:
|
|
releaseCycle:
|
|
column: "End of life"
|
|
```
|
|
|
|
can be replaced with this:
|
|
```
|
|
fields:
|
|
releaseCycle: "End of life"
|
|
```
|
|
|
|
Supported CSS selectors are defined by BeautifulSoup and documented on its website. For more information, see
|
|
https://beautiful-soup-4.readthedocs.io/en/latest/index.html?highlight=selector#css-selectors."""
|
|
|
|
SUPPORTED_TYPES = ["date", "string", "range", "identifier"]
|
|
STRING_TYPES = ["string", "identifier"]
|
|
STRING_FIELDS = ["releaseCycle", "releaseLabel"]
|
|
DATE_TYPES = ["date"]
|
|
DATE_FIELDS = ["releaseDate", "lts", "eoas", "eol", "eoes", "latestReleaseDate"]
|
|
DEFAULT_REGEX = r"^(?P<value>.+)$"
|
|
DEFAULT_TEMPLATE = "{{value}}"
|
|
DEFAULT_RELEASE_REGEX = r"^v?(?P<value>\d+(\.\d+)*)$"
|
|
RANGE_LIST_SEPARATOR_PATTERN = re.compile(r"\s*,\s*")
|
|
|
|
|
|
class Field:
|
|
def __init__(self, name: str, definition: str | dict) -> None:
|
|
# Directly specifying the column name or index instead of its full definition is allowed.
|
|
# In this case we must convert it to a full definition.
|
|
if isinstance(definition, (str, int)):
|
|
definition = {"column": definition}
|
|
|
|
self.name = name
|
|
if self.name == "releaseCycle":
|
|
definition["type"] = "string" if "type" not in definition else definition["type"]
|
|
definition["regex"] = definition.get("regex", [DEFAULT_RELEASE_REGEX])
|
|
definition["template"] = definition.get("template", DEFAULT_TEMPLATE)
|
|
|
|
self.is_index = isinstance(definition["column"], int)
|
|
if self.is_index:
|
|
self.column = definition["column"] - 1 # convert to 0-based index
|
|
else:
|
|
self.column = definition["column"].lower()
|
|
|
|
self.type = definition.get("type", "string")
|
|
if self.name in DATE_FIELDS and self.type not in DATE_TYPES:
|
|
self.type = "date" # override type for known date fields
|
|
elif self.name in STRING_FIELDS and self.type not in STRING_TYPES:
|
|
self.type = "string" # override type for known string fields
|
|
elif self.type not in SUPPORTED_TYPES:
|
|
msg = f"unsupported type: {self.type} for field {self.name}"
|
|
raise ValueError(msg)
|
|
|
|
regex = definition.get("regex", [DEFAULT_REGEX])
|
|
regex = regex if isinstance(regex, list) else [regex]
|
|
self.include_version_patterns = [re.compile(r, re.MULTILINE) for r in regex]
|
|
|
|
exclude_regex = definition.get("regex_exclude", [])
|
|
exclude_regex = exclude_regex if isinstance(exclude_regex, list) else [exclude_regex]
|
|
self.exclude_version_patterns = [re.compile(r, re.MULTILINE) for r in exclude_regex]
|
|
|
|
self.template = Template(definition.get("template", DEFAULT_TEMPLATE)) \
|
|
if "template" in definition or regex else None
|
|
|
|
def extract_from(self, raw_value: str) -> str | datetime | None:
|
|
for exclude_pattern in self.exclude_version_patterns:
|
|
if exclude_pattern.match(raw_value):
|
|
logging.debug(f"Excluding '{raw_value}': matches exclude pattern {exclude_pattern}")
|
|
return None
|
|
|
|
logging.debug(f"'{raw_value}' does not match exclude pattern {exclude_pattern}")
|
|
|
|
for include_pattern in self.include_version_patterns:
|
|
match = include_pattern.match(raw_value)
|
|
if not match:
|
|
logging.debug(f"'{raw_value}' does not match include pattern {include_pattern}")
|
|
continue
|
|
|
|
logging.debug(f"Processing '{raw_value}': matches include pattern {include_pattern}")
|
|
return self.__process_value(match, raw_value)
|
|
|
|
if self.name == "releaseCycle":
|
|
return None # skipping entire rows is allowed
|
|
|
|
msg = f"field {self}'s value '{raw_value}' does not match any regex in {self.include_version_patterns}"
|
|
raise ValueError(msg)
|
|
|
|
def __process_value(self, match: Match[str], raw_value: str) -> str | datetime:
|
|
str_value = self.template.render(**match.groupdict()) if self.template else raw_value
|
|
|
|
if self.type == "date":
|
|
return dates.parse_date_or_month_year_date(str_value)
|
|
|
|
if self.type == "range":
|
|
items = RANGE_LIST_SEPARATOR_PATTERN.split(str_value)
|
|
return f"{items[0]} - {items[-1]}" if len(items) > 1 else str_value
|
|
|
|
if self.type == "identifier":
|
|
return endoflife.to_identifier(str_value)
|
|
|
|
return str_value
|
|
|
|
def __repr__(self) -> str:
|
|
return f"{self.name}({self.column})"
|
|
|
|
|
|
config = config_from_argv()
|
|
with ProductData(config.product) as product_data:
|
|
user_agent: str = config.data.get("user_agent", http.ENDOFLIFE_BOT_USER_AGENT)
|
|
|
|
render_js: bool = config.data.get("render_javascript", False)
|
|
render_js_wait_until: str | None = config.data.get("render_javascript_wait_until", None)
|
|
render_js_wait_for: str | None = config.data.get("render_javascript_wait_for", None)
|
|
render_js_click_selector: str | None = config.data.get("render_javascript_click_selector", None)
|
|
render_js_headless: str | None = config.data.get("render_javascript_headless", None)
|
|
|
|
table_selector: str = config.data.get("selector", "table")
|
|
header_row_selector: str = config.data.get("header_selector", "thead tr")
|
|
rows_selector: str = config.data.get("rows_selector", "tbody tr")
|
|
cells_selector: str = "td, th"
|
|
|
|
remove_if_undefined_field: str | None = config.data.get("remove_if_undefined", None)
|
|
|
|
release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle"))
|
|
fields = [Field(name, definition) for name, definition in config.data["fields"].items()]
|
|
|
|
if render_js:
|
|
response_text = http.fetch_javascript_url(config.url, user_agent=user_agent, headless=render_js_headless,
|
|
wait_until=render_js_wait_until, wait_for=render_js_wait_for,
|
|
click_selector=render_js_click_selector)
|
|
else:
|
|
response_text = http.fetch_url(config.url, user_agent=user_agent).text
|
|
soup = BeautifulSoup(response_text, features="html5lib")
|
|
|
|
for table in soup.select(table_selector):
|
|
header_row = table.select_one(header_row_selector)
|
|
if not header_row:
|
|
logging.info(f"skipping table with attributes {table.attrs}: no header row found")
|
|
continue
|
|
|
|
headers = [th.get_text().strip().lower() for th in header_row.select(cells_selector)]
|
|
logging.info(f"processing table with headers {headers}")
|
|
|
|
try:
|
|
fields_index = {"releaseCycle": headers.index(release_cycle_field.column)}
|
|
for field in fields:
|
|
fields_index[field.name] = field.column if field.is_index else headers.index(field.column)
|
|
min_column_count = max(fields_index.values()) + 1
|
|
|
|
for row in table.select(rows_selector):
|
|
cells = [cell.get_text().strip() for cell in row.select(cells_selector)]
|
|
if len(cells) < min_column_count:
|
|
logging.debug(f"skipping row {cells}: not enough columns")
|
|
continue
|
|
|
|
raw_release_name = cells[fields_index[release_cycle_field.name]]
|
|
release_name = release_cycle_field.extract_from(raw_release_name)
|
|
if not release_name:
|
|
logging.debug(f"skipping row {cells}: invalid release cycle '{raw_release_name}', "
|
|
f"should match one of {release_cycle_field.include_version_patterns} "
|
|
f"and not match all of {release_cycle_field.exclude_version_patterns}")
|
|
continue
|
|
|
|
release = product_data.get_release(release_name)
|
|
for field in fields:
|
|
raw_field = cells[fields_index[field.name]]
|
|
try:
|
|
release.set_field(field.name, field.extract_from(raw_field))
|
|
except ValueError as e:
|
|
logging.debug(f"skipping cell {raw_field} for {release}: {e}")
|
|
|
|
if remove_if_undefined_field and not release.get_field(remove_if_undefined_field):
|
|
product_data.remove_release(release_name, f"{remove_if_undefined_field} is not defined")
|
|
|
|
except ValueError as e:
|
|
logging.info(f"skipping table with headers {headers}: {e}")
|