[release_table] Improve script (#305)
- Add strict typing to the fields. This makes the script fail if some column does not have the expected type (for example because of a change in the HTML page). - Support regex and templating for all fields (not only the releaseCycle). This make it possible to extract only the necessary information without having to do some sort of 'magic' cleanup (replacements in dates have been reverted). - Do not inject 'releaseCycle' anymore in the JSON (there is already the name).
This commit is contained in:
@@ -43,16 +43,10 @@ def parse_datetime(text: str, formats: list[str] = frozenset([
|
||||
# so that we don't have to deal with some special cases in formats
|
||||
text = (
|
||||
text.strip()
|
||||
.replace("th, ", " ") # November 10th, 2015 -> November 10, 2015
|
||||
.replace("st, ", " ") # March 31st, 2015 -> March 31, 2015
|
||||
.replace("Augu ", "August ") # 17 Augu 2023 -> 17 August 2023 - revert after st replacement
|
||||
.replace("augu ", "August ") # 17 Augu 2023 -> 17 august 2023 - revert after st replacement
|
||||
.replace("rd, ", " ") # March 3rd, 2015 -> March 3, 2015
|
||||
.replace(", ", " ") # November 10, 2015 -> November 10 2015
|
||||
.replace(". ", " ") # November 10. 2015 -> November 10 2015
|
||||
.replace("(", "") # (November 10 2015) -> November 10 2015)
|
||||
.replace(")", "") # (November 10 2015) -> (November 10 2015
|
||||
.replace("*", "") # November 10 2015* -> November 10 2015
|
||||
)
|
||||
for fmt in formats:
|
||||
try:
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import BeautifulSoup, PageElement
|
||||
from common import dates, endoflife, http, releasedata
|
||||
from liquid import Template
|
||||
|
||||
"""Fetch release-level data from an HTML table in a web page.
|
||||
|
||||
@@ -27,6 +30,72 @@ string."""
|
||||
|
||||
METHOD = "release_table"
|
||||
|
||||
|
||||
class Field:
|
||||
SUPPORTED_TYPES = ["date", "string"]
|
||||
DATE_FIELDS = ["releaseDate", "support", "eol", "extendedSupport"]
|
||||
DEFAULT_REGEX = r"^(?P<value>.+)$"
|
||||
DEFAULT_TEMPLATE = "{{value}}"
|
||||
DEFAULT_RELEASE_REGEX = r"^v?(?P<value>\d+(\.\d+)?)$"
|
||||
|
||||
def __init__(self, name: str, definition: str | dict, columns: list[str]) -> None:
|
||||
if isinstance(definition, str):
|
||||
definition = {"column": definition}
|
||||
|
||||
self.name = name
|
||||
if self.name == "releaseCycle":
|
||||
definition["type"] = "string"
|
||||
definition["regex"] = definition.get("regex", [self.DEFAULT_RELEASE_REGEX])
|
||||
definition["template"] = definition.get("template", self.DEFAULT_TEMPLATE)
|
||||
|
||||
self.column = definition["column"].lower()
|
||||
if self.column not in columns:
|
||||
msg = f"column {self.column} not found in {columns}"
|
||||
raise ValueError(msg)
|
||||
self.column_index = columns.index(self.column)
|
||||
|
||||
self.type = definition.get("type", "string")
|
||||
if self.name in self.DATE_FIELDS:
|
||||
self.type = "date" # override type for known date fields
|
||||
elif self.type not in self.SUPPORTED_TYPES:
|
||||
msg = f"unsupported type: {self.type} for field {self.name}"
|
||||
raise ValueError(msg)
|
||||
|
||||
regex = definition.get("regex", [self.DEFAULT_REGEX])
|
||||
regex = regex if isinstance(regex, list) else [regex]
|
||||
self.include_version_patterns = [re.compile(r, re.MULTILINE) for r in regex]
|
||||
|
||||
exclude_regex = definition.get("regex_exclude", [])
|
||||
exclude_regex = exclude_regex if isinstance(exclude_regex, list) else [exclude_regex]
|
||||
self.exclude_version_patterns = [re.compile(r, re.MULTILINE) for r in exclude_regex]
|
||||
|
||||
self.template = Template(definition.get("template", self.DEFAULT_TEMPLATE)) \
|
||||
if "template" in definition or regex else None
|
||||
|
||||
def extract_from(self, cells: list[PageElement]) -> str | datetime | None:
|
||||
raw_value = cells[self.column_index].get_text(strip=True)
|
||||
|
||||
for exclude_pattern in self.exclude_version_patterns:
|
||||
if exclude_pattern.match(raw_value):
|
||||
return None
|
||||
|
||||
for include_pattern in self.include_version_patterns:
|
||||
match = include_pattern.match(raw_value)
|
||||
if not match:
|
||||
continue
|
||||
|
||||
str_value = self.template.render(**match.groupdict()) if self.template else raw_value
|
||||
if self.type == "date":
|
||||
return dates.parse_date(str_value)
|
||||
return str_value
|
||||
|
||||
if self.name == "releaseCycle":
|
||||
return None # skipping entire rows is allowed
|
||||
|
||||
msg = f"{raw_value} is not matching any regex in {self.include_version_patterns}"
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
p_filter = sys.argv[1] if len(sys.argv) > 1 else None
|
||||
m_filter = sys.argv[2] if len(sys.argv) > 2 else None
|
||||
for config in endoflife.list_configs(p_filter, METHOD, m_filter):
|
||||
@@ -39,31 +108,20 @@ for config in endoflife.list_configs(p_filter, METHOD, m_filter):
|
||||
message = f"No table found for {config.product} with selector {config.data['selector']}"
|
||||
raise ValueError(message)
|
||||
|
||||
index_by_target = {}
|
||||
headers = [th.get_text().strip().lower() for th in table.select(config.data["headers_selector"])]
|
||||
for target, column in config.data["mapping"].items():
|
||||
index_by_target[target] = headers.index(str(column).lower())
|
||||
release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle"), headers)
|
||||
fields = [Field(name, definition, headers) for name, definition in config.data["fields"].items()]
|
||||
min_column_count = max([f.column_index for f in fields] + [release_cycle_field.column_index]) + 1
|
||||
|
||||
min_column_count = max(index_by_target.values()) + 1
|
||||
release_cycle_index = index_by_target.pop("releaseCycle")
|
||||
for row in table.select(config.data["rows_selector"]):
|
||||
cells = row.findAll("td")
|
||||
if len(cells) < min_column_count:
|
||||
row_cells = row.findAll("td")
|
||||
if len(row_cells) < min_column_count:
|
||||
continue
|
||||
|
||||
release_cycle = cells[release_cycle_index].get_text().strip()
|
||||
release_cycle_match = config.first_match(release_cycle)
|
||||
if not release_cycle_match:
|
||||
release_cycle = release_cycle_field.extract_from(row_cells)
|
||||
if not release_cycle:
|
||||
continue
|
||||
|
||||
release = product_data.get_release(config.render(release_cycle_match))
|
||||
release.set_field("releaseCycle", release.name())
|
||||
for target, index in index_by_target.items():
|
||||
value_str = cells[index].get_text().strip()
|
||||
|
||||
try:
|
||||
value = dates.parse_date(value_str)
|
||||
except ValueError:
|
||||
value = value_str
|
||||
|
||||
release.set_field(target, value)
|
||||
release = product_data.get_release(release_cycle)
|
||||
for field in fields:
|
||||
release.set_field(field.name, field.extract_from(row_cells))
|
||||
|
||||
Reference in New Issue
Block a user