diff --git a/src/common/http.py b/src/common/http.py index 742b3f26..4dd277aa 100644 --- a/src/common/http.py +++ b/src/common/http.py @@ -48,13 +48,16 @@ def fetch_url(url: str, data: any = None, headers: dict[str, str] = None, # This requires some setup, see https://playwright.dev/python/docs/intro#installing-playwright. -def fetch_javascript_url(url: str) -> str: +def fetch_javascript_url(url: str, click_selector: str = None) -> str: logging.info(f"Fetching {url}") with sync_playwright() as p: browser = p.chromium.launch() try: page = browser.new_page() page.goto(url, wait_until='networkidle') + if click_selector: + logging.info(f"Clicked on {click_selector}") + page.click(click_selector) logging.info(f"Fetched {url}") return page.content() finally: diff --git a/src/release_table.py b/src/release_table.py index a6dc4891..15811fae 100644 --- a/src/release_table.py +++ b/src/release_table.py @@ -18,6 +18,8 @@ necessary information. Available configuration options are: - header_selector (mandatory, default = thead tr): A CSS selector used to locate the table's header row. - rows_selector (mandatory, default = tbody tr): A CSS selector used to locate the table's rows. - render_javascript (optional, default = false): A boolean value indicating whether to render JavaScript on the page. +- render_javascript_click_selector (optional, default = None): A playwright selector used to click on an element after + the JavaScript rendering. - ignore_empty_releases (optional, default = false): A boolean value indicating whether to ignore releases with no fields except the name. - fields: A dictionary that maps release fields to the table's columns. Field definition include: @@ -122,6 +124,7 @@ m_filter = sys.argv[2] if len(sys.argv) > 2 else None for config in endoflife.list_configs(p_filter, METHOD, m_filter): with releasedata.ProductData(config.product) as product_data: render_javascript = config.data.get("render_javascript", False) + render_javascript_click_selector = config.data.get("render_javascript_click_selector", None) ignore_empty_releases = config.data.get("ignore_empty_releases", False) header_row_selector = config.data.get("header_selector", "thead tr") rows_selector = config.data.get("rows_selector", "tbody tr") @@ -129,7 +132,10 @@ for config in endoflife.list_configs(p_filter, METHOD, m_filter): release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle")) fields = [Field(name, definition) for name, definition in config.data["fields"].items()] - response_text = http.fetch_javascript_url(config.url) if render_javascript else http.fetch_url(config.url).text + if render_javascript: + response_text = http.fetch_javascript_url(config.url, click_selector=render_javascript_click_selector) + else: + response_text = http.fetch_url(config.url).text soup = BeautifulSoup(response_text, features="html5lib") for table in soup.select(config.data["selector"]): @@ -139,6 +145,7 @@ for config in endoflife.list_configs(p_filter, METHOD, m_filter): continue headers = [th.get_text().strip().lower() for th in header_row.select(cells_selector)] + logging.info(f"processing table with headers {headers}") try: fields_index = {"releaseCycle": headers.index(release_cycle_field.column)}