diff --git a/src/artifactory.py b/src/artifactory.py index d4343efd..3002c983 100644 --- a/src/artifactory.py +++ b/src/artifactory.py @@ -7,7 +7,8 @@ needed to render the page.""" config = config_from_argv() with ProductData(config.product) as product_data: - content = http.fetch_javascript_url(config.url, wait_until = 'networkidle') + # Oddly the full page content does not contain the versions, must use the wait_for element directly. + content = http.fetch_javascript_url(config.url, wait_for='div.informaltable', select_wait_for=True) soup = BeautifulSoup(content, 'html.parser') for row in soup.select('.informaltable tbody tr'): diff --git a/src/common/http.py b/src/common/http.py index 08084e33..ea493af7 100644 --- a/src/common/http.py +++ b/src/common/http.py @@ -80,17 +80,25 @@ def fetch_markdown(url: str, data: any = None, headers: dict[str, str] = None, return mwparserfromhell.parse(response.text) # This requires some setup, see https://playwright.dev/python/docs/intro#installing-playwright. -def fetch_javascript_url(url: str, click_selector: str = None, wait_until: str = None) -> str: - logging.info(f"Fetching {url} with JavaScript (click_selector = {click_selector}, wait_until = {wait_until})") +def fetch_javascript_url(url: str, wait_until: str = None, wait_for: str = None, select_wait_for: bool = False) -> str: + logging.info(f"Fetching {url} with JavaScript (wait_until = {wait_until}, wait_for = {wait_for}, select_wait_for = {select_wait_for})") with sync_playwright() as p: browser = p.chromium.launch() try: page = browser.new_page() page.goto(url, wait_until=wait_until) - if click_selector: - logging.info(f"Clicked on {click_selector}") - page.click(click_selector) logging.info(f"Fetched {url}") + + if wait_for: + logging.info(f"Waiting for element with selector {wait_for}") + element = page.wait_for_selector(selector=wait_for) + + if element: + logging.debug(f"Found element with selector {wait_for} on {url}") + return element.inner_html() if select_wait_for else page.content() + + logging.warning(f"No element found with selector {wait_for} on {url}, returning full page content") + return page.content() finally: browser.close() diff --git a/src/release_table.py b/src/release_table.py index 01aac434..1762ce89 100644 --- a/src/release_table.py +++ b/src/release_table.py @@ -17,8 +17,8 @@ necessary information. Available configuration options are: - header_selector (mandatory, default = thead tr): A CSS selector used to locate the table's header row. - rows_selector (mandatory, default = tbody tr): A CSS selector used to locate the table's rows. - render_javascript (optional, default = false): A boolean value indicating whether to render JavaScript on the page. -- render_javascript_click_selector (optional, default = None): A playwright selector used to click on an element after - the JavaScript rendering. Only use when render_javascript is true. +- render_javascript_wait_for (optional, default = None): Wait until the given selector appear on the page. Only use when + render_javascript is true. - render_javascript_wait_until (optional, default = None): Argument to pass to Playwright, one of "commit", "domcontentloaded", "load", or "networkidle". Only use when render_javascript is true and if the script fails without it. - ignore_empty_releases (optional, default = false): A boolean value indicating whether to ignore releases with no @@ -154,8 +154,8 @@ class Field: config = config_from_argv() with ProductData(config.product) as product_data: render_javascript = config.data.get("render_javascript", False) - render_javascript_click_selector = config.data.get("render_javascript_click_selector", None) render_javascript_wait_until = config.data.get("render_javascript_wait_until", None) + render_javascript_wait_for = config.data.get("render_javascript_wait_for", None) ignore_empty_releases = config.data.get("ignore_empty_releases", False) header_row_selector = config.data.get("header_selector", "thead tr") rows_selector = config.data.get("rows_selector", "tbody tr") @@ -164,8 +164,8 @@ with ProductData(config.product) as product_data: fields = [Field(name, definition) for name, definition in config.data["fields"].items()] if render_javascript: - response_text = http.fetch_javascript_url(config.url, click_selector=render_javascript_click_selector, - wait_until=render_javascript_wait_until) + response_text = http.fetch_javascript_url(config.url, wait_until=render_javascript_wait_until, + wait_for=render_javascript_wait_for) else: response_text = http.fetch_url(config.url).text soup = BeautifulSoup(response_text, features="html5lib")