From 9cf243a10e8d907046e91cbc17cc465f6c3e4684 Mon Sep 17 00:00:00 2001 From: Marc Wrobel Date: Fri, 16 Feb 2024 22:51:21 +0100 Subject: [PATCH] Fix scripts requiring rendering pages with javascript (#310) Replace request_html by playwright, as request_html, as it is [not maintained anymore](https://pypi.org/project/requests-html/) and scripts using it, such as artifactory.py, started to fail. --- .github/workflows/update.yml | 1 - requirements.txt | 4 ++-- src/artifactory.py | 21 +++++++++++++++++++++ src/artifactory.py.disabled | 24 ------------------------ src/common/http.py | 15 +++++++++++++++ src/confluence.py | 14 +++++++------- src/jira.py | 14 +++++++------- src/oracle-jdk.py | 14 +++++++------- update.py | 10 ++++++++++ 9 files changed, 69 insertions(+), 48 deletions(-) create mode 100644 src/artifactory.py delete mode 100644 src/artifactory.py.disabled diff --git a/.github/workflows/update.yml b/.github/workflows/update.yml index cbe91fb4..7021efb1 100644 --- a/.github/workflows/update.yml +++ b/.github/workflows/update.yml @@ -65,7 +65,6 @@ jobs: - name: Update release data id: update_data env: - PYPPETEER_HOME: /home/runner/.cache/pyppeteer # Add chromium downloaded by pyppeteer to the cache. GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} continue-on-error: true # commit even if the data was not fully updated run: python update.py diff --git a/requirements.txt b/requirements.txt index 6dee180e..ad1e7322 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,13 @@ -beautifulsoup4==4.12.3 # used by a lot of script to parse html +beautifulsoup4==4.12.3 # used by a lot of scripts to parse html deepdiff==6.7.1 # used in update.py html5lib==1.1 # used in conjunction with beautifulsoup4 mwparserfromhell==0.6.6 # used in unrealircd.py packaging==23.2 # used in latest.py +playwright==1.41.2 # used by a few scripts to parse html pre-commit==3.5.0 # used to check code before commit python-frontmatter==1.1.0 # used in endoflife.py to parse products YAML frontmatters python-liquid==1.10.2 # used in endoflife.py to render version templates requests==2.31.0 # used in http.py to make HTTP requests simpler -requests-html==0.10.0 # used by a few scripts to parse html that needs javascript to be rendered requests-futures==1.0.1 # used in http.py to be able to make async HTTP requests ruamel.yaml==0.18.5 # used in latest.py ruamel.yaml.clib==0.2.8 # used in latest.py diff --git a/src/artifactory.py b/src/artifactory.py new file mode 100644 index 00000000..77ca3766 --- /dev/null +++ b/src/artifactory.py @@ -0,0 +1,21 @@ +from bs4 import BeautifulSoup +from common import dates, http, releasedata + +"""Fetches Artifactory versions from https://jfrog.com, using requests_html because JavaScript is +needed to render the page.""" + +with releasedata.ProductData("artifactory") as product_data: + content = http.fetch_javascript_url('https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life') + soup = BeautifulSoup(content, 'html.parser') + + for row in soup.select('.informaltable tbody tr'): + cells = row.select("td") + if len(cells) >= 2: + version = cells[0].text.strip() + if version: + date_str = cells[1].text.strip().replace("_", "-").replace("Sept-", "Sep-") + product_data.declare_version(version, dates.parse_date(date_str)) + + # 7.29.9 release date is wrong on https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life. + # Sent a mail to jfrog-help-center-feedback@jfrog.com to fix it, but in the meantime... + product_data.declare_version('7.29.9', dates.date(2022, 1, 11)) diff --git a/src/artifactory.py.disabled b/src/artifactory.py.disabled deleted file mode 100644 index be93088c..00000000 --- a/src/artifactory.py.disabled +++ /dev/null @@ -1,24 +0,0 @@ - -from common import dates, releasedata -from requests_html import HTMLSession - -"""Fetches Artifactory versions from https://jfrog.com, using requests_html because JavaScript is -needed to render the page.""" - -product = releasedata.Product("artifactory") -r = HTMLSession().get("https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life") -r.html.render(sleep=2, scrolldown=5) - -for row in r.html.find('.informaltable tbody tr'): - cells = row.find("td") - if len(cells) >= 2: - version = cells[0].text.strip() - if version: - date_str = cells[1].text.strip().replace("_", "-").replace("Sept-", "Sep-") - product.declare_version(version, dates.parse_date(date_str)) - -# 7.29.9 release date is wrong on https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life. -# Sent a mail to jfrog-help-center-feedback@jfrog.com to fix it, but in the meantime... -product.replace_version('7.29.9', dates.date(2022, 1, 11)) - -product.write() diff --git a/src/common/http.py b/src/common/http.py index 33f9fdfc..400c55f4 100644 --- a/src/common/http.py +++ b/src/common/http.py @@ -1,6 +1,7 @@ import logging from concurrent.futures import as_completed +from playwright.sync_api import sync_playwright from requests import Response from requests.adapters import HTTPAdapter from requests.exceptions import ChunkedEncodingError @@ -41,3 +42,17 @@ def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None def fetch_url(url: str, data: any = None, headers: dict[str, str] = None, max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Response: return fetch_urls([url], data, headers, max_retries, backoff_factor, timeout)[0] + + +# This requires some setup, see https://playwright.dev/python/docs/intro#installing-playwright. +def fetch_javascript_url(url: str) -> str: + logging.info(f"Fetching {url}") + with sync_playwright() as p: + browser = p.chromium.launch() + try: + page = browser.new_page() + page.goto(url, wait_until='networkidle') + logging.info(f"Fetched {url}") + return page.content() + finally: + browser.close() diff --git a/src/confluence.py b/src/confluence.py index 4da9f6da..2f147b80 100644 --- a/src/confluence.py +++ b/src/confluence.py @@ -1,15 +1,15 @@ -from common import dates, releasedata -from requests_html import HTMLSession +from bs4 import BeautifulSoup +from common import dates, http, releasedata """Fetches Confluence versions from www.atlassian.com. Note that requests_html is used because JavaScript is needed to render the page.""" with releasedata.ProductData("confluence") as product_data: - r = HTMLSession().get("https://www.atlassian.com/software/confluence/download-archives") - r.html.render(sleep=1, scrolldown=3) + content = http.fetch_javascript_url("https://www.atlassian.com/software/confluence/download-archives") + soup = BeautifulSoup(content, 'html.parser') - for version_block in r.html.find('.versions-list'): - version = version_block.find('a.product-versions', first=True).attrs['data-version'] - date = dates.parse_date(version_block.find('.release-date', first=True).text) + for version_block in soup.select('.versions-list'): + version = version_block.select_one('a.product-versions').attrs['data-version'] + date = dates.parse_date(version_block.select_one('.release-date').text) product_data.declare_version(version, date) diff --git a/src/jira.py b/src/jira.py index 60b626ea..e61f3813 100644 --- a/src/jira.py +++ b/src/jira.py @@ -1,15 +1,15 @@ -from common import dates, releasedata -from requests_html import HTMLSession +from bs4 import BeautifulSoup +from common import dates, http, releasedata """Fetches Jira versions from www.atlassian.com. Note that requests_html is used because JavaScript is needed to render the page.""" with releasedata.ProductData("jira") as product_data: - r = HTMLSession().get("https://www.atlassian.com/software/jira/update") - r.html.render(sleep=1, scrolldown=3) + content = http.fetch_javascript_url("https://www.atlassian.com/software/jira/update") + soup = BeautifulSoup(content, 'html.parser') - for version_block in r.html.find('.versions-list'): - version = version_block.find('a.product-versions', first=True).attrs['data-version'] - date = dates.parse_date(version_block.find('.release-date', first=True).text) + for version_block in soup.select('.versions-list'): + version = version_block.select_one('a.product-versions').attrs['data-version'] + date = dates.parse_date(version_block.select_one('.release-date').text) product_data.declare_version(version, date) diff --git a/src/oracle-jdk.py b/src/oracle-jdk.py index c4f176c7..5ff36ce2 100644 --- a/src/oracle-jdk.py +++ b/src/oracle-jdk.py @@ -1,20 +1,20 @@ -from common import dates, releasedata -from requests_html import HTMLSession +from bs4 import BeautifulSoup +from common import dates, http, releasedata """Fetch Java versions from https://www.java.com/releases/. This script is using requests-html because the page needs JavaScript to render correctly.""" with releasedata.ProductData("oracle-jdk") as product_data: - r = HTMLSession().get('https://www.java.com/releases/') - r.html.render(sleep=1, scrolldown=3) + content = http.fetch_javascript_url('https://www.java.com/releases/') + soup = BeautifulSoup(content, 'html.parser') previous_date = None - for row in r.html.find('#released tr'): - version_cell = row.find('td.anchor', first=True) + for row in soup.select('#released tr'): + version_cell = row.select_one('td.anchor') if version_cell: version = version_cell.attrs['id'] - date_str = row.find('td')[1].text + date_str = row.select('td')[1].text date = dates.parse_date(date_str) if date_str else previous_date product_data.declare_version(version, date) previous_date = date diff --git a/update.py b/update.py index dc2c1759..8abf8f24 100644 --- a/update.py +++ b/update.py @@ -58,6 +58,13 @@ class ScriptExecutionSummary: return not all(self.success_by_product.values()) +def install_playwright() -> None: + with GitHubGroup("Install Playwright"): + logging.info("Installing Playwright") + subprocess.run('playwright install chromium', timeout=120, check=True, shell=True) + logging.info("Playwright installed") + + def __delete_data(product: ProductFrontmatter) -> None: release_data_path = DATA_DIR / f"{product.name}.json" if not release_data_path.exists() or product.is_auto_update_cumulative(): @@ -154,7 +161,10 @@ def generate_commit_message(old_content: dict[Path, dict], new_content: dict[Pat logging.basicConfig(format="%(message)s", level=logging.INFO) p_filter = sys.argv[1] if len(sys.argv) > 1 else None + + with GitHubStepSummary() as step_summary: + install_playwright() some_script_failed = run_scripts(step_summary, p_filter) updated_products = get_updated_products()