From 9cf243a10e8d907046e91cbc17cc465f6c3e4684 Mon Sep 17 00:00:00 2001
From: Marc Wrobel <marc.wrobel@gmail.com>
Date: Fri, 16 Feb 2024 22:51:21 +0100
Subject: [PATCH] Fix scripts requiring rendering pages with javascript (#310)

Replace request_html by playwright, as request_html, as it is [not maintained anymore](https://pypi.org/project/requests-html/) and scripts using it, such as artifactory.py, started to fail.
---
 .github/workflows/update.yml |  1 -
 requirements.txt             |  4 ++--
 src/artifactory.py           | 21 +++++++++++++++++++++
 src/artifactory.py.disabled  | 24 ------------------------
 src/common/http.py           | 15 +++++++++++++++
 src/confluence.py            | 14 +++++++-------
 src/jira.py                  | 14 +++++++-------
 src/oracle-jdk.py            | 14 +++++++-------
 update.py                    | 10 ++++++++++
 9 files changed, 69 insertions(+), 48 deletions(-)
 create mode 100644 src/artifactory.py
 delete mode 100644 src/artifactory.py.disabled

diff --git a/.github/workflows/update.yml b/.github/workflows/update.yml
index cbe91fb4..7021efb1 100644
--- a/.github/workflows/update.yml
+++ b/.github/workflows/update.yml
@@ -65,7 +65,6 @@ jobs:
       - name: Update release data
         id: update_data
         env:
-          PYPPETEER_HOME: /home/runner/.cache/pyppeteer # Add chromium downloaded by pyppeteer to the cache.
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         continue-on-error: true # commit even if the data was not fully updated
         run: python update.py
diff --git a/requirements.txt b/requirements.txt
index 6dee180e..ad1e7322 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,13 @@
-beautifulsoup4==4.12.3 # used by a lot of script to parse html
+beautifulsoup4==4.12.3 # used by a lot of scripts to parse html
 deepdiff==6.7.1 # used in update.py
 html5lib==1.1 # used in conjunction with beautifulsoup4
 mwparserfromhell==0.6.6 # used in unrealircd.py
 packaging==23.2 # used in latest.py
+playwright==1.41.2 # used by a few scripts to parse html
 pre-commit==3.5.0 # used to check code before commit
 python-frontmatter==1.1.0 # used in endoflife.py to parse products YAML frontmatters
 python-liquid==1.10.2 # used in endoflife.py to render version templates
 requests==2.31.0 # used in http.py to make HTTP requests simpler
-requests-html==0.10.0 # used by a few scripts to parse html that needs javascript to be rendered
 requests-futures==1.0.1 # used in http.py to be able to make async HTTP requests
 ruamel.yaml==0.18.5 # used in latest.py
 ruamel.yaml.clib==0.2.8 # used in latest.py
diff --git a/src/artifactory.py b/src/artifactory.py
new file mode 100644
index 00000000..77ca3766
--- /dev/null
+++ b/src/artifactory.py
@@ -0,0 +1,21 @@
+from bs4 import BeautifulSoup
+from common import dates, http, releasedata
+
+"""Fetches Artifactory versions from https://jfrog.com, using requests_html because JavaScript is
+needed to render the page."""
+
+with releasedata.ProductData("artifactory") as product_data:
+    content = http.fetch_javascript_url('https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life')
+    soup = BeautifulSoup(content, 'html.parser')
+
+    for row in soup.select('.informaltable tbody tr'):
+        cells = row.select("td")
+        if len(cells) >= 2:
+            version = cells[0].text.strip()
+            if version:
+                date_str = cells[1].text.strip().replace("_", "-").replace("Sept-", "Sep-")
+                product_data.declare_version(version, dates.parse_date(date_str))
+
+    # 7.29.9 release date is wrong on https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life.
+    # Sent a mail to jfrog-help-center-feedback@jfrog.com to fix it, but in the meantime...
+    product_data.declare_version('7.29.9', dates.date(2022, 1, 11))
diff --git a/src/artifactory.py.disabled b/src/artifactory.py.disabled
deleted file mode 100644
index be93088c..00000000
--- a/src/artifactory.py.disabled
+++ /dev/null
@@ -1,24 +0,0 @@
-
-from common import dates, releasedata
-from requests_html import HTMLSession
-
-"""Fetches Artifactory versions from https://jfrog.com, using requests_html because JavaScript is
-needed to render the page."""
-
-product = releasedata.Product("artifactory")
-r = HTMLSession().get("https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life")
-r.html.render(sleep=2, scrolldown=5)
-
-for row in r.html.find('.informaltable tbody tr'):
-    cells = row.find("td")
-    if len(cells) >= 2:
-        version = cells[0].text.strip()
-        if version:
-            date_str = cells[1].text.strip().replace("_", "-").replace("Sept-", "Sep-")
-            product.declare_version(version, dates.parse_date(date_str))
-
-# 7.29.9 release date is wrong on https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life.
-# Sent a mail to jfrog-help-center-feedback@jfrog.com to fix it, but in the meantime...
-product.replace_version('7.29.9', dates.date(2022, 1, 11))
-
-product.write()
diff --git a/src/common/http.py b/src/common/http.py
index 33f9fdfc..400c55f4 100644
--- a/src/common/http.py
+++ b/src/common/http.py
@@ -1,6 +1,7 @@
 import logging
 from concurrent.futures import as_completed
 
+from playwright.sync_api import sync_playwright
 from requests import Response
 from requests.adapters import HTTPAdapter
 from requests.exceptions import ChunkedEncodingError
@@ -41,3 +42,17 @@ def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None
 def fetch_url(url: str, data: any = None, headers: dict[str, str] = None,
               max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Response:
     return fetch_urls([url], data, headers, max_retries, backoff_factor, timeout)[0]
+
+
+# This requires some setup, see https://playwright.dev/python/docs/intro#installing-playwright.
+def fetch_javascript_url(url: str) -> str:
+    logging.info(f"Fetching {url}")
+    with sync_playwright() as p:
+        browser = p.chromium.launch()
+        try:
+            page = browser.new_page()
+            page.goto(url, wait_until='networkidle')
+            logging.info(f"Fetched {url}")
+            return page.content()
+        finally:
+            browser.close()
diff --git a/src/confluence.py b/src/confluence.py
index 4da9f6da..2f147b80 100644
--- a/src/confluence.py
+++ b/src/confluence.py
@@ -1,15 +1,15 @@
-from common import dates, releasedata
-from requests_html import HTMLSession
+from bs4 import BeautifulSoup
+from common import dates, http, releasedata
 
 """Fetches Confluence versions from www.atlassian.com.
 
 Note that requests_html is used because JavaScript is needed to render the page."""
 
 with releasedata.ProductData("confluence") as product_data:
-    r = HTMLSession().get("https://www.atlassian.com/software/confluence/download-archives")
-    r.html.render(sleep=1, scrolldown=3)
+    content = http.fetch_javascript_url("https://www.atlassian.com/software/confluence/download-archives")
+    soup = BeautifulSoup(content, 'html.parser')
 
-    for version_block in r.html.find('.versions-list'):
-        version = version_block.find('a.product-versions', first=True).attrs['data-version']
-        date = dates.parse_date(version_block.find('.release-date', first=True).text)
+    for version_block in soup.select('.versions-list'):
+        version = version_block.select_one('a.product-versions').attrs['data-version']
+        date = dates.parse_date(version_block.select_one('.release-date').text)
         product_data.declare_version(version, date)
diff --git a/src/jira.py b/src/jira.py
index 60b626ea..e61f3813 100644
--- a/src/jira.py
+++ b/src/jira.py
@@ -1,15 +1,15 @@
-from common import dates, releasedata
-from requests_html import HTMLSession
+from bs4 import BeautifulSoup
+from common import dates, http, releasedata
 
 """Fetches Jira versions from www.atlassian.com.
 
 Note that requests_html is used because JavaScript is needed to render the page."""
 
 with releasedata.ProductData("jira") as product_data:
-    r = HTMLSession().get("https://www.atlassian.com/software/jira/update")
-    r.html.render(sleep=1, scrolldown=3)
+    content = http.fetch_javascript_url("https://www.atlassian.com/software/jira/update")
+    soup = BeautifulSoup(content, 'html.parser')
 
-    for version_block in r.html.find('.versions-list'):
-        version = version_block.find('a.product-versions', first=True).attrs['data-version']
-        date = dates.parse_date(version_block.find('.release-date', first=True).text)
+    for version_block in soup.select('.versions-list'):
+        version = version_block.select_one('a.product-versions').attrs['data-version']
+        date = dates.parse_date(version_block.select_one('.release-date').text)
         product_data.declare_version(version, date)
diff --git a/src/oracle-jdk.py b/src/oracle-jdk.py
index c4f176c7..5ff36ce2 100644
--- a/src/oracle-jdk.py
+++ b/src/oracle-jdk.py
@@ -1,20 +1,20 @@
-from common import dates, releasedata
-from requests_html import HTMLSession
+from bs4 import BeautifulSoup
+from common import dates, http, releasedata
 
 """Fetch Java versions from https://www.java.com/releases/.
 
 This script is using requests-html because the page needs JavaScript to render correctly."""
 
 with releasedata.ProductData("oracle-jdk") as product_data:
-    r = HTMLSession().get('https://www.java.com/releases/')
-    r.html.render(sleep=1, scrolldown=3)
+    content = http.fetch_javascript_url('https://www.java.com/releases/')
+    soup = BeautifulSoup(content, 'html.parser')
 
     previous_date = None
-    for row in r.html.find('#released tr'):
-        version_cell = row.find('td.anchor', first=True)
+    for row in soup.select('#released tr'):
+        version_cell = row.select_one('td.anchor')
         if version_cell:
             version = version_cell.attrs['id']
-            date_str = row.find('td')[1].text
+            date_str = row.select('td')[1].text
             date = dates.parse_date(date_str) if date_str else previous_date
             product_data.declare_version(version, date)
             previous_date = date
diff --git a/update.py b/update.py
index dc2c1759..8abf8f24 100644
--- a/update.py
+++ b/update.py
@@ -58,6 +58,13 @@ class ScriptExecutionSummary:
         return not all(self.success_by_product.values())
 
 
+def install_playwright() -> None:
+    with GitHubGroup("Install Playwright"):
+        logging.info("Installing Playwright")
+        subprocess.run('playwright install chromium', timeout=120, check=True, shell=True)
+        logging.info("Playwright installed")
+
+
 def __delete_data(product: ProductFrontmatter) -> None:
     release_data_path = DATA_DIR / f"{product.name}.json"
     if not release_data_path.exists() or product.is_auto_update_cumulative():
@@ -154,7 +161,10 @@ def generate_commit_message(old_content: dict[Path, dict], new_content: dict[Pat
 
 logging.basicConfig(format="%(message)s", level=logging.INFO)
 p_filter = sys.argv[1] if len(sys.argv) > 1 else None
+
+
 with GitHubStepSummary() as step_summary:
+    install_playwright()
     some_script_failed = run_scripts(step_summary, p_filter)
     updated_products = get_updated_products()