Fix scripts requiring rendering pages with javascript (#310)
Replace request_html by playwright, as request_html, as it is [not maintained anymore](https://pypi.org/project/requests-html/) and scripts using it, such as artifactory.py, started to fail.
This commit is contained in:
1
.github/workflows/update.yml
vendored
1
.github/workflows/update.yml
vendored
@@ -65,7 +65,6 @@ jobs:
|
|||||||
- name: Update release data
|
- name: Update release data
|
||||||
id: update_data
|
id: update_data
|
||||||
env:
|
env:
|
||||||
PYPPETEER_HOME: /home/runner/.cache/pyppeteer # Add chromium downloaded by pyppeteer to the cache.
|
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
continue-on-error: true # commit even if the data was not fully updated
|
continue-on-error: true # commit even if the data was not fully updated
|
||||||
run: python update.py
|
run: python update.py
|
||||||
|
|||||||
@@ -1,13 +1,13 @@
|
|||||||
beautifulsoup4==4.12.3 # used by a lot of script to parse html
|
beautifulsoup4==4.12.3 # used by a lot of scripts to parse html
|
||||||
deepdiff==6.7.1 # used in update.py
|
deepdiff==6.7.1 # used in update.py
|
||||||
html5lib==1.1 # used in conjunction with beautifulsoup4
|
html5lib==1.1 # used in conjunction with beautifulsoup4
|
||||||
mwparserfromhell==0.6.6 # used in unrealircd.py
|
mwparserfromhell==0.6.6 # used in unrealircd.py
|
||||||
packaging==23.2 # used in latest.py
|
packaging==23.2 # used in latest.py
|
||||||
|
playwright==1.41.2 # used by a few scripts to parse html
|
||||||
pre-commit==3.5.0 # used to check code before commit
|
pre-commit==3.5.0 # used to check code before commit
|
||||||
python-frontmatter==1.1.0 # used in endoflife.py to parse products YAML frontmatters
|
python-frontmatter==1.1.0 # used in endoflife.py to parse products YAML frontmatters
|
||||||
python-liquid==1.10.2 # used in endoflife.py to render version templates
|
python-liquid==1.10.2 # used in endoflife.py to render version templates
|
||||||
requests==2.31.0 # used in http.py to make HTTP requests simpler
|
requests==2.31.0 # used in http.py to make HTTP requests simpler
|
||||||
requests-html==0.10.0 # used by a few scripts to parse html that needs javascript to be rendered
|
|
||||||
requests-futures==1.0.1 # used in http.py to be able to make async HTTP requests
|
requests-futures==1.0.1 # used in http.py to be able to make async HTTP requests
|
||||||
ruamel.yaml==0.18.5 # used in latest.py
|
ruamel.yaml==0.18.5 # used in latest.py
|
||||||
ruamel.yaml.clib==0.2.8 # used in latest.py
|
ruamel.yaml.clib==0.2.8 # used in latest.py
|
||||||
|
|||||||
21
src/artifactory.py
Normal file
21
src/artifactory.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from common import dates, http, releasedata
|
||||||
|
|
||||||
|
"""Fetches Artifactory versions from https://jfrog.com, using requests_html because JavaScript is
|
||||||
|
needed to render the page."""
|
||||||
|
|
||||||
|
with releasedata.ProductData("artifactory") as product_data:
|
||||||
|
content = http.fetch_javascript_url('https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life')
|
||||||
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
|
|
||||||
|
for row in soup.select('.informaltable tbody tr'):
|
||||||
|
cells = row.select("td")
|
||||||
|
if len(cells) >= 2:
|
||||||
|
version = cells[0].text.strip()
|
||||||
|
if version:
|
||||||
|
date_str = cells[1].text.strip().replace("_", "-").replace("Sept-", "Sep-")
|
||||||
|
product_data.declare_version(version, dates.parse_date(date_str))
|
||||||
|
|
||||||
|
# 7.29.9 release date is wrong on https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life.
|
||||||
|
# Sent a mail to jfrog-help-center-feedback@jfrog.com to fix it, but in the meantime...
|
||||||
|
product_data.declare_version('7.29.9', dates.date(2022, 1, 11))
|
||||||
@@ -1,24 +0,0 @@
|
|||||||
|
|
||||||
from common import dates, releasedata
|
|
||||||
from requests_html import HTMLSession
|
|
||||||
|
|
||||||
"""Fetches Artifactory versions from https://jfrog.com, using requests_html because JavaScript is
|
|
||||||
needed to render the page."""
|
|
||||||
|
|
||||||
product = releasedata.Product("artifactory")
|
|
||||||
r = HTMLSession().get("https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life")
|
|
||||||
r.html.render(sleep=2, scrolldown=5)
|
|
||||||
|
|
||||||
for row in r.html.find('.informaltable tbody tr'):
|
|
||||||
cells = row.find("td")
|
|
||||||
if len(cells) >= 2:
|
|
||||||
version = cells[0].text.strip()
|
|
||||||
if version:
|
|
||||||
date_str = cells[1].text.strip().replace("_", "-").replace("Sept-", "Sep-")
|
|
||||||
product.declare_version(version, dates.parse_date(date_str))
|
|
||||||
|
|
||||||
# 7.29.9 release date is wrong on https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life.
|
|
||||||
# Sent a mail to jfrog-help-center-feedback@jfrog.com to fix it, but in the meantime...
|
|
||||||
product.replace_version('7.29.9', dates.date(2022, 1, 11))
|
|
||||||
|
|
||||||
product.write()
|
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
from concurrent.futures import as_completed
|
from concurrent.futures import as_completed
|
||||||
|
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
from requests import Response
|
from requests import Response
|
||||||
from requests.adapters import HTTPAdapter
|
from requests.adapters import HTTPAdapter
|
||||||
from requests.exceptions import ChunkedEncodingError
|
from requests.exceptions import ChunkedEncodingError
|
||||||
@@ -41,3 +42,17 @@ def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None
|
|||||||
def fetch_url(url: str, data: any = None, headers: dict[str, str] = None,
|
def fetch_url(url: str, data: any = None, headers: dict[str, str] = None,
|
||||||
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Response:
|
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Response:
|
||||||
return fetch_urls([url], data, headers, max_retries, backoff_factor, timeout)[0]
|
return fetch_urls([url], data, headers, max_retries, backoff_factor, timeout)[0]
|
||||||
|
|
||||||
|
|
||||||
|
# This requires some setup, see https://playwright.dev/python/docs/intro#installing-playwright.
|
||||||
|
def fetch_javascript_url(url: str) -> str:
|
||||||
|
logging.info(f"Fetching {url}")
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch()
|
||||||
|
try:
|
||||||
|
page = browser.new_page()
|
||||||
|
page.goto(url, wait_until='networkidle')
|
||||||
|
logging.info(f"Fetched {url}")
|
||||||
|
return page.content()
|
||||||
|
finally:
|
||||||
|
browser.close()
|
||||||
|
|||||||
@@ -1,15 +1,15 @@
|
|||||||
from common import dates, releasedata
|
from bs4 import BeautifulSoup
|
||||||
from requests_html import HTMLSession
|
from common import dates, http, releasedata
|
||||||
|
|
||||||
"""Fetches Confluence versions from www.atlassian.com.
|
"""Fetches Confluence versions from www.atlassian.com.
|
||||||
|
|
||||||
Note that requests_html is used because JavaScript is needed to render the page."""
|
Note that requests_html is used because JavaScript is needed to render the page."""
|
||||||
|
|
||||||
with releasedata.ProductData("confluence") as product_data:
|
with releasedata.ProductData("confluence") as product_data:
|
||||||
r = HTMLSession().get("https://www.atlassian.com/software/confluence/download-archives")
|
content = http.fetch_javascript_url("https://www.atlassian.com/software/confluence/download-archives")
|
||||||
r.html.render(sleep=1, scrolldown=3)
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
|
|
||||||
for version_block in r.html.find('.versions-list'):
|
for version_block in soup.select('.versions-list'):
|
||||||
version = version_block.find('a.product-versions', first=True).attrs['data-version']
|
version = version_block.select_one('a.product-versions').attrs['data-version']
|
||||||
date = dates.parse_date(version_block.find('.release-date', first=True).text)
|
date = dates.parse_date(version_block.select_one('.release-date').text)
|
||||||
product_data.declare_version(version, date)
|
product_data.declare_version(version, date)
|
||||||
|
|||||||
14
src/jira.py
14
src/jira.py
@@ -1,15 +1,15 @@
|
|||||||
from common import dates, releasedata
|
from bs4 import BeautifulSoup
|
||||||
from requests_html import HTMLSession
|
from common import dates, http, releasedata
|
||||||
|
|
||||||
"""Fetches Jira versions from www.atlassian.com.
|
"""Fetches Jira versions from www.atlassian.com.
|
||||||
|
|
||||||
Note that requests_html is used because JavaScript is needed to render the page."""
|
Note that requests_html is used because JavaScript is needed to render the page."""
|
||||||
|
|
||||||
with releasedata.ProductData("jira") as product_data:
|
with releasedata.ProductData("jira") as product_data:
|
||||||
r = HTMLSession().get("https://www.atlassian.com/software/jira/update")
|
content = http.fetch_javascript_url("https://www.atlassian.com/software/jira/update")
|
||||||
r.html.render(sleep=1, scrolldown=3)
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
|
|
||||||
for version_block in r.html.find('.versions-list'):
|
for version_block in soup.select('.versions-list'):
|
||||||
version = version_block.find('a.product-versions', first=True).attrs['data-version']
|
version = version_block.select_one('a.product-versions').attrs['data-version']
|
||||||
date = dates.parse_date(version_block.find('.release-date', first=True).text)
|
date = dates.parse_date(version_block.select_one('.release-date').text)
|
||||||
product_data.declare_version(version, date)
|
product_data.declare_version(version, date)
|
||||||
|
|||||||
@@ -1,20 +1,20 @@
|
|||||||
from common import dates, releasedata
|
from bs4 import BeautifulSoup
|
||||||
from requests_html import HTMLSession
|
from common import dates, http, releasedata
|
||||||
|
|
||||||
"""Fetch Java versions from https://www.java.com/releases/.
|
"""Fetch Java versions from https://www.java.com/releases/.
|
||||||
|
|
||||||
This script is using requests-html because the page needs JavaScript to render correctly."""
|
This script is using requests-html because the page needs JavaScript to render correctly."""
|
||||||
|
|
||||||
with releasedata.ProductData("oracle-jdk") as product_data:
|
with releasedata.ProductData("oracle-jdk") as product_data:
|
||||||
r = HTMLSession().get('https://www.java.com/releases/')
|
content = http.fetch_javascript_url('https://www.java.com/releases/')
|
||||||
r.html.render(sleep=1, scrolldown=3)
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
|
|
||||||
previous_date = None
|
previous_date = None
|
||||||
for row in r.html.find('#released tr'):
|
for row in soup.select('#released tr'):
|
||||||
version_cell = row.find('td.anchor', first=True)
|
version_cell = row.select_one('td.anchor')
|
||||||
if version_cell:
|
if version_cell:
|
||||||
version = version_cell.attrs['id']
|
version = version_cell.attrs['id']
|
||||||
date_str = row.find('td')[1].text
|
date_str = row.select('td')[1].text
|
||||||
date = dates.parse_date(date_str) if date_str else previous_date
|
date = dates.parse_date(date_str) if date_str else previous_date
|
||||||
product_data.declare_version(version, date)
|
product_data.declare_version(version, date)
|
||||||
previous_date = date
|
previous_date = date
|
||||||
|
|||||||
10
update.py
10
update.py
@@ -58,6 +58,13 @@ class ScriptExecutionSummary:
|
|||||||
return not all(self.success_by_product.values())
|
return not all(self.success_by_product.values())
|
||||||
|
|
||||||
|
|
||||||
|
def install_playwright() -> None:
|
||||||
|
with GitHubGroup("Install Playwright"):
|
||||||
|
logging.info("Installing Playwright")
|
||||||
|
subprocess.run('playwright install chromium', timeout=120, check=True, shell=True)
|
||||||
|
logging.info("Playwright installed")
|
||||||
|
|
||||||
|
|
||||||
def __delete_data(product: ProductFrontmatter) -> None:
|
def __delete_data(product: ProductFrontmatter) -> None:
|
||||||
release_data_path = DATA_DIR / f"{product.name}.json"
|
release_data_path = DATA_DIR / f"{product.name}.json"
|
||||||
if not release_data_path.exists() or product.is_auto_update_cumulative():
|
if not release_data_path.exists() or product.is_auto_update_cumulative():
|
||||||
@@ -154,7 +161,10 @@ def generate_commit_message(old_content: dict[Path, dict], new_content: dict[Pat
|
|||||||
|
|
||||||
logging.basicConfig(format="%(message)s", level=logging.INFO)
|
logging.basicConfig(format="%(message)s", level=logging.INFO)
|
||||||
p_filter = sys.argv[1] if len(sys.argv) > 1 else None
|
p_filter = sys.argv[1] if len(sys.argv) > 1 else None
|
||||||
|
|
||||||
|
|
||||||
with GitHubStepSummary() as step_summary:
|
with GitHubStepSummary() as step_summary:
|
||||||
|
install_playwright()
|
||||||
some_script_failed = run_scripts(step_summary, p_filter)
|
some_script_failed = run_scripts(step_summary, p_filter)
|
||||||
updated_products = get_updated_products()
|
updated_products = get_updated_products()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user