Fix scripts requiring rendering pages with javascript (#310)

Replace request_html by playwright, as request_html, as it is [not maintained anymore](https://pypi.org/project/requests-html/) and scripts using it, such as artifactory.py, started to fail.
This commit is contained in:
Marc Wrobel
2024-02-16 22:51:21 +01:00
parent 1175756d11
commit 9cf243a10e
9 changed files with 69 additions and 48 deletions

21
src/artifactory.py Normal file
View File

@@ -0,0 +1,21 @@
from bs4 import BeautifulSoup
from common import dates, http, releasedata
"""Fetches Artifactory versions from https://jfrog.com, using requests_html because JavaScript is
needed to render the page."""
with releasedata.ProductData("artifactory") as product_data:
content = http.fetch_javascript_url('https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life')
soup = BeautifulSoup(content, 'html.parser')
for row in soup.select('.informaltable tbody tr'):
cells = row.select("td")
if len(cells) >= 2:
version = cells[0].text.strip()
if version:
date_str = cells[1].text.strip().replace("_", "-").replace("Sept-", "Sep-")
product_data.declare_version(version, dates.parse_date(date_str))
# 7.29.9 release date is wrong on https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life.
# Sent a mail to jfrog-help-center-feedback@jfrog.com to fix it, but in the meantime...
product_data.declare_version('7.29.9', dates.date(2022, 1, 11))

View File

@@ -1,24 +0,0 @@
from common import dates, releasedata
from requests_html import HTMLSession
"""Fetches Artifactory versions from https://jfrog.com, using requests_html because JavaScript is
needed to render the page."""
product = releasedata.Product("artifactory")
r = HTMLSession().get("https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life")
r.html.render(sleep=2, scrolldown=5)
for row in r.html.find('.informaltable tbody tr'):
cells = row.find("td")
if len(cells) >= 2:
version = cells[0].text.strip()
if version:
date_str = cells[1].text.strip().replace("_", "-").replace("Sept-", "Sep-")
product.declare_version(version, dates.parse_date(date_str))
# 7.29.9 release date is wrong on https://jfrog.com/help/r/jfrog-release-information/artifactory-end-of-life.
# Sent a mail to jfrog-help-center-feedback@jfrog.com to fix it, but in the meantime...
product.replace_version('7.29.9', dates.date(2022, 1, 11))
product.write()

View File

@@ -1,6 +1,7 @@
import logging
from concurrent.futures import as_completed
from playwright.sync_api import sync_playwright
from requests import Response
from requests.adapters import HTTPAdapter
from requests.exceptions import ChunkedEncodingError
@@ -41,3 +42,17 @@ def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None
def fetch_url(url: str, data: any = None, headers: dict[str, str] = None,
max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> Response:
return fetch_urls([url], data, headers, max_retries, backoff_factor, timeout)[0]
# This requires some setup, see https://playwright.dev/python/docs/intro#installing-playwright.
def fetch_javascript_url(url: str) -> str:
logging.info(f"Fetching {url}")
with sync_playwright() as p:
browser = p.chromium.launch()
try:
page = browser.new_page()
page.goto(url, wait_until='networkidle')
logging.info(f"Fetched {url}")
return page.content()
finally:
browser.close()

View File

@@ -1,15 +1,15 @@
from common import dates, releasedata
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from common import dates, http, releasedata
"""Fetches Confluence versions from www.atlassian.com.
Note that requests_html is used because JavaScript is needed to render the page."""
with releasedata.ProductData("confluence") as product_data:
r = HTMLSession().get("https://www.atlassian.com/software/confluence/download-archives")
r.html.render(sleep=1, scrolldown=3)
content = http.fetch_javascript_url("https://www.atlassian.com/software/confluence/download-archives")
soup = BeautifulSoup(content, 'html.parser')
for version_block in r.html.find('.versions-list'):
version = version_block.find('a.product-versions', first=True).attrs['data-version']
date = dates.parse_date(version_block.find('.release-date', first=True).text)
for version_block in soup.select('.versions-list'):
version = version_block.select_one('a.product-versions').attrs['data-version']
date = dates.parse_date(version_block.select_one('.release-date').text)
product_data.declare_version(version, date)

View File

@@ -1,15 +1,15 @@
from common import dates, releasedata
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from common import dates, http, releasedata
"""Fetches Jira versions from www.atlassian.com.
Note that requests_html is used because JavaScript is needed to render the page."""
with releasedata.ProductData("jira") as product_data:
r = HTMLSession().get("https://www.atlassian.com/software/jira/update")
r.html.render(sleep=1, scrolldown=3)
content = http.fetch_javascript_url("https://www.atlassian.com/software/jira/update")
soup = BeautifulSoup(content, 'html.parser')
for version_block in r.html.find('.versions-list'):
version = version_block.find('a.product-versions', first=True).attrs['data-version']
date = dates.parse_date(version_block.find('.release-date', first=True).text)
for version_block in soup.select('.versions-list'):
version = version_block.select_one('a.product-versions').attrs['data-version']
date = dates.parse_date(version_block.select_one('.release-date').text)
product_data.declare_version(version, date)

View File

@@ -1,20 +1,20 @@
from common import dates, releasedata
from requests_html import HTMLSession
from bs4 import BeautifulSoup
from common import dates, http, releasedata
"""Fetch Java versions from https://www.java.com/releases/.
This script is using requests-html because the page needs JavaScript to render correctly."""
with releasedata.ProductData("oracle-jdk") as product_data:
r = HTMLSession().get('https://www.java.com/releases/')
r.html.render(sleep=1, scrolldown=3)
content = http.fetch_javascript_url('https://www.java.com/releases/')
soup = BeautifulSoup(content, 'html.parser')
previous_date = None
for row in r.html.find('#released tr'):
version_cell = row.find('td.anchor', first=True)
for row in soup.select('#released tr'):
version_cell = row.select_one('td.anchor')
if version_cell:
version = version_cell.attrs['id']
date_str = row.find('td')[1].text
date_str = row.select('td')[1].text
date = dates.parse_date(date_str) if date_str else previous_date
product_data.declare_version(version, date)
previous_date = date