[firefox] Simplify script

- use requests_futures instead of managing concurrency manually,
- drop retrieval of versions < 10.0 to not make the parsing too complex.
This commit is contained in:
Marc Wrobel
2023-11-25 11:20:56 +01:00
parent f0324372d9
commit deb5d9f1a2
3 changed files with 42 additions and 219 deletions

View File

@@ -463,89 +463,10 @@
"10.0.5": "2012-06-05",
"12.0": "2012-04-24",
"10.0.4": "2012-04-24",
"3.6.28": "2012-03-13",
"11.0": "2012-03-13",
"10.0.3": "2012-03-13",
"3.6.27": "2012-02-17",
"10.0.2": "2012-02-16",
"10.0.1": "2012-02-10",
"3.6.26": "2012-01-31",
"10.0": "2012-01-31",
"9.0.1": "2011-12-21",
"9.0": "2011-12-20",
"3.6.25": "2011-12-20",
"8.0.1": "2011-11-21",
"8.0": "2011-11-08",
"3.6.24": "2011-11-08",
"7.0.1": "2011-09-29",
"7.0": "2011-09-27",
"3.6.23": "2011-09-27",
"6.0.2": "2011-09-06",
"3.6.22": "2011-09-06",
"6.0.1": "2011-08-30",
"3.6.21": "2011-08-30",
"6.0": "2011-08-16",
"3.6.20": "2011-08-16",
"5.0.1": "2011-07-11",
"3.6.19": "2011-07-11",
"5.0": "2011-06-21",
"3.6.18": "2011-06-21",
"4.0.1": "2011-04-28",
"3.6.17": "2011-04-28",
"3.5.19": "2011-04-28",
"4.0": "2011-03-22",
"3.6.16": "2011-03-22",
"3.5.18": "2011-03-22",
"3.6.15": "2011-03-04",
"3.6.14": "2011-03-01",
"3.5.17": "2011-03-01",
"3.6.13": "2010-12-09",
"3.5.16": "2010-12-09",
"3.6.12": "2010-10-27",
"3.5.15": "2010-10-27",
"3.6.11": "2010-10-19",
"3.5.14": "2010-10-19",
"3.6.10": "2010-09-15",
"3.5.13": "2010-09-15",
"3.6.9": "2010-09-07",
"3.5.12": "2010-09-07",
"3.6.8": "2010-07-23",
"3.6.7": "2010-07-20",
"3.5.11": "2010-07-20",
"3.6.6": "2010-06-26",
"3.6.4": "2010-06-22",
"3.5.10": "2010-06-22",
"3.6.3": "2010-04-01",
"3.5.9": "2010-03-30",
"3.0.19": "2010-03-30",
"3.6.2": "2010-03-22",
"3.5.8": "2010-02-17",
"3.0.18": "2010-02-17",
"3.6": "2010-01-21",
"3.5.7": "2010-01-05",
"3.0.17": "2010-01-05",
"3.5.6": "2009-12-15",
"3.0.16": "2009-12-15",
"3.5.5": "2009-11-05",
"3.5.4": "2009-10-27",
"3.0.15": "2009-10-27",
"3.5.3": "2009-09-09",
"3.0.14": "2009-09-09",
"3.5.2": "2009-08-03",
"3.0.13": "2009-08-03",
"3.0.12": "2009-07-21",
"3.5.1": "2009-07-16",
"3.5": "2009-06-30",
"3.0.11": "2009-06-11",
"3.0.10": "2009-04-27",
"3.0.9": "2009-04-21",
"3.0.8": "2009-03-27",
"3.0.7": "2009-03-04",
"3.0.6": "2009-02-03",
"3.0.5": "2008-12-16",
"3.0.4": "2008-11-12",
"3.0.3": "2008-09-26",
"3.0.2": "2008-09-23",
"3.0.1": "2008-07-16",
"3.0": "2008-06-17"
"9.0": "2012-01-31",
"10.0": "2012-01-31"
}

View File

@@ -15,4 +15,5 @@ typing_extensions==4.8.0
webencodings==0.5.1
requests==2.31.0
requests-html==0.10.0
requests-futures==1.0.1
regex==2023.10.3

View File

@@ -1,11 +1,13 @@
import concurrent.futures
import re
import requests
from urllib.error import HTTPError
import urllib.parse
from bs4 import BeautifulSoup
from common import endoflife
from datetime import datetime
from typing import Tuple
from urllib3.util import Retry
from requests.adapters import HTTPAdapter
from requests_futures.sessions import FuturesSession
from requests.exceptions import ChunkedEncodingError
from concurrent.futures import as_completed
"""Fetch Firefox versions with their dates from https://www.mozilla.org/"""
@@ -16,148 +18,47 @@ DATE_REGEX = r"(January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|July|J
VERSION_REGEX = r"\d+(\.\d+)*"
class UnsupportedPageError(Exception):
"""Raised when a firefox release page is not supported"""
pass
class InvalidPageVariantError(Exception):
"""Raised when an invalid variant is passed to get_version_and_date"""
pass
class UnpublishedReleaseError(Exception):
"""Raised when a page is not yet published, but linked"""
pass
def format_date(unformatted_date: str) -> str:
""" Format date from July 11, 2002 to 2002-07-11 """
date = re.sub(r'(\d)(st|nd|rd|th)', r'\1', unformatted_date)
def format_date(text: str) -> str:
text = text.replace(')', '')
formats = ["%b %d, %Y", "%B %d, %Y"]
for f in formats:
try:
return datetime.strptime(date, f).strftime("%Y-%m-%d")
return datetime.strptime(text, f).strftime("%Y-%m-%d")
except ValueError:
pass
return ""
def get_version_and_date_variant_1(soup: BeautifulSoup) -> Tuple[str, str]:
""" Version matching for firefox versions >= 28.0 (usually) """
# get version
version = soup.find("div", class_="c-release-version").get_text()
# get date
unformatted_date = soup.find("p", class_="c-release-date").get_text()
date = format_date(unformatted_date)
return version, date
def get_version_and_date_variant_2(soup: BeautifulSoup) -> Tuple[str, str]:
""" Version matching for firefox versions >= 10.0 (usually) """
release_info = soup.find("h2").find("small").text
# get version
version_match = re.search(VERSION_REGEX, soup.select('div#nav-access a')[0].get("href"))
if version_match is None:
raise InvalidPageVariantError("Unable to find version")
version = version_match.group()
# get date
unformatted_date_match = re.search(DATE_REGEX, release_info)
if unformatted_date_match is None:
raise InvalidPageVariantError("Unable to find date")
unformatted_date = unformatted_date_match.group()
date = format_date(unformatted_date)
return version, date
def get_version_and_date_variant_3(soup: BeautifulSoup) -> Tuple[str, str]:
""" Version matching for firefox versions >= 3.0 (usually) """
release_info = soup.select('div#main-feature p em')[0].get_text()
# get version
version_match = re.search(VERSION_REGEX, release_info)
if version_match is None:
raise InvalidPageVariantError("Unable to find version")
version = version_match.group()
# get date
unformatted_date_match = re.search(DATE_REGEX, release_info)
if unformatted_date_match is None:
raise InvalidPageVariantError("Unable to find date")
unformatted_date = unformatted_date_match.group()
date = format_date(unformatted_date)
return version, date
def get_version_and_date(release_page: str, release_version: str) -> Tuple[str, str]:
""" Get version and date from the given release page """
major = int(release_version.split(".")[0])
# firefox release pages for versions <3.0 don't include release dates, so we
# can't match these versions for now.
# example: https://www.mozilla.org/en-US/firefox/2.0/releasenotes/
if major < 3:
raise UnsupportedPageError(f"Unsupported release page: {release_page}")
# Firefox release pages come in 3 different variants. Unfortunately, there
# is no consistent way to determine which variant a page is (say, by version
# number), so we have to try each variant until we find one that works.
functions = [
get_version_and_date_variant_1,
get_version_and_date_variant_2,
get_version_and_date_variant_3
]
try:
soup = make_bs_request(release_page)
except(HTTPError) as e:
if(e.code == 404):
raise UnpublishedReleaseError(f"The release page is not yet published, got a 404: {release_page}")
else:
raise e
for function in functions:
try:
return function(soup)
except (InvalidPageVariantError, AttributeError, IndexError):
pass
raise UnsupportedPageError(f"Unable to find version and date from {release_page}")
def make_bs_request(url: str) -> BeautifulSoup:
response = endoflife.fetch_url(url)
return BeautifulSoup(response, features="html5lib")
def fetch_releases():
releases = {}
soup = make_bs_request(URL)
ff_releases = soup.find_all("ol", class_="c-release-list")
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_url = {
executor.submit(
get_version_and_date,
requests.compat.urljoin(URL, p.get("href")),
p.get_text()): p.get("href") for p in ff_releases[0].find_all("a")
}
for future in concurrent.futures.as_completed(future_to_url):
try:
(version, date) = future.result()
print(f"{version}: {date}")
releases[version] = date
except(UnsupportedPageError, UnpublishedReleaseError):
print(f"Unsupported release page: {future_to_url[future]}")
return releases
print(f"::group::{PRODUCT}")
versions = fetch_releases()
versions = {}
response = endoflife.fetch_url(URL)
ff_releases = BeautifulSoup(response, features="html5lib").find_all("ol", class_="c-release-list")
ff_urls = [urllib.parse.urljoin(URL, p.get("href")) for p in ff_releases[0].find_all("a")]
session = FuturesSession()
session.mount('https://', HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.2)))
futures = [session.get(url, timeout=30) for url in ff_urls]
for future in as_completed(futures):
try:
response = future.result()
soup = BeautifulSoup(response.text, features="html5lib")
version = response.request.url.split("/")[-3]
if soup.find("div", class_="c-release-version"):
date = format_date(soup.find("p", class_="c-release-date").get_text())
versions[version] = date
print(f"{version}: {date}")
elif soup.find("small", string=re.compile("^.?First offered")):
element = soup.find("small", string=re.compile("^.?First offered"))
date = format_date(' '.join(element.get_text().split(" ")[-3:])) # get last 3 words
versions[version] = date
print(f"{version}: {date}")
# we don't get version <= 10.0, not a big deal
except ChunkedEncodingError:
# This may happen sometimes and will be ignored to not make the script fail,
# see https://stackoverflow.com/a/71899731/374236.
print(f"Error fetching {response.request.url}: ChunkedEncodingError")
endoflife.write_releases(PRODUCT, versions)
print("::endgroup::")