[firefox] Simplify script
- use requests_futures instead of managing concurrency manually, - drop retrieval of versions < 10.0 to not make the parsing too complex.
This commit is contained in:
@@ -463,89 +463,10 @@
|
||||
"10.0.5": "2012-06-05",
|
||||
"12.0": "2012-04-24",
|
||||
"10.0.4": "2012-04-24",
|
||||
"3.6.28": "2012-03-13",
|
||||
"11.0": "2012-03-13",
|
||||
"10.0.3": "2012-03-13",
|
||||
"3.6.27": "2012-02-17",
|
||||
"10.0.2": "2012-02-16",
|
||||
"10.0.1": "2012-02-10",
|
||||
"3.6.26": "2012-01-31",
|
||||
"10.0": "2012-01-31",
|
||||
"9.0.1": "2011-12-21",
|
||||
"9.0": "2011-12-20",
|
||||
"3.6.25": "2011-12-20",
|
||||
"8.0.1": "2011-11-21",
|
||||
"8.0": "2011-11-08",
|
||||
"3.6.24": "2011-11-08",
|
||||
"7.0.1": "2011-09-29",
|
||||
"7.0": "2011-09-27",
|
||||
"3.6.23": "2011-09-27",
|
||||
"6.0.2": "2011-09-06",
|
||||
"3.6.22": "2011-09-06",
|
||||
"6.0.1": "2011-08-30",
|
||||
"3.6.21": "2011-08-30",
|
||||
"6.0": "2011-08-16",
|
||||
"3.6.20": "2011-08-16",
|
||||
"5.0.1": "2011-07-11",
|
||||
"3.6.19": "2011-07-11",
|
||||
"5.0": "2011-06-21",
|
||||
"3.6.18": "2011-06-21",
|
||||
"4.0.1": "2011-04-28",
|
||||
"3.6.17": "2011-04-28",
|
||||
"3.5.19": "2011-04-28",
|
||||
"4.0": "2011-03-22",
|
||||
"3.6.16": "2011-03-22",
|
||||
"3.5.18": "2011-03-22",
|
||||
"3.6.15": "2011-03-04",
|
||||
"3.6.14": "2011-03-01",
|
||||
"3.5.17": "2011-03-01",
|
||||
"3.6.13": "2010-12-09",
|
||||
"3.5.16": "2010-12-09",
|
||||
"3.6.12": "2010-10-27",
|
||||
"3.5.15": "2010-10-27",
|
||||
"3.6.11": "2010-10-19",
|
||||
"3.5.14": "2010-10-19",
|
||||
"3.6.10": "2010-09-15",
|
||||
"3.5.13": "2010-09-15",
|
||||
"3.6.9": "2010-09-07",
|
||||
"3.5.12": "2010-09-07",
|
||||
"3.6.8": "2010-07-23",
|
||||
"3.6.7": "2010-07-20",
|
||||
"3.5.11": "2010-07-20",
|
||||
"3.6.6": "2010-06-26",
|
||||
"3.6.4": "2010-06-22",
|
||||
"3.5.10": "2010-06-22",
|
||||
"3.6.3": "2010-04-01",
|
||||
"3.5.9": "2010-03-30",
|
||||
"3.0.19": "2010-03-30",
|
||||
"3.6.2": "2010-03-22",
|
||||
"3.5.8": "2010-02-17",
|
||||
"3.0.18": "2010-02-17",
|
||||
"3.6": "2010-01-21",
|
||||
"3.5.7": "2010-01-05",
|
||||
"3.0.17": "2010-01-05",
|
||||
"3.5.6": "2009-12-15",
|
||||
"3.0.16": "2009-12-15",
|
||||
"3.5.5": "2009-11-05",
|
||||
"3.5.4": "2009-10-27",
|
||||
"3.0.15": "2009-10-27",
|
||||
"3.5.3": "2009-09-09",
|
||||
"3.0.14": "2009-09-09",
|
||||
"3.5.2": "2009-08-03",
|
||||
"3.0.13": "2009-08-03",
|
||||
"3.0.12": "2009-07-21",
|
||||
"3.5.1": "2009-07-16",
|
||||
"3.5": "2009-06-30",
|
||||
"3.0.11": "2009-06-11",
|
||||
"3.0.10": "2009-04-27",
|
||||
"3.0.9": "2009-04-21",
|
||||
"3.0.8": "2009-03-27",
|
||||
"3.0.7": "2009-03-04",
|
||||
"3.0.6": "2009-02-03",
|
||||
"3.0.5": "2008-12-16",
|
||||
"3.0.4": "2008-11-12",
|
||||
"3.0.3": "2008-09-26",
|
||||
"3.0.2": "2008-09-23",
|
||||
"3.0.1": "2008-07-16",
|
||||
"3.0": "2008-06-17"
|
||||
"9.0": "2012-01-31",
|
||||
"10.0": "2012-01-31"
|
||||
}
|
||||
@@ -15,4 +15,5 @@ typing_extensions==4.8.0
|
||||
webencodings==0.5.1
|
||||
requests==2.31.0
|
||||
requests-html==0.10.0
|
||||
requests-futures==1.0.1
|
||||
regex==2023.10.3
|
||||
|
||||
177
src/firefox.py
177
src/firefox.py
@@ -1,11 +1,13 @@
|
||||
import concurrent.futures
|
||||
import re
|
||||
import requests
|
||||
from urllib.error import HTTPError
|
||||
import urllib.parse
|
||||
from bs4 import BeautifulSoup
|
||||
from common import endoflife
|
||||
from datetime import datetime
|
||||
from typing import Tuple
|
||||
from urllib3.util import Retry
|
||||
from requests.adapters import HTTPAdapter
|
||||
from requests_futures.sessions import FuturesSession
|
||||
from requests.exceptions import ChunkedEncodingError
|
||||
from concurrent.futures import as_completed
|
||||
|
||||
"""Fetch Firefox versions with their dates from https://www.mozilla.org/"""
|
||||
|
||||
@@ -16,148 +18,47 @@ DATE_REGEX = r"(January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|July|J
|
||||
VERSION_REGEX = r"\d+(\.\d+)*"
|
||||
|
||||
|
||||
class UnsupportedPageError(Exception):
|
||||
"""Raised when a firefox release page is not supported"""
|
||||
pass
|
||||
|
||||
|
||||
class InvalidPageVariantError(Exception):
|
||||
"""Raised when an invalid variant is passed to get_version_and_date"""
|
||||
pass
|
||||
|
||||
class UnpublishedReleaseError(Exception):
|
||||
"""Raised when a page is not yet published, but linked"""
|
||||
pass
|
||||
|
||||
def format_date(unformatted_date: str) -> str:
|
||||
""" Format date from July 11, 2002 to 2002-07-11 """
|
||||
date = re.sub(r'(\d)(st|nd|rd|th)', r'\1', unformatted_date)
|
||||
def format_date(text: str) -> str:
|
||||
text = text.replace(')', '')
|
||||
formats = ["%b %d, %Y", "%B %d, %Y"]
|
||||
for f in formats:
|
||||
try:
|
||||
return datetime.strptime(date, f).strftime("%Y-%m-%d")
|
||||
return datetime.strptime(text, f).strftime("%Y-%m-%d")
|
||||
except ValueError:
|
||||
pass
|
||||
return ""
|
||||
|
||||
|
||||
def get_version_and_date_variant_1(soup: BeautifulSoup) -> Tuple[str, str]:
|
||||
""" Version matching for firefox versions >= 28.0 (usually) """
|
||||
# get version
|
||||
version = soup.find("div", class_="c-release-version").get_text()
|
||||
|
||||
# get date
|
||||
unformatted_date = soup.find("p", class_="c-release-date").get_text()
|
||||
date = format_date(unformatted_date)
|
||||
|
||||
return version, date
|
||||
|
||||
|
||||
def get_version_and_date_variant_2(soup: BeautifulSoup) -> Tuple[str, str]:
|
||||
""" Version matching for firefox versions >= 10.0 (usually) """
|
||||
release_info = soup.find("h2").find("small").text
|
||||
|
||||
# get version
|
||||
version_match = re.search(VERSION_REGEX, soup.select('div#nav-access a')[0].get("href"))
|
||||
if version_match is None:
|
||||
raise InvalidPageVariantError("Unable to find version")
|
||||
version = version_match.group()
|
||||
|
||||
# get date
|
||||
unformatted_date_match = re.search(DATE_REGEX, release_info)
|
||||
if unformatted_date_match is None:
|
||||
raise InvalidPageVariantError("Unable to find date")
|
||||
unformatted_date = unformatted_date_match.group()
|
||||
date = format_date(unformatted_date)
|
||||
|
||||
return version, date
|
||||
|
||||
|
||||
def get_version_and_date_variant_3(soup: BeautifulSoup) -> Tuple[str, str]:
|
||||
""" Version matching for firefox versions >= 3.0 (usually) """
|
||||
release_info = soup.select('div#main-feature p em')[0].get_text()
|
||||
|
||||
# get version
|
||||
version_match = re.search(VERSION_REGEX, release_info)
|
||||
if version_match is None:
|
||||
raise InvalidPageVariantError("Unable to find version")
|
||||
version = version_match.group()
|
||||
|
||||
# get date
|
||||
unformatted_date_match = re.search(DATE_REGEX, release_info)
|
||||
if unformatted_date_match is None:
|
||||
raise InvalidPageVariantError("Unable to find date")
|
||||
unformatted_date = unformatted_date_match.group()
|
||||
date = format_date(unformatted_date)
|
||||
|
||||
return version, date
|
||||
|
||||
|
||||
def get_version_and_date(release_page: str, release_version: str) -> Tuple[str, str]:
|
||||
""" Get version and date from the given release page """
|
||||
major = int(release_version.split(".")[0])
|
||||
|
||||
# firefox release pages for versions <3.0 don't include release dates, so we
|
||||
# can't match these versions for now.
|
||||
# example: https://www.mozilla.org/en-US/firefox/2.0/releasenotes/
|
||||
if major < 3:
|
||||
raise UnsupportedPageError(f"Unsupported release page: {release_page}")
|
||||
|
||||
# Firefox release pages come in 3 different variants. Unfortunately, there
|
||||
# is no consistent way to determine which variant a page is (say, by version
|
||||
# number), so we have to try each variant until we find one that works.
|
||||
functions = [
|
||||
get_version_and_date_variant_1,
|
||||
get_version_and_date_variant_2,
|
||||
get_version_and_date_variant_3
|
||||
]
|
||||
try:
|
||||
soup = make_bs_request(release_page)
|
||||
except(HTTPError) as e:
|
||||
if(e.code == 404):
|
||||
raise UnpublishedReleaseError(f"The release page is not yet published, got a 404: {release_page}")
|
||||
else:
|
||||
raise e
|
||||
|
||||
for function in functions:
|
||||
try:
|
||||
return function(soup)
|
||||
except (InvalidPageVariantError, AttributeError, IndexError):
|
||||
pass
|
||||
|
||||
raise UnsupportedPageError(f"Unable to find version and date from {release_page}")
|
||||
|
||||
|
||||
def make_bs_request(url: str) -> BeautifulSoup:
|
||||
response = endoflife.fetch_url(url)
|
||||
return BeautifulSoup(response, features="html5lib")
|
||||
|
||||
|
||||
def fetch_releases():
|
||||
releases = {}
|
||||
soup = make_bs_request(URL)
|
||||
|
||||
ff_releases = soup.find_all("ol", class_="c-release-list")
|
||||
with concurrent.futures.ThreadPoolExecutor() as executor:
|
||||
future_to_url = {
|
||||
executor.submit(
|
||||
get_version_and_date,
|
||||
requests.compat.urljoin(URL, p.get("href")),
|
||||
p.get_text()): p.get("href") for p in ff_releases[0].find_all("a")
|
||||
}
|
||||
|
||||
for future in concurrent.futures.as_completed(future_to_url):
|
||||
try:
|
||||
(version, date) = future.result()
|
||||
print(f"{version}: {date}")
|
||||
releases[version] = date
|
||||
except(UnsupportedPageError, UnpublishedReleaseError):
|
||||
print(f"Unsupported release page: {future_to_url[future]}")
|
||||
|
||||
return releases
|
||||
|
||||
|
||||
print(f"::group::{PRODUCT}")
|
||||
versions = fetch_releases()
|
||||
versions = {}
|
||||
|
||||
response = endoflife.fetch_url(URL)
|
||||
ff_releases = BeautifulSoup(response, features="html5lib").find_all("ol", class_="c-release-list")
|
||||
ff_urls = [urllib.parse.urljoin(URL, p.get("href")) for p in ff_releases[0].find_all("a")]
|
||||
|
||||
session = FuturesSession()
|
||||
session.mount('https://', HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.2)))
|
||||
futures = [session.get(url, timeout=30) for url in ff_urls]
|
||||
for future in as_completed(futures):
|
||||
try:
|
||||
response = future.result()
|
||||
soup = BeautifulSoup(response.text, features="html5lib")
|
||||
|
||||
version = response.request.url.split("/")[-3]
|
||||
if soup.find("div", class_="c-release-version"):
|
||||
date = format_date(soup.find("p", class_="c-release-date").get_text())
|
||||
versions[version] = date
|
||||
print(f"{version}: {date}")
|
||||
elif soup.find("small", string=re.compile("^.?First offered")):
|
||||
element = soup.find("small", string=re.compile("^.?First offered"))
|
||||
date = format_date(' '.join(element.get_text().split(" ")[-3:])) # get last 3 words
|
||||
versions[version] = date
|
||||
print(f"{version}: {date}")
|
||||
# we don't get version <= 10.0, not a big deal
|
||||
except ChunkedEncodingError:
|
||||
# This may happen sometimes and will be ignored to not make the script fail,
|
||||
# see https://stackoverflow.com/a/71899731/374236.
|
||||
print(f"Error fetching {response.request.url}: ChunkedEncodingError")
|
||||
|
||||
endoflife.write_releases(PRODUCT, versions)
|
||||
print("::endgroup::")
|
||||
|
||||
Reference in New Issue
Block a user