Files
endoflife-date-release-data/src/firefox.py
2023-01-08 14:36:21 +01:00

165 lines
5.9 KiB
Python

import json
from typing import Tuple
from datetime import datetime
import re
import requests
import urllib.request
from bs4 import BeautifulSoup
import concurrent.futures
"""Fetch Firefox versions with their dates from https://www.mozilla.org/en-US/firefox/releases/"""
URL = "https://www.mozilla.org/en-US/firefox/releases/"
PRODUCT = "firefox"
DATE_REGEX = r"(January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|July|Jul|August|Aug|September|Sept|October|Oct|November|Nov|December|Dec)\s+\d{1,2}(st|nd|rd|th)?,\s+\d{4}"
VERSION_REGEX = r"\d+(\.\d+)*"
class UnsupportedReleasePageError(Exception):
"Raised when a firefox release page is not supported"
pass
class InvalidPageVariantError(Exception):
"Raised when an invalid variant is passed to get_version_and_date"
pass
def format_date(unformatted_date: str) -> str:
""" Format date from July 11, 2002 to 2002-07-11 """
date = re.sub(r'(\d)(st|nd|rd|th)', r'\1', unformatted_date)
formats = ["%b %d, %Y", "%B %d, %Y"]
for f in formats:
try:
return datetime.strptime(date, f).strftime("%Y-%m-%d")
except ValueError:
pass
return ""
def get_version_and_date_varant_1(soup: BeautifulSoup) -> Tuple[str, str]:
""" Version matching for firefox versions >= 28.0 (usually) """
# get version
version = soup.find("div", class_="c-release-version").get_text()
# get date
unformatted_date = soup.find("p", class_="c-release-date").get_text()
date = format_date(unformatted_date)
return (version, date)
def get_version_and_date_variant_2(soup: BeautifulSoup) -> Tuple[str, str]:
""" Version matching for firefox versions >= 10.0 (usually) """
release_info = soup.find("h2").find("small").text
# get version
version_match = re.search(VERSION_REGEX, soup.select('div#nav-access a')[0].get("href"))
if version_match is None:
raise InvalidPageVariantError("Unable to find version")
version = version_match.group()
# get date
unformatted_date_match = re.search(DATE_REGEX, release_info)
if unformatted_date_match is None:
raise InvalidPageVariantError("Unable to find date")
unformatted_date = unformatted_date_match.group()
date = format_date(unformatted_date)
return (version, date)
def get_version_and_date_variant_3(soup: BeautifulSoup) -> Tuple[str, str]:
""" Version matching for firefox versions >= 3.0 (usually) """
release_info = soup.select('div#main-feature p em')[0].get_text()
# get version
version_match = re.search(VERSION_REGEX, release_info)
if version_match is None:
raise InvalidPageVariantError("Unable to find version")
version = version_match.group()
# get date
unformatted_date_match = re.search(DATE_REGEX, release_info)
if unformatted_date_match is None:
raise InvalidPageVariantError("Unable to find date")
unformatted_date = unformatted_date_match.group()
date = format_date(unformatted_date)
return (version, date)
def get_version_and_date(release_page: str, release_version: str) -> Tuple[str, str]:
""" Get version and date from the given release page """
major = int(release_version.split(".")[0])
# firefox release pages for versions <3.0 don't include release dates so we
# can't match these versions for now.
# example: https://www.mozilla.org/en-US/firefox/2.0/releasenotes/
if major < 3:
raise UnsupportedReleasePageError("Unsupported release page: %s" % release_page)
# Firefox release pages come in 3 different variants. Unforunately, there is no
# consistent way to determine which variant a page is (say, by version number), so
# we have to try each variant until we find one that works.
functions = [get_version_and_date_varant_1, get_version_and_date_variant_2, get_version_and_date_variant_3]
soup = make_bs_request(release_page)
for function in functions:
try:
return function(soup)
except (InvalidPageVariantError, AttributeError, IndexError):
pass
raise UnsupportedReleasePageError("Unable to find version and date for %s" % release_page)
def make_bs_request(url: str) -> BeautifulSoup:
""" Make a request to the given url and return a BeautifulSoup object """
last_exception = None
headers = {"user-agent": "mozilla"}
# requests to www.mozilla.org often time out, retry in case of failures
for i in range(0, 5):
try:
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req, timeout=5) as response:
return BeautifulSoup(response.read(), features="html5lib")
except TimeoutError as e:
last_exception = e
print(f"Request to {url} timed out, retrying ({i})...")
continue
raise last_exception
def fetch_releases():
releases = {}
soup = make_bs_request(URL)
ff_releases = soup.find_all("ol", class_="c-release-list")
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_url = {
executor.submit(
get_version_and_date,
requests.compat.urljoin(URL, p.get("href")),
p.get_text()): p.get("href") for p in ff_releases[0].find_all("a")
}
for future in concurrent.futures.as_completed(future_to_url):
try:
(version, date) = future.result()
print("%s: %s" % (version, date))
releases[version] = date
except UnsupportedReleasePageError:
print("Unsupported release page: %s" % future_to_url[future])
return releases
def main():
print(f"::group::{PRODUCT}")
releases = fetch_releases()
with open(f"releases/{PRODUCT}.json", "w") as f:
f.write(json.dumps(dict(
# sort by date then version (desc)
sorted(releases.items(), key=lambda x: (x[1], x[0]), reverse=True)
), indent=2))
print("::endgroup::")
if __name__ == '__main__':
main()