From dc3f4e06530ce489a7042c26c4c1ade88d3c79af Mon Sep 17 00:00:00 2001
From: Marc Wrobel <marc.wrobel@gmail.com>
Date: Sun, 4 Feb 2024 18:05:18 +0100
Subject: [PATCH] [firefox][unity] Add support for cumulative updates and use
 it in a few scripts (#295)

Generic support for cumulative updates has been added to speed up execution time of some scripts that were very long (in comparison with the vast majority of products), usually because they were involving a lot of HTTP requests.

This feature was developed particularily for the firefox.py and unity.py scripts, which was often very long to execute (a minute or moreaccording to GHA summaries). Those scripts has been updated to make use of this new feature.
---
 src/aws-lambda.py         |  3 +--
 src/common/releasedata.py | 47 ++++++++++++++++++++++----------------
 src/firefox.py            | 48 +++++++++++++++------------------------
 src/unity.py              | 38 ++++++++++++++++---------------
 4 files changed, 67 insertions(+), 69 deletions(-)

diff --git a/src/aws-lambda.py b/src/aws-lambda.py
index 0aad7b03..4c75d617 100644
--- a/src/aws-lambda.py
+++ b/src/aws-lambda.py
@@ -13,7 +13,6 @@ them though. Note that this would also be unnecessary if it was possible to disa
 release dates updates in the latest.py script."""
 
 with releasedata.ProductData("aws-lambda") as product_data:
-    old_product_data = releasedata.ProductData.from_file(product_data.name)
     product_frontmatter = endoflife.ProductFrontmatter(product_data.name)
     response = http.fetch_url("https://docs.aws.amazon.com/lambda/latest/dg/lambda-runtimes.html")
     soup = BeautifulSoup(response.text, features="html5lib")
@@ -30,7 +29,7 @@ with releasedata.ProductData("aws-lambda") as product_data:
 
             date = product_frontmatter.get_release_date(identifier)  # use the product releaseDate if available
             if date is None:
-                date = old_product_data.get_version(identifier).date()  # else use the previously found date
+                date = product_data.get_previous_version(identifier).date()  # else use the previously found date
             if date is None:
                 date = dates.today()  # else use today's date
 
diff --git a/src/common/releasedata.py b/src/common/releasedata.py
index 83018fa1..993aecbf 100644
--- a/src/common/releasedata.py
+++ b/src/common/releasedata.py
@@ -15,13 +15,14 @@ VERSIONS_PATH = Path(os.environ.get("VERSIONS_PATH", "releases"))
 class ProductUpdateError(Exception):
     """Custom exceptions raised when unexpected errors occur during product updates."""
 
+
 class ProductVersion:
-    def __init__(self, product: "ProductData", data: dict) -> None:
-        self.product = str(product)
+    def __init__(self, product: str, data: dict) -> None:
+        self.product = product
         self.data = data
 
     @staticmethod
-    def of(product: "ProductData", name: str, date: datetime) -> "ProductVersion":
+    def of(product: str, name: str, date: datetime) -> "ProductVersion":
         return ProductVersion(product, {
             "name": name,
             "date": date.strftime("%Y-%m-%d"),
@@ -36,18 +37,38 @@ class ProductVersion:
     def replace_date(self, date: datetime) -> None:
         self.data["date"] = date.strftime("%Y-%m-%d")
 
+    def copy(self) -> "ProductVersion":
+        return ProductVersion(self.product, self.data.copy())
+
     def __repr__(self) -> str:
         return f"{self.product}#{self.name()} ({self.date()})"
 
 
 class ProductData:
-    def __init__(self, name: str) -> None:
+    def __init__(self, name: str, cumulative_update: bool = False) -> None:
         self.name: str = name
+        self.cumulative_update: bool = cumulative_update
         self.versions_path: Path = VERSIONS_PATH / f"{name}.json"
         self.versions: dict[str, ProductVersion] = {}
+        self.previous_versions: dict[str, ProductVersion] = {}
 
     def __enter__(self) -> "ProductData":
         logging.info(f"::group::{self}")
+
+        if self.versions_path.is_file():
+            with self.versions_path.open() as f:
+                for json_version in json.load(f)["versions"].values():
+                    version = ProductVersion(self.name, json_version)
+                    self.previous_versions[version.name()] = version
+            logging.info(f"loaded previous versions data for {self} from {self.versions_path}")
+        else:
+            logging.info(f"no previous versions data found for {self} at {self.versions_path}")
+
+        if self.cumulative_update:
+            logging.info(f"cumulative update is enabled for {self}, will reuse previous versions data")
+            for name, version in self.previous_versions.items():
+                self.versions[name] = version.copy()
+
         return self
 
     def __exit__(self, exc_type: Optional[Type[BaseException]], exc_value: Optional[BaseException],
@@ -68,24 +89,12 @@ class ProductData:
         finally:
             logging.info("::endgroup::")
 
-    @staticmethod
-    def from_file(name: str) -> "ProductData":
-        product = ProductData(name)
-
-        if product.versions_path.is_file():
-            with product.versions_path.open() as f:
-                for json_version in json.load(f)["versions"].values():
-                    version = ProductVersion(product, json_version)
-                    product.versions[version.name()] = version
-            logging.info(f"loaded versions data for {product} from {product.versions_path}")
-        else:
-            logging.warning(f"no versions data found for {product} at {product.versions_path}")
-
-        return product
-
     def get_version(self, version: str) -> ProductVersion:
         return self.versions[version] if version in self.versions else None
 
+    def get_previous_version(self, version: str) -> ProductVersion:
+        return self.previous_versions[version] if version in self.previous_versions else None
+
     def declare_version(self, version: str, date: datetime) -> None:
         if version in self.versions and self.versions[version].date() != date:
             logging.info(f"overwriting {version} ({self.get_version(version).date()} -> {date}) for {self}")
diff --git a/src/firefox.py b/src/firefox.py
index 76618be0..5a338f6c 100644
--- a/src/firefox.py
+++ b/src/firefox.py
@@ -1,45 +1,33 @@
-import re
 import urllib.parse
-from itertools import islice
 
 from bs4 import BeautifulSoup
 from common import dates, http, releasedata
 
 """Fetch Firefox versions with their dates from https://www.mozilla.org/.
 
-Versions lower than 10.0 are ignored because too difficult to parse."""
+This script is cumulative: previously found versions are kept, and eventually updated if needed. It only considers the
+first MAX_VERSIONS_COUNT versions on Firefox release page because:
+- it is too long to fetch them all (at least a minute usually),
+- this generates too many requests to the mozilla.org servers,
+- and anyway oldest versions are never updated.
 
+Note that it was assumed that:
+- the script is ran regularly enough to keep the versions up to date (once a day or week looks enough),
+- the versions are listed in descending order on the page,
+- new versions are always added inside in the last MAX_VERSIONS_COUNT versions.
 
-# Will be replaced by itertools.batched in Python 3.12+.
-# See https://docs.python.org/3/library/itertools.html#itertools.batched.
-def batched(iterable: iter, n: int) -> iter:
-    if n < 1:
-        msg = 'n must be at least one'
-        raise ValueError(msg)
-    it = iter(iterable)
-    while batch := tuple(islice(it, n)):
-        yield batch
+The script will need to be updated if someday those conditions are not met."""
 
+MAX_VERSIONS_LIMIT = 50
 
-with releasedata.ProductData("firefox") as product_data:
+with releasedata.ProductData("firefox", cumulative_update=True) as product_data:
     releases_page = http.fetch_url("https://www.mozilla.org/en-US/firefox/releases/")
     releases_soup = BeautifulSoup(releases_page.text, features="html5lib")
     releases_list = releases_soup.find_all("ol", class_="c-release-list")
+
     release_notes_urls = [urllib.parse.urljoin(releases_page.url, p.get("href")) for p in releases_list[0].find_all("a")]
-
-    for batch_release_notes_urls in batched(release_notes_urls, 20):
-        for release_notes in http.fetch_urls(batch_release_notes_urls):
-            version = release_notes.url.split("/")[-3]
-
-            release_notes_soup = BeautifulSoup(release_notes.text, features="html5lib")
-            date_elt = release_notes_soup.find(class_="c-release-date")
-            if date_elt:
-                date = dates.parse_date(date_elt.get_text())
-                product_data.declare_version(version, date)
-                continue
-
-            date_elt = release_notes_soup.find("small", string=re.compile("^.?First offered"))
-            if date_elt:
-                date = dates.parse_date(' '.join(date_elt.get_text().split(" ")[-3:]))  # get last 3 words
-                product_data.declare_version(version, date)
-            # versions < 10.0 are ignored
+    for release_notes in http.fetch_urls(release_notes_urls[:MAX_VERSIONS_LIMIT]):
+        version = release_notes.url.split("/")[-3]
+        release_notes_soup = BeautifulSoup(release_notes.text, features="html5lib")
+        date_str = release_notes_soup.find(class_="c-release-date").get_text()  # note: only works for versions > 25
+        product_data.declare_version(version, dates.parse_date(date_str))
diff --git a/src/unity.py b/src/unity.py
index 704344b9..2aaf7e38 100644
--- a/src/unity.py
+++ b/src/unity.py
@@ -1,25 +1,27 @@
 from bs4 import BeautifulSoup
 from common import dates, http, releasedata
 
-# Fetches the Unity LTS releases from the Unity website. Non-LTS releases are not listed there,
-# so this automation is only partial.
-#
-# This script iterates over all pages of the Unity LTS releases page, which is paginated.
-# It keeps fetching the next page until there is no next page link.
+"""Fetches the Unity LTS releases from the Unity website. Non-LTS releases are not listed there, so this automation
+is only partial.
 
-BASE_URL = "https://unity.com/releases/editor/qa/lts-releases"
+This script is cumulative, only the first page is fetched (e.g. the first ten versions). This is because:
+- it is too long to fetch all (at least 30s, usually more than a minute),
+- this generates too many requests to the unity.com servers,
+- fetching multiple pages in parallel is raising a lot of errors and makes the overall process slower (this was tested
+  during https://github.com/endoflife-date/release-data/pull/194),
+- and anyway oldest versions are never updated.
 
-next_page_url = BASE_URL
-with releasedata.ProductData("unity") as product_data:
-    # Do not try to fetch multiple pages in parallel: it is raising a lot of errors and make the overall process slower.
-    while next_page_url:
-        response = http.fetch_url(next_page_url)
-        soup = BeautifulSoup(response.text, features="html5lib")
+Note that it was assumed that:
+- the script is ran regularly enough to keep the versions up to date (once a day or week looks enough),
+- there is never more than 10 new LTS versions at a time.
 
-        for release in soup.find_all('div', class_='component-releases-item__show__inner-header'):
-            version = release.find('h4').find('span').text
-            date = dates.parse_datetime(release.find('time').attrs['datetime'])
-            product_data.declare_version(version, date)
+The script will need to be updated if someday those conditions are not met."""
 
-        next_link = soup.find('a', {"rel": "next"})
-        next_page_url = BASE_URL + next_link.attrs['href'] if next_link else None
+with releasedata.ProductData("unity", cumulative_update=True) as product_data:
+    response = http.fetch_url("https://unity.com/releases/editor/qa/lts-releases")
+    soup = BeautifulSoup(response.text, features="html5lib")
+
+    for release in soup.find_all('div', class_='component-releases-item__show__inner-header'):
+        version = release.find('h4').find('span').text
+        date = dates.parse_datetime(release.find('time').attrs['datetime'])
+        product_data.declare_version(version, date)