[apple] Adapt script for single-product usage (#317)

Make the Apple script compatible with the way update.py now works, which is 'product' oriented, meaning the script will be called once for each product. To minimize the impacts the responses are now cached to avoid rate-limiting by support.apple.com. Version patterns have also been moved to product's auto configuration to make future changes simpler.
2024-02-21 00:01:25 +01:00
parent b11f01bc62
commit 2d5145444b
4 changed files with 20 additions and 39 deletions
--- a/src/apple.py
+++ b/src/apple.py
@@ -1,12 +1,11 @@
 import logging
 import re
+import sys

 from bs4 import BeautifulSoup
-from common import dates, http, releasedata
+from common import dates, endoflife, http, releasedata

-"""Fetches and parses version and release date information from Apple's support website for macOS,
-iOS, iPadOS, and watchOS. While all URLs are fetched once for performance reasons, the actual
-parsing for each product is done in a separate loop for having easier-to-read logs."""
+"""Fetches and parses version and release date information from Apple's support website."""

 URLS = [
    "https://support.apple.com/en-us/HT201222",  # latest
@@ -22,38 +21,16 @@ URLS = [
    "http://web.archive.org/web/20230204234533_/https://support.apple.com/en-us/HT1263",  # 2005-2007
 ]

-# If you are changing these, please use
-# https://gist.githubusercontent.com/captn3m0/e7cb1f4fc3c07a5da0296ebda2b33e15/raw/5747e42ad611ec9ffdb7a2d1c0e3946bb87ab6d7/apple.txt
-# as your corpus to validate your changes
-VERSION_PATTERNS = {
-    "macos": [
-        # This covers Sierra and beyond
-        re.compile(r"^macOS[\D]+(?P<version>\d+(?:\.\d+)*)", re.MULTILINE),
-        # This covers Mavericks - El Capitan
-        re.compile(r"OS\s+X\s[\w\s]+\sv?(?P<version>\d+(?:\.\d+)+)", re.MULTILINE),
-        # This covers even older versions (OS X)
-        re.compile(r"^Mac\s+OS\s+X\s[\w\s]+\sv?(?P<version>\d{2}(?:\.\d+)+)", re.MULTILINE),
-    ],
-    "ios": [
-        re.compile(r"iOS\s+(?P<version>\d+)", re.MULTILINE),
-        re.compile(r"iOS\s+(?P<version>\d+(?:\.\d+)+)", re.MULTILINE),
-        re.compile(r"iPhone\s+v?(?P<version>\d+(?:\.\d+)+)", re.MULTILINE),
-    ],
-    "ipados": [
-        re.compile(r"iPadOS\s+(?P<version>\d+)", re.MULTILINE),
-        re.compile(r"iPadOS\s+(?P<version>\d+(?:\.\d+)+)", re.MULTILINE),
-    ],
-    "watchos": [
-        re.compile(r"watchOS\s+(?P<version>\d+)", re.MULTILINE),
-        re.compile(r"watchOS\s+(?P<version>\d+(?:\.\d+)+)", re.MULTILINE),
-    ],
-}
-
 DATE_PATTERN = re.compile(r"\b\d+\s[A-Za-z]+\s\d+\b")
+METHOD = 'apple'
+
+p_filter = sys.argv[1] if len(sys.argv) > 1 else None
+m_filter = sys.argv[2] if len(sys.argv) > 2 else None
+for config in endoflife.list_configs(p_filter, METHOD, m_filter):
+    with releasedata.ProductData(config.product) as product_data:
+        # URLs are cached to avoid rate limiting by support.apple.com.
+        soups = [BeautifulSoup(response.text, features="html5lib") for response in http.fetch_urls(URLS, cache=True)]

-soups = [BeautifulSoup(response.text, features="html5lib") for response in http.fetch_urls(URLS)]
-for product_name in VERSION_PATTERNS:
-    with releasedata.ProductData(product_name) as product_data:
        for soup in soups:
            versions_table = soup.find(id="tableWraper")
            versions_table = versions_table if versions_table else soup.find('table', class_="gb-table")
@@ -70,7 +47,7 @@ for product_name in VERSION_PATTERNS:

                date_str = date_match.group(0).replace("Sept ", "Sep ")
                date = dates.parse_date(date_str)
-                for version_pattern in VERSION_PATTERNS[product_data.name]:
+                for version_pattern in config.include_version_patterns:
                    for version_str in version_pattern.findall(version_text):
                        version = product_data.get_version(version_str)
                        if not version or version.date() > date:
--- a/src/common/endoflife.py
+++ b/src/common/endoflife.py
@@ -29,11 +29,11 @@ class AutoConfig:

        regexes_include = data.get("regex", DEFAULT_VERSION_REGEX)
        regexes_include = regexes_include if isinstance(regexes_include, list) else [regexes_include]
-        self.include_version_patterns = [re.compile(r) for r in regexes_include]
+        self.include_version_patterns = [re.compile(r, re.MULTILINE) for r in regexes_include]

        regexes_exclude = data.get("regex_exclude", [])
        regexes_exclude = regexes_exclude if isinstance(regexes_exclude, list) else [regexes_exclude]
-        self.exclude_version_patterns = [re.compile(r) for r in regexes_exclude]
+        self.exclude_version_patterns = [re.compile(r, re.MULTILINE) for r in regexes_exclude]

    def first_match(self, version: str) -> re.Match | None:
        for exclude_pattern in self.exclude_version_patterns:
--- a/src/common/http.py
+++ b/src/common/http.py
@@ -5,6 +5,7 @@ from playwright.sync_api import sync_playwright
 from requests import Response
 from requests.adapters import HTTPAdapter
 from requests.exceptions import ChunkedEncodingError
+from requests_cache import CachedSession
 from requests_futures.sessions import FuturesSession
 from urllib3.util import Retry

@@ -13,11 +14,13 @@ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/1


 def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None,
-               max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> list[Response]:
+               max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30,
+               cache: bool = False) -> list[Response]:
    logging.info(f"Fetching {urls}")

    try:
-        with FuturesSession() as session:
+        underlying_session = CachedSession('/tmp/http_cache', backend='filesystem') if cache else None
+        with FuturesSession(session=underlying_session) as session:
            adapter = HTTPAdapter(max_retries=Retry(total=max_retries, backoff_factor=backoff_factor))
            session.mount('http://', adapter)
            session.mount('https://', adapter)