From 2d5145444bf00081acd1e42369e0f1b4a514cc8f Mon Sep 17 00:00:00 2001 From: Marc Wrobel Date: Wed, 21 Feb 2024 00:01:25 +0100 Subject: [PATCH] [apple] Adapt script for single-product usage (#317) Make the Apple script compatible with the way update.py now works, which is 'product' oriented, meaning the script will be called once for each product. To minimize the impacts the responses are now cached to avoid rate-limiting by support.apple.com. Version patterns have also been moved to product's auto configuration to make future changes simpler. --- requirements.txt | 1 + src/apple.py | 47 +++++++++++------------------------------ src/common/endoflife.py | 4 ++-- src/common/http.py | 7 ++++-- 4 files changed, 20 insertions(+), 39 deletions(-) diff --git a/requirements.txt b/requirements.txt index ad1e7322..7f5a125c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ python-frontmatter==1.1.0 # used in endoflife.py to parse products YAML frontmat python-liquid==1.10.2 # used in endoflife.py to render version templates requests==2.31.0 # used in http.py to make HTTP requests simpler requests-futures==1.0.1 # used in http.py to be able to make async HTTP requests +requests-cache==1.2.0 # used in http.py to be able to cache HTTP requests ruamel.yaml==0.18.5 # used in latest.py ruamel.yaml.clib==0.2.8 # used in latest.py soupsieve==2.5 # used in conjunction with beautifulsoup4 diff --git a/src/apple.py b/src/apple.py index dfeffed6..8faf91f0 100644 --- a/src/apple.py +++ b/src/apple.py @@ -1,12 +1,11 @@ import logging import re +import sys from bs4 import BeautifulSoup -from common import dates, http, releasedata +from common import dates, endoflife, http, releasedata -"""Fetches and parses version and release date information from Apple's support website for macOS, -iOS, iPadOS, and watchOS. While all URLs are fetched once for performance reasons, the actual -parsing for each product is done in a separate loop for having easier-to-read logs.""" +"""Fetches and parses version and release date information from Apple's support website.""" URLS = [ "https://support.apple.com/en-us/HT201222", # latest @@ -22,38 +21,16 @@ URLS = [ "http://web.archive.org/web/20230204234533_/https://support.apple.com/en-us/HT1263", # 2005-2007 ] -# If you are changing these, please use -# https://gist.githubusercontent.com/captn3m0/e7cb1f4fc3c07a5da0296ebda2b33e15/raw/5747e42ad611ec9ffdb7a2d1c0e3946bb87ab6d7/apple.txt -# as your corpus to validate your changes -VERSION_PATTERNS = { - "macos": [ - # This covers Sierra and beyond - re.compile(r"^macOS[\D]+(?P\d+(?:\.\d+)*)", re.MULTILINE), - # This covers Mavericks - El Capitan - re.compile(r"OS\s+X\s[\w\s]+\sv?(?P\d+(?:\.\d+)+)", re.MULTILINE), - # This covers even older versions (OS X) - re.compile(r"^Mac\s+OS\s+X\s[\w\s]+\sv?(?P\d{2}(?:\.\d+)+)", re.MULTILINE), - ], - "ios": [ - re.compile(r"iOS\s+(?P\d+)", re.MULTILINE), - re.compile(r"iOS\s+(?P\d+(?:\.\d+)+)", re.MULTILINE), - re.compile(r"iPhone\s+v?(?P\d+(?:\.\d+)+)", re.MULTILINE), - ], - "ipados": [ - re.compile(r"iPadOS\s+(?P\d+)", re.MULTILINE), - re.compile(r"iPadOS\s+(?P\d+(?:\.\d+)+)", re.MULTILINE), - ], - "watchos": [ - re.compile(r"watchOS\s+(?P\d+)", re.MULTILINE), - re.compile(r"watchOS\s+(?P\d+(?:\.\d+)+)", re.MULTILINE), - ], -} - DATE_PATTERN = re.compile(r"\b\d+\s[A-Za-z]+\s\d+\b") +METHOD = 'apple' + +p_filter = sys.argv[1] if len(sys.argv) > 1 else None +m_filter = sys.argv[2] if len(sys.argv) > 2 else None +for config in endoflife.list_configs(p_filter, METHOD, m_filter): + with releasedata.ProductData(config.product) as product_data: + # URLs are cached to avoid rate limiting by support.apple.com. + soups = [BeautifulSoup(response.text, features="html5lib") for response in http.fetch_urls(URLS, cache=True)] -soups = [BeautifulSoup(response.text, features="html5lib") for response in http.fetch_urls(URLS)] -for product_name in VERSION_PATTERNS: - with releasedata.ProductData(product_name) as product_data: for soup in soups: versions_table = soup.find(id="tableWraper") versions_table = versions_table if versions_table else soup.find('table', class_="gb-table") @@ -70,7 +47,7 @@ for product_name in VERSION_PATTERNS: date_str = date_match.group(0).replace("Sept ", "Sep ") date = dates.parse_date(date_str) - for version_pattern in VERSION_PATTERNS[product_data.name]: + for version_pattern in config.include_version_patterns: for version_str in version_pattern.findall(version_text): version = product_data.get_version(version_str) if not version or version.date() > date: diff --git a/src/common/endoflife.py b/src/common/endoflife.py index ae246ff8..307a3956 100644 --- a/src/common/endoflife.py +++ b/src/common/endoflife.py @@ -29,11 +29,11 @@ class AutoConfig: regexes_include = data.get("regex", DEFAULT_VERSION_REGEX) regexes_include = regexes_include if isinstance(regexes_include, list) else [regexes_include] - self.include_version_patterns = [re.compile(r) for r in regexes_include] + self.include_version_patterns = [re.compile(r, re.MULTILINE) for r in regexes_include] regexes_exclude = data.get("regex_exclude", []) regexes_exclude = regexes_exclude if isinstance(regexes_exclude, list) else [regexes_exclude] - self.exclude_version_patterns = [re.compile(r) for r in regexes_exclude] + self.exclude_version_patterns = [re.compile(r, re.MULTILINE) for r in regexes_exclude] def first_match(self, version: str) -> re.Match | None: for exclude_pattern in self.exclude_version_patterns: diff --git a/src/common/http.py b/src/common/http.py index 400c55f4..742b3f26 100644 --- a/src/common/http.py +++ b/src/common/http.py @@ -5,6 +5,7 @@ from playwright.sync_api import sync_playwright from requests import Response from requests.adapters import HTTPAdapter from requests.exceptions import ChunkedEncodingError +from requests_cache import CachedSession from requests_futures.sessions import FuturesSession from urllib3.util import Retry @@ -13,11 +14,13 @@ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/1 def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None, - max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> list[Response]: + max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30, + cache: bool = False) -> list[Response]: logging.info(f"Fetching {urls}") try: - with FuturesSession() as session: + underlying_session = CachedSession('/tmp/http_cache', backend='filesystem') if cache else None + with FuturesSession(session=underlying_session) as session: adapter = HTTPAdapter(max_retries=Retry(total=max_retries, backoff_factor=backoff_factor)) session.mount('http://', adapter) session.mount('https://', adapter)