diff --git a/requirements.txt b/requirements.txt index ad1e7322..7f5a125c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ python-frontmatter==1.1.0 # used in endoflife.py to parse products YAML frontmat python-liquid==1.10.2 # used in endoflife.py to render version templates requests==2.31.0 # used in http.py to make HTTP requests simpler requests-futures==1.0.1 # used in http.py to be able to make async HTTP requests +requests-cache==1.2.0 # used in http.py to be able to cache HTTP requests ruamel.yaml==0.18.5 # used in latest.py ruamel.yaml.clib==0.2.8 # used in latest.py soupsieve==2.5 # used in conjunction with beautifulsoup4 diff --git a/src/apple.py b/src/apple.py index dfeffed6..8faf91f0 100644 --- a/src/apple.py +++ b/src/apple.py @@ -1,12 +1,11 @@ import logging import re +import sys from bs4 import BeautifulSoup -from common import dates, http, releasedata +from common import dates, endoflife, http, releasedata -"""Fetches and parses version and release date information from Apple's support website for macOS, -iOS, iPadOS, and watchOS. While all URLs are fetched once for performance reasons, the actual -parsing for each product is done in a separate loop for having easier-to-read logs.""" +"""Fetches and parses version and release date information from Apple's support website.""" URLS = [ "https://support.apple.com/en-us/HT201222", # latest @@ -22,38 +21,16 @@ URLS = [ "http://web.archive.org/web/20230204234533_/https://support.apple.com/en-us/HT1263", # 2005-2007 ] -# If you are changing these, please use -# https://gist.githubusercontent.com/captn3m0/e7cb1f4fc3c07a5da0296ebda2b33e15/raw/5747e42ad611ec9ffdb7a2d1c0e3946bb87ab6d7/apple.txt -# as your corpus to validate your changes -VERSION_PATTERNS = { - "macos": [ - # This covers Sierra and beyond - re.compile(r"^macOS[\D]+(?P\d+(?:\.\d+)*)", re.MULTILINE), - # This covers Mavericks - El Capitan - re.compile(r"OS\s+X\s[\w\s]+\sv?(?P\d+(?:\.\d+)+)", re.MULTILINE), - # This covers even older versions (OS X) - re.compile(r"^Mac\s+OS\s+X\s[\w\s]+\sv?(?P\d{2}(?:\.\d+)+)", re.MULTILINE), - ], - "ios": [ - re.compile(r"iOS\s+(?P\d+)", re.MULTILINE), - re.compile(r"iOS\s+(?P\d+(?:\.\d+)+)", re.MULTILINE), - re.compile(r"iPhone\s+v?(?P\d+(?:\.\d+)+)", re.MULTILINE), - ], - "ipados": [ - re.compile(r"iPadOS\s+(?P\d+)", re.MULTILINE), - re.compile(r"iPadOS\s+(?P\d+(?:\.\d+)+)", re.MULTILINE), - ], - "watchos": [ - re.compile(r"watchOS\s+(?P\d+)", re.MULTILINE), - re.compile(r"watchOS\s+(?P\d+(?:\.\d+)+)", re.MULTILINE), - ], -} - DATE_PATTERN = re.compile(r"\b\d+\s[A-Za-z]+\s\d+\b") +METHOD = 'apple' + +p_filter = sys.argv[1] if len(sys.argv) > 1 else None +m_filter = sys.argv[2] if len(sys.argv) > 2 else None +for config in endoflife.list_configs(p_filter, METHOD, m_filter): + with releasedata.ProductData(config.product) as product_data: + # URLs are cached to avoid rate limiting by support.apple.com. + soups = [BeautifulSoup(response.text, features="html5lib") for response in http.fetch_urls(URLS, cache=True)] -soups = [BeautifulSoup(response.text, features="html5lib") for response in http.fetch_urls(URLS)] -for product_name in VERSION_PATTERNS: - with releasedata.ProductData(product_name) as product_data: for soup in soups: versions_table = soup.find(id="tableWraper") versions_table = versions_table if versions_table else soup.find('table', class_="gb-table") @@ -70,7 +47,7 @@ for product_name in VERSION_PATTERNS: date_str = date_match.group(0).replace("Sept ", "Sep ") date = dates.parse_date(date_str) - for version_pattern in VERSION_PATTERNS[product_data.name]: + for version_pattern in config.include_version_patterns: for version_str in version_pattern.findall(version_text): version = product_data.get_version(version_str) if not version or version.date() > date: diff --git a/src/common/endoflife.py b/src/common/endoflife.py index ae246ff8..307a3956 100644 --- a/src/common/endoflife.py +++ b/src/common/endoflife.py @@ -29,11 +29,11 @@ class AutoConfig: regexes_include = data.get("regex", DEFAULT_VERSION_REGEX) regexes_include = regexes_include if isinstance(regexes_include, list) else [regexes_include] - self.include_version_patterns = [re.compile(r) for r in regexes_include] + self.include_version_patterns = [re.compile(r, re.MULTILINE) for r in regexes_include] regexes_exclude = data.get("regex_exclude", []) regexes_exclude = regexes_exclude if isinstance(regexes_exclude, list) else [regexes_exclude] - self.exclude_version_patterns = [re.compile(r) for r in regexes_exclude] + self.exclude_version_patterns = [re.compile(r, re.MULTILINE) for r in regexes_exclude] def first_match(self, version: str) -> re.Match | None: for exclude_pattern in self.exclude_version_patterns: diff --git a/src/common/http.py b/src/common/http.py index 400c55f4..742b3f26 100644 --- a/src/common/http.py +++ b/src/common/http.py @@ -5,6 +5,7 @@ from playwright.sync_api import sync_playwright from requests import Response from requests.adapters import HTTPAdapter from requests.exceptions import ChunkedEncodingError +from requests_cache import CachedSession from requests_futures.sessions import FuturesSession from urllib3.util import Retry @@ -13,11 +14,13 @@ USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/1 def fetch_urls(urls: list[str], data: any = None, headers: dict[str, str] = None, - max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30) -> list[Response]: + max_retries: int = 10, backoff_factor: float = 0.5, timeout: int = 30, + cache: bool = False) -> list[Response]: logging.info(f"Fetching {urls}") try: - with FuturesSession() as session: + underlying_session = CachedSession('/tmp/http_cache', backend='filesystem') if cache else None + with FuturesSession(session=underlying_session) as session: adapter = HTTPAdapter(max_retries=Retry(total=max_retries, backoff_factor=backoff_factor)) session.mount('http://', adapter) session.mount('https://', adapter)