diff --git a/src/common/endoflife.py b/src/common/endoflife.py index 42453611..3b40f05c 100644 --- a/src/common/endoflife.py +++ b/src/common/endoflife.py @@ -1,10 +1,12 @@ +import http.client import json import frontmatter -from requests import Session -from requests.adapters import HTTPAdapter -from urllib3.util import Retry +from concurrent.futures import as_completed from glob import glob from os import path +from requests.adapters import HTTPAdapter +from requests_futures.sessions import FuturesSession +from urllib3.util import Retry # See https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent. USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0' @@ -41,12 +43,20 @@ def list_products(method, products_filter=None, pathname="website/products"): # Keep the default timeout high enough to avoid errors with web.archive.org. -def fetch_url(url, retry_count=5, timeout=30, data=None, headers=None): - headers = {'User-Agent': USER_AGENT} | {} if headers is None else headers - with Session() as s: - s.mount('https://', HTTPAdapter(max_retries=Retry(total=retry_count, backoff_factor=0.2))) - r = s.get(url, headers=headers, data=data, timeout=timeout) - return r.text +def fetch_urls(urls, data=None, headers=None, max_retries=5, backoff_factor=0.5, timeout=30): + adapter = HTTPAdapter(max_retries=Retry(total=max_retries, backoff_factor=backoff_factor)) + session = FuturesSession() + session.mount('http://', adapter) + session.mount('https://', adapter) + + headers = {'User-Agent': USER_AGENT} | ({} if headers is None else headers) + futures = [session.get(url, headers=headers, data=data, timeout=timeout) for url in urls] + + return [future.result() for future in as_completed(futures)] + + +def fetch_url(url, data=None, headers=None, max_retries=5, backoff_factor=0.5, timeout=30): + return fetch_urls([url], data, headers, max_retries, backoff_factor, timeout)[0].text def write_releases(product, releases, pathname="releases"): @@ -55,3 +65,19 @@ def write_releases(product, releases, pathname="releases"): # sort by date then version (desc) sorted(releases.items(), key=lambda x: (x[1], x[0]), reverse=True) ), indent=2)) + + +def patch_http_response_read(func): + def inner(*args): + try: + return func(*args) + except http.client.IncompleteRead as e: + return e.partial + return inner + + +# This patch HTTPResponse to prevent ChunkedEncodingError when fetching some websites, such as +# Mozilla's website. According to https://stackoverflow.com/a/44511691/374236, this is an issue at +# server side that cannot be avoided: most servers transmit all data, but due implementation errors +# they wrongly close session. +http.client.HTTPResponse.read = patch_http_response_read(http.client.HTTPResponse.read) diff --git a/src/cos.py b/src/cos.py index b1e92dfc..2a5e6a40 100644 --- a/src/cos.py +++ b/src/cos.py @@ -10,16 +10,16 @@ REGEX = r"^(cos-\d+-\d+-\d+-\d+)" def fetch_all_milestones(): url = "https://cloud.google.com/container-optimized-os/docs/release-notes/" # Retry as Google Docs often returns SSL errors. - response = endoflife.fetch_url(url, retry_count=10) + response = endoflife.fetch_url(url) soup = BeautifulSoup(response, features="html5lib") - milestones = soup.find_all('td', text=re.compile(r'COS \d+ LTS')) + milestones = soup.find_all('td', string=re.compile(r'COS \d+ LTS')) return [m.text.split(' ')[1] for m in milestones] def fetch_milestone(channel): url = f"https://cloud.google.com/container-optimized-os/docs/release-notes/m{channel}" # Retry as Google Docs often returns SSL errors. - response = endoflife.fetch_url(url, retry_count=10) + response = endoflife.fetch_url(url) return BeautifulSoup(response, features="html5lib") diff --git a/src/firefox.py b/src/firefox.py index e1e78a41..2fae4851 100644 --- a/src/firefox.py +++ b/src/firefox.py @@ -14,9 +14,6 @@ from concurrent.futures import as_completed URL = "https://www.mozilla.org/en-US/firefox/releases/" PRODUCT = "firefox" -DATE_REGEX = r"(January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|July|Jul|August|Aug|September|Sept|October|Oct|November|Nov|December|Dec)\s+\d{1,2}(st|nd|rd|th)?,\s+\d{4}" -VERSION_REGEX = r"\d+(\.\d+)*" - def format_date(text: str) -> str: text = text.replace(')', '') @@ -34,31 +31,22 @@ versions = {} response = endoflife.fetch_url(URL) ff_releases = BeautifulSoup(response, features="html5lib").find_all("ol", class_="c-release-list") -ff_urls = [urllib.parse.urljoin(URL, p.get("href")) for p in ff_releases[0].find_all("a")] +urls = [urllib.parse.urljoin(URL, p.get("href")) for p in ff_releases[0].find_all("a")] -session = FuturesSession() -session.mount('https://', HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.2))) -futures = [session.get(url, timeout=30) for url in ff_urls] -for future in as_completed(futures): - try: - response = future.result() - soup = BeautifulSoup(response.text, features="html5lib") +for response in endoflife.fetch_urls(urls): + soup = BeautifulSoup(response.text, features="html5lib") - version = response.request.url.split("/")[-3] - if soup.find("div", class_="c-release-version"): - date = format_date(soup.find("p", class_="c-release-date").get_text()) - versions[version] = date - print(f"{version}: {date}") - elif soup.find("small", string=re.compile("^.?First offered")): - element = soup.find("small", string=re.compile("^.?First offered")) - date = format_date(' '.join(element.get_text().split(" ")[-3:])) # get last 3 words - versions[version] = date - print(f"{version}: {date}") - # we don't get version <= 10.0, not a big deal - except ChunkedEncodingError: - # This may happen sometimes and will be ignored to not make the script fail, - # see https://stackoverflow.com/a/71899731/374236. - print(f"Error fetching {response.request.url}: ChunkedEncodingError") + version = response.request.url.split("/")[-3] + if soup.find("div", class_="c-release-version"): + date = format_date(soup.find("p", class_="c-release-date").get_text()) + versions[version] = date + print(f"{version}: {date}") + elif soup.find("small", string=re.compile("^.?First offered")): + element = soup.find("small", string=re.compile("^.?First offered")) + date = format_date(' '.join(element.get_text().split(" ")[-3:])) # get last 3 words + versions[version] = date + print(f"{version}: {date}") + # we don't get version <= 10.0, not a big deal endoflife.write_releases(PRODUCT, versions) print("::endgroup::")