Support parallel URL fetching via endoflife.fetch_urls (#184)

Move the parallel URL fetching from firefox.py to endoflife.py to make available parallel URL fetching for all scripts.

Also a a fix found on https://stackoverflow.com/a/44511691/374236 to avoid ChunkedEncodingError.
This commit is contained in:
Marc Wrobel
2023-11-26 11:34:18 +01:00
committed by GitHub
parent 3424d04ce0
commit ca689dc373
3 changed files with 52 additions and 38 deletions

View File

@@ -14,9 +14,6 @@ from concurrent.futures import as_completed
URL = "https://www.mozilla.org/en-US/firefox/releases/"
PRODUCT = "firefox"
DATE_REGEX = r"(January|Jan|February|Feb|March|Mar|April|Apr|May|June|Jun|July|Jul|August|Aug|September|Sept|October|Oct|November|Nov|December|Dec)\s+\d{1,2}(st|nd|rd|th)?,\s+\d{4}"
VERSION_REGEX = r"\d+(\.\d+)*"
def format_date(text: str) -> str:
text = text.replace(')', '')
@@ -34,31 +31,22 @@ versions = {}
response = endoflife.fetch_url(URL)
ff_releases = BeautifulSoup(response, features="html5lib").find_all("ol", class_="c-release-list")
ff_urls = [urllib.parse.urljoin(URL, p.get("href")) for p in ff_releases[0].find_all("a")]
urls = [urllib.parse.urljoin(URL, p.get("href")) for p in ff_releases[0].find_all("a")]
session = FuturesSession()
session.mount('https://', HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.2)))
futures = [session.get(url, timeout=30) for url in ff_urls]
for future in as_completed(futures):
try:
response = future.result()
soup = BeautifulSoup(response.text, features="html5lib")
for response in endoflife.fetch_urls(urls):
soup = BeautifulSoup(response.text, features="html5lib")
version = response.request.url.split("/")[-3]
if soup.find("div", class_="c-release-version"):
date = format_date(soup.find("p", class_="c-release-date").get_text())
versions[version] = date
print(f"{version}: {date}")
elif soup.find("small", string=re.compile("^.?First offered")):
element = soup.find("small", string=re.compile("^.?First offered"))
date = format_date(' '.join(element.get_text().split(" ")[-3:])) # get last 3 words
versions[version] = date
print(f"{version}: {date}")
# we don't get version <= 10.0, not a big deal
except ChunkedEncodingError:
# This may happen sometimes and will be ignored to not make the script fail,
# see https://stackoverflow.com/a/71899731/374236.
print(f"Error fetching {response.request.url}: ChunkedEncodingError")
version = response.request.url.split("/")[-3]
if soup.find("div", class_="c-release-version"):
date = format_date(soup.find("p", class_="c-release-date").get_text())
versions[version] = date
print(f"{version}: {date}")
elif soup.find("small", string=re.compile("^.?First offered")):
element = soup.find("small", string=re.compile("^.?First offered"))
date = format_date(' '.join(element.get_text().split(" ")[-3:])) # get last 3 words
versions[version] = date
print(f"{version}: {date}")
# we don't get version <= 10.0, not a big deal
endoflife.write_releases(PRODUCT, versions)
print("::endgroup::")