From f59390815c7431346dd2602adf75a338317cf357 Mon Sep 17 00:00:00 2001 From: Marc Wrobel Date: Sun, 26 Nov 2023 13:23:17 +0100 Subject: [PATCH] Try to fix fetch_urls when ChunkedEncodingError occurs (#188) Intermittent ChunkedEncodingErrors occurs while fetching URLs. This change try to fix it by retrying. According to https://stackoverflow.com/a/44511691/374236, most servers transmit all data, but that's not what was observed. For future reference the traceback was: ``` During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/home/runner/work/release-data/release-data/src/firefox.py", line 36, in for response in endoflife.fetch_urls(urls): ^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/work/release-data/release-data/src/common/endoflife.py", line 55, in fetch_urls return [future.result() for future in as_completed(futures)] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/home/runner/work/release-data/release-data/src/common/endoflife.py", line 55, in return [future.result() for future in as_completed(futures)] ^^^^^^^^^^^^^^^ File "/opt/hostedtoolcache/Python/3.11.6/x64/lib/python3.11/concurrent/futures/_base.py", line 449, in result return self.__get_result() ^^^^^^^^^^^^^^^^^^^ File "/opt/hostedtoolcache/Python/3.11.6/x64/lib/python3.11/concurrent/futures/_base.py", line 401, in __get_result raise self._exception File "/opt/hostedtoolcache/Python/3.11.6/x64/lib/python3.11/concurrent/futures/thread.py", line 58, in run result = self.fn(*self.args, **self.kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/hostedtoolcache/Python/3.11.6/x64/lib/python3.11/site-packages/requests/sessions.py", line 589, in request resp = self.send(prep, **send_kwargs) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/hostedtoolcache/Python/3.11.6/x64/lib/python3.11/site-packages/requests/sessions.py", line 747, in send r.content File "/opt/hostedtoolcache/Python/3.11.6/x64/lib/python3.11/site-packages/requests/models.py", line 899, in content self._content = b"".join(self.iter_content(CONTENT_CHUNK_SIZE)) or b"" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/opt/hostedtoolcache/Python/3.11.6/x64/lib/python3.11/site-packages/requests/models.py", line 818, in generate raise ChunkedEncodingError(e) requests.exceptions.ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read)) ``` --- src/common/endoflife.py | 45 ++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/src/common/endoflife.py b/src/common/endoflife.py index 3b40f05c..68a15ae0 100644 --- a/src/common/endoflife.py +++ b/src/common/endoflife.py @@ -1,10 +1,11 @@ -import http.client import json import frontmatter from concurrent.futures import as_completed from glob import glob from os import path +from requests import Response from requests.adapters import HTTPAdapter +from requests.exceptions import ChunkedEncodingError from requests_futures.sessions import FuturesSession from urllib3.util import Retry @@ -12,14 +13,14 @@ from urllib3.util import Retry USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101 Firefox/102.0' -def load_product(product_name, pathname="website/products"): +def load_product(product_name, pathname="website/products") -> frontmatter.Post: """Load the product's file frontmatter. """ with open(f"{pathname}/{product_name}.md", "r") as f: return frontmatter.load(f) -def list_products(method, products_filter=None, pathname="website/products"): +def list_products(method, products_filter=None, pathname="website/products") -> dict[str, list[dict]]: """Return a list of products that are using the same given update method. """ products_with_method = {} @@ -43,7 +44,7 @@ def list_products(method, products_filter=None, pathname="website/products"): # Keep the default timeout high enough to avoid errors with web.archive.org. -def fetch_urls(urls, data=None, headers=None, max_retries=5, backoff_factor=0.5, timeout=30): +def fetch_urls(urls, data=None, headers=None, max_retries=10, backoff_factor=0.5, timeout=30) -> list[Response]: adapter = HTTPAdapter(max_retries=Retry(total=max_retries, backoff_factor=backoff_factor)) session = FuturesSession() session.mount('http://', adapter) @@ -52,32 +53,30 @@ def fetch_urls(urls, data=None, headers=None, max_retries=5, backoff_factor=0.5, headers = {'User-Agent': USER_AGENT} | ({} if headers is None else headers) futures = [session.get(url, headers=headers, data=data, timeout=timeout) for url in urls] - return [future.result() for future in as_completed(futures)] + return [result_or_retry(future) for future in as_completed(futures)] -def fetch_url(url, data=None, headers=None, max_retries=5, backoff_factor=0.5, timeout=30): +def result_or_retry(future) -> Response: + """Return the future's result or retry the request if there is an error. + This may lead to an infinite loop, but let's try it for now. + """ + try: + return future.result() + except ChunkedEncodingError as e: + # Intermittent ChunkedEncodingErrors occurs while fetching URLs. This change try to fix it by retrying. + # According to https://stackoverflow.com/a/44511691/374236, most servers transmit all data, but that's not + # what was observed. + print(f"Got ChunkedEncodingError while fetching {e.request.url}, retrying...") + return fetch_urls([e.request.url], e.request.body, e.request.headers)[0] + + +def fetch_url(url, data=None, headers=None, max_retries=5, backoff_factor=0.5, timeout=30) -> str: return fetch_urls([url], data, headers, max_retries, backoff_factor, timeout)[0].text -def write_releases(product, releases, pathname="releases"): +def write_releases(product, releases, pathname="releases") -> None: with open(f"{pathname}/{product}.json", "w") as f: f.write(json.dumps(dict( # sort by date then version (desc) sorted(releases.items(), key=lambda x: (x[1], x[0]), reverse=True) ), indent=2)) - - -def patch_http_response_read(func): - def inner(*args): - try: - return func(*args) - except http.client.IncompleteRead as e: - return e.partial - return inner - - -# This patch HTTPResponse to prevent ChunkedEncodingError when fetching some websites, such as -# Mozilla's website. According to https://stackoverflow.com/a/44511691/374236, this is an issue at -# server side that cannot be avoided: most servers transmit all data, but due implementation errors -# they wrongly close session. -http.client.HTTPResponse.read = patch_http_response_read(http.client.HTTPResponse.read)