Improve retry mechanism (#194) (#194)

When a ChunkedEncodingError occurs, request and response are not set and there is no way to get the URL that causes the error. With this change all URLs are retried. The max_retries parameter is decreased each time so that we do not get stuck in an infinite loop. I also considered to also wait before retrying, but for now I don't see any benefit to it. Relates to #188.
2023-11-26 19:00:32 +01:00
parent 37683f9677
commit 1e65a048b0
2 changed files with 18 additions and 23 deletions
--- a/src/common/endoflife.py
+++ b/src/common/endoflife.py
@@ -45,30 +45,23 @@ def list_products(method, products_filter=None, pathname="website/products") ->

 # Keep the default timeout high enough to avoid errors with web.archive.org.
 def fetch_urls(urls, data=None, headers=None, max_retries=10, backoff_factor=0.5, timeout=30) -> list[Response]:
-    adapter = HTTPAdapter(max_retries=Retry(total=max_retries, backoff_factor=backoff_factor))
-    session = FuturesSession()
-    session.mount('http://', adapter)
-    session.mount('https://', adapter)
-
-    headers = {'User-Agent': USER_AGENT} | ({} if headers is None else headers)
-    futures = [session.get(url, headers=headers, data=data, timeout=timeout) for url in urls]
-
-    return [result_or_retry(future) for future in as_completed(futures)]
-
-
-def result_or_retry(future) -> Response:
-    """Return the future's result or retry the request if there is an error.
-    This may lead to an infinite loop, but let's try it for now.
-    """
    try:
-        return future.result()
-    except ChunkedEncodingError as e:
-        # Intermittent ChunkedEncodingErrors occurs while fetching URLs. This change try to fix it by retrying.
-        # According to https://stackoverflow.com/a/44511691/374236, most servers transmit all data, but that's not
-        # what was observed.
-        url = e.response.url
-        print(f"Got ChunkedEncodingError while fetching {url}, retrying...")
-        return fetch_urls([url], e.request.body, e.request.headers)[0]
+        with FuturesSession() as session:
+            adapter = HTTPAdapter(max_retries=Retry(total=max_retries, backoff_factor=backoff_factor))
+            session.mount('http://', adapter)
+            session.mount('https://', adapter)
+
+            headers = {'User-Agent': USER_AGENT} | ({} if headers is None else headers)
+            futures = [session.get(url, headers=headers, data=data, timeout=timeout, stream=None) for url in urls]
+            return [future.result() for future in as_completed(futures)]
+    except ChunkedEncodingError as e:  # See https://github.com/psf/requests/issues/4771#issue-354077499
+        next_max_retries = max_retries - 1
+        if next_max_retries == 0:
+            raise e  # So that the function does not get stuck in an infinite loop.
+        else:
+            # We could wait a bit before retrying, but it's not clear if it would help.
+            print(f"Got ChunkedEncodingError while fetching {urls} ({e}), retrying (remaining retries = {next_max_retries}).")
+            return fetch_urls(urls, data, headers, next_max_retries, backoff_factor, timeout)


 def fetch_url(url, data=None, headers=None, max_retries=5, backoff_factor=0.5, timeout=30) -> str:
--- a/src/unity.py
+++ b/src/unity.py
@@ -33,6 +33,8 @@ print(f"::group::{PRODUCT}")
 all_versions = {}
 next_page_url = URL

+# Do not try to fetch multiple pages in parallel: it is raising a lot of ChunkedEncodingErrors and
+# make the overall process slower.
 while next_page_url:
    next_page_url = fetch_releases(all_versions, next_page_url)