add 'extractor.retry-codes' option (#3313)

do not retry 429 and 430 by default
This commit is contained in:
Mike Fährmann
2023-01-14 17:16:18 +01:00
parent e30e8aeef7
commit c881548a27
3 changed files with 34 additions and 11 deletions

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2014-2022 Mike Fährmann
# Copyright 2014-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -53,6 +53,7 @@ class Extractor():
self._parentdir = ""
self._write_pages = self.config("write-pages", False)
self._retry_codes = self.config("retry-codes")
self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True)
@@ -64,6 +65,8 @@ class Extractor():
if self._retries < 0:
self._retries = float("inf")
if not self._retry_codes:
self._retry_codes = ()
self._init_session()
self._init_cookies()
@@ -103,12 +106,15 @@ class Extractor():
values[:0] = config.accumulate((self.subcategory,), key, conf=conf)
return values
def request(self, url, *, method="GET", session=None, retries=None,
encoding=None, fatal=True, notfound=None, **kwargs):
def request(self, url, *, method="GET", session=None,
retries=None, retry_codes=None, encoding=None,
fatal=True, notfound=None, **kwargs):
if session is None:
session = self.session
if retries is None:
retries = self._retries
if retry_codes is None:
retry_codes = self._retry_codes
if "proxies" not in kwargs:
kwargs["proxies"] = self._proxies
if "timeout" not in kwargs:
@@ -153,12 +159,12 @@ class Extractor():
code in (403, 503):
content = response.content
if b"_cf_chl_opt" in content or b"jschl-answer" in content:
self.log.warning("Cloudflare IUAM challenge")
self.log.warning("Cloudflare challenge")
break
if b'name="captcha-bypass"' in content:
self.log.warning("Cloudflare CAPTCHA")
break
if code < 500 and code != 429 and code != 430:
if code not in retry_codes and code < 500:
break
finally: