decouple extractor initialization

Introduce an 'initialize()' function that does the actual init
(session, cookies, config options) and can called separately from
the constructor __init__().

This allows, for example, to adjust config access inside a Job
before most of it already happened when calling 'extractor.find()'.
This commit is contained in:
Mike Fährmann
2023-07-25 20:09:44 +02:00
parent f0203b7559
commit a383eca7f6
71 changed files with 314 additions and 193 deletions

View File

@@ -22,18 +22,21 @@ class ReactorExtractor(BaseExtractor):
def __init__(self, match):
BaseExtractor.__init__(self, match)
url = text.ensure_http_scheme(match.group(0), "http://")
pos = url.index("/", 10)
self.root, self.path = url[:pos], url[pos:]
self.session.headers["Referer"] = self.root
self.gif = self.config("gif", False)
self.root = url[:pos]
self.path = url[pos:]
if self.category == "reactor":
# set category based on domain name
netloc = urllib.parse.urlsplit(self.root).netloc
self.category = netloc.rpartition(".")[0]
def _init(self):
self.session.headers["Referer"] = self.root
self.gif = self.config("gif", False)
def items(self):
data = self.metadata()
yield Message.Directory, data