decouple extractor initialization
Introduce an 'initialize()' function that does the actual init (session, cookies, config options) and can called separately from the constructor __init__(). This allows, for example, to adjust config access inside a Job before most of it already happened when calling 'extractor.find()'.
This commit is contained in:
@@ -22,18 +22,21 @@ class ReactorExtractor(BaseExtractor):
|
||||
|
||||
def __init__(self, match):
|
||||
BaseExtractor.__init__(self, match)
|
||||
|
||||
url = text.ensure_http_scheme(match.group(0), "http://")
|
||||
pos = url.index("/", 10)
|
||||
|
||||
self.root, self.path = url[:pos], url[pos:]
|
||||
self.session.headers["Referer"] = self.root
|
||||
self.gif = self.config("gif", False)
|
||||
self.root = url[:pos]
|
||||
self.path = url[pos:]
|
||||
|
||||
if self.category == "reactor":
|
||||
# set category based on domain name
|
||||
netloc = urllib.parse.urlsplit(self.root).netloc
|
||||
self.category = netloc.rpartition(".")[0]
|
||||
|
||||
def _init(self):
|
||||
self.session.headers["Referer"] = self.root
|
||||
self.gif = self.config("gif", False)
|
||||
|
||||
def items(self):
|
||||
data = self.metadata()
|
||||
yield Message.Directory, data
|
||||
|
||||
Reference in New Issue
Block a user