@@ -36,28 +36,28 @@ class GenericExtractor(Extractor):
|
||||
example = "generic:https://www.nongnu.org/lzip/"
|
||||
|
||||
def __init__(self, match):
|
||||
self.subcategory = match.group('domain')
|
||||
self.subcategory = match['domain']
|
||||
Extractor.__init__(self, match)
|
||||
|
||||
# Strip the "g(eneric):" prefix
|
||||
# and inform about "forced" or "fallback" mode
|
||||
if match.group('generic'):
|
||||
self.url = match.group(0).partition(":")[2]
|
||||
if match['generic']:
|
||||
self.url = match[0].partition(":")[2]
|
||||
else:
|
||||
self.log.info("Falling back on generic information extractor.")
|
||||
self.url = match.group(0)
|
||||
self.url = match[0]
|
||||
|
||||
# Make sure we have a scheme, or use https
|
||||
if match.group('scheme'):
|
||||
self.scheme = match.group('scheme')
|
||||
if match['scheme']:
|
||||
self.scheme = match['scheme']
|
||||
else:
|
||||
self.scheme = 'https://'
|
||||
self.url = text.ensure_http_scheme(self.url, self.scheme)
|
||||
|
||||
self.path = match.group('path')
|
||||
self.path = match['path']
|
||||
|
||||
# Used to resolve relative image urls
|
||||
self.root = self.scheme + match.group('domain')
|
||||
self.root = self.scheme + match['domain']
|
||||
|
||||
def items(self):
|
||||
"""Get page, extract metadata & images, yield them in suitable messages
|
||||
@@ -184,7 +184,7 @@ class GenericExtractor(Extractor):
|
||||
basematch = util.re(
|
||||
r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)").search(page)
|
||||
if basematch:
|
||||
self.baseurl = basematch.group('url').rstrip('/')
|
||||
self.baseurl = basematch['url'].rstrip('/')
|
||||
# Otherwise, extract the base url from self.url
|
||||
else:
|
||||
if self.url.endswith("/"):
|
||||
|
||||
Reference in New Issue
Block a user