allow BaseExtractors to use the domain pf the matched URL

This commit is contained in:
Mike Fährmann
2022-02-10 01:38:50 +01:00
parent c0fddcefc5
commit b4f8e15a1f
2 changed files with 12 additions and 1 deletions

View File

@@ -607,6 +607,9 @@ class BaseExtractor(Extractor):
if group is not None:
if index:
self.category, self.root = self.instances[index-1]
if not self.root:
url = text.ensure_http_scheme(match.group(0))
self.root = url[:url.index("/", 8)]
else:
self.root = group
self.category = group.partition("://")[2]
@@ -624,7 +627,9 @@ class BaseExtractor(Extractor):
pattern_list = []
instance_list = cls.instances = []
for category, info in instances.items():
root = info["root"].rstrip("/")
root = info["root"]
if root:
root = root.rstrip("/")
instance_list.append((category, root))
pattern = info.get("pattern")

View File

@@ -349,6 +349,12 @@ def build_extractor_list():
for category, root in extr.instances:
base[category].append(extr.subcategory)
if category not in domains:
if not root:
# use domain from first matching test
for url, _ in extr._get_tests():
if extr.from_url(url).category == category:
root = url[:url.index("/", 8)]
break
domains[category] = root + "/"
# sort subcategory lists