[common] implement BaseExtractor class

Should be used when the same extractor logic applies to different
instances/domains of several sites, e.g. FoolFuuka, Shopify, etc.

This will replace the functionality of 'generate_extractors()' in
a more efficient way, by condensing everything into 1 class and not
dynamically generating an extractor class for each instance.
This commit is contained in:
Mike Fährmann
2021-01-26 03:40:14 +01:00
parent b549c53b36
commit 745a114c61

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2014-2020 Mike Fährmann
# Copyright 2014-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -527,6 +527,39 @@ class AsynchronousMixin():
messages.put(None)
class BaseExtractor(Extractor):
instances = None
def __init__(self, match):
if not self.category:
for index, group in enumerate(match.groups()):
if group is not None:
self.category, self.root = self.instances[index]
break
Extractor.__init__(self, match)
@classmethod
def update(cls, instances):
extra_instances = config.get(("extractor",), cls.basecategory)
if extra_instances:
for category, info in extra_instances.items():
if isinstance(info, dict) and "root" in info:
instances[category] = info
pattern_list = []
instance_list = cls.instances = []
for category, info in instances.items():
root = info["root"]
instance_list.append((category, root))
pattern = info.get("pattern")
if not pattern:
pattern = re.escape(root[root.index(":") + 3:])
pattern_list.append(pattern + "()")
return r"(?:https?://)?(?:" + "|".join(pattern_list) + r")"
def generate_extractors(extractor_data, symtable, classes):
"""Dynamically generate Extractor classes"""
extractors = config.get(("extractor",), classes[0].basecategory)