implement new extractor-module selection

This commit is contained in:
Mike Fährmann
2015-04-09 16:13:00 +02:00
parent cd4a699dd2
commit 652d60a5cd

View File

@@ -9,7 +9,6 @@
import os import os
import sys import sys
import re import re
import sqlite3
import importlib import importlib
from .extractor.common import Message from .extractor.common import Message
@@ -133,51 +132,57 @@ class ExtractorFinder():
def __init__(self, config): def __init__(self, config):
self.config = config self.config = config
self.match_list = list()
if "database" in config["general"]:
path = os.path.expanduser(config["general"]["database"])
conn = sqlite3.connect(path)
self.load_from_database(conn)
self.load_from_config(config)
def get_for_url(self, url): def get_for_url(self, url):
# TODO: implement general case name, match = self.find_pattern_match(url)
module = importlib.import_module(".extractor.8chan", __package__) if match:
for pattern in module.info["pattern"]: module = importlib.import_module(".extractor." + name, __package__)
match = re.match(pattern, url) klass = getattr(module, module.info["extractor"])
if match: return klass(match, self.config), module.info
klass = getattr(module, module.info["extractor"]) else:
return klass(match, self.config), module.info print("pattern mismatch")
print("pattern mismatch") return None
sys.exit()
def match(self, url): def find_pattern_match(self, url):
for category, regex in self.match_list: for category in self.config:
match = regex.match(url) for key, value in self.config[category].items():
if match: if(key.startswith("regex")):
module = importlib.import_module("."+category, __package__) print(value)
return module.Extractor(match, self.config) match = re.match(value, url)
if match:
return category, match
for name, info in self.extractor_metadata():
for pattern in info["pattern"]:
print(pattern)
match = re.match(pattern, url)
if match:
return name, match
return None return None
def load_from_database(self, db): def extractor_metadata(self):
query = ( path = os.path.join(os.path.dirname(__file__), "extractor")
"SELECT regex.re, category.name " for name in os.listdir(path):
"FROM regex JOIN category " extractor_path = os.path.join(path, name)
"ON regex.category_id = category.id" info = self.get_info_dict(extractor_path)
) if info is not None:
for row in db.execute(query): yield os.path.splitext(name)[0], info
self.add_match(row[1], row[0])
def load_from_config(self, conf): @staticmethod
for category in conf: def get_info_dict(extractor_path):
for key, value in conf[category].items():
if(key.startswith("regex")):
self.add_match(category, value)
def add_match(self, category, regex):
try: try:
# print(category, regex) with open(extractor_path) as f:
self.match_list.append( (category, re.compile(regex)) ) for index in range(30):
except: line = next(f)
print("[Warning] [{0}] failed to compile regular expression '{1}'" if line.startswith("info ="):
.format(category, regex)) break
else:
return None
info = [line[6:]]
for line in f:
info.append(line)
if line.startswith("}"):
break
except (StopIteration, OSError):
return None
return eval("".join(info))