From abbd45d0f44a74a2f4bbaa741ed23b48bcb6bb7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 8 Feb 2019 20:08:16 +0100 Subject: [PATCH] update handling of extractor URL patterns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When loading extractor classes during 'extractor.find(…)', their 'pattern' attribute will be replaced with a compiled version of itself. --- gallery_dl/extractor/__init__.py | 50 +++++++++++++++--------------- gallery_dl/extractor/imagehosts.py | 2 +- gallery_dl/extractor/reddit.py | 3 +- scripts/build_supportedsites.py | 14 +++++---- scripts/build_testresult_db.py | 3 +- test/test_extractor.py | 12 +++---- 6 files changed, 43 insertions(+), 41 deletions(-) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 8864afab..cfe6b0de 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -93,33 +93,34 @@ modules = [ def find(url): - """Find suitable extractor for the given url""" - for pattern, klass in _list_patterns(): - match = pattern.match(url) - if match and klass not in _blacklist: - return klass(match) + """Find a suitable extractor for the given URL""" + for cls in _list_classes(): + match = cls.pattern.match(url) + if match and cls not in _blacklist: + return cls(match) return None -def add(klass): - """Add 'klass' to the list of available extractors""" - _cache.append((re.compile(klass.pattern), klass)) +def add(cls): + """Add 'cls' to the list of available extractors""" + cls.pattern = re.compile(cls.pattern) + _cache.append(cls) + return cls def add_module(module): """Add all extractors in 'module' to the list of available extractors""" - tuples = [ - (re.compile(klass.pattern), klass) - for klass in _get_classes(module) - ] - _cache.extend(tuples) - return tuples + classes = _get_classes(module) + for cls in classes: + cls.pattern = re.compile(cls.pattern) + _cache.extend(classes) + return classes def extractors(): """Yield all available extractor classes""" return sorted( - set(klass for _, klass in _list_patterns()), + _list_classes(), key=lambda x: x.__name__ ) @@ -128,9 +129,9 @@ class blacklist(): """Context Manager to blacklist extractor modules""" def __init__(self, categories, extractors=None): self.extractors = extractors or [] - for _, klass in _list_patterns(): - if klass.category in categories: - self.extractors.append(klass) + for cls in _list_classes(): + if cls.category in categories: + self.extractors.append(cls) def __enter__(self): _blacklist.update(self.extractors) @@ -147,20 +148,19 @@ _blacklist = set() _module_iter = iter(modules) -def _list_patterns(): - """Yield all available (pattern, class) tuples""" +def _list_classes(): + """Yield all available extractor classes""" yield from _cache for module_name in _module_iter: - yield from add_module( - importlib.import_module("."+module_name, __package__) - ) + module = importlib.import_module("."+module_name, __package__) + yield from add_module(module) def _get_classes(module): """Return a list of all extractor classes in a module""" return [ - klass for klass in module.__dict__.values() if ( - hasattr(klass, "pattern") and klass.__module__ == module.__name__ + cls for cls in module.__dict__.values() if ( + hasattr(cls, "pattern") and cls.__module__ == module.__name__ ) ] diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 54268a07..e249302b 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -228,7 +228,7 @@ class PostimgImageExtractor(ImagehostImageExtractor): class TurboimagehostImageExtractor(ImagehostImageExtractor): - """Extractor for single images from turboimagehost.com""" + """Extractor for single images from www.turboimagehost.com""" category = "turboimagehost" pattern = (r"(?:https?://)?((?:www\.)?turboimagehost\.com" r"/p/(\d+)/[^/?&#]+\.html)") diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index fc630ff6..69fe5b37 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -13,7 +13,6 @@ from .. import text, util, extractor, exception from ..cache import cache import datetime import time -import re class RedditExtractor(Extractor): @@ -27,7 +26,7 @@ class RedditExtractor(Extractor): self._visited = set() def items(self): - subre = re.compile(RedditSubmissionExtractor.pattern) + subre = RedditSubmissionExtractor.pattern submissions = self.submissions() depth = 0 diff --git a/scripts/build_supportedsites.py b/scripts/build_supportedsites.py index b4808111..995e1cd5 100755 --- a/scripts/build_supportedsites.py +++ b/scripts/build_supportedsites.py @@ -211,13 +211,15 @@ def get_domain(classes): if hasattr(cls, "root") and cls.root: return cls.root + "/" - if hasattr(cls, "test") and cls.test: - url = cls.test[0][0] - return url[:url.find("/", 8)+1] + if hasattr(cls, "https"): + scheme = "https" if cls.https else "http" + domain = cls.__doc__.split()[-1] + return "{}://{}/".format(scheme, domain) - scheme = "http" if hasattr(cls, "https") and not cls.https else "https" - host = cls.__doc__.split()[-1] - return scheme + "://" + host + "/" + test = next(cls._get_tests(), None) + if test: + url = test[0] + return url[:url.find("/", 8)+1] except (IndexError, AttributeError): pass return "" diff --git a/scripts/build_testresult_db.py b/scripts/build_testresult_db.py index 03dfd921..f1eca505 100755 --- a/scripts/build_testresult_db.py +++ b/scripts/build_testresult_db.py @@ -6,6 +6,7 @@ import datetime ROOTDIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) sys.path.insert(0, os.path.realpath(ROOTDIR)) + from gallery_dl import extractor, job, config from test.test_results import setup_test_config @@ -19,7 +20,7 @@ tests = [ if hasattr(extr, "test") and extr.test if len(sys.argv) <= 1 or extr.category in sys.argv - for idx, (url, result) in enumerate(extr.test) + for idx, (url, result) in enumerate(extr._get_tests()) if result ] diff --git a/test/test_extractor.py b/test/test_extractor.py index bf47fe62..80fe65d0 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -66,10 +66,10 @@ class TestExtractor(unittest.TestCase): uri = "fake:foobar" self.assertIsNone(extractor.find(uri)) - tuples = extractor.add_module(sys.modules[__name__]) - self.assertEqual(len(tuples), 1) - self.assertEqual(tuples[0][0].pattern, FakeExtractor.pattern) - self.assertEqual(tuples[0][1], FakeExtractor) + classes = extractor.add_module(sys.modules[__name__]) + self.assertEqual(len(classes), 1) + self.assertEqual(classes[0].pattern, FakeExtractor.pattern) + self.assertEqual(classes[0], FakeExtractor) self.assertIsInstance(extractor.find(uri), FakeExtractor) def test_blacklist(self): @@ -109,13 +109,13 @@ class TestExtractor(unittest.TestCase): matches = [] # ... and apply all regex patterns to each one - for pattern, extr2 in extractor._cache: + for extr2 in extractor._cache: # skip DirectlinkExtractor pattern if it isn't tested if extr1 != DLExtractor and extr2 == DLExtractor: continue - match = pattern.match(url) + match = extr2.pattern.match(url) if match: matches.append(match)