From abbd45d0f44a74a2f4bbaa741ed23b48bcb6bb7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Fri, 8 Feb 2019 20:08:16 +0100
Subject: [PATCH] update handling of extractor URL patterns
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When loading extractor classes during 'extractor.find(…)', their
'pattern' attribute will be replaced with a compiled version of itself.
---
 gallery_dl/extractor/__init__.py   | 50 +++++++++++++++---------------
 gallery_dl/extractor/imagehosts.py |  2 +-
 gallery_dl/extractor/reddit.py     |  3 +-
 scripts/build_supportedsites.py    | 14 +++++----
 scripts/build_testresult_db.py     |  3 +-
 test/test_extractor.py             | 12 +++----
 6 files changed, 43 insertions(+), 41 deletions(-)

diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 8864afab..cfe6b0de 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -93,33 +93,34 @@ modules = [
 
 
 def find(url):
-    """Find suitable extractor for the given url"""
-    for pattern, klass in _list_patterns():
-        match = pattern.match(url)
-        if match and klass not in _blacklist:
-            return klass(match)
+    """Find a suitable extractor for the given URL"""
+    for cls in _list_classes():
+        match = cls.pattern.match(url)
+        if match and cls not in _blacklist:
+            return cls(match)
     return None
 
 
-def add(klass):
-    """Add 'klass' to the list of available extractors"""
-    _cache.append((re.compile(klass.pattern), klass))
+def add(cls):
+    """Add 'cls' to the list of available extractors"""
+    cls.pattern = re.compile(cls.pattern)
+    _cache.append(cls)
+    return cls
 
 
 def add_module(module):
     """Add all extractors in 'module' to the list of available extractors"""
-    tuples = [
-        (re.compile(klass.pattern), klass)
-        for klass in _get_classes(module)
-    ]
-    _cache.extend(tuples)
-    return tuples
+    classes = _get_classes(module)
+    for cls in classes:
+        cls.pattern = re.compile(cls.pattern)
+    _cache.extend(classes)
+    return classes
 
 
 def extractors():
     """Yield all available extractor classes"""
     return sorted(
-        set(klass for _, klass in _list_patterns()),
+        _list_classes(),
         key=lambda x: x.__name__
     )
 
@@ -128,9 +129,9 @@ class blacklist():
     """Context Manager to blacklist extractor modules"""
     def __init__(self, categories, extractors=None):
         self.extractors = extractors or []
-        for _, klass in _list_patterns():
-            if klass.category in categories:
-                self.extractors.append(klass)
+        for cls in _list_classes():
+            if cls.category in categories:
+                self.extractors.append(cls)
 
     def __enter__(self):
         _blacklist.update(self.extractors)
@@ -147,20 +148,19 @@ _blacklist = set()
 _module_iter = iter(modules)
 
 
-def _list_patterns():
-    """Yield all available (pattern, class) tuples"""
+def _list_classes():
+    """Yield all available extractor classes"""
     yield from _cache
 
     for module_name in _module_iter:
-        yield from add_module(
-            importlib.import_module("."+module_name, __package__)
-        )
+        module = importlib.import_module("."+module_name, __package__)
+        yield from add_module(module)
 
 
 def _get_classes(module):
     """Return a list of all extractor classes in a module"""
     return [
-        klass for klass in module.__dict__.values() if (
-            hasattr(klass, "pattern") and klass.__module__ == module.__name__
+        cls for cls in module.__dict__.values() if (
+            hasattr(cls, "pattern") and cls.__module__ == module.__name__
         )
     ]
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
index 54268a07..e249302b 100644
--- a/gallery_dl/extractor/imagehosts.py
+++ b/gallery_dl/extractor/imagehosts.py
@@ -228,7 +228,7 @@ class PostimgImageExtractor(ImagehostImageExtractor):
 
 
 class TurboimagehostImageExtractor(ImagehostImageExtractor):
-    """Extractor for single images from turboimagehost.com"""
+    """Extractor for single images from www.turboimagehost.com"""
     category = "turboimagehost"
     pattern = (r"(?:https?://)?((?:www\.)?turboimagehost\.com"
                r"/p/(\d+)/[^/?&#]+\.html)")
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index fc630ff6..69fe5b37 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -13,7 +13,6 @@ from .. import text, util, extractor, exception
 from ..cache import cache
 import datetime
 import time
-import re
 
 
 class RedditExtractor(Extractor):
@@ -27,7 +26,7 @@ class RedditExtractor(Extractor):
         self._visited = set()
 
     def items(self):
-        subre = re.compile(RedditSubmissionExtractor.pattern)
+        subre = RedditSubmissionExtractor.pattern
         submissions = self.submissions()
         depth = 0
 
diff --git a/scripts/build_supportedsites.py b/scripts/build_supportedsites.py
index b4808111..995e1cd5 100755
--- a/scripts/build_supportedsites.py
+++ b/scripts/build_supportedsites.py
@@ -211,13 +211,15 @@ def get_domain(classes):
         if hasattr(cls, "root") and cls.root:
             return cls.root + "/"
 
-        if hasattr(cls, "test") and cls.test:
-            url = cls.test[0][0]
-            return url[:url.find("/", 8)+1]
+        if hasattr(cls, "https"):
+            scheme = "https" if cls.https else "http"
+            domain = cls.__doc__.split()[-1]
+            return "{}://{}/".format(scheme, domain)
 
-        scheme = "http" if hasattr(cls, "https") and not cls.https else "https"
-        host = cls.__doc__.split()[-1]
-        return scheme + "://" + host + "/"
+        test = next(cls._get_tests(), None)
+        if test:
+            url = test[0]
+            return url[:url.find("/", 8)+1]
     except (IndexError, AttributeError):
         pass
     return ""
diff --git a/scripts/build_testresult_db.py b/scripts/build_testresult_db.py
index 03dfd921..f1eca505 100755
--- a/scripts/build_testresult_db.py
+++ b/scripts/build_testresult_db.py
@@ -6,6 +6,7 @@ import datetime
 
 ROOTDIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 sys.path.insert(0, os.path.realpath(ROOTDIR))
+
 from gallery_dl import extractor, job, config
 from test.test_results import setup_test_config
 
@@ -19,7 +20,7 @@ tests = [
     if hasattr(extr, "test") and extr.test
     if len(sys.argv) <= 1 or extr.category in sys.argv
 
-    for idx, (url, result) in enumerate(extr.test)
+    for idx, (url, result) in enumerate(extr._get_tests())
     if result
 ]
 
diff --git a/test/test_extractor.py b/test/test_extractor.py
index bf47fe62..80fe65d0 100644
--- a/test/test_extractor.py
+++ b/test/test_extractor.py
@@ -66,10 +66,10 @@ class TestExtractor(unittest.TestCase):
         uri = "fake:foobar"
         self.assertIsNone(extractor.find(uri))
 
-        tuples = extractor.add_module(sys.modules[__name__])
-        self.assertEqual(len(tuples), 1)
-        self.assertEqual(tuples[0][0].pattern, FakeExtractor.pattern)
-        self.assertEqual(tuples[0][1], FakeExtractor)
+        classes = extractor.add_module(sys.modules[__name__])
+        self.assertEqual(len(classes), 1)
+        self.assertEqual(classes[0].pattern, FakeExtractor.pattern)
+        self.assertEqual(classes[0], FakeExtractor)
         self.assertIsInstance(extractor.find(uri), FakeExtractor)
 
     def test_blacklist(self):
@@ -109,13 +109,13 @@ class TestExtractor(unittest.TestCase):
             matches = []
 
             # ... and apply all regex patterns to each one
-            for pattern, extr2 in extractor._cache:
+            for extr2 in extractor._cache:
 
                 # skip DirectlinkExtractor pattern if it isn't tested
                 if extr1 != DLExtractor and extr2 == DLExtractor:
                     continue
 
-                match = pattern.match(url)
+                match = extr2.pattern.match(url)
                 if match:
                     matches.append(match)