diff --git a/gallery_dl/text.py b/gallery_dl/text.py index a573de6e..cf9d4909 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -8,14 +8,29 @@ """Collection of functions that work on strings/text""" -import re import sys import html import time import datetime import urllib.parse +import re as re_module -HTML_RE = re.compile("<[^>]+>") +try: + re_compile = re_module._compiler.compile +except AttributeError: + re_compile = re_module.sre_compile.compile + +HTML_RE = re_compile(r"<[^>]+>") +PATTERN_CACHE = {} + + +def re(pattern): + """Compile a regular expression pattern""" + try: + return PATTERN_CACHE[pattern] + except KeyError: + p = PATTERN_CACHE[pattern] = re_compile(pattern) + return p def remove_html(txt, repl=" ", sep=" "): @@ -47,8 +62,8 @@ def slugify(value): Adapted from: https://github.com/django/django/blob/master/django/utils/text.py """ - value = re.sub(r"[^\w\s-]", "", str(value).lower()) - return re.sub(r"[-\s]+", "-", value).strip("-_") + value = re(r"[^\w\s-]").sub("", str(value).lower()) + return re(r"[-\s]+").sub("-", value).strip("-_") def ensure_http_scheme(url, scheme="https://"): @@ -199,7 +214,7 @@ def extract_from(txt, pos=None, default=""): def parse_unicode_escapes(txt): """Convert JSON Unicode escapes in 'txt' into actual characters""" if "\\u" in txt: - return re.sub(r"\\u([0-9a-fA-F]{4})", _hex_to_char, txt) + return re(r"\\u([0-9a-fA-F]{4})").sub(_hex_to_char, txt) return txt diff --git a/gallery_dl/util.py b/gallery_dl/util.py index fb6d3771..f06668f9 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -8,7 +8,6 @@ """Utility functions and classes""" -import re as re_module import os import sys import json @@ -27,24 +26,6 @@ from http.cookiejar import Cookie from email.utils import mktime_tz, parsedate_tz from . import text, version, exception -try: - re_compile = re_module._compiler.compile -except AttributeError: - re_compile = re_module.sre_compile.compile - -CACHE_PATTERN = {} - - -def re(pattern): - """Compile a regular expression pattern""" - try: - return CACHE_PATTERN[pattern] - except KeyError: - pass - - p = CACHE_PATTERN[pattern] = re_compile(pattern) - return p - def bencode(num, alphabet="0123456789"): """Encode an integer into a base-N encoded string""" @@ -752,6 +733,9 @@ class CustomNone(): _ff_ver = (datetime.date.today().toordinal() - 735506) // 28 # _ch_ver = _ff_ver - 2 +re = text.re +re_compile = text.re_compile + NONE = CustomNone() EPOCH = datetime.datetime(1970, 1, 1) SECOND = datetime.timedelta(0, 1) @@ -784,7 +768,7 @@ GLOBALS = { "hash_sha1": sha1, "hash_md5" : md5, "std" : ModuleProxy(), - "re" : re_module, + "re" : text.re_module, "exts_image" : EXTS_IMAGE, "exts_video" : EXTS_VIDEO, "exts_archive": EXTS_ARCHIVE, diff --git a/test/test_text.py b/test/test_text.py index 17f11656..85cc1d2d 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -23,6 +23,20 @@ INVALID_ALT = ((), [], {}, None, "") class TestText(unittest.TestCase): + def test_re(self): + p1 = text.re_compile("foo") + p2 = text.re("foo") + p3 = text.re("foo") + + Pattern = text.re_module.Pattern + self.assertIsInstance(p1, Pattern) + self.assertIsInstance(p2, Pattern) + self.assertIsInstance(p3, Pattern) + + self.assertEqual(p1, p2) + self.assertIsNot(p1, p2) + self.assertIs(p2, p3) + def test_remove_html(self, f=text.remove_html): result = "Hello World." diff --git a/test/test_util.py b/test/test_util.py index 6613898b..00e8c4bc 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -13,7 +13,6 @@ import unittest from unittest.mock import patch import io -import re import time import random import string @@ -1042,21 +1041,6 @@ value = 123 self.assertEqual(response.links.get("next"), None) self.assertEqual(response.close(), None) - def test_re(self): - Pattern = type(re.compile("")) - - p1 = util.re_compile("foo") - p2 = util.re("foo") - p3 = util.re("foo") - - self.assertIsInstance(p1, Pattern) - self.assertIsInstance(p2, Pattern) - self.assertIsInstance(p3, Pattern) - - self.assertIsNot(p1, p2) - self.assertIs(p2, p3) - self.assertEqual(p1, p2) - class TestExtractor(): category = "test_category"