[util] move 're' functions to text.py

2025-06-23 20:05:20 +02:00
parent 92ac35c932
commit c08833aed9
4 changed files with 38 additions and 41 deletions
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -8,14 +8,29 @@

 """Collection of functions that work on strings/text"""

-import re
 import sys
 import html
 import time
 import datetime
 import urllib.parse
+import re as re_module

-HTML_RE = re.compile("<[^>]+>")
+try:
+    re_compile = re_module._compiler.compile
+except AttributeError:
+    re_compile = re_module.sre_compile.compile
+
+HTML_RE = re_compile(r"<[^>]+>")
+PATTERN_CACHE = {}
+
+
+def re(pattern):
+    """Compile a regular expression pattern"""
+    try:
+        return PATTERN_CACHE[pattern]
+    except KeyError:
+        p = PATTERN_CACHE[pattern] = re_compile(pattern)
+        return p


 def remove_html(txt, repl=" ", sep=" "):
@@ -47,8 +62,8 @@ def slugify(value):
    Adapted from:
    https://github.com/django/django/blob/master/django/utils/text.py
    """
-    value = re.sub(r"[^\w\s-]", "", str(value).lower())
-    return re.sub(r"[-\s]+", "-", value).strip("-_")
+    value = re(r"[^\w\s-]").sub("", str(value).lower())
+    return re(r"[-\s]+").sub("-", value).strip("-_")


 def ensure_http_scheme(url, scheme="https://"):
@@ -199,7 +214,7 @@ def extract_from(txt, pos=None, default=""):
 def parse_unicode_escapes(txt):
    """Convert JSON Unicode escapes in 'txt' into actual characters"""
    if "\\u" in txt:
-        return re.sub(r"\\u([0-9a-fA-F]{4})", _hex_to_char, txt)
+        return re(r"\\u([0-9a-fA-F]{4})").sub(_hex_to_char, txt)
    return txt


--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -8,7 +8,6 @@

 """Utility functions and classes"""

-import re as re_module
 import os
 import sys
 import json
@@ -27,24 +26,6 @@ from http.cookiejar import Cookie
 from email.utils import mktime_tz, parsedate_tz
 from . import text, version, exception

-try:
-    re_compile = re_module._compiler.compile
-except AttributeError:
-    re_compile = re_module.sre_compile.compile
-
-CACHE_PATTERN = {}
-
-
-def re(pattern):
-    """Compile a regular expression pattern"""
-    try:
-        return CACHE_PATTERN[pattern]
-    except KeyError:
-        pass
-
-    p = CACHE_PATTERN[pattern] = re_compile(pattern)
-    return p
-

 def bencode(num, alphabet="0123456789"):
    """Encode an integer into a base-N encoded string"""
@@ -752,6 +733,9 @@ class CustomNone():
 _ff_ver = (datetime.date.today().toordinal() - 735506) // 28
 #  _ch_ver = _ff_ver - 2

+re = text.re
+re_compile = text.re_compile
+
 NONE = CustomNone()
 EPOCH = datetime.datetime(1970, 1, 1)
 SECOND = datetime.timedelta(0, 1)
@@ -784,7 +768,7 @@ GLOBALS = {
    "hash_sha1": sha1,
    "hash_md5" : md5,
    "std"      : ModuleProxy(),
-    "re"       : re_module,
+    "re"       : text.re_module,
    "exts_image"  : EXTS_IMAGE,
    "exts_video"  : EXTS_VIDEO,
    "exts_archive": EXTS_ARCHIVE,
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -23,6 +23,20 @@ INVALID_ALT = ((), [], {}, None, "")

 class TestText(unittest.TestCase):

+    def test_re(self):
+        p1 = text.re_compile("foo")
+        p2 = text.re("foo")
+        p3 = text.re("foo")
+
+        Pattern = text.re_module.Pattern
+        self.assertIsInstance(p1, Pattern)
+        self.assertIsInstance(p2, Pattern)
+        self.assertIsInstance(p3, Pattern)
+
+        self.assertEqual(p1, p2)
+        self.assertIsNot(p1, p2)
+        self.assertIs(p2, p3)
+
    def test_remove_html(self, f=text.remove_html):
        result = "Hello World."

--- a/test/test_util.py
+++ b/test/test_util.py
@@ -13,7 +13,6 @@ import unittest
 from unittest.mock import patch

 import io
-import re
 import time
 import random
 import string
@@ -1042,21 +1041,6 @@ value = 123
        self.assertEqual(response.links.get("next"), None)
        self.assertEqual(response.close(), None)

-    def test_re(self):
-        Pattern = type(re.compile(""))
-
-        p1 = util.re_compile("foo")
-        p2 = util.re("foo")
-        p3 = util.re("foo")
-
-        self.assertIsInstance(p1, Pattern)
-        self.assertIsInstance(p2, Pattern)
-        self.assertIsInstance(p3, Pattern)
-
-        self.assertIsNot(p1, p2)
-        self.assertIs(p2, p3)
-        self.assertEqual(p1, p2)
-

 class TestExtractor():
    category = "test_category"