[util] move 're' functions to text.py

This commit is contained in:
Mike Fährmann
2025-06-23 20:05:20 +02:00
parent 92ac35c932
commit c08833aed9
4 changed files with 38 additions and 41 deletions

View File

@@ -8,14 +8,29 @@
"""Collection of functions that work on strings/text"""
import re
import sys
import html
import time
import datetime
import urllib.parse
import re as re_module
HTML_RE = re.compile("<[^>]+>")
try:
re_compile = re_module._compiler.compile
except AttributeError:
re_compile = re_module.sre_compile.compile
HTML_RE = re_compile(r"<[^>]+>")
PATTERN_CACHE = {}
def re(pattern):
"""Compile a regular expression pattern"""
try:
return PATTERN_CACHE[pattern]
except KeyError:
p = PATTERN_CACHE[pattern] = re_compile(pattern)
return p
def remove_html(txt, repl=" ", sep=" "):
@@ -47,8 +62,8 @@ def slugify(value):
Adapted from:
https://github.com/django/django/blob/master/django/utils/text.py
"""
value = re.sub(r"[^\w\s-]", "", str(value).lower())
return re.sub(r"[-\s]+", "-", value).strip("-_")
value = re(r"[^\w\s-]").sub("", str(value).lower())
return re(r"[-\s]+").sub("-", value).strip("-_")
def ensure_http_scheme(url, scheme="https://"):
@@ -199,7 +214,7 @@ def extract_from(txt, pos=None, default=""):
def parse_unicode_escapes(txt):
"""Convert JSON Unicode escapes in 'txt' into actual characters"""
if "\\u" in txt:
return re.sub(r"\\u([0-9a-fA-F]{4})", _hex_to_char, txt)
return re(r"\\u([0-9a-fA-F]{4})").sub(_hex_to_char, txt)
return txt

View File

@@ -8,7 +8,6 @@
"""Utility functions and classes"""
import re as re_module
import os
import sys
import json
@@ -27,24 +26,6 @@ from http.cookiejar import Cookie
from email.utils import mktime_tz, parsedate_tz
from . import text, version, exception
try:
re_compile = re_module._compiler.compile
except AttributeError:
re_compile = re_module.sre_compile.compile
CACHE_PATTERN = {}
def re(pattern):
"""Compile a regular expression pattern"""
try:
return CACHE_PATTERN[pattern]
except KeyError:
pass
p = CACHE_PATTERN[pattern] = re_compile(pattern)
return p
def bencode(num, alphabet="0123456789"):
"""Encode an integer into a base-N encoded string"""
@@ -752,6 +733,9 @@ class CustomNone():
_ff_ver = (datetime.date.today().toordinal() - 735506) // 28
# _ch_ver = _ff_ver - 2
re = text.re
re_compile = text.re_compile
NONE = CustomNone()
EPOCH = datetime.datetime(1970, 1, 1)
SECOND = datetime.timedelta(0, 1)
@@ -784,7 +768,7 @@ GLOBALS = {
"hash_sha1": sha1,
"hash_md5" : md5,
"std" : ModuleProxy(),
"re" : re_module,
"re" : text.re_module,
"exts_image" : EXTS_IMAGE,
"exts_video" : EXTS_VIDEO,
"exts_archive": EXTS_ARCHIVE,

View File

@@ -23,6 +23,20 @@ INVALID_ALT = ((), [], {}, None, "")
class TestText(unittest.TestCase):
def test_re(self):
p1 = text.re_compile("foo")
p2 = text.re("foo")
p3 = text.re("foo")
Pattern = text.re_module.Pattern
self.assertIsInstance(p1, Pattern)
self.assertIsInstance(p2, Pattern)
self.assertIsInstance(p3, Pattern)
self.assertEqual(p1, p2)
self.assertIsNot(p1, p2)
self.assertIs(p2, p3)
def test_remove_html(self, f=text.remove_html):
result = "Hello World."

View File

@@ -13,7 +13,6 @@ import unittest
from unittest.mock import patch
import io
import re
import time
import random
import string
@@ -1042,21 +1041,6 @@ value = 123
self.assertEqual(response.links.get("next"), None)
self.assertEqual(response.close(), None)
def test_re(self):
Pattern = type(re.compile(""))
p1 = util.re_compile("foo")
p2 = util.re("foo")
p3 = util.re("foo")
self.assertIsInstance(p1, Pattern)
self.assertIsInstance(p2, Pattern)
self.assertIsInstance(p3, Pattern)
self.assertIsNot(p1, p2)
self.assertIs(p2, p3)
self.assertEqual(p1, p2)
class TestExtractor():
category = "test_category"