replace standard library 're' uses with 'util.re()'

This commit is contained in:
Mike Fährmann
2025-06-06 12:26:21 +02:00
parent e1f03a5a93
commit b5c88b3d3e
45 changed files with 143 additions and 220 deletions

View File

@@ -7,9 +7,8 @@
"""Generic information extractor"""
from .common import Extractor, Message
from .. import config, text
from .. import config, text, util
import os.path
import re
class GenericExtractor(Extractor):
@@ -172,8 +171,8 @@ class GenericExtractor(Extractor):
r"(?:[^\"'<>\s]*)?" # optional query and fragment
)
imageurls_src = re.findall(imageurl_pattern_src, page)
imageurls_ext = re.findall(imageurl_pattern_ext, page)
imageurls_src = util.re(imageurl_pattern_src).findall(page)
imageurls_ext = util.re(imageurl_pattern_ext).findall(page)
imageurls = imageurls_src + imageurls_ext
# Resolve relative urls
@@ -182,8 +181,8 @@ class GenericExtractor(Extractor):
# by prepending a suitable base url.
#
# If the page contains a <base> element, use it as base url
basematch = re.search(
r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)", page)
basematch = util.re(
r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)").search(page)
if basematch:
self.baseurl = basematch.group('url').rstrip('/')
# Otherwise, extract the base url from self.url