pre-compile regex pattern for remove_html() and split_html()
This commit is contained in:
@@ -15,6 +15,8 @@ import datetime
|
||||
import urllib.parse
|
||||
|
||||
|
||||
HTML_RE = re.compile("<[^>]+>")
|
||||
|
||||
INVALID_XML_CHARS = (
|
||||
"\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07",
|
||||
"\x08", "\x0b", "\x0c", "\x0e", "\x0f", "\x10", "\x11", "\x12",
|
||||
@@ -39,7 +41,7 @@ def clean_xml(xmldata, repl=""):
|
||||
def remove_html(txt, repl=" ", sep=" "):
|
||||
"""Remove html-tags from a string"""
|
||||
try:
|
||||
txt = re.sub("<[^>]+>", repl, txt)
|
||||
txt = HTML_RE.sub(repl, txt)
|
||||
except TypeError:
|
||||
return ""
|
||||
if sep:
|
||||
@@ -51,7 +53,7 @@ def split_html(txt, sep=None):
|
||||
"""Split input string by html-tags"""
|
||||
try:
|
||||
return [
|
||||
x.strip() for x in re.split("<[^>]+>", txt)
|
||||
x.strip() for x in HTML_RE.split(txt)
|
||||
if x and not x.isspace()
|
||||
]
|
||||
except TypeError:
|
||||
|
||||
Reference in New Issue
Block a user