[text] add 'extract_urls()' helper
This commit is contained in:
@@ -611,7 +611,7 @@ _CONVERSIONS = {
|
|||||||
"U": text.unescape,
|
"U": text.unescape,
|
||||||
"H": lambda s: text.unescape(text.remove_html(s)),
|
"H": lambda s: text.unescape(text.remove_html(s)),
|
||||||
"g": text.slugify,
|
"g": text.slugify,
|
||||||
"R": text.re(r"https?://[^\s\"'<>\\]+").findall,
|
"R": text.extract_urls,
|
||||||
"W": text.sanitize_whitespace,
|
"W": text.sanitize_whitespace,
|
||||||
"S": util.to_string,
|
"S": util.to_string,
|
||||||
"s": str,
|
"s": str,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# Copyright 2015-2025 Mike Fährmann
|
# Copyright 2015-2026 Mike Fährmann
|
||||||
#
|
#
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License version 2 as
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
@@ -231,6 +231,9 @@ def extract_from(txt, pos=None, default=""):
|
|||||||
return extr
|
return extr
|
||||||
|
|
||||||
|
|
||||||
|
extract_urls = re(r"https?://[^\s\"'<>\\]+").findall
|
||||||
|
|
||||||
|
|
||||||
def parse_unicode_escapes(txt):
|
def parse_unicode_escapes(txt):
|
||||||
"""Convert JSON Unicode escapes in 'txt' into actual characters"""
|
"""Convert JSON Unicode escapes in 'txt' into actual characters"""
|
||||||
if "\\u" in txt:
|
if "\\u" in txt:
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# Copyright 2015-2025 Mike Fährmann
|
# Copyright 2015-2026 Mike Fährmann
|
||||||
#
|
#
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License version 2 as
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
@@ -391,6 +391,22 @@ class TestText(unittest.TestCase):
|
|||||||
self.assertEqual(e("[", "]"), "END")
|
self.assertEqual(e("[", "]"), "END")
|
||||||
self.assertEqual(e("[", "]"), "END")
|
self.assertEqual(e("[", "]"), "END")
|
||||||
|
|
||||||
|
def test_extract_urls(self, f=text.extract_urls):
|
||||||
|
txt = ""
|
||||||
|
self.assertEqual(f(txt), [])
|
||||||
|
|
||||||
|
txt = "<p>foo </p> & bar <p> </p>"
|
||||||
|
self.assertEqual(f(txt), [])
|
||||||
|
|
||||||
|
txt = """<p>
|
||||||
|
<a href="http://www.example.com">Lorem ipsum dolor sit amet</a>.
|
||||||
|
Duis aute irure <a href="http://blog.example.org/lorem?foo=bar">
|
||||||
|
http://blog.example.org</a>.
|
||||||
|
</p>"""
|
||||||
|
self.assertEqual(f(txt), ["http://www.example.com",
|
||||||
|
"http://blog.example.org/lorem?foo=bar",
|
||||||
|
"http://blog.example.org"])
|
||||||
|
|
||||||
def test_parse_unicode_escapes(self, f=text.parse_unicode_escapes):
|
def test_parse_unicode_escapes(self, f=text.parse_unicode_escapes):
|
||||||
self.assertEqual(f(""), "")
|
self.assertEqual(f(""), "")
|
||||||
self.assertEqual(f("foobar"), "foobar")
|
self.assertEqual(f("foobar"), "foobar")
|
||||||
|
|||||||
Reference in New Issue
Block a user