[text] add 'extract_urls()' helper

This commit is contained in:
Mike Fährmann
2026-02-06 20:46:35 +01:00
parent 98ef34a9be
commit c978fe18d4
3 changed files with 22 additions and 3 deletions

View File

@@ -611,7 +611,7 @@ _CONVERSIONS = {
"U": text.unescape, "U": text.unescape,
"H": lambda s: text.unescape(text.remove_html(s)), "H": lambda s: text.unescape(text.remove_html(s)),
"g": text.slugify, "g": text.slugify,
"R": text.re(r"https?://[^\s\"'<>\\]+").findall, "R": text.extract_urls,
"W": text.sanitize_whitespace, "W": text.sanitize_whitespace,
"S": util.to_string, "S": util.to_string,
"s": str, "s": str,

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2025 Mike Fährmann # Copyright 2015-2026 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -231,6 +231,9 @@ def extract_from(txt, pos=None, default=""):
return extr return extr
extract_urls = re(r"https?://[^\s\"'<>\\]+").findall
def parse_unicode_escapes(txt): def parse_unicode_escapes(txt):
"""Convert JSON Unicode escapes in 'txt' into actual characters""" """Convert JSON Unicode escapes in 'txt' into actual characters"""
if "\\u" in txt: if "\\u" in txt:

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015-2025 Mike Fährmann # Copyright 2015-2026 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -391,6 +391,22 @@ class TestText(unittest.TestCase):
self.assertEqual(e("[", "]"), "END") self.assertEqual(e("[", "]"), "END")
self.assertEqual(e("[", "]"), "END") self.assertEqual(e("[", "]"), "END")
def test_extract_urls(self, f=text.extract_urls):
txt = ""
self.assertEqual(f(txt), [])
txt = "<p>foo </p> &amp; bar <p> </p>"
self.assertEqual(f(txt), [])
txt = """<p>
<a href="http://www.example.com">Lorem ipsum dolor sit amet</a>.
Duis aute irure <a href="http://blog.example.org/lorem?foo=bar">
http://blog.example.org</a>.
</p>"""
self.assertEqual(f(txt), ["http://www.example.com",
"http://blog.example.org/lorem?foo=bar",
"http://blog.example.org"])
def test_parse_unicode_escapes(self, f=text.parse_unicode_escapes): def test_parse_unicode_escapes(self, f=text.parse_unicode_escapes):
self.assertEqual(f(""), "") self.assertEqual(f(""), "")
self.assertEqual(f("foobar"), "foobar") self.assertEqual(f("foobar"), "foobar")