From c978fe18d4e7e2d693494fe42cbef9e8de00061f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 6 Feb 2026 20:46:35 +0100 Subject: [PATCH] [text] add 'extract_urls()' helper --- gallery_dl/formatter.py | 2 +- gallery_dl/text.py | 5 ++++- test/test_text.py | 18 +++++++++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index 1afea5a4..0f9004a3 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -611,7 +611,7 @@ _CONVERSIONS = { "U": text.unescape, "H": lambda s: text.unescape(text.remove_html(s)), "g": text.slugify, - "R": text.re(r"https?://[^\s\"'<>\\]+").findall, + "R": text.extract_urls, "W": text.sanitize_whitespace, "S": util.to_string, "s": str, diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 5b074d9e..9fdf05bc 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2025 Mike Fährmann +# Copyright 2015-2026 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -231,6 +231,9 @@ def extract_from(txt, pos=None, default=""): return extr +extract_urls = re(r"https?://[^\s\"'<>\\]+").findall + + def parse_unicode_escapes(txt): """Convert JSON Unicode escapes in 'txt' into actual characters""" if "\\u" in txt: diff --git a/test/test_text.py b/test/test_text.py index eac79069..681dda6b 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2015-2025 Mike Fährmann +# Copyright 2015-2026 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -391,6 +391,22 @@ class TestText(unittest.TestCase): self.assertEqual(e("[", "]"), "END") self.assertEqual(e("[", "]"), "END") + def test_extract_urls(self, f=text.extract_urls): + txt = "" + self.assertEqual(f(txt), []) + + txt = "

foo

& bar

" + self.assertEqual(f(txt), []) + + txt = """

+ Lorem ipsum dolor sit amet. + Duis aute irure + http://blog.example.org. +

""" + self.assertEqual(f(txt), ["http://www.example.com", + "http://blog.example.org/lorem?foo=bar", + "http://blog.example.org"]) + def test_parse_unicode_escapes(self, f=text.parse_unicode_escapes): self.assertEqual(f(""), "") self.assertEqual(f("foobar"), "foobar")