From 2d64e7622388520155ea93850548447cc0e1bae9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 7 Feb 2026 11:41:32 +0100 Subject: [PATCH] [job] implement 'follow' option (#8752) Follow and process URLs found in the given format string result. --- docs/configuration.rst | 16 +++++++++++++++- docs/gallery-dl.conf | 1 + gallery_dl/job.py | 34 ++++++++++++++++++++++++++++++++++ test/test_job.py | 42 ++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 90 insertions(+), 3 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 921342a2..3c32ab87 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -212,6 +212,20 @@ Description Specifying a default |Path|_ with ``""`` is required. +extractor.*.follow +------------------ +Type + `Format String`_ +Default + ``null`` +Example + * ``"{content}"`` + * ``"\fE body or html or text"`` +Description + Follow URLs in the given `Format String`_'s result and + process them with child extractors. + + extractor.*.parent ------------------ Type @@ -10230,7 +10244,7 @@ Example * ``"foo"`` * ``"{username}"`` * ``"{title} ({id}).{extension}"`` - * ``"\fF {title.title()} ({num:>0:>0{len(str(a))}} / {count}).{extension}"`` + * ``"\fF {title.title()} ({num:>0{len(str(count))}} / {count}).{extension}"`` Description A `Format String`_ allows creating dynamic text by embedding metadata values directly into replacement fields diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index ffdd984a..62bfc4f6 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -14,6 +14,7 @@ "postprocessors": null, "skip" : true, "skip-filter" : null, + "follow" : null, "user-agent" : "auto", "referer" : true, diff --git a/gallery_dl/job.py b/gallery_dl/job.py index c70bbc2a..019c30b1 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -211,13 +211,31 @@ class Job(): msg = None process = True + if follow := self.extractor.config("follow"): + follow = formatter.parse(follow, None, util.identity).format_map + follow_urls = follow_kwdict = None + else: + follow = follow_urls = None + for msg, url, kwdict in messages: if msg == Message.Directory: + if follow_urls is not None: + for furl in follow_urls: + if self.metadata_url: + follow_kwdict[self.metadata_url] = furl + if self.pred_queue(furl, follow_kwdict): + self.handle_queue(furl, follow_kwdict) + follow_urls = None + self.update_kwdict(kwdict) if self.pred_post(url, kwdict): process = True self.handle_directory(kwdict) + if follow is not None: + follow_urls = self._collect_urls(follow(kwdict)) + if follow_urls is not None: + follow_kwdict = kwdict.copy() else: process = None if FLAGS.POST is not None: @@ -253,6 +271,13 @@ class Job(): if FLAGS.CHILD is not None: FLAGS.process("CHILD") + if follow_urls is not None: + for furl in follow_urls: + if self.metadata_url: + follow_kwdict[self.metadata_url] = furl + if self.pred_queue(furl, follow_kwdict): + self.handle_queue(furl, follow_kwdict) + return msg def handle_url(self, url, kwdict): @@ -300,6 +325,15 @@ class Job(): if init and init != "lazy": self.initialize() + def _collect_urls(self, source): + if not source: + return None + if isinstance(source, list): + return source + if isinstance(source, str): + if urls := text.extract_urls(source): + return urls + def _prepare_predicates(self, target, alt=None, skip=None): predicates = [] extr = self.extractor diff --git a/test/test_job.py b/test/test_job.py index c73577f3..92d15542 100644 --- a/test/test_job.py +++ b/test/test_job.py @@ -207,7 +207,7 @@ test:child def test_child(self): extr = TestExtractorParent.from_url("test:parent") - tjob = job.UrlJob(extr, depth=0) + tjob = self.jobclass(extr, depth=0) self.assertEqual(self._capture_stdout(tjob), 3 * """\ https://example.org/1.jpg https://example.org/2.jpg @@ -221,6 +221,20 @@ https://example.org/3.jpg tjob = self.jobclass(extr) tjob._init() + def test_opt_follow(self): + config.set((), "follow", "{user[bio]}") + + extr = TestExtractor.from_url("test:urls") + tjob = self.jobclass(extr) + self.assertEqual(self._capture_stdout(tjob), """\ +https://example.org/1.jpg +https://example.org/2.jpg +https://example.org/3.jpg +https://example1.org/content/abc +https://example2.org/content?query=123 +https://example3.org/content/#frag +""") + class TestInfoJob(TestJob): jobclass = job.InfoJob @@ -448,6 +462,21 @@ class TestDataJob(TestJob): tjob = self.jobclass(extr) tjob._init() + def test_opt_follow(self): + config.set((), "follow", "{user[bio]!R}") + + extr = TestExtractor.from_url("test:urls") + tjob = self.jobclass(extr, file=None) + tjob.run() + self.assertEqual(tjob.data_urls, [ + "https://example.org/1.jpg", + "https://example.org/2.jpg", + "https://example.org/3.jpg", + "https://example1.org/content/abc", + "https://example2.org/content?query=123", + "https://example3.org/content/#frag" + ]) + def test_resolve(self): extr = TestExtractorParent.from_url("test:parent:3") tjob = self.jobclass(extr, file=None, resolve=0) @@ -486,13 +515,22 @@ class TestExtractor(Extractor): subcategory = "test_subcategory" directory_fmt = ("{category}",) filename_fmt = "test_{filename}.{extension}" - pattern = r"test:(child|self)?$" + pattern = r"test:(child|self|urls)?$" def __init__(self, match): Extractor.__init__(self, match) self.user = {"id": 123, "name": "test"} if match[1] == "self": self.user["self"] = self.user + elif match[1] == "urls": + self.user["bio"] = """ +Site 1: +* https://example1.org/content/abc +Site 2: +* https://example2.org/content?query=123 + +Site 3 +""" def items(self): root = "https://example.org"