[job] implement 'follow' option (#8752)

Follow and process URLs found in the given format string result.
2026-02-07 11:41:32 +01:00
parent c978fe18d4
commit 2d64e76223
4 changed files with 90 additions and 3 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -212,6 +212,20 @@ Description
    Specifying a default |Path|_ with ``""`` is required.
 extractor.*.follow
 ------------------
 Type
    `Format String`_
 Default
    ``null``
 Example
    * ``"{content}"``
    * ``"\fE body or html or text"``
 Description
    Follow URLs in the given `Format String`_'s result and
    process them with child extractors.
 extractor.*.parent
 ------------------
 Type
@@ -10230,7 +10244,7 @@ Example
    * ``"foo"``
    * ``"{username}"``
    * ``"{title} ({id}).{extension}"``
-    * ``"\fF {title.title()} ({num:>0:>0{len(str(a))}} / {count}).{extension}"``
+    * ``"\fF {title.title()} ({num:>0{len(str(count))}} / {count}).{extension}"``
 Description
    A `Format String`_ allows creating dynamic text
    by embedding metadata values directly into replacement fields
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -14,6 +14,7 @@
        "postprocessors": null,
        "skip"          : true,
        "skip-filter"   : null,
        "follow"        : null,
        "user-agent"    : "auto",
        "referer"       : true,
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -211,13 +211,31 @@ class Job():
        msg = None
        process = True
        if follow := self.extractor.config("follow"):
            follow = formatter.parse(follow, None, util.identity).format_map
            follow_urls = follow_kwdict = None
        else:
            follow = follow_urls = None
        for msg, url, kwdict in messages:
            if msg == Message.Directory:
                if follow_urls is not None:
                    for furl in follow_urls:
                        if self.metadata_url:
                            follow_kwdict[self.metadata_url] = furl
                        if self.pred_queue(furl, follow_kwdict):
                            self.handle_queue(furl, follow_kwdict)
                    follow_urls = None
                self.update_kwdict(kwdict)
                if self.pred_post(url, kwdict):
                    process = True
                    self.handle_directory(kwdict)
                    if follow is not None:
                        follow_urls = self._collect_urls(follow(kwdict))
                        if follow_urls is not None:
                            follow_kwdict = kwdict.copy()
                else:
                    process = None
                if FLAGS.POST is not None:
@@ -253,6 +271,13 @@ class Job():
                if FLAGS.CHILD is not None:
                    FLAGS.process("CHILD")
        if follow_urls is not None:
            for furl in follow_urls:
                if self.metadata_url:
                    follow_kwdict[self.metadata_url] = furl
                if self.pred_queue(furl, follow_kwdict):
                    self.handle_queue(furl, follow_kwdict)
        return msg
    def handle_url(self, url, kwdict):
@@ -300,6 +325,15 @@ class Job():
        if init and init != "lazy":
            self.initialize()
    def _collect_urls(self, source):
        if not source:
            return None
        if isinstance(source, list):
            return source
        if isinstance(source, str):
            if urls := text.extract_urls(source):
                return urls
    def _prepare_predicates(self, target, alt=None, skip=None):
        predicates = []
        extr = self.extractor
--- a/test/test_job.py
+++ b/test/test_job.py
@@ -207,7 +207,7 @@ test:child
    def test_child(self):
        extr = TestExtractorParent.from_url("test:parent")
-        tjob = job.UrlJob(extr, depth=0)
+        tjob = self.jobclass(extr, depth=0)
        self.assertEqual(self._capture_stdout(tjob), 3 * """\
 https://example.org/1.jpg
 https://example.org/2.jpg
@@ -221,6 +221,20 @@ https://example.org/3.jpg
        tjob = self.jobclass(extr)
        tjob._init()
    def test_opt_follow(self):
        config.set((), "follow", "{user[bio]}")
        extr = TestExtractor.from_url("test:urls")
        tjob = self.jobclass(extr)
        self.assertEqual(self._capture_stdout(tjob), """\
 https://example.org/1.jpg
 https://example.org/2.jpg
 https://example.org/3.jpg
 https://example1.org/content/abc
 https://example2.org/content?query=123
 https://example3.org/content/#frag
 """)
 class TestInfoJob(TestJob):
    jobclass = job.InfoJob
@@ -448,6 +462,21 @@ class TestDataJob(TestJob):
        tjob = self.jobclass(extr)
        tjob._init()
    def test_opt_follow(self):
        config.set((), "follow", "{user[bio]!R}")
        extr = TestExtractor.from_url("test:urls")
        tjob = self.jobclass(extr, file=None)
        tjob.run()
        self.assertEqual(tjob.data_urls, [
            "https://example.org/1.jpg",
            "https://example.org/2.jpg",
            "https://example.org/3.jpg",
            "https://example1.org/content/abc",
            "https://example2.org/content?query=123",
            "https://example3.org/content/#frag"
        ])
    def test_resolve(self):
        extr = TestExtractorParent.from_url("test:parent:3")
        tjob = self.jobclass(extr, file=None, resolve=0)
@@ -486,13 +515,22 @@ class TestExtractor(Extractor):
    subcategory = "test_subcategory"
    directory_fmt = ("{category}",)
    filename_fmt = "test_{filename}.{extension}"
-    pattern = r"test:(child|self)?$"
+    pattern = r"test:(child|self|urls)?$"
    def __init__(self, match):
        Extractor.__init__(self, match)
        self.user = {"id": 123, "name": "test"}
        if match[1] == "self":
            self.user["self"] = self.user
        elif match[1] == "urls":
            self.user["bio"] = """
 Site 1:
 * https://example1.org/content/abc
 Site 2:
 * https://example2.org/content?query=123
 <a href="https://example3.org/content/#frag">Site 3</a>
 """
    def items(self):
        root = "https://example.org"