[job] implement 'follow' option (#8752)
Follow and process URLs found in the given format string result.
This commit is contained in:
@@ -212,6 +212,20 @@ Description
|
|||||||
Specifying a default |Path|_ with ``""`` is required.
|
Specifying a default |Path|_ with ``""`` is required.
|
||||||
|
|
||||||
|
|
||||||
|
extractor.*.follow
|
||||||
|
------------------
|
||||||
|
Type
|
||||||
|
`Format String`_
|
||||||
|
Default
|
||||||
|
``null``
|
||||||
|
Example
|
||||||
|
* ``"{content}"``
|
||||||
|
* ``"\fE body or html or text"``
|
||||||
|
Description
|
||||||
|
Follow URLs in the given `Format String`_'s result and
|
||||||
|
process them with child extractors.
|
||||||
|
|
||||||
|
|
||||||
extractor.*.parent
|
extractor.*.parent
|
||||||
------------------
|
------------------
|
||||||
Type
|
Type
|
||||||
@@ -10230,7 +10244,7 @@ Example
|
|||||||
* ``"foo"``
|
* ``"foo"``
|
||||||
* ``"{username}"``
|
* ``"{username}"``
|
||||||
* ``"{title} ({id}).{extension}"``
|
* ``"{title} ({id}).{extension}"``
|
||||||
* ``"\fF {title.title()} ({num:>0:>0{len(str(a))}} / {count}).{extension}"``
|
* ``"\fF {title.title()} ({num:>0{len(str(count))}} / {count}).{extension}"``
|
||||||
Description
|
Description
|
||||||
A `Format String`_ allows creating dynamic text
|
A `Format String`_ allows creating dynamic text
|
||||||
by embedding metadata values directly into replacement fields
|
by embedding metadata values directly into replacement fields
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
"postprocessors": null,
|
"postprocessors": null,
|
||||||
"skip" : true,
|
"skip" : true,
|
||||||
"skip-filter" : null,
|
"skip-filter" : null,
|
||||||
|
"follow" : null,
|
||||||
|
|
||||||
"user-agent" : "auto",
|
"user-agent" : "auto",
|
||||||
"referer" : true,
|
"referer" : true,
|
||||||
|
|||||||
@@ -211,13 +211,31 @@ class Job():
|
|||||||
msg = None
|
msg = None
|
||||||
process = True
|
process = True
|
||||||
|
|
||||||
|
if follow := self.extractor.config("follow"):
|
||||||
|
follow = formatter.parse(follow, None, util.identity).format_map
|
||||||
|
follow_urls = follow_kwdict = None
|
||||||
|
else:
|
||||||
|
follow = follow_urls = None
|
||||||
|
|
||||||
for msg, url, kwdict in messages:
|
for msg, url, kwdict in messages:
|
||||||
|
|
||||||
if msg == Message.Directory:
|
if msg == Message.Directory:
|
||||||
|
if follow_urls is not None:
|
||||||
|
for furl in follow_urls:
|
||||||
|
if self.metadata_url:
|
||||||
|
follow_kwdict[self.metadata_url] = furl
|
||||||
|
if self.pred_queue(furl, follow_kwdict):
|
||||||
|
self.handle_queue(furl, follow_kwdict)
|
||||||
|
follow_urls = None
|
||||||
|
|
||||||
self.update_kwdict(kwdict)
|
self.update_kwdict(kwdict)
|
||||||
if self.pred_post(url, kwdict):
|
if self.pred_post(url, kwdict):
|
||||||
process = True
|
process = True
|
||||||
self.handle_directory(kwdict)
|
self.handle_directory(kwdict)
|
||||||
|
if follow is not None:
|
||||||
|
follow_urls = self._collect_urls(follow(kwdict))
|
||||||
|
if follow_urls is not None:
|
||||||
|
follow_kwdict = kwdict.copy()
|
||||||
else:
|
else:
|
||||||
process = None
|
process = None
|
||||||
if FLAGS.POST is not None:
|
if FLAGS.POST is not None:
|
||||||
@@ -253,6 +271,13 @@ class Job():
|
|||||||
if FLAGS.CHILD is not None:
|
if FLAGS.CHILD is not None:
|
||||||
FLAGS.process("CHILD")
|
FLAGS.process("CHILD")
|
||||||
|
|
||||||
|
if follow_urls is not None:
|
||||||
|
for furl in follow_urls:
|
||||||
|
if self.metadata_url:
|
||||||
|
follow_kwdict[self.metadata_url] = furl
|
||||||
|
if self.pred_queue(furl, follow_kwdict):
|
||||||
|
self.handle_queue(furl, follow_kwdict)
|
||||||
|
|
||||||
return msg
|
return msg
|
||||||
|
|
||||||
def handle_url(self, url, kwdict):
|
def handle_url(self, url, kwdict):
|
||||||
@@ -300,6 +325,15 @@ class Job():
|
|||||||
if init and init != "lazy":
|
if init and init != "lazy":
|
||||||
self.initialize()
|
self.initialize()
|
||||||
|
|
||||||
|
def _collect_urls(self, source):
|
||||||
|
if not source:
|
||||||
|
return None
|
||||||
|
if isinstance(source, list):
|
||||||
|
return source
|
||||||
|
if isinstance(source, str):
|
||||||
|
if urls := text.extract_urls(source):
|
||||||
|
return urls
|
||||||
|
|
||||||
def _prepare_predicates(self, target, alt=None, skip=None):
|
def _prepare_predicates(self, target, alt=None, skip=None):
|
||||||
predicates = []
|
predicates = []
|
||||||
extr = self.extractor
|
extr = self.extractor
|
||||||
|
|||||||
@@ -207,7 +207,7 @@ test:child
|
|||||||
|
|
||||||
def test_child(self):
|
def test_child(self):
|
||||||
extr = TestExtractorParent.from_url("test:parent")
|
extr = TestExtractorParent.from_url("test:parent")
|
||||||
tjob = job.UrlJob(extr, depth=0)
|
tjob = self.jobclass(extr, depth=0)
|
||||||
self.assertEqual(self._capture_stdout(tjob), 3 * """\
|
self.assertEqual(self._capture_stdout(tjob), 3 * """\
|
||||||
https://example.org/1.jpg
|
https://example.org/1.jpg
|
||||||
https://example.org/2.jpg
|
https://example.org/2.jpg
|
||||||
@@ -221,6 +221,20 @@ https://example.org/3.jpg
|
|||||||
tjob = self.jobclass(extr)
|
tjob = self.jobclass(extr)
|
||||||
tjob._init()
|
tjob._init()
|
||||||
|
|
||||||
|
def test_opt_follow(self):
|
||||||
|
config.set((), "follow", "{user[bio]}")
|
||||||
|
|
||||||
|
extr = TestExtractor.from_url("test:urls")
|
||||||
|
tjob = self.jobclass(extr)
|
||||||
|
self.assertEqual(self._capture_stdout(tjob), """\
|
||||||
|
https://example.org/1.jpg
|
||||||
|
https://example.org/2.jpg
|
||||||
|
https://example.org/3.jpg
|
||||||
|
https://example1.org/content/abc
|
||||||
|
https://example2.org/content?query=123
|
||||||
|
https://example3.org/content/#frag
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
class TestInfoJob(TestJob):
|
class TestInfoJob(TestJob):
|
||||||
jobclass = job.InfoJob
|
jobclass = job.InfoJob
|
||||||
@@ -448,6 +462,21 @@ class TestDataJob(TestJob):
|
|||||||
tjob = self.jobclass(extr)
|
tjob = self.jobclass(extr)
|
||||||
tjob._init()
|
tjob._init()
|
||||||
|
|
||||||
|
def test_opt_follow(self):
|
||||||
|
config.set((), "follow", "{user[bio]!R}")
|
||||||
|
|
||||||
|
extr = TestExtractor.from_url("test:urls")
|
||||||
|
tjob = self.jobclass(extr, file=None)
|
||||||
|
tjob.run()
|
||||||
|
self.assertEqual(tjob.data_urls, [
|
||||||
|
"https://example.org/1.jpg",
|
||||||
|
"https://example.org/2.jpg",
|
||||||
|
"https://example.org/3.jpg",
|
||||||
|
"https://example1.org/content/abc",
|
||||||
|
"https://example2.org/content?query=123",
|
||||||
|
"https://example3.org/content/#frag"
|
||||||
|
])
|
||||||
|
|
||||||
def test_resolve(self):
|
def test_resolve(self):
|
||||||
extr = TestExtractorParent.from_url("test:parent:3")
|
extr = TestExtractorParent.from_url("test:parent:3")
|
||||||
tjob = self.jobclass(extr, file=None, resolve=0)
|
tjob = self.jobclass(extr, file=None, resolve=0)
|
||||||
@@ -486,13 +515,22 @@ class TestExtractor(Extractor):
|
|||||||
subcategory = "test_subcategory"
|
subcategory = "test_subcategory"
|
||||||
directory_fmt = ("{category}",)
|
directory_fmt = ("{category}",)
|
||||||
filename_fmt = "test_{filename}.{extension}"
|
filename_fmt = "test_{filename}.{extension}"
|
||||||
pattern = r"test:(child|self)?$"
|
pattern = r"test:(child|self|urls)?$"
|
||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
self.user = {"id": 123, "name": "test"}
|
self.user = {"id": 123, "name": "test"}
|
||||||
if match[1] == "self":
|
if match[1] == "self":
|
||||||
self.user["self"] = self.user
|
self.user["self"] = self.user
|
||||||
|
elif match[1] == "urls":
|
||||||
|
self.user["bio"] = """
|
||||||
|
Site 1:
|
||||||
|
* https://example1.org/content/abc
|
||||||
|
Site 2:
|
||||||
|
* https://example2.org/content?query=123
|
||||||
|
|
||||||
|
<a href="https://example3.org/content/#frag">Site 3</a>
|
||||||
|
"""
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
root = "https://example.org"
|
root = "https://example.org"
|
||||||
|
|||||||
Reference in New Issue
Block a user