[job] add 'output.jsonl' option (#8953)
This commit is contained in:
@@ -8035,6 +8035,17 @@ Description
|
||||
Include fallback URLs in the output of ``-g/--get-urls``.
|
||||
|
||||
|
||||
output.jsonl
|
||||
------------
|
||||
Type
|
||||
``bool``
|
||||
Default
|
||||
``false``
|
||||
Description
|
||||
Output ``-j/--dump-json`` & ``-J/--resolve-json``
|
||||
data in `JSON Lines <https://jsonlines.org/>`__ format.
|
||||
|
||||
|
||||
output.private
|
||||
--------------
|
||||
Type
|
||||
|
||||
@@ -1276,6 +1276,7 @@
|
||||
{
|
||||
"ansi" : true,
|
||||
"fallback" : true,
|
||||
"jsonl" : false,
|
||||
"mode" : "auto",
|
||||
"private" : false,
|
||||
"progress" : true,
|
||||
|
||||
@@ -947,6 +947,7 @@ class DataJob(Job):
|
||||
self.data_meta = []
|
||||
self.exception = None
|
||||
self.ascii = config.get(("output",), "ascii", ensure_ascii)
|
||||
self.jsonl = config.get(("output",), "jsonl", False)
|
||||
self.resolve = 128 if resolve is True else (resolve or self.resolve)
|
||||
|
||||
private = config.get(("output",), "private")
|
||||
@@ -954,6 +955,8 @@ class DataJob(Job):
|
||||
|
||||
if self.resolve > 0:
|
||||
self.handle_queue = self.handle_queue_resolve
|
||||
if not self.jsonl:
|
||||
self.out = util.noop
|
||||
|
||||
def run(self):
|
||||
self._init()
|
||||
@@ -983,7 +986,7 @@ class DataJob(Job):
|
||||
for msg in self.data:
|
||||
util.transform_dict(msg[-1], util.number_to_string)
|
||||
|
||||
if self.file:
|
||||
if self.file and not self.jsonl:
|
||||
# dump to 'file'
|
||||
try:
|
||||
util.dump_json(self.data, self.file, self.ascii, 2)
|
||||
@@ -993,22 +996,30 @@ class DataJob(Job):
|
||||
|
||||
return 0
|
||||
|
||||
def out(self, msg):
|
||||
self.file.write(util.json_dumps(msg))
|
||||
self.file.write("\n")
|
||||
self.file.flush()
|
||||
|
||||
def handle_url(self, url, kwdict):
|
||||
kwdict = self.filter(kwdict)
|
||||
self.out(msg := (Message.Url, url, kwdict))
|
||||
self.data_urls.append(url)
|
||||
self.data_meta.append(kwdict)
|
||||
self.data.append((Message.Url, url, kwdict))
|
||||
self.data.append(msg)
|
||||
|
||||
def handle_directory(self, kwdict):
|
||||
kwdict = self.filter(kwdict)
|
||||
self.out(msg := (Message.Directory, kwdict))
|
||||
self.data_post.append(kwdict)
|
||||
self.data.append((Message.Directory, kwdict))
|
||||
self.data.append(msg)
|
||||
|
||||
def handle_queue(self, url, kwdict):
|
||||
kwdict = self.filter(kwdict)
|
||||
self.out(msg := (Message.Queue, url, kwdict))
|
||||
self.data_urls.append(url)
|
||||
self.data_meta.append(kwdict)
|
||||
self.data.append((Message.Queue, url, kwdict))
|
||||
self.data.append(msg)
|
||||
|
||||
def handle_queue_resolve(self, url, kwdict):
|
||||
if cls := kwdict.get("_extractor"):
|
||||
@@ -1018,9 +1029,10 @@ class DataJob(Job):
|
||||
|
||||
if not extr:
|
||||
kwdict = self.filter(kwdict)
|
||||
self.out(msg := (Message.Queue, url, kwdict))
|
||||
self.data_urls.append(url)
|
||||
self.data_meta.append(kwdict)
|
||||
return self.data.append((Message.Queue, url, kwdict))
|
||||
return self.data.append(msg)
|
||||
|
||||
job = self.__class__(extr, self, None, self.ascii, self.resolve-1)
|
||||
job.data = self.data
|
||||
|
||||
@@ -359,6 +359,25 @@ class TestDataJob(TestJob):
|
||||
self.assertEqual(tjob.data[-1][0], Message.Url)
|
||||
self.assertEqual(tjob.data[-1][2]["num"], "3")
|
||||
|
||||
def test_jsonl(self):
|
||||
extr = TestExtractor.from_url("test:")
|
||||
tjob = self.jobclass(extr, file=io.StringIO())
|
||||
with patch("gallery_dl.job.DataJob.out") as out:
|
||||
tjob.run()
|
||||
self.assertEqual(len(out.call_args_list), 0)
|
||||
|
||||
config.set(("output",), "jsonl", True)
|
||||
extr = TestExtractor.from_url("test:")
|
||||
file = io.StringIO()
|
||||
tjob = self.jobclass(extr, file=file)
|
||||
with patch("gallery_dl.job.DataJob.out") as out:
|
||||
tjob.run()
|
||||
self.assertEqual(len(out.call_args_list), 4)
|
||||
|
||||
tjob.run()
|
||||
for line in file.getvalue().split():
|
||||
self.assertRegex(line, r"""^\[[23],("http[^"]+",)?\{.+\}\]$""")
|
||||
|
||||
|
||||
class TestExtractor(Extractor):
|
||||
category = "test_category"
|
||||
|
||||
Reference in New Issue
Block a user