diff --git a/docs/configuration.rst b/docs/configuration.rst index 08b58dad..a0e13d3a 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -8035,6 +8035,17 @@ Description Include fallback URLs in the output of ``-g/--get-urls``. +output.jsonl +------------ +Type + ``bool`` +Default + ``false`` +Description + Output ``-j/--dump-json`` & ``-J/--resolve-json`` + data in `JSON Lines `__ format. + + output.private -------------- Type diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index addbcf05..1f1b1724 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -1276,6 +1276,7 @@ { "ansi" : true, "fallback" : true, + "jsonl" : false, "mode" : "auto", "private" : false, "progress" : true, diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 16c5c5e0..0a04653b 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -947,6 +947,7 @@ class DataJob(Job): self.data_meta = [] self.exception = None self.ascii = config.get(("output",), "ascii", ensure_ascii) + self.jsonl = config.get(("output",), "jsonl", False) self.resolve = 128 if resolve is True else (resolve or self.resolve) private = config.get(("output",), "private") @@ -954,6 +955,8 @@ class DataJob(Job): if self.resolve > 0: self.handle_queue = self.handle_queue_resolve + if not self.jsonl: + self.out = util.noop def run(self): self._init() @@ -983,7 +986,7 @@ class DataJob(Job): for msg in self.data: util.transform_dict(msg[-1], util.number_to_string) - if self.file: + if self.file and not self.jsonl: # dump to 'file' try: util.dump_json(self.data, self.file, self.ascii, 2) @@ -993,22 +996,30 @@ class DataJob(Job): return 0 + def out(self, msg): + self.file.write(util.json_dumps(msg)) + self.file.write("\n") + self.file.flush() + def handle_url(self, url, kwdict): kwdict = self.filter(kwdict) + self.out(msg := (Message.Url, url, kwdict)) self.data_urls.append(url) self.data_meta.append(kwdict) - self.data.append((Message.Url, url, kwdict)) + self.data.append(msg) def handle_directory(self, kwdict): kwdict = self.filter(kwdict) + self.out(msg := (Message.Directory, kwdict)) self.data_post.append(kwdict) - self.data.append((Message.Directory, kwdict)) + self.data.append(msg) def handle_queue(self, url, kwdict): kwdict = self.filter(kwdict) + self.out(msg := (Message.Queue, url, kwdict)) self.data_urls.append(url) self.data_meta.append(kwdict) - self.data.append((Message.Queue, url, kwdict)) + self.data.append(msg) def handle_queue_resolve(self, url, kwdict): if cls := kwdict.get("_extractor"): @@ -1018,9 +1029,10 @@ class DataJob(Job): if not extr: kwdict = self.filter(kwdict) + self.out(msg := (Message.Queue, url, kwdict)) self.data_urls.append(url) self.data_meta.append(kwdict) - return self.data.append((Message.Queue, url, kwdict)) + return self.data.append(msg) job = self.__class__(extr, self, None, self.ascii, self.resolve-1) job.data = self.data diff --git a/test/test_job.py b/test/test_job.py index ec86c6ce..c59519c0 100644 --- a/test/test_job.py +++ b/test/test_job.py @@ -359,6 +359,25 @@ class TestDataJob(TestJob): self.assertEqual(tjob.data[-1][0], Message.Url) self.assertEqual(tjob.data[-1][2]["num"], "3") + def test_jsonl(self): + extr = TestExtractor.from_url("test:") + tjob = self.jobclass(extr, file=io.StringIO()) + with patch("gallery_dl.job.DataJob.out") as out: + tjob.run() + self.assertEqual(len(out.call_args_list), 0) + + config.set(("output",), "jsonl", True) + extr = TestExtractor.from_url("test:") + file = io.StringIO() + tjob = self.jobclass(extr, file=file) + with patch("gallery_dl.job.DataJob.out") as out: + tjob.run() + self.assertEqual(len(out.call_args_list), 4) + + tjob.run() + for line in file.getvalue().split(): + self.assertRegex(line, r"""^\[[23],("http[^"]+",)?\{.+\}\]$""") + class TestExtractor(Extractor): category = "test_category"