add '-j/--dump-json' option

this outputs the extractor-results in JSON format rather then
downloading files
This commit is contained in:
Mike Fährmann
2017-04-12 18:43:41 +02:00
parent c9a5650cf8
commit b43cd88101
3 changed files with 40 additions and 1 deletions

View File

@@ -6,6 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
import sys
import json
import hashlib
from . import extractor, downloader, config, util, output, exception
@@ -79,7 +80,7 @@ class Job():
)
# TODO: support for multiple message versions
def handle_url(self, url, kexwords):
def handle_url(self, url, keywords):
"""Handle Message.Url"""
def handle_directory(self, keywords):
@@ -265,3 +266,35 @@ class TestJob(DownloadJob):
"""Update the content hash"""
if self.content:
self.get_downloader(url).download(url, self.fileobj)
class DataJob(Job):
"""Collect extractor results and dump them"""
def __init__(self, url, file=sys.stdout):
Job.__init__(self, url)
self.file = file
self.data = []
self.ensure_ascii = config.get(("output", "ascii"), True)
def run(self):
# collect data
try:
for msg in self.extractor:
if msg[0] in (Message.Headers, Message.Cookies):
copy = (msg[0], dict(msg[1]))
else:
copy = [
part.copy() if hasattr(part, "copy") else part
for part in msg
]
self.data.append(copy)
except Exception as exc:
self.data.append((exc.__class__.__name__, str(exc)))
# dump to 'file'
json.dump(
self.data, self.file,
sort_keys=True, indent=2, ensure_ascii=self.ensure_ascii
)
self.file.write("\n")