add '-j/--dump-json' option
this outputs the extractor-results in JSON format rather then downloading files
This commit is contained in:
@@ -83,6 +83,8 @@ def main():
|
|||||||
jobtype.maxdepth = args.list_urls
|
jobtype.maxdepth = args.list_urls
|
||||||
elif args.list_keywords:
|
elif args.list_keywords:
|
||||||
jobtype = job.KeywordJob
|
jobtype = job.KeywordJob
|
||||||
|
elif args.list_data:
|
||||||
|
jobtype = job.DataJob
|
||||||
else:
|
else:
|
||||||
jobtype = job.DownloadJob
|
jobtype = job.DownloadJob
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,7 @@
|
|||||||
# it under the terms of the GNU General Public License version 2 as
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
# published by the Free Software Foundation.
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
import sys
|
||||||
import json
|
import json
|
||||||
import hashlib
|
import hashlib
|
||||||
from . import extractor, downloader, config, util, output, exception
|
from . import extractor, downloader, config, util, output, exception
|
||||||
@@ -79,7 +80,7 @@ class Job():
|
|||||||
)
|
)
|
||||||
# TODO: support for multiple message versions
|
# TODO: support for multiple message versions
|
||||||
|
|
||||||
def handle_url(self, url, kexwords):
|
def handle_url(self, url, keywords):
|
||||||
"""Handle Message.Url"""
|
"""Handle Message.Url"""
|
||||||
|
|
||||||
def handle_directory(self, keywords):
|
def handle_directory(self, keywords):
|
||||||
@@ -265,3 +266,35 @@ class TestJob(DownloadJob):
|
|||||||
"""Update the content hash"""
|
"""Update the content hash"""
|
||||||
if self.content:
|
if self.content:
|
||||||
self.get_downloader(url).download(url, self.fileobj)
|
self.get_downloader(url).download(url, self.fileobj)
|
||||||
|
|
||||||
|
|
||||||
|
class DataJob(Job):
|
||||||
|
"""Collect extractor results and dump them"""
|
||||||
|
|
||||||
|
def __init__(self, url, file=sys.stdout):
|
||||||
|
Job.__init__(self, url)
|
||||||
|
self.file = file
|
||||||
|
self.data = []
|
||||||
|
self.ensure_ascii = config.get(("output", "ascii"), True)
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
# collect data
|
||||||
|
try:
|
||||||
|
for msg in self.extractor:
|
||||||
|
if msg[0] in (Message.Headers, Message.Cookies):
|
||||||
|
copy = (msg[0], dict(msg[1]))
|
||||||
|
else:
|
||||||
|
copy = [
|
||||||
|
part.copy() if hasattr(part, "copy") else part
|
||||||
|
for part in msg
|
||||||
|
]
|
||||||
|
self.data.append(copy)
|
||||||
|
except Exception as exc:
|
||||||
|
self.data.append((exc.__class__.__name__, str(exc)))
|
||||||
|
|
||||||
|
# dump to 'file'
|
||||||
|
json.dump(
|
||||||
|
self.data, self.file,
|
||||||
|
sort_keys=True, indent=2, ensure_ascii=self.ensure_ascii
|
||||||
|
)
|
||||||
|
self.file.write("\n")
|
||||||
|
|||||||
@@ -63,6 +63,10 @@ def build_parser():
|
|||||||
"-g", "--get-urls", dest="list_urls", action="count",
|
"-g", "--get-urls", dest="list_urls", action="count",
|
||||||
help="Print download urls",
|
help="Print download urls",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-j", "--dump-json", dest="list_data", action="store_true",
|
||||||
|
help="Print JSON information",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-d", "--dest",
|
"-d", "--dest",
|
||||||
metavar="DEST", action=ConfigAction, dest="base-directory",
|
metavar="DEST", action=ConfigAction, dest="base-directory",
|
||||||
|
|||||||
Reference in New Issue
Block a user