add --write-unsupported option (#15)

This commit is contained in:
Mike Fährmann
2017-05-27 16:16:57 +02:00
parent bf452a8516
commit 25bcdc8aa9
3 changed files with 22 additions and 12 deletions

View File

@@ -100,8 +100,14 @@ def main():
file = open(args.inputfile) file = open(args.inputfile)
import itertools import itertools
urls = itertools.chain(urls, sanatize_input(file)) urls = itertools.chain(urls, sanatize_input(file))
except OSError as err: except OSError as exc:
log.error(err) log.warning("input-file: %s", exc)
if args.unsupportedfile:
try:
job.Job.ufile = open(args.unsupportedfile, "w")
except OSError as exc:
log.warning("unsupported-URL file: %s", exc)
for url in urls: for url in urls:
try: try:

View File

@@ -15,6 +15,7 @@ from .extractor.message import Message
class Job(): class Job():
"""Base class for Job-types""" """Base class for Job-types"""
ufile = None
def __init__(self, url): def __init__(self, url):
self.url = url self.url = url
@@ -111,6 +112,10 @@ class Job():
kwdict["category"] = self.extractor.category kwdict["category"] = self.extractor.category
kwdict["subcategory"] = self.extractor.subcategory kwdict["subcategory"] = self.extractor.subcategory
def _write_unsupported(self, url):
if self.ufile:
print(url, file=self.ufile, flush=True)
class DownloadJob(Job): class DownloadJob(Job):
"""Download images into appropriate directory/filename locations""" """Download images into appropriate directory/filename locations"""
@@ -138,7 +143,7 @@ class DownloadJob(Job):
try: try:
DownloadJob(url).run() DownloadJob(url).run()
except exception.NoExtractorError: except exception.NoExtractorError:
pass self._write_unsupported(url)
def handle_headers(self, headers): def handle_headers(self, headers):
self.get_downloader("http:").set_headers(headers) self.get_downloader("http:").set_headers(headers)
@@ -205,7 +210,7 @@ class UrlJob(Job):
Job.__init__(self, url) Job.__init__(self, url)
self.depth = depth self.depth = depth
if depth == self.maxdepth: if depth == self.maxdepth:
self.handle_queue = self._print self.handle_queue = print
@staticmethod @staticmethod
def handle_url(url, _): def handle_url(url, _):
@@ -215,13 +220,7 @@ class UrlJob(Job):
try: try:
UrlJob(url, self.depth + 1).run() UrlJob(url, self.depth + 1).run()
except exception.NoExtractorError: except exception.NoExtractorError:
pass self._write_unsupported(url)
@staticmethod
def _print(url):
if url.startswith("nofollow:"):
url = url[9:]
print(url)
class TestJob(DownloadJob): class TestJob(DownloadJob):

View File

@@ -67,7 +67,7 @@ def build_parser():
) )
parser.add_argument( parser.add_argument(
"-g", "--get-urls", dest="list_urls", action="count", "-g", "--get-urls", dest="list_urls", action="count",
help="Print download urls", help="Print URLs instead of downloading",
) )
parser.add_argument( parser.add_argument(
"-j", "--dump-json", dest="list_data", action="store_true", "-j", "--dump-json", dest="list_data", action="store_true",
@@ -139,6 +139,11 @@ def build_parser():
metavar="OPT", action=ParseAction, dest="options", default=[], metavar="OPT", action=ParseAction, dest="options", default=[],
help="Additional '<key>=<value>' option values", help="Additional '<key>=<value>' option values",
) )
parser.add_argument(
"--write-unsupported", metavar="FILE", dest="unsupportedfile",
help=("Write URLs, which get emitted by other extractors but cannot "
"be handled, to FILE"),
)
parser.add_argument( parser.add_argument(
"--list-extractors", dest="list_extractors", action="store_true", "--list-extractors", dest="list_extractors", action="store_true",
help=("Print a list of extractor classes " help=("Print a list of extractor classes "