add --write-unsupported option (#15)
This commit is contained in:
@@ -100,8 +100,14 @@ def main():
|
|||||||
file = open(args.inputfile)
|
file = open(args.inputfile)
|
||||||
import itertools
|
import itertools
|
||||||
urls = itertools.chain(urls, sanatize_input(file))
|
urls = itertools.chain(urls, sanatize_input(file))
|
||||||
except OSError as err:
|
except OSError as exc:
|
||||||
log.error(err)
|
log.warning("input-file: %s", exc)
|
||||||
|
|
||||||
|
if args.unsupportedfile:
|
||||||
|
try:
|
||||||
|
job.Job.ufile = open(args.unsupportedfile, "w")
|
||||||
|
except OSError as exc:
|
||||||
|
log.warning("unsupported-URL file: %s", exc)
|
||||||
|
|
||||||
for url in urls:
|
for url in urls:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ from .extractor.message import Message
|
|||||||
|
|
||||||
class Job():
|
class Job():
|
||||||
"""Base class for Job-types"""
|
"""Base class for Job-types"""
|
||||||
|
ufile = None
|
||||||
|
|
||||||
def __init__(self, url):
|
def __init__(self, url):
|
||||||
self.url = url
|
self.url = url
|
||||||
@@ -111,6 +112,10 @@ class Job():
|
|||||||
kwdict["category"] = self.extractor.category
|
kwdict["category"] = self.extractor.category
|
||||||
kwdict["subcategory"] = self.extractor.subcategory
|
kwdict["subcategory"] = self.extractor.subcategory
|
||||||
|
|
||||||
|
def _write_unsupported(self, url):
|
||||||
|
if self.ufile:
|
||||||
|
print(url, file=self.ufile, flush=True)
|
||||||
|
|
||||||
|
|
||||||
class DownloadJob(Job):
|
class DownloadJob(Job):
|
||||||
"""Download images into appropriate directory/filename locations"""
|
"""Download images into appropriate directory/filename locations"""
|
||||||
@@ -138,7 +143,7 @@ class DownloadJob(Job):
|
|||||||
try:
|
try:
|
||||||
DownloadJob(url).run()
|
DownloadJob(url).run()
|
||||||
except exception.NoExtractorError:
|
except exception.NoExtractorError:
|
||||||
pass
|
self._write_unsupported(url)
|
||||||
|
|
||||||
def handle_headers(self, headers):
|
def handle_headers(self, headers):
|
||||||
self.get_downloader("http:").set_headers(headers)
|
self.get_downloader("http:").set_headers(headers)
|
||||||
@@ -205,7 +210,7 @@ class UrlJob(Job):
|
|||||||
Job.__init__(self, url)
|
Job.__init__(self, url)
|
||||||
self.depth = depth
|
self.depth = depth
|
||||||
if depth == self.maxdepth:
|
if depth == self.maxdepth:
|
||||||
self.handle_queue = self._print
|
self.handle_queue = print
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def handle_url(url, _):
|
def handle_url(url, _):
|
||||||
@@ -215,13 +220,7 @@ class UrlJob(Job):
|
|||||||
try:
|
try:
|
||||||
UrlJob(url, self.depth + 1).run()
|
UrlJob(url, self.depth + 1).run()
|
||||||
except exception.NoExtractorError:
|
except exception.NoExtractorError:
|
||||||
pass
|
self._write_unsupported(url)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _print(url):
|
|
||||||
if url.startswith("nofollow:"):
|
|
||||||
url = url[9:]
|
|
||||||
print(url)
|
|
||||||
|
|
||||||
|
|
||||||
class TestJob(DownloadJob):
|
class TestJob(DownloadJob):
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ def build_parser():
|
|||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-g", "--get-urls", dest="list_urls", action="count",
|
"-g", "--get-urls", dest="list_urls", action="count",
|
||||||
help="Print download urls",
|
help="Print URLs instead of downloading",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-j", "--dump-json", dest="list_data", action="store_true",
|
"-j", "--dump-json", dest="list_data", action="store_true",
|
||||||
@@ -139,6 +139,11 @@ def build_parser():
|
|||||||
metavar="OPT", action=ParseAction, dest="options", default=[],
|
metavar="OPT", action=ParseAction, dest="options", default=[],
|
||||||
help="Additional '<key>=<value>' option values",
|
help="Additional '<key>=<value>' option values",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--write-unsupported", metavar="FILE", dest="unsupportedfile",
|
||||||
|
help=("Write URLs, which get emitted by other extractors but cannot "
|
||||||
|
"be handled, to FILE"),
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--list-extractors", dest="list_extractors", action="store_true",
|
"--list-extractors", dest="list_extractors", action="store_true",
|
||||||
help=("Print a list of extractor classes "
|
help=("Print a list of extractor classes "
|
||||||
|
|||||||
Reference in New Issue
Block a user