improve '--write-pages' (#737)
- move code into its own function - add enumeration index to filenames - dump responses regardless of status code
This commit is contained in:
@@ -40,6 +40,7 @@ class Extractor():
|
||||
self._cookiefile = None
|
||||
self._cookiejar = self.session.cookies
|
||||
self._parentdir = ""
|
||||
self._write_pages = self.config("write-pages", False)
|
||||
self._retries = self.config("retries", 4)
|
||||
self._timeout = self.config("timeout", 30)
|
||||
self._verify = self.config("verify", True)
|
||||
@@ -91,22 +92,13 @@ class Extractor():
|
||||
raise exception.HttpError(exc)
|
||||
else:
|
||||
code = response.status_code
|
||||
if self._write_pages:
|
||||
self._dump_response(response)
|
||||
if 200 <= code < 400 or fatal is None and \
|
||||
(400 <= code < 500) or not fatal and \
|
||||
(400 <= code < 429 or 431 <= code < 500):
|
||||
if encoding:
|
||||
response.encoding = encoding
|
||||
|
||||
if config.get((), "write_pages", False):
|
||||
# Write the response content to a .dump file
|
||||
# in the current directory.
|
||||
# The file name is derived from the response
|
||||
# url, replacing special characters with "_"
|
||||
r = re.compile(r"[\\\\|/<>:\"?*&=#]+")
|
||||
outfilename = r.sub('_', response.url) + '.dump'
|
||||
with open(outfilename, 'wb') as outfile:
|
||||
outfile.write(response.content)
|
||||
|
||||
return response
|
||||
if notfound and code == 404:
|
||||
raise exception.NotFoundError(notfound)
|
||||
@@ -321,6 +313,24 @@ class Extractor():
|
||||
result.append((Message.Queue, url, {"_extractor": extr}))
|
||||
return iter(result)
|
||||
|
||||
@staticmethod
|
||||
def _dump_response(response):
|
||||
"""Write the response content to a .dump file in the current directory.
|
||||
|
||||
The file name is derived from the response url,
|
||||
replacing special characters with "_"
|
||||
"""
|
||||
if hasattr(Extractor, "_dump_index"):
|
||||
Extractor._dump_index += 1
|
||||
else:
|
||||
Extractor._dump_index = 1
|
||||
Extractor._dump_sanitize = re.compile(r"[\\\\|/<>:\"?*&=#]+").sub
|
||||
|
||||
outfilename = "{:>03}_{}.dump".format(
|
||||
Extractor._dump_index, Extractor._dump_sanitize('_', response.url))
|
||||
with open(outfilename, 'wb') as outfile:
|
||||
outfile.write(response.content)
|
||||
|
||||
@classmethod
|
||||
def _get_tests(cls):
|
||||
"""Yield an extractor's test cases as (URL, RESULTS) tuples"""
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017-2019 Mike Fährmann
|
||||
# Copyright 2017-2020 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -175,7 +175,7 @@ def build_parser():
|
||||
)
|
||||
output.add_argument(
|
||||
"--write-pages",
|
||||
dest="write_pages", nargs=0, action=ConfigConstAction, const=True,
|
||||
dest="write-pages", nargs=0, action=ConfigConstAction, const=True,
|
||||
help=("Write downloaded intermediary pages to files "
|
||||
"in the current directory to debug problems"),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user