improve '--write-pages' (#737)
- move code into its own function - add enumeration index to filenames - dump responses regardless of status code
This commit is contained in:
@@ -40,6 +40,7 @@ class Extractor():
|
|||||||
self._cookiefile = None
|
self._cookiefile = None
|
||||||
self._cookiejar = self.session.cookies
|
self._cookiejar = self.session.cookies
|
||||||
self._parentdir = ""
|
self._parentdir = ""
|
||||||
|
self._write_pages = self.config("write-pages", False)
|
||||||
self._retries = self.config("retries", 4)
|
self._retries = self.config("retries", 4)
|
||||||
self._timeout = self.config("timeout", 30)
|
self._timeout = self.config("timeout", 30)
|
||||||
self._verify = self.config("verify", True)
|
self._verify = self.config("verify", True)
|
||||||
@@ -91,22 +92,13 @@ class Extractor():
|
|||||||
raise exception.HttpError(exc)
|
raise exception.HttpError(exc)
|
||||||
else:
|
else:
|
||||||
code = response.status_code
|
code = response.status_code
|
||||||
|
if self._write_pages:
|
||||||
|
self._dump_response(response)
|
||||||
if 200 <= code < 400 or fatal is None and \
|
if 200 <= code < 400 or fatal is None and \
|
||||||
(400 <= code < 500) or not fatal and \
|
(400 <= code < 500) or not fatal and \
|
||||||
(400 <= code < 429 or 431 <= code < 500):
|
(400 <= code < 429 or 431 <= code < 500):
|
||||||
if encoding:
|
if encoding:
|
||||||
response.encoding = encoding
|
response.encoding = encoding
|
||||||
|
|
||||||
if config.get((), "write_pages", False):
|
|
||||||
# Write the response content to a .dump file
|
|
||||||
# in the current directory.
|
|
||||||
# The file name is derived from the response
|
|
||||||
# url, replacing special characters with "_"
|
|
||||||
r = re.compile(r"[\\\\|/<>:\"?*&=#]+")
|
|
||||||
outfilename = r.sub('_', response.url) + '.dump'
|
|
||||||
with open(outfilename, 'wb') as outfile:
|
|
||||||
outfile.write(response.content)
|
|
||||||
|
|
||||||
return response
|
return response
|
||||||
if notfound and code == 404:
|
if notfound and code == 404:
|
||||||
raise exception.NotFoundError(notfound)
|
raise exception.NotFoundError(notfound)
|
||||||
@@ -321,6 +313,24 @@ class Extractor():
|
|||||||
result.append((Message.Queue, url, {"_extractor": extr}))
|
result.append((Message.Queue, url, {"_extractor": extr}))
|
||||||
return iter(result)
|
return iter(result)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _dump_response(response):
|
||||||
|
"""Write the response content to a .dump file in the current directory.
|
||||||
|
|
||||||
|
The file name is derived from the response url,
|
||||||
|
replacing special characters with "_"
|
||||||
|
"""
|
||||||
|
if hasattr(Extractor, "_dump_index"):
|
||||||
|
Extractor._dump_index += 1
|
||||||
|
else:
|
||||||
|
Extractor._dump_index = 1
|
||||||
|
Extractor._dump_sanitize = re.compile(r"[\\\\|/<>:\"?*&=#]+").sub
|
||||||
|
|
||||||
|
outfilename = "{:>03}_{}.dump".format(
|
||||||
|
Extractor._dump_index, Extractor._dump_sanitize('_', response.url))
|
||||||
|
with open(outfilename, 'wb') as outfile:
|
||||||
|
outfile.write(response.content)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _get_tests(cls):
|
def _get_tests(cls):
|
||||||
"""Yield an extractor's test cases as (URL, RESULTS) tuples"""
|
"""Yield an extractor's test cases as (URL, RESULTS) tuples"""
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
# Copyright 2017-2019 Mike Fährmann
|
# Copyright 2017-2020 Mike Fährmann
|
||||||
#
|
#
|
||||||
# This program is free software; you can redistribute it and/or modify
|
# This program is free software; you can redistribute it and/or modify
|
||||||
# it under the terms of the GNU General Public License version 2 as
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
@@ -175,7 +175,7 @@ def build_parser():
|
|||||||
)
|
)
|
||||||
output.add_argument(
|
output.add_argument(
|
||||||
"--write-pages",
|
"--write-pages",
|
||||||
dest="write_pages", nargs=0, action=ConfigConstAction, const=True,
|
dest="write-pages", nargs=0, action=ConfigConstAction, const=True,
|
||||||
help=("Write downloaded intermediary pages to files "
|
help=("Write downloaded intermediary pages to files "
|
||||||
"in the current directory to debug problems"),
|
"in the current directory to debug problems"),
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user