include 'http-metadata' in '-K' output
This commit is contained in:
@@ -14,8 +14,6 @@ from requests.exceptions import RequestException, ConnectionError, Timeout
|
|||||||
from .common import DownloaderBase
|
from .common import DownloaderBase
|
||||||
from .. import text, util
|
from .. import text, util
|
||||||
|
|
||||||
from email.utils import parsedate_tz
|
|
||||||
from datetime import datetime
|
|
||||||
from ssl import SSLError
|
from ssl import SSLError
|
||||||
try:
|
try:
|
||||||
from OpenSSL.SSL import Error as OpenSSLError
|
from OpenSSL.SSL import Error as OpenSSLError
|
||||||
@@ -197,7 +195,7 @@ class HttpDownloader(DownloaderBase):
|
|||||||
|
|
||||||
# set metadata from HTTP headers
|
# set metadata from HTTP headers
|
||||||
if self.metadata:
|
if self.metadata:
|
||||||
kwdict[self.metadata] = self._extract_metadata(response)
|
kwdict[self.metadata] = util.extract_headers(response)
|
||||||
pathfmt.build_path()
|
pathfmt.build_path()
|
||||||
if pathfmt.exists():
|
if pathfmt.exists():
|
||||||
pathfmt.temppath = ""
|
pathfmt.temppath = ""
|
||||||
@@ -305,22 +303,6 @@ class HttpDownloader(DownloaderBase):
|
|||||||
|
|
||||||
t1 = t2
|
t1 = t2
|
||||||
|
|
||||||
def _extract_metadata(self, response):
|
|
||||||
headers = response.headers
|
|
||||||
data = dict(headers)
|
|
||||||
|
|
||||||
hcd = headers.get("content-disposition")
|
|
||||||
if hcd:
|
|
||||||
name = text.extr(hcd, 'filename="', '"')
|
|
||||||
if name:
|
|
||||||
text.nameext_from_url(name, data)
|
|
||||||
|
|
||||||
hlm = headers.get("last-modified")
|
|
||||||
if hlm:
|
|
||||||
data["date"] = datetime(*parsedate_tz(hlm)[:6])
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
def _find_extension(self, response):
|
def _find_extension(self, response):
|
||||||
"""Get filename extension from MIME type"""
|
"""Get filename extension from MIME type"""
|
||||||
mtype = response.headers.get("Content-Type", "image/jpeg")
|
mtype = response.headers.get("Content-Type", "image/jpeg")
|
||||||
|
|||||||
@@ -32,11 +32,8 @@ class Job():
|
|||||||
self.pathfmt = None
|
self.pathfmt = None
|
||||||
self.kwdict = {}
|
self.kwdict = {}
|
||||||
self.status = 0
|
self.status = 0
|
||||||
self.url_key = extr.config("url-metadata")
|
|
||||||
|
|
||||||
path_key = extr.config("path-metadata")
|
|
||||||
path_proxy = output.PathfmtProxy(self)
|
path_proxy = output.PathfmtProxy(self)
|
||||||
|
|
||||||
self._logger_extra = {
|
self._logger_extra = {
|
||||||
"job" : self,
|
"job" : self,
|
||||||
"extractor": extr,
|
"extractor": extr,
|
||||||
@@ -56,12 +53,16 @@ class Job():
|
|||||||
extr.category = pextr.category
|
extr.category = pextr.category
|
||||||
extr.subcategory = pextr.subcategory
|
extr.subcategory = pextr.subcategory
|
||||||
|
|
||||||
|
self.metadata_url = extr.config("url-metadata")
|
||||||
|
self.metadata_http = extr.config("http-metadata")
|
||||||
|
metadata_path = extr.config("path-metadata")
|
||||||
|
|
||||||
# user-supplied metadata
|
# user-supplied metadata
|
||||||
kwdict = extr.config("keywords")
|
kwdict = extr.config("keywords")
|
||||||
if kwdict:
|
if kwdict:
|
||||||
self.kwdict.update(kwdict)
|
self.kwdict.update(kwdict)
|
||||||
if path_key:
|
if metadata_path:
|
||||||
self.kwdict[path_key] = path_proxy
|
self.kwdict[metadata_path] = path_proxy
|
||||||
|
|
||||||
# predicates
|
# predicates
|
||||||
self.pred_url = self._prepare_predicates("image", True)
|
self.pred_url = self._prepare_predicates("image", True)
|
||||||
@@ -120,8 +121,8 @@ class Job():
|
|||||||
"""Call the appropriate message handler"""
|
"""Call the appropriate message handler"""
|
||||||
if msg[0] == Message.Url:
|
if msg[0] == Message.Url:
|
||||||
_, url, kwdict = msg
|
_, url, kwdict = msg
|
||||||
if self.url_key:
|
if self.metadata_url:
|
||||||
kwdict[self.url_key] = url
|
kwdict[self.metadata_url] = url
|
||||||
if self.pred_url(url, kwdict):
|
if self.pred_url(url, kwdict):
|
||||||
self.update_kwdict(kwdict)
|
self.update_kwdict(kwdict)
|
||||||
self.handle_url(url, kwdict)
|
self.handle_url(url, kwdict)
|
||||||
@@ -132,8 +133,8 @@ class Job():
|
|||||||
|
|
||||||
elif msg[0] == Message.Queue:
|
elif msg[0] == Message.Queue:
|
||||||
_, url, kwdict = msg
|
_, url, kwdict = msg
|
||||||
if self.url_key:
|
if self.metadata_url:
|
||||||
kwdict[self.url_key] = url
|
kwdict[self.metadata_url] = url
|
||||||
if self.pred_queue(url, kwdict):
|
if self.pred_queue(url, kwdict):
|
||||||
self.handle_queue(url, kwdict)
|
self.handle_queue(url, kwdict)
|
||||||
|
|
||||||
@@ -557,6 +558,11 @@ class KeywordJob(Job):
|
|||||||
def handle_url(self, url, kwdict):
|
def handle_url(self, url, kwdict):
|
||||||
stdout_write("\nKeywords for filenames and --filter:\n"
|
stdout_write("\nKeywords for filenames and --filter:\n"
|
||||||
"------------------------------------\n")
|
"------------------------------------\n")
|
||||||
|
|
||||||
|
if self.metadata_http:
|
||||||
|
kwdict[self.metadata_http] = util.extract_headers(
|
||||||
|
self.extractor.request(url, method="HEAD"))
|
||||||
|
|
||||||
self.print_kwdict(kwdict)
|
self.print_kwdict(kwdict)
|
||||||
raise exception.StopExtraction()
|
raise exception.StopExtraction()
|
||||||
|
|
||||||
|
|||||||
@@ -274,6 +274,23 @@ Response Headers
|
|||||||
fp.write(response.content)
|
fp.write(response.content)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_headers(response):
|
||||||
|
headers = response.headers
|
||||||
|
data = dict(headers)
|
||||||
|
|
||||||
|
hcd = headers.get("content-disposition")
|
||||||
|
if hcd:
|
||||||
|
name = text.extr(hcd, 'filename="', '"')
|
||||||
|
if name:
|
||||||
|
text.nameext_from_url(name, data)
|
||||||
|
|
||||||
|
hlm = headers.get("last-modified")
|
||||||
|
if hlm:
|
||||||
|
data["date"] = datetime.datetime(*parsedate_tz(hlm)[:6])
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
@functools.lru_cache(maxsize=None)
|
@functools.lru_cache(maxsize=None)
|
||||||
def git_head():
|
def git_head():
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user