include 'http-metadata' in '-K' output

This commit is contained in:
Mike Fährmann
2022-11-07 16:33:26 +01:00
parent e2401c96ee
commit 39d9c362e4
3 changed files with 33 additions and 28 deletions

View File

@@ -14,8 +14,6 @@ from requests.exceptions import RequestException, ConnectionError, Timeout
from .common import DownloaderBase from .common import DownloaderBase
from .. import text, util from .. import text, util
from email.utils import parsedate_tz
from datetime import datetime
from ssl import SSLError from ssl import SSLError
try: try:
from OpenSSL.SSL import Error as OpenSSLError from OpenSSL.SSL import Error as OpenSSLError
@@ -197,7 +195,7 @@ class HttpDownloader(DownloaderBase):
# set metadata from HTTP headers # set metadata from HTTP headers
if self.metadata: if self.metadata:
kwdict[self.metadata] = self._extract_metadata(response) kwdict[self.metadata] = util.extract_headers(response)
pathfmt.build_path() pathfmt.build_path()
if pathfmt.exists(): if pathfmt.exists():
pathfmt.temppath = "" pathfmt.temppath = ""
@@ -305,22 +303,6 @@ class HttpDownloader(DownloaderBase):
t1 = t2 t1 = t2
def _extract_metadata(self, response):
headers = response.headers
data = dict(headers)
hcd = headers.get("content-disposition")
if hcd:
name = text.extr(hcd, 'filename="', '"')
if name:
text.nameext_from_url(name, data)
hlm = headers.get("last-modified")
if hlm:
data["date"] = datetime(*parsedate_tz(hlm)[:6])
return data
def _find_extension(self, response): def _find_extension(self, response):
"""Get filename extension from MIME type""" """Get filename extension from MIME type"""
mtype = response.headers.get("Content-Type", "image/jpeg") mtype = response.headers.get("Content-Type", "image/jpeg")

View File

@@ -32,11 +32,8 @@ class Job():
self.pathfmt = None self.pathfmt = None
self.kwdict = {} self.kwdict = {}
self.status = 0 self.status = 0
self.url_key = extr.config("url-metadata")
path_key = extr.config("path-metadata")
path_proxy = output.PathfmtProxy(self) path_proxy = output.PathfmtProxy(self)
self._logger_extra = { self._logger_extra = {
"job" : self, "job" : self,
"extractor": extr, "extractor": extr,
@@ -56,12 +53,16 @@ class Job():
extr.category = pextr.category extr.category = pextr.category
extr.subcategory = pextr.subcategory extr.subcategory = pextr.subcategory
self.metadata_url = extr.config("url-metadata")
self.metadata_http = extr.config("http-metadata")
metadata_path = extr.config("path-metadata")
# user-supplied metadata # user-supplied metadata
kwdict = extr.config("keywords") kwdict = extr.config("keywords")
if kwdict: if kwdict:
self.kwdict.update(kwdict) self.kwdict.update(kwdict)
if path_key: if metadata_path:
self.kwdict[path_key] = path_proxy self.kwdict[metadata_path] = path_proxy
# predicates # predicates
self.pred_url = self._prepare_predicates("image", True) self.pred_url = self._prepare_predicates("image", True)
@@ -120,8 +121,8 @@ class Job():
"""Call the appropriate message handler""" """Call the appropriate message handler"""
if msg[0] == Message.Url: if msg[0] == Message.Url:
_, url, kwdict = msg _, url, kwdict = msg
if self.url_key: if self.metadata_url:
kwdict[self.url_key] = url kwdict[self.metadata_url] = url
if self.pred_url(url, kwdict): if self.pred_url(url, kwdict):
self.update_kwdict(kwdict) self.update_kwdict(kwdict)
self.handle_url(url, kwdict) self.handle_url(url, kwdict)
@@ -132,8 +133,8 @@ class Job():
elif msg[0] == Message.Queue: elif msg[0] == Message.Queue:
_, url, kwdict = msg _, url, kwdict = msg
if self.url_key: if self.metadata_url:
kwdict[self.url_key] = url kwdict[self.metadata_url] = url
if self.pred_queue(url, kwdict): if self.pred_queue(url, kwdict):
self.handle_queue(url, kwdict) self.handle_queue(url, kwdict)
@@ -557,6 +558,11 @@ class KeywordJob(Job):
def handle_url(self, url, kwdict): def handle_url(self, url, kwdict):
stdout_write("\nKeywords for filenames and --filter:\n" stdout_write("\nKeywords for filenames and --filter:\n"
"------------------------------------\n") "------------------------------------\n")
if self.metadata_http:
kwdict[self.metadata_http] = util.extract_headers(
self.extractor.request(url, method="HEAD"))
self.print_kwdict(kwdict) self.print_kwdict(kwdict)
raise exception.StopExtraction() raise exception.StopExtraction()

View File

@@ -274,6 +274,23 @@ Response Headers
fp.write(response.content) fp.write(response.content)
def extract_headers(response):
headers = response.headers
data = dict(headers)
hcd = headers.get("content-disposition")
if hcd:
name = text.extr(hcd, 'filename="', '"')
if name:
text.nameext_from_url(name, data)
hlm = headers.get("last-modified")
if hlm:
data["date"] = datetime.datetime(*parsedate_tz(hlm)[:6])
return data
@functools.lru_cache(maxsize=None) @functools.lru_cache(maxsize=None)
def git_head(): def git_head():
try: try: