[imagevenue] detect 404 image files (#7570)
This commit is contained in:
@@ -23,6 +23,7 @@ class ImagehostImageExtractor(Extractor):
|
||||
_params = None
|
||||
_cookies = None
|
||||
_encoding = None
|
||||
_validate = None
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self, match)
|
||||
@@ -57,6 +58,8 @@ class ImagehostImageExtractor(Extractor):
|
||||
data.update(self.metadata(page))
|
||||
if self._https and url.startswith("http:"):
|
||||
url = "https:" + url[5:]
|
||||
if self._validate is not None:
|
||||
data["_http_validate"] = self._validate
|
||||
|
||||
yield Message.Directory, data
|
||||
yield Message.Url, url, data
|
||||
@@ -164,6 +167,14 @@ class ImagevenueImageExtractor(ImagehostImageExtractor):
|
||||
filename, pos = text.extract(page, 'alt="', '"', pos)
|
||||
return url, text.unescape(filename)
|
||||
|
||||
def _validate(self, response):
|
||||
hget = response.headers.get
|
||||
return not (
|
||||
hget("content-length") == "14396" and
|
||||
hget("content-type") == "image/jpeg" and
|
||||
hget("last-modified") == "Mon, 04 May 2020 07:19:52 GMT"
|
||||
)
|
||||
|
||||
|
||||
class ImagetwistImageExtractor(ImagehostImageExtractor):
|
||||
"""Extractor for single images from imagetwist.com"""
|
||||
|
||||
Reference in New Issue
Block a user