[imagevenue] detect 404 image files (#7570)

This commit is contained in:
Mike Fährmann
2025-05-24 22:03:05 +02:00
parent 5e07d30d91
commit e469dc0da8
2 changed files with 27 additions and 1 deletions

View File

@@ -23,6 +23,7 @@ class ImagehostImageExtractor(Extractor):
_params = None
_cookies = None
_encoding = None
_validate = None
def __init__(self, match):
Extractor.__init__(self, match)
@@ -57,6 +58,8 @@ class ImagehostImageExtractor(Extractor):
data.update(self.metadata(page))
if self._https and url.startswith("http:"):
url = "https:" + url[5:]
if self._validate is not None:
data["_http_validate"] = self._validate
yield Message.Directory, data
yield Message.Url, url, data
@@ -164,6 +167,14 @@ class ImagevenueImageExtractor(ImagehostImageExtractor):
filename, pos = text.extract(page, 'alt="', '"', pos)
return url, text.unescape(filename)
def _validate(self, response):
hget = response.headers.get
return not (
hget("content-length") == "14396" and
hget("content-type") == "image/jpeg" and
hget("last-modified") == "Mon, 04 May 2020 07:19:52 GMT"
)
class ImagetwistImageExtractor(ImagehostImageExtractor):
"""Extractor for single images from imagetwist.com"""