diff --git a/docs/configuration.rst b/docs/configuration.rst index e31f6e27..f8b9be28 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -6166,6 +6166,20 @@ Description instead of downloading a potentially broken file. +downloader.http.validate-html +----------------------------- +Type + ``bool`` +Default + ``true`` +Description + Check for unexpected HTML responses. + + Fail file downloads with a ``text/html`` + `Content-Type header `__ + when expecting a media file instead. + + downloader.ytdl.cmdline-args ---------------------------- Type diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 1723ec33..4d6cecef 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -1020,7 +1020,8 @@ "headers" : null, "retry-codes" : [], "sleep-429" : 60.0, - "validate" : true + "validate" : true, + "validate-html" : true }, "ytdl": diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index ac0a03ad..94aa8f33 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -29,6 +29,7 @@ class HttpDownloader(DownloaderBase): self.metadata = extractor.config("http-metadata") self.progress = self.config("progress", 3.0) self.validate = self.config("validate", True) + self.validate_html = self.config("validate-html", True) self.headers = self.config("headers") self.minsize = self.config("filesize-min") self.maxsize = self.config("filesize-max") @@ -204,8 +205,8 @@ class HttpDownloader(DownloaderBase): return False # check for invalid responses - validate = kwdict.get("_http_validate") - if validate and self.validate: + if self.validate and \ + (validate := kwdict.get("_http_validate")) is not None: try: result = validate(response) except Exception: @@ -219,6 +220,14 @@ class HttpDownloader(DownloaderBase): self.release_conn(response) self.log.warning("Invalid response") return False + if self.validate_html and response.headers.get( + "content-type", "").startswith("text/html") and \ + pathfmt.extension not in ("html", "htm"): + if response.history: + self.log.warning("HTTP redirect to '%s'", response.url) + else: + self.log.warning("HTML response") + return False # check file size size = text.parse_int(size, None) diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 64491fcd..2e3f0d8b 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -56,7 +56,6 @@ class FantiaExtractor(Extractor): "%s#post-content-id-%s", content["visible_status"], post["post_url"], content["id"]) - post["_http_validate"] = self._validate_response for file in files: post.update(file) post["num"] += 1 @@ -91,10 +90,6 @@ class FantiaExtractor(Extractor): self.headers["X-CSRF-Token"] = text.extr( page, 'name="csrf-token" content="', '"') - def _validate_response(self, response): - return not response.history or not response.headers.get( - "content-type", "").startswith("text/html") - def _get_post_data(self, post_id): """Fetch and process post data""" url = self.root+"/api/v1/posts/"+post_id