[motherless] detect 404 / 'File not found' pages

This commit is contained in:
Mike Fährmann
2025-08-08 08:20:59 +02:00
parent 16acfbd1e7
commit ee47c70079
2 changed files with 27 additions and 3 deletions

View File

@@ -9,7 +9,7 @@
"""Extractors for https://motherless.com/"""
from .common import Extractor, Message
from .. import text, util
from .. import text, util, exception
from ..cache import memcache
from datetime import timedelta
@@ -23,6 +23,17 @@ class MotherlessExtractor(Extractor):
filename_fmt = "{id} {title}.{extension}"
archive_fmt = "{id}"
def request(self, url, **kwargs):
response = Extractor.request(self, url, **kwargs)
content = response.content
if (b'<div class="error-page' in content or
b">The page you're looking for cannot be found.<" in content):
raise exception.NotFoundError("page")
self.request = Extractor.request.__get__(self)
return response
def _extract_media(self, path):
url = f"{self.root}/{path}"
page = self.request(url).text