[hentaifoundry] add 'html' description test case

This commit is contained in:
Mike Fährmann
2025-08-02 17:29:07 +02:00
parent 12f56b888c
commit afa720d3e0
3 changed files with 26 additions and 9 deletions

View File

@@ -34,7 +34,7 @@ class HentaifoundryExtractor(Extractor):
def _init(self):
if self.config("descriptions") == "html":
self._process_description = self._process_html_description
self._process_description = self._process_description_html
def items(self):
self._init_site_filters()
@@ -81,9 +81,9 @@ class HentaifoundryExtractor(Extractor):
"artist" : text.unescape(extr('/profile">', '<')),
"_body" : extr(
'<div class="boxbody"', '<div class="boxfooter"'),
"_description": extr(
"description": self._process_description(extr(
"<div class='picDescript'>", '</section>')
.replace("\r\n", "\n"),
.replace("\r\n", "\n")),
"ratings" : [text.unescape(r) for r in text.extract_iter(extr(
"class='ratings_box'", "</div>"), "title='", "'")],
"date" : text.parse_datetime(extr("datetime='", "'")),
@@ -94,7 +94,6 @@ class HentaifoundryExtractor(Extractor):
">Tags </span>", "</div>")),
}
data["description"] = self._process_description(data["_description"])
body = data["_body"]
if "<object " in body:
data["src"] = text.urljoin(self.root, text.unescape(text.extr(
@@ -111,14 +110,14 @@ class HentaifoundryExtractor(Extractor):
return text.nameext_from_url(data["src"], data)
def _process_html_description(self, description: str):
def _process_description(self, description):
return text.unescape(text.remove_html(description, "", ""))
def _process_description_html(self, description):
pos1 = description.rfind('</div') # picDescript
pos2 = description.rfind('</div', None, pos1) # boxBody
return str.strip(description[0:pos2])
def _process_description(self, description):
return text.unescape(text.remove_html(description, "", ""))
def _parse_story(self, html):
"""Collect url and metadata for a story"""
extr = text.extract_from(html)