[zerochan] parse JSON-LD data (#7178)

This commit is contained in:
Mike Fährmann
2025-03-17 15:51:41 +01:00
parent 6532cf9075
commit d746e025a0
2 changed files with 68 additions and 8 deletions

View File

@@ -64,16 +64,22 @@ class ZerochanExtractor(BooruExtractor):
def _parse_entry_html(self, entry_id):
url = "{}/{}".format(self.root, entry_id)
extr = text.extract_from(self.request(url).text)
page = self.request(url).text
try:
jsonld = self._extract_jsonld(page)
except Exception:
return {"id": entry_id}
extr = text.extract_from(page)
data = {
"id" : text.parse_int(entry_id),
"author" : text.parse_unicode_escapes(extr(' "name": "', '"')),
"file_url": extr('"contentUrl": "', '"'),
"date" : text.parse_datetime(extr('"datePublished": "', '"')),
"width" : text.parse_int(extr('"width": "', ' ')),
"height" : text.parse_int(extr('"height": "', ' ')),
"size" : text.parse_bytes(extr('"contentSize": "', 'B')),
"author" : jsonld["author"]["name"],
"file_url": jsonld["contentUrl"],
"date" : text.parse_datetime(jsonld["datePublished"]),
"width" : text.parse_int(jsonld["width"][:-3]),
"height" : text.parse_int(jsonld["height"][:-3]),
"size" : text.parse_bytes(jsonld["contentSize"][:-1]),
"path" : text.split_html(extr(
'class="breadcrumbs', '</nav>'))[2:],
"uploader": extr('href="/user/', '"'),
@@ -86,7 +92,7 @@ class ZerochanExtractor(BooruExtractor):
tags = data["tags"] = []
for tag in html.split("<li class=")[1:]:
category = text.extr(tag, '"', '"')
name = text.extr(tag, 'data-tag="', '"')
name = text.unescape(text.extr(tag, 'data-tag="', '"'))
tags.append(category.partition(" ")[0].capitalize() + ":" + name)
return data

View File

@@ -250,4 +250,58 @@ __tests__ = (
],
},
{
"#url" : "https://www.zerochan.net/4354955",
"#comment" : "quotes in HTML tags",
"#category": ("booru", "zerochan", "image"),
"#class" : zerochan.ZerochanImageExtractor,
"#auth" : False,
"#options" : {"metadata": False},
"tags": [
"Mangaka:Tory Patterson",
"Studio:SEGA",
"Game:Sonic Origins",
"Series:Sonic the Hedgehog",
"Character:Miles \"Tails\" Prower",
"Theme:Airplane",
"Theme:Flying",
"Theme:Fox",
"Source:Character Sheet",
"Source:Official Art",
"Source:Official Art from X",
"Source:X (Twitter)",
],
},
{
"#url" : "https://www.zerochan.net/2275437",
"#comment" : "unicode escapes with surrogate pair (#7178)",
"#category": ("booru", "zerochan", "image"),
"#class" : zerochan.ZerochanImageExtractor,
"#auth" : False,
"#options" : {"metadata": False},
"author" : "MAYO🍚",
"date" : "dt:2018-02-25 16:03:48",
"extension": "png",
"file_url" : "https://static.zerochan.net/Kongou.full.2275437.png",
"filename" : "Kongou.full.2275437",
"width" : 1047,
"height" : 1365,
"id" : 2275437,
"size" : 502784,
"source" : "",
"uploader" : "SubaruSumeragi",
"path" : [
"Kantai Collection",
"Kongou",
],
"tags" : [
"Mangaka:MAYO🍚",
"Game:Kantai Collection",
"Character:Kongou"
],
},
)