[common] add '_extract_jsonld' method (#5272)

This commit is contained in:
Mike Fährmann
2025-01-12 11:07:48 +01:00
parent 88f1ef7c3c
commit 3f48e2f820
4 changed files with 9 additions and 11 deletions

View File

@@ -26,8 +26,7 @@ class BbcGalleryExtractor(GalleryExtractor):
example = "https://www.bbc.co.uk/programmes/PATH"
def metadata(self, page):
data = util.json_loads(text.extr(
page, '<script type="application/ld+json">', '</script>'))
data = self._extract_jsonld(page)
return {
"programme": self.gallery_url.split("/")[4],
"path": list(util.unique_sequence(

View File

@@ -9,7 +9,7 @@
"""Extractors for https://ci-en.net/"""
from .common import Extractor, Message
from .. import text, util
from .. import text
BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)"
@@ -56,11 +56,8 @@ class CienArticleExtractor(CienExtractor):
self.root, self.groups[0], self.groups[1])
page = self.request(url, notfound="article").text
post = util.json_loads(text.extr(
page, '<script type="application/ld+json">', '</script>'))[0]
files = self._extract_files(page)
post = self._extract_jsonld(page)[0]
post["post_url"] = url
post["post_id"] = text.parse_int(self.groups[1])
post["count"] = len(files)

View File

@@ -587,6 +587,10 @@ class Extractor():
return True
return False
def _extract_jsonld(self, page):
return util.json_loads(text.extr(
page, '<script type="application/ld+json">', "</script>"))
def _prepare_ddosguard_cookies(self):
if not self.cookies.get("__ddg2", domain=self.cookies_domain):
self.cookies.set(

View File

@@ -9,7 +9,7 @@
"""Extractors for https://www.imagefap.com/"""
from .common import Extractor, Message
from .. import text, util, exception
from .. import text, exception
BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?imagefap\.com"
@@ -129,13 +129,11 @@ class ImagefapImageExtractor(ImagefapExtractor):
url, pos = text.extract(
page, 'original="', '"')
info, pos = text.extract(
page, '<script type="application/ld+json">', '</script>', pos)
image_id, pos = text.extract(
page, 'id="imageid_input" value="', '"', pos)
gallery_id, pos = text.extract(
page, 'id="galleryid_input" value="', '"', pos)
info = util.json_loads(info)
info = self._extract_jsonld(page)
return url, text.nameext_from_url(url, {
"title": text.unescape(info["name"]),