[common] add '_extract_nextdata' method

This commit is contained in:
Mike Fährmann
2025-01-12 11:14:33 +01:00
parent 3f48e2f820
commit 1ae3ac5e39
4 changed files with 13 additions and 14 deletions

View File

@@ -591,6 +591,10 @@ class Extractor():
return util.json_loads(text.extr(
page, '<script type="application/ld+json">', "</script>"))
def _extract_nextdata(self, page):
return util.json_loads(text.extr(
page, ' id="__NEXT_DATA__" type="application/json">', "</script>"))
def _prepare_ddosguard_cookies(self):
if not self.cookies.get("__ddg2", domain=self.cookies_domain):
self.cookies.set(

View File

@@ -43,8 +43,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
ChapterExtractor.__init__(self, match, url)
def metadata(self, page):
data = util.json_loads(text.extr(
page, 'id="__NEXT_DATA__" type="application/json">', '<'))
data = self._extract_nextdata(page)
chapter = (data["props"]["pageProps"]["dehydratedState"]
["queries"][0]["state"]["data"]["data"])
manga = chapter["comicNode"]["data"]

View File

@@ -286,15 +286,12 @@ class PatreonExtractor(Extractor):
return [genmap[ft] for ft in filetypes]
def _extract_bootstrap(self, page):
data = text.extr(
page, 'id="__NEXT_DATA__" type="application/json">', '</script')
if data:
try:
data = util.json_loads(data)
env = data["props"]["pageProps"]["bootstrapEnvelope"]
return env.get("pageBootstrap") or env["bootstrap"]
except Exception as exc:
self.log.debug("%s: %s", exc.__class__.__name__, exc)
try:
data = self._extract_nextdata(page)
env = data["props"]["pageProps"]["bootstrapEnvelope"]
return env.get("pageBootstrap") or env["bootstrap"]
except Exception as exc:
self.log.debug("%s: %s", exc.__class__.__name__, exc)
bootstrap = text.extr(
page, 'window.patreon = {"bootstrap":', '},"apiServer"')

View File

@@ -10,7 +10,7 @@
"""Extractors for https://www.slideshare.net/"""
from .common import GalleryExtractor
from .. import text, util
from .. import text
class SlidesharePresentationExtractor(GalleryExtractor):
@@ -31,8 +31,7 @@ class SlidesharePresentationExtractor(GalleryExtractor):
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
data = util.json_loads(text.extr(
page, 'id="__NEXT_DATA__" type="application/json">', '</script>'))
data = self._extract_nextdata(page)
self.slideshow = slideshow = data["props"]["pageProps"]["slideshow"]
return {