[common] add '_extract_jsonld' method (#5272)
This commit is contained in:
@@ -26,8 +26,7 @@ class BbcGalleryExtractor(GalleryExtractor):
|
|||||||
example = "https://www.bbc.co.uk/programmes/PATH"
|
example = "https://www.bbc.co.uk/programmes/PATH"
|
||||||
|
|
||||||
def metadata(self, page):
|
def metadata(self, page):
|
||||||
data = util.json_loads(text.extr(
|
data = self._extract_jsonld(page)
|
||||||
page, '<script type="application/ld+json">', '</script>'))
|
|
||||||
return {
|
return {
|
||||||
"programme": self.gallery_url.split("/")[4],
|
"programme": self.gallery_url.split("/")[4],
|
||||||
"path": list(util.unique_sequence(
|
"path": list(util.unique_sequence(
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
"""Extractors for https://ci-en.net/"""
|
"""Extractors for https://ci-en.net/"""
|
||||||
|
|
||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import text, util
|
from .. import text
|
||||||
|
|
||||||
BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)"
|
BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)"
|
||||||
|
|
||||||
@@ -56,11 +56,8 @@ class CienArticleExtractor(CienExtractor):
|
|||||||
self.root, self.groups[0], self.groups[1])
|
self.root, self.groups[0], self.groups[1])
|
||||||
page = self.request(url, notfound="article").text
|
page = self.request(url, notfound="article").text
|
||||||
|
|
||||||
post = util.json_loads(text.extr(
|
|
||||||
page, '<script type="application/ld+json">', '</script>'))[0]
|
|
||||||
|
|
||||||
files = self._extract_files(page)
|
files = self._extract_files(page)
|
||||||
|
post = self._extract_jsonld(page)[0]
|
||||||
post["post_url"] = url
|
post["post_url"] = url
|
||||||
post["post_id"] = text.parse_int(self.groups[1])
|
post["post_id"] = text.parse_int(self.groups[1])
|
||||||
post["count"] = len(files)
|
post["count"] = len(files)
|
||||||
|
|||||||
@@ -587,6 +587,10 @@ class Extractor():
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def _extract_jsonld(self, page):
|
||||||
|
return util.json_loads(text.extr(
|
||||||
|
page, '<script type="application/ld+json">', "</script>"))
|
||||||
|
|
||||||
def _prepare_ddosguard_cookies(self):
|
def _prepare_ddosguard_cookies(self):
|
||||||
if not self.cookies.get("__ddg2", domain=self.cookies_domain):
|
if not self.cookies.get("__ddg2", domain=self.cookies_domain):
|
||||||
self.cookies.set(
|
self.cookies.set(
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
"""Extractors for https://www.imagefap.com/"""
|
"""Extractors for https://www.imagefap.com/"""
|
||||||
|
|
||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import text, util, exception
|
from .. import text, exception
|
||||||
|
|
||||||
BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?imagefap\.com"
|
BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?imagefap\.com"
|
||||||
|
|
||||||
@@ -129,13 +129,11 @@ class ImagefapImageExtractor(ImagefapExtractor):
|
|||||||
|
|
||||||
url, pos = text.extract(
|
url, pos = text.extract(
|
||||||
page, 'original="', '"')
|
page, 'original="', '"')
|
||||||
info, pos = text.extract(
|
|
||||||
page, '<script type="application/ld+json">', '</script>', pos)
|
|
||||||
image_id, pos = text.extract(
|
image_id, pos = text.extract(
|
||||||
page, 'id="imageid_input" value="', '"', pos)
|
page, 'id="imageid_input" value="', '"', pos)
|
||||||
gallery_id, pos = text.extract(
|
gallery_id, pos = text.extract(
|
||||||
page, 'id="galleryid_input" value="', '"', pos)
|
page, 'id="galleryid_input" value="', '"', pos)
|
||||||
info = util.json_loads(info)
|
info = self._extract_jsonld(page)
|
||||||
|
|
||||||
return url, text.nameext_from_url(url, {
|
return url, text.nameext_from_url(url, {
|
||||||
"title": text.unescape(info["name"]),
|
"title": text.unescape(info["name"]),
|
||||||
|
|||||||
Reference in New Issue
Block a user