[issuu] fix extraction (#2483)
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2019-2020 Mike Fährmann
|
||||
# Copyright 2019-2022 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@@ -9,7 +9,7 @@
|
||||
"""Extractors for https://issuu.com/"""
|
||||
|
||||
from .common import GalleryExtractor, Extractor, Message
|
||||
from .. import text, util
|
||||
from .. import text
|
||||
import json
|
||||
|
||||
|
||||
@@ -22,33 +22,30 @@ class IssuuBase():
|
||||
class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
|
||||
"""Extractor for a single publication"""
|
||||
subcategory = "publication"
|
||||
directory_fmt = ("{category}", "{document[userName]}",
|
||||
"{document[originalPublishDate]} {document[title]}")
|
||||
directory_fmt = ("{category}", "{document[username]}",
|
||||
"{document[date]:%Y-%m-%d} {document[title]}")
|
||||
filename_fmt = "{num:>03}.{extension}"
|
||||
archive_fmt = "{document[id]}_{num}"
|
||||
archive_fmt = "{document[publicationId]}_{num}"
|
||||
pattern = r"(?:https?://)?issuu\.com(/[^/?#]+/docs/[^/?#]+)"
|
||||
test = ("https://issuu.com/issuu/docs/motions-1-2019/", {
|
||||
"pattern": r"https://image.isu.pub/190916155301-\w+/jpg/page_\d+.jpg",
|
||||
"count" : 36,
|
||||
"keyword": {
|
||||
"document": {
|
||||
"access" : "public",
|
||||
"articleStories": list,
|
||||
"contentRating" : dict,
|
||||
"access" : "PUBLIC",
|
||||
"contentRating" : {
|
||||
"isAdsafe" : True,
|
||||
"isExplicit": False,
|
||||
"isReviewed": True,
|
||||
},
|
||||
"date" : "dt:2019-09-16 00:00:00",
|
||||
"description" : "re:Motions, the brand new publication by I",
|
||||
"documentId" : r"re:\d+-d99ec95935f15091b040cb8060f05510",
|
||||
"documentName" : "motions-1-2019",
|
||||
"downloadState" : "NOT_AVAILABLE",
|
||||
"id" : r"re:\d+-d99ec95935f15091b040cb8060f05510",
|
||||
"isConverting" : False,
|
||||
"isQuarantined" : False,
|
||||
"lang" : "en",
|
||||
"language" : "English",
|
||||
"downloadable" : False,
|
||||
"pageCount" : 36,
|
||||
"publicationId" : "d99ec95935f15091b040cb8060f05510",
|
||||
"title" : "Motions by Issuu - Issue 1",
|
||||
"userName" : "issuu",
|
||||
"username" : "issuu",
|
||||
},
|
||||
"extension": "jpg",
|
||||
"filename" : r"re:page_\d+",
|
||||
@@ -58,17 +55,18 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
|
||||
|
||||
def metadata(self, page):
|
||||
data = json.loads(text.extract(
|
||||
page, 'window.__INITIAL_STATE__ =', ';\n')[0])
|
||||
page, '<script data-json="', '"')[0].replace(""", '"'))
|
||||
|
||||
doc = data["document"]
|
||||
doc["lang"] = doc["language"]
|
||||
doc["language"] = util.code_to_language(doc["language"])
|
||||
doc = data["initialDocumentData"]["document"]
|
||||
doc["date"] = text.parse_datetime(
|
||||
doc["originalPublishDate"], "%Y-%m-%d")
|
||||
doc["originalPublishDateInISOString"], "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
|
||||
self._cnt = text.parse_int(doc["pageCount"])
|
||||
self._tpl = "https://{}/{}/jpg/page_{{}}.jpg".format(
|
||||
data["config"]["hosts"]["image"], doc["id"])
|
||||
self._tpl = "https://{}/{}-{}/jpg/page_{{}}.jpg".format(
|
||||
data["config"]["hosts"]["image"],
|
||||
doc["revisionId"],
|
||||
doc["publicationId"],
|
||||
)
|
||||
|
||||
return {"document": doc}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user