[issuu] fix extractors (#7317)

This commit is contained in:
Mike Fährmann
2025-04-08 11:56:26 +02:00
parent 7f7af12c2f
commit 76040f9d68
2 changed files with 14 additions and 11 deletions

View File

@@ -29,9 +29,11 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
example = "https://issuu.com/issuu/docs/TITLE/"
def metadata(self, page):
pos = page.rindex('id="initial-data"')
data = util.json_loads(text.unescape(text.rextract(
page, '<script data-json="', '"', pos)[0]))
data = text.extr(
page, '{\\"documentTextVersion\\":', ']\\n"])</script>')
data = util.json_loads(text.unescape(
'{"":' + data.replace('\\"', '"')))
doc = data["initialDocumentData"]["document"]
doc["date"] = text.parse_datetime(
@@ -39,7 +41,7 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
self._cnt = text.parse_int(doc["pageCount"])
self._tpl = "https://{}/{}-{}/jpg/page_{{}}.jpg".format(
data["config"]["hosts"]["image"],
"image.isu.pub", # data["config"]["hosts"]["image"],
doc["revisionId"],
doc["publicationId"],
)
@@ -66,9 +68,8 @@ class IssuuUserExtractor(IssuuBase, Extractor):
url = base + "/" + str(pnum) if pnum > 1 else base
try:
html = self.request(url).text
data = util.json_loads(text.unescape(text.extr(
html, '</main></div><script data-json="', '" id="')))
docs = data["docs"]
data = text.extr(html, '\\"docs\\":', '}]\\n"]')
docs = util.json_loads(data.replace('\\"', '"'))
except Exception as exc:
self.log.debug("", exc_info=exc)
return

View File

@@ -5,6 +5,7 @@
# published by the Free Software Foundation.
from gallery_dl.extractor import issuu
from gallery_dl import exception
__tests__ = (
@@ -35,10 +36,11 @@ __tests__ = (
},
{
"#url" : "https://issuu.com/foodhome1955/docs/fh_winter2025-issuu-011625",
"#comment" : "HTML escapes",
"#class" : issuu.IssuuPublicationExtractor,
"#count" : 84,
"#url" : "https://issuu.com/foodhome1955/docs/fh_winter2025-issuu-011625",
"#comment" : "HTML escapes",
"#class" : issuu.IssuuPublicationExtractor,
"#exception": exception.NotFoundError,
"#count" : 84,
"document": {
"access" : "PUBLIC",