[issuu] fix extractors (#7317)

This commit is contained in:
Mike Fährmann
2025-04-08 11:56:26 +02:00
parent 7f7af12c2f
commit 76040f9d68
2 changed files with 14 additions and 11 deletions

View File

@@ -29,9 +29,11 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
example = "https://issuu.com/issuu/docs/TITLE/" example = "https://issuu.com/issuu/docs/TITLE/"
def metadata(self, page): def metadata(self, page):
pos = page.rindex('id="initial-data"')
data = util.json_loads(text.unescape(text.rextract( data = text.extr(
page, '<script data-json="', '"', pos)[0])) page, '{\\"documentTextVersion\\":', ']\\n"])</script>')
data = util.json_loads(text.unescape(
'{"":' + data.replace('\\"', '"')))
doc = data["initialDocumentData"]["document"] doc = data["initialDocumentData"]["document"]
doc["date"] = text.parse_datetime( doc["date"] = text.parse_datetime(
@@ -39,7 +41,7 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
self._cnt = text.parse_int(doc["pageCount"]) self._cnt = text.parse_int(doc["pageCount"])
self._tpl = "https://{}/{}-{}/jpg/page_{{}}.jpg".format( self._tpl = "https://{}/{}-{}/jpg/page_{{}}.jpg".format(
data["config"]["hosts"]["image"], "image.isu.pub", # data["config"]["hosts"]["image"],
doc["revisionId"], doc["revisionId"],
doc["publicationId"], doc["publicationId"],
) )
@@ -66,9 +68,8 @@ class IssuuUserExtractor(IssuuBase, Extractor):
url = base + "/" + str(pnum) if pnum > 1 else base url = base + "/" + str(pnum) if pnum > 1 else base
try: try:
html = self.request(url).text html = self.request(url).text
data = util.json_loads(text.unescape(text.extr( data = text.extr(html, '\\"docs\\":', '}]\\n"]')
html, '</main></div><script data-json="', '" id="'))) docs = util.json_loads(data.replace('\\"', '"'))
docs = data["docs"]
except Exception as exc: except Exception as exc:
self.log.debug("", exc_info=exc) self.log.debug("", exc_info=exc)
return return

View File

@@ -5,6 +5,7 @@
# published by the Free Software Foundation. # published by the Free Software Foundation.
from gallery_dl.extractor import issuu from gallery_dl.extractor import issuu
from gallery_dl import exception
__tests__ = ( __tests__ = (
@@ -35,10 +36,11 @@ __tests__ = (
}, },
{ {
"#url" : "https://issuu.com/foodhome1955/docs/fh_winter2025-issuu-011625", "#url" : "https://issuu.com/foodhome1955/docs/fh_winter2025-issuu-011625",
"#comment" : "HTML escapes", "#comment" : "HTML escapes",
"#class" : issuu.IssuuPublicationExtractor, "#class" : issuu.IssuuPublicationExtractor,
"#count" : 84, "#exception": exception.NotFoundError,
"#count" : 84,
"document": { "document": {
"access" : "PUBLIC", "access" : "PUBLIC",