- use 'datetime.fromisoformat()' when possible (#7671) - return a datetime-compatible object for invalid datetimes (instead of a 'str' value)
82 lines
2.7 KiB
Python
82 lines
2.7 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2019-2025 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
"""Extractors for https://issuu.com/"""
|
|
|
|
from .common import GalleryExtractor, Extractor, Message
|
|
from .. import text, util
|
|
|
|
|
|
class IssuuBase():
|
|
"""Base class for issuu extractors"""
|
|
category = "issuu"
|
|
root = "https://issuu.com"
|
|
|
|
|
|
class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
|
|
"""Extractor for a single publication"""
|
|
subcategory = "publication"
|
|
directory_fmt = ("{category}", "{document[username]}",
|
|
"{document[date]:%Y-%m-%d} {document[title]}")
|
|
filename_fmt = "{num:>03}.{extension}"
|
|
archive_fmt = "{document[publicationId]}_{num}"
|
|
pattern = r"(?:https?://)?issuu\.com(/[^/?#]+/docs/[^/?#]+)"
|
|
example = "https://issuu.com/issuu/docs/TITLE/"
|
|
|
|
def metadata(self, page):
|
|
|
|
data = text.extr(
|
|
page, '{\\"documentTextVersion\\":', ']\\n"])</script>')
|
|
data = util.json_loads(text.unescape(
|
|
'{"":' + data.replace('\\"', '"')))
|
|
|
|
doc = data["initialDocumentData"]["document"]
|
|
doc["date"] = self.parse_datetime_iso(
|
|
doc["originalPublishDateInISOString"])
|
|
|
|
self.count = text.parse_int(doc["pageCount"])
|
|
self.base = (f"https://image.isu.pub/{doc['revisionId']}-"
|
|
f"{doc['publicationId']}/jpg/page_")
|
|
|
|
return {"document": doc}
|
|
|
|
def images(self, page):
|
|
return [(f"{self.base}{i}.jpg", None)
|
|
for i in range(1, self.count + 1)]
|
|
|
|
|
|
class IssuuUserExtractor(IssuuBase, Extractor):
|
|
"""Extractor for all publications of a user/publisher"""
|
|
subcategory = "user"
|
|
pattern = r"(?:https?://)?issuu\.com/([^/?#]+)(?:/(\d*))?$"
|
|
example = "https://issuu.com/USER"
|
|
|
|
def items(self):
|
|
user, pnum = self.groups
|
|
base = self.root + "/" + user
|
|
pnum = text.parse_int(pnum, 1)
|
|
|
|
while True:
|
|
url = base + "/" + str(pnum) if pnum > 1 else base
|
|
try:
|
|
html = self.request(url).text
|
|
data = text.extr(html, '\\"docs\\":', '}]\\n"]')
|
|
docs = util.json_loads(data.replace('\\"', '"'))
|
|
except Exception as exc:
|
|
self.log.traceback(exc)
|
|
return
|
|
|
|
for publication in docs:
|
|
url = self.root + "/" + publication["uri"]
|
|
publication["_extractor"] = IssuuPublicationExtractor
|
|
yield Message.Queue, url, publication
|
|
|
|
if len(docs) < 48:
|
|
return
|
|
pnum += 1
|