[xenforo:media-album] extract 'album' metadata (#8902)

- add 'callback' argument to _pagination()
- generalize 'author' metadata collection
This commit is contained in:
Mike Fährmann
2026-01-22 15:02:15 +01:00
parent fb0d639f68
commit 16a59140c5

View File

@@ -107,13 +107,13 @@ class XenforoExtractor(BaseExtractor):
url = base + url
yield Message.Url, url, data
def items_media(self, path, pnum):
def items_media(self, path, pnum, callback=None):
if (order := self.config("order-posts")) and \
order[0] in ("d", "r"):
pages = self._pagination_reverse(path, pnum)
pages = self._pagination_reverse(path, pnum, callback)
reverse = True
else:
pages = self._pagination(path, pnum)
pages = self._pagination(path, pnum, callback)
reverse = False
if self.config("metadata"):
@@ -186,7 +186,7 @@ class XenforoExtractor(BaseExtractor):
if cookie.domain.endswith(self.cookies_domain)
}
def _pagination(self, base, pnum=None):
def _pagination(self, base, pnum=None, callback=None):
base = self.root + base
if pnum is None:
@@ -196,17 +196,18 @@ class XenforoExtractor(BaseExtractor):
url = f"{base}/page-{pnum}"
pnum = None
page = self.request_page(url).text
if callback is not None:
callback(page)
while True:
page = self.request_page(url).text
yield page
if pnum is None or "pageNav-jump--next" not in page:
return
pnum += 1
url = f"{base}/page-{pnum}"
page = self.request_page(f"{base}/page-{pnum}").text
def _pagination_reverse(self, base, pnum=None):
def _pagination_reverse(self, base, pnum=None, callback=None):
base = self.root + base
url = f"{base}/page-{'9999' if pnum is None else pnum}"
@@ -220,6 +221,8 @@ class XenforoExtractor(BaseExtractor):
pnum = text.parse_int(url[url.rfind("-")+1:], 1)
page = response.text
if callback is not None:
callback(page)
while True:
yield page
@@ -237,46 +240,6 @@ class XenforoExtractor(BaseExtractor):
return text.unescape(text.extr(
html, "blockMessage--error", "</").rpartition(">")[2].strip())
def _parse_thread(self, page):
try:
data = self._extract_jsonld(page)
except ValueError:
return {}
schema = data.get("mainEntity", data)
author = schema["author"]
stats = schema["interactionStatistic"]
url_t = schema.get("url") or schema.get("@id") or ""
thread = {
"id" : url_t[url_t.rfind(".")+1:-1],
"url" : url_t,
"title": schema["headline"],
"date" : self.parse_datetime_iso(schema["datePublished"]),
"tags" : (schema["keywords"].split(", ")
if "keywords" in schema else ()),
"section": schema["articleSection"],
"author" : author.get("name") or "",
}
if url_a := author.get("url"):
thread["author_url"] = url_a
thread["author_slug"], _, thread["author_id"] = \
url_a[url_a.rfind("/", 0, -1)+1:-1].rpartition(".")
else:
thread["author_url"] = ""
thread["author_slug"] = text.slugify(thread["author"][:15])
thread["author_id"] = thread["author"][15:]
if isinstance(stats, list):
thread["views"] = stats[0]["userInteractionCount"]
thread["posts"] = stats[1]["userInteractionCount"]
else:
thread["views"] = -1
thread["posts"] = stats["userInteractionCount"]
return thread
def _parse_post(self, html):
extr = text.extract_from(html)
@@ -303,6 +266,70 @@ class XenforoExtractor(BaseExtractor):
return post
def _parse_thread(self, page):
try:
data = self._extract_jsonld(page)
except ValueError:
return {}
main = data.get("mainEntity", data)
url = main.get("url") or main.get("@id") or ""
self.kwdict["thread"] = thread = self._parse_author(main["author"], {
"id" : url[url.rfind(".")+1:-1],
"url" : url,
"title": main["headline"],
"date" : self.parse_datetime_iso(main["datePublished"]),
"tags" : (main["keywords"].split(", ")
if "keywords" in main else ()),
"section": main["articleSection"],
})
stats = main["interactionStatistic"]
if isinstance(stats, list):
thread["views"] = stats[0]["userInteractionCount"]
thread["posts"] = stats[1]["userInteractionCount"]
else:
thread["views"] = -1
thread["posts"] = stats["userInteractionCount"]
return thread
def _parse_album(self, page):
main = self._extract_jsonld(page)["mainEntity"]
url = main.get("url") or main.get("@id") or ""
slug, _, id = url[url.rfind("/", 0, -1)+1:-1].rpartition(".")
self.kwdict["album"] = album = self._parse_author(main["author"], {
"id" : id,
"url" : url,
"slug" : text.unquote(slug),
"title": main["headline"],
"description": main.get("description"),
"date": self.parse_datetime_iso(main["dateCreated"]),
})
stats = main["interactionStatistic"]
if isinstance(stats, list):
album["count"] = stats[0]["userInteractionCount"]
album["likes"] = stats[1]["userInteractionCount"]
album["views"] = stats[2]["userInteractionCount"]
album["comments"] = stats[3]["userInteractionCount"]
return album
def _parse_author(self, author, data):
data["author"] = author.get("name") or ""
if url := author.get("url"):
data["author_url"] = url
data["author_slug"], _, data["author_id"] = \
url[url.rfind("/", 0, -1)+1:-1].rpartition(".")
else:
data["author_url"] = ""
data["author_slug"] = text.slugify(data["author"][:15])
data["author_id"] = data["author"][15:]
return data
def _extract_media(self, url, file):
media = {}
name, _, media["id"] = file.rpartition(".")
@@ -314,7 +341,6 @@ class XenforoExtractor(BaseExtractor):
schema = self._extract_jsonld(page)
main = schema["mainEntity"]
author = main["author"]
stats = main["interactionStatistic"]
media = text.nameext_from_name(main["name"], {
@@ -327,18 +353,9 @@ class XenforoExtractor(BaseExtractor):
w["name"].partition(" ")[0]) or 0,
"height": (h := main.get("height")) and text.parse_int(
h["name"].partition(" ")[0]) or 0,
"author": author.get("name") or "",
})
if url_a := author.get("url"):
media["author_url"] = url_a
media["author_slug"], _, media["author_id"] = \
url_a[url_a.rfind("/", 0, -1)+1:-1].rpartition(".")
else:
media["author_url"] = ""
media["author_slug"] = text.slugify(media["author"][15:])
media["author_id"] = media["author"][15:]
self._parse_author(main["author"], media)
if ext := main.get("encodingFormat"):
media["extension"] = ext
@@ -398,7 +415,7 @@ class XenforoPostExtractor(XenforoExtractor):
raise exception.NotFoundError("post")
html = text.extract(page, "<article ", "<footer", pos-200)[0]
self.kwdict["thread"] = self._parse_thread(page)
self._parse_thread(page)
return (self._parse_post(html),)
@@ -422,7 +439,7 @@ class XenforoThreadExtractor(XenforoExtractor):
for page in pages:
if "thread" not in self.kwdict:
self.kwdict["thread"] = self._parse_thread(page)
self._parse_thread(page)
posts = text.extract_iter(page, "<article ", "<footer")
if reverse:
posts = list(posts)
@@ -479,7 +496,7 @@ class XenforoMediaUserExtractor(XenforoExtractor):
class XenforoMediaAlbumExtractor(XenforoExtractor):
subcategory = "media-album"
directory_fmt = ("{category}", "Media", "Albums",
"{album_slug} ({album_id})")
"{album[slug]} ({album[id]})")
filename_fmt = "{filename}.{extension}"
archive_fmt = "{id}"
pattern = (BASE_PATTERN + r"(/(?:index\.php\?)?"
@@ -487,9 +504,8 @@ class XenforoMediaAlbumExtractor(XenforoExtractor):
example = "https://simpcity.cr/media/albums/ALBUM.123/"
def items(self):
slug, _, self.kwdict["album_id"] = self.groups[-2].rpartition(".")
self.kwdict["album_slug"] = text.unquote(slug)
return self.items_media(self.groups[-3], self.groups[-1])
return self.items_media(
self.groups[-3], self.groups[-1], self._parse_album)
class XenforoMediaCategoryExtractor(XenforoExtractor):