[bellazon] add 'order-posts' option (#8248)

This commit is contained in:
Mike Fährmann
2025-09-20 22:12:01 +02:00
parent c991f05d52
commit 8796ad02ba
4 changed files with 55 additions and 3 deletions

View File

@@ -1600,6 +1600,22 @@ Description
``image``, ``video``, ``mediacollection``, ``embed``, ``text``.
extractor.bellazon.order-posts
------------------------------
Type
``string``
Default
``"desc"``
Description
Controls the order in which
posts of a ``thread`` are processed.
``"asc"``
Ascending order (oldest first)
``"desc"`` | ``"reverse"``
Descending order (newest first)
extractor.bellazon.quoted
-------------------------
Type

View File

@@ -156,7 +156,8 @@
},
"bellazon":
{
"quoted": false
"order-posts": "desc",
"quoted" : false
},
"bilibili":
{

View File

@@ -91,6 +91,28 @@ class BellazonExtractor(Extractor):
pnum += 1
url = f"{base}/page/{pnum}/"
def _pagination_reverse(self, base, pnum=None):
base = f"{self.root}{base}"
url = f"{base}/page/9999/" # force redirect to highest page number
with self.request(url) as response:
parts = response.url.rsplit("/", 3)
pnum = text.parse_int(parts[2]) if parts[1] == "page" else 1
page = response.text
while True:
yield page
pnum -= 1
if pnum > 1:
url = f"{base}/page/{pnum}/"
elif pnum == 1:
url = f"{base}/"
else:
return
page = self.request(url).text
def _parse_thread(self, page):
schema = self._extract_jsonld(page)
author = schema["author"]
@@ -166,10 +188,22 @@ class BellazonThreadExtractor(BellazonExtractor):
example = "https://www.bellazon.com/main/topic/123-SLUG/"
def posts(self):
for page in self._pagination(*self.groups):
if (order := self.config("order-posts")) and \
order[0] not in ("d", "r"):
pages = self._pagination(*self.groups)
reverse = False
else:
pages = self._pagination_reverse(*self.groups)
reverse = True
for page in pages:
if "thread" not in self.kwdict:
self.kwdict["thread"] = self._parse_thread(page)
for html in text.extract_iter(page, "<article ", "</article>"):
posts = text.extract_iter(page, "<article ", "</article>")
if reverse:
posts = list(posts)
posts.reverse()
for html in posts:
yield self._parse_post(html)

View File

@@ -244,6 +244,7 @@ __tests__ = (
"#url" : "https://www.bellazon.com/main/topic/1774-zhang-ziyi/",
"#class" : bellazon.BellazonThreadExtractor,
"#range" : "1-5",
"#options" : {"prder-posts": "asc"},
"#results" : (
"http://img292.echo.cx/my.php?image=4moon011rk.jpg",
"http://img294.echo.cx/my.php?image=heroclip3jb.jpg",