[bellazon] update (#8247)

- include 'filename' in default filename_fmt and archive_fmt as 'id'
  alone is not guaranteed to be unique, even in the same post
  https://www.bellazon.com/main/topic/3556-bipasha-basu
  /page/2/#findComment-2536060
- support 'inline' files
- ignore '/profile/' links
- do not increment 'num' on ignored files
This commit is contained in:
Mike Fährmann
2025-09-21 16:20:17 +02:00
parent 6df17cc621
commit 997e7422ff
2 changed files with 31 additions and 9 deletions

View File

@@ -20,13 +20,16 @@ class BellazonExtractor(Extractor):
root = "https://www.bellazon.com/main"
directory_fmt = ("{category}", "{thread[section]}",
"{thread[title]} ({thread[id]})")
filename_fmt = "{post[id]}_{num:>02}_{id}.{extension}"
archive_fmt = "{post[id]}/{filename}"
filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}"
archive_fmt = "{post[id]}/{id}_{filename}"
def items(self):
native = (f"{self.root}/", f"{self.root[6:]}/")
extract_urls = text.re(
r'(?s)<((?:video .*?<source src|a [^>]*?href)="([^"]+).*?)</a>'
r'(?s)<('
r'(?:video .*?<source src|a [^>]*?href)="([^"]+).*?</a>'
r'|img [^>]*?src="([^"]+)"[^>]*>'
r')'
).findall
if self.config("quoted", False):
@@ -44,9 +47,14 @@ class BellazonExtractor(Extractor):
post["count"] = data["count"] = len(urls)
yield Message.Directory, data
for data["num"], (info, url) in enumerate(urls, 1):
url = text.unescape(url)
data["num"] = 0
for info, url, url_img in urls:
url = text.unescape(url or url_img)
if url.startswith(native):
if "/uploads/emoticons/" in url or "/profile/" in url:
continue
data["num"] += 1
if not (alt := text.extr(info, ' alt="', '"')) or (
alt.startswith("post-") and "_thumb." in alt):
name = url
@@ -60,13 +68,13 @@ class BellazonExtractor(Extractor):
elif "/core/interface/file/attachment.php" in url:
if not dc["id"]:
dc["id"] = url.rpartition("?id=")[2]
if (pos := info.find(">")) >= 0 and \
(name := info[pos+1:].strip()):
if name := text.extr(info, ">", "<").strip():
text.nameext_from_url(name, dc)
if url[0] == "/":
url = f"https:{url}"
yield Message.Url, url, dc
else:
yield Message.Queue, url, data

View File

@@ -134,7 +134,7 @@ __tests__ = (
"extension": "mp4",
"filename" : r"re:^\d+$",
"id" : r"re:6361\d\d\d",
"num" : range(3, 12),
"num" : range(2, 11),
"post" : {
"author_id" : "101807",
"author_slug": "rogerdanish",
@@ -190,6 +190,20 @@ __tests__ = (
"id" : "10919171",
},
{
"#url" : "https://www.bellazon.com/main/topic/66334-charly-jordan/page/3/#findComment-4602714",
"#comment" : "'/profile/' link",
"#class" : bellazon.BellazonPostExtractor,
"#count" : 0,
},
{
"#url" : "https://www.bellazon.com/main/topic/66334-charly-jordan/page/3/#findComment-4603172",
"#comment" : "'inline' image",
"#class" : bellazon.BellazonPostExtractor,
"#results" : "https://www.bellazon.com/main/uploads/monthly_2018_04/30602369_1891291154222843_1650952189830496256_n.jpg.33e6ab78dd0e8723f790ad4f58f3761a.jpg",
},
{
"#url" : "https://www.bellazon.com/main/topic/57872-millie-brady/",
"#class" : bellazon.BellazonThreadExtractor,
@@ -244,7 +258,7 @@ __tests__ = (
"#url" : "https://www.bellazon.com/main/topic/1774-zhang-ziyi/",
"#class" : bellazon.BellazonThreadExtractor,
"#range" : "1-5",
"#options" : {"prder-posts": "asc"},
"#options" : {"order-posts": "asc"},
"#results" : (
"http://img292.echo.cx/my.php?image=4moon011rk.jpg",
"http://img294.echo.cx/my.php?image=heroclip3jb.jpg",