From 88bfc0991c88ba9452c39c982a837fbca01fe847 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 12 Sep 2025 17:21:34 +0200 Subject: [PATCH] [bellazon] add initial support (#7480) --- docs/supportedsites.md | 6 + gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/bellazon.py | 165 ++++++++++++++++++++++++ test/results/bellazon.py | 213 +++++++++++++++++++++++++++++++ 4 files changed, 385 insertions(+) create mode 100644 gallery_dl/extractor/bellazon.py create mode 100644 test/results/bellazon.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a913c6bc..e8e836eb 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -133,6 +133,12 @@ Consider all listed sites to potentially be NSFW. Collections, Galleries, User Profiles + + Bellazon + https://www.bellazon.com/ + Forums, Posts, Threads + + Bilibili https://www.bilibili.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index fe61c428..b32fcd11 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -31,6 +31,7 @@ modules = [ "batoto", "bbc", "behance", + "bellazon", "bilibili", "blogger", "bluesky", diff --git a/gallery_dl/extractor/bellazon.py b/gallery_dl/extractor/bellazon.py new file mode 100644 index 00000000..5c9b9cd2 --- /dev/null +++ b/gallery_dl/extractor/bellazon.py @@ -0,0 +1,165 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.bellazon.com/""" + +from .common import Extractor, Message +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?bellazon\.com/main" + + +class BellazonExtractor(Extractor): + """Base class for bellazon extractors""" + category = "bellazon" + root = "https://www.bellazon.com/main" + directory_fmt = ("{category}", "{thread[section]}", + "{thread[title]} ({thread[id]})") + filename_fmt = "{post[id]}_{num:>02}_{id}.{extension}" + archive_fmt = "{post[id]}/{filename}" + + def items(self): + extract_urls = text.re(r']*?href="([^"]+)".*?)').findall + native = f"{self.root}/" + + for post in self.posts(): + urls = extract_urls(post["content"]) + data = {"post": post} + post["count"] = data["count"] = len(urls) + + yield Message.Directory, data + for data["num"], (info, url) in enumerate(urls, 1): + url = text.unescape(url) + if url.startswith(native): + if not (alt := text.extr(info, ' alt="', '"')) or ( + alt.startswith("post-") and "_thumb." in alt): + name = url + else: + name = text.unescape(alt) + dc = text.nameext_from_url(name, data.copy()) + dc["id"] = text.extr(info, 'data-fileid="', '"') + if ext := text.extr(info, 'data-fileext="', '"'): + dc["extension"] = ext + yield Message.Url, url, dc + else: + yield Message.Queue, url, data + + def _pagination(self, base, pnum=None): + base = f"{self.root}{base}" + + if pnum is None: + url = f"{base}/" + pnum = 1 + else: + url = f"{base}/page/{pnum}/" + pnum = None + + while True: + page = self.request(url).text + + yield page + + if pnum is None or ' rel="next" ' not in page or text.extr( + page, " rel=\"next\" data-page='", "'") == str(pnum): + return + pnum += 1 + url = f"{base}/page/{pnum}/" + + def _parse_thread(self, page): + schema = self._extract_jsonld(page) + author = schema["author"] + stats = schema["interactionStatistic"] + url_t = schema["url"] + url_a = author["url"] + + path = text.split_html(text.extr( + page, '