From 48ac41605d5cd9a85d6bcbd31ddda9e4f6d6f501 Mon Sep 17 00:00:00 2001 From: SpiffyChatterbox <168247181+SpiffyChatterbox@users.noreply.github.com> Date: Mon, 16 Jun 2025 06:10:42 -0400 Subject: [PATCH] [redbust] add support (#6759 #6918 #7043) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * init - Redbust.com Support * Added Test Could use a second set of eyes on this * update 'gallery' extractor - extract more metadata - simplify image extraction - support legacy galleries * add tests * update 'image' extractor * add 'tag' extractor * add 'archive' extractor * restrict 'image' extractor pattern * update docs/supportedsites * replace quotes inside f-string --------- Co-authored-by: Mike Fährmann --- docs/supportedsites.md | 6 + gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/redbust.py | 186 +++++++++++++++++++++++++++++++ scripts/supportedsites.py | 4 + test/results/redbust.py | 100 +++++++++++++++++ 5 files changed, 297 insertions(+) create mode 100644 gallery_dl/extractor/redbust.py create mode 100644 test/results/redbust.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3598ae4d..5af473a5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -787,6 +787,12 @@ Consider all listed sites to potentially be NSFW. Favorites, Pools, Posts, Tag Searches + + RedBust + https://redbust.com/ + Archives, Galleries, Categories, individual Images, Tag Searches + + Reddit https://www.reddit.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index e656718f..11b6ecdc 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -145,6 +145,7 @@ modules = [ "reactor", "readcomiconline", "realbooru", + "redbust", "reddit", "redgifs", "rule34us", diff --git a/gallery_dl/extractor/redbust.py b/gallery_dl/extractor/redbust.py new file mode 100644 index 00000000..3a3af13e --- /dev/null +++ b/gallery_dl/extractor/redbust.py @@ -0,0 +1,186 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://redbust.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?redbust\.com" + + +class RedbustExtractor(Extractor): + """Base class for RedBust extractors""" + category = "redbust" + root = "https://redbust.com" + filename_fmt = "{filename}.{extension}" + + def items(self): + data = {"_extractor": RedbustGalleryExtractor} + for url in self.galleries(): + yield Message.Queue, url, data + + def _pagination(self, path, page=None): + if page is None: + url = f"{self.root}{path}/" + base = url + "page/" + page = self.request(url).text + else: + base = f"{self.root}{path}/page/" + + pnum = 1 + while True: + for post in text.extract_iter( + page, '

', "rel="): + yield text.extr(post, 'href="', '"') + + pnum += 1 + url = f"{base}{pnum}/" + if url not in page: + return + page = self.request(url).text + + +class RedbustGalleryExtractor(GalleryExtractor, RedbustExtractor): + """Extractor for RedBust galleries""" + pattern = BASE_PATTERN + r"/([\w-]+)/?$" + example = "https://redbust.com/TITLE/" + + def items(self): + url = f"{self.root}/{self.groups[0]}/" + self.page = page = self.request(url).text + + self.gallery_id = gid = text.extr( + page, "', "rel="): + url = text.extr(post, 'href="', '"') + yield Message.Queue, url, data + + pnum += 1 + url = f"{base}{pnum}/" + if url not in page: + return + page = self.request(url).text + + def metadata(self, _): + extr = text.extract_from(self.page) + + return { + "gallery_id" : self.gallery_id, + "gallery_slug": self.groups[0], + "categories" : text.split_html(extr( + '
  • ', "
  • "))[::2], + "title" : text.unescape(extr('class="post-title">', "<")), + "date" : text.parse_datetime( + extr('class="post-byline">', "<").strip(), "%B %d, %Y"), + "views" : text.parse_int(extr("", "v").replace(",", "")), + "tags" : text.split_html(extr( + 'class="post-tags">', ""): + if src := text.extr(img, 'src="', '"'): + path, _, end = src.rpartition("-") + if "x" in end: + url = f"{path}.{end.rpartition('.')[2]}" + data = None if src == url else {"_fallback": (src,)} + else: + url = src + data = None + results.append((url, data)) + + if not results: + # fallback for older galleries + for path in text.extract_iter( + self.page, '