[nudostarforum] add support (#8664)

Add support for nudostar.com forum (XenForo-based forum site). This is separate from the existing nudostar.py which handles nudostar.tv. Supports: - Thread extraction with pagination - Individual post extraction - Authentication via xf_user cookie or username/password - Internal attachments (both linked and embedded images) - External image host URLs (queued for recursive processing)
2025-12-11 11:03:17 -05:00
parent 484a15ff83
commit 1eaaffffbb
3 changed files with 231 additions and 0 deletions
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@@ -146,6 +146,7 @@ modules = [
    "nozomi",
    "nsfwalbum",
    "nudostar",
+    "nudostarforum",
    "okporn",
    "paheal",
    "patreon",
--- a/gallery_dl/extractor/nudostarforum.py
+++ b/gallery_dl/extractor/nudostarforum.py
@@ -0,0 +1,201 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://nudostar.com/forum/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+from ..cache import cache
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?nudostar\.com/forum"
+
+
+class NudostarforumExtractor(Extractor):
+    """Base class for nudostar forum extractors"""
+    category = "nudostarforum"
+    cookies_domain = "nudostar.com"
+    cookies_names = ("xf_user",)
+    root = "https://nudostar.com/forum"
+    directory_fmt = ("{category}", "{thread[title]} ({thread[id]})")
+    filename_fmt = "{post[id]}_{num:>02}_{filename}.{extension}"
+    archive_fmt = "{post[id]}/{filename}"
+
+    def items(self):
+        self.login()
+
+        for post in self.posts():
+            internal, external = self._extract_post_urls(post["content"])
+
+            data = {"post": post}
+            post["count"] = data["count"] = len(internal) + len(external)
+            yield Message.Directory, "", data
+
+            data["num"] = 0
+            for url in internal:
+                data["num"] += 1
+                text.nameext_from_url(url, data)
+                yield Message.Url, url, data
+
+            for url in external:
+                data["num"] += 1
+                yield Message.Queue, url, data
+
+    def _extract_post_urls(self, content):
+        """Extract image and video URLs from post content"""
+        internal = []
+        external = []
+        seen = set()
+
+        # Extract URLs from both href= and src= attributes
+        for attr in ('href="', 'src="'):
+            for url in text.extract_iter(content, attr, '"'):
+                if url in seen:
+                    continue
+
+                # Internal attachments
+                if "/forum/attachments/" in url:
+                    # Skip numeric-only IDs and non-file links
+                    path = url.rstrip("/")
+                    if path.split(".")[-1].isdigit() and "-" not in path:
+                        continue
+                    if "upload?" in url:
+                        continue
+                    seen.add(url)
+                    # Normalize to full URL
+                    if url.startswith("/"):
+                        url = "https://nudostar.com" + url
+                    internal.append(url)
+
+                # External image hosts
+                elif url.startswith("http") and "nudostar.com" not in url:
+                    seen.add(url)
+                    external.append(url)
+
+        return internal, external
+
+    def request_page(self, url):
+        try:
+            return self.request(url)
+        except exception.HttpError as exc:
+            if exc.status == 403:
+                raise exception.AuthRequired(
+                    ("username & password", "authenticated cookies"), None,
+                    "Login required to view this content")
+            raise
+
+    def login(self):
+        if self.cookies_check(self.cookies_names):
+            return
+
+        username, password = self._get_auth_info()
+        if username:
+            self.cookies_update(self._login_impl(username, password))
+
+    @cache(maxage=365*86400, keyarg=1)
+    def _login_impl(self, username, password):
+        self.log.info("Logging in as %s", username)
+
+        url = f"{self.root}/login/"
+        page = self.request(url).text
+        token = text.extr(page, 'name="_xfToken" value="', '"')
+
+        url = f"{self.root}/login/login"
+        data = {
+            "_xfToken" : token,
+            "login"    : username,
+            "password" : password,
+            "remember" : "1",
+            "_xfRedirect": self.root + "/",
+        }
+        response = self.request(url, method="POST", data=data)
+
+        if not response.history or "xf_user" not in response.cookies:
+            raise exception.AuthenticationError()
+
+        return {
+            cookie.name: cookie.value
+            for cookie in self.cookies
+            if cookie.domain.endswith(self.cookies_domain)
+        }
+
+    def _pagination(self, base, pnum=None):
+        if pnum is None:
+            url = f"{self.root}{base}/"
+            pnum = 1
+        else:
+            url = f"{self.root}{base}/page-{pnum}"
+            pnum = None
+
+        while True:
+            page = self.request_page(url).text
+            yield page
+
+            if pnum is None or "pageNav-jump--next" not in page:
+                return
+            pnum += 1
+            url = f"{self.root}{base}/page-{pnum}"
+
+    def _parse_thread(self, page):
+        extr = text.extract_from(page)
+
+        title = text.unescape(extr("<title>", "<"))
+        if " | " in title:
+            title = title.rpartition(" | ")[0]
+
+        thread_id = extr('data-content-key="thread-', '"')
+
+        return {
+            "id"   : thread_id,
+            "title": title.strip(),
+        }
+
+    def _parse_post(self, html):
+        extr = text.extract_from(html)
+
+        return {
+            "author": extr('data-author="', '"'),
+            "id"    : extr('data-content="post-', '"'),
+            "date"  : extr('datetime="', '"'),
+            "content": html,  # Pass full article HTML for URL extraction
+        }
+
+
+class NudostarforumPostExtractor(NudostarforumExtractor):
+    """Extractor for individual posts on nudostar forum"""
+    subcategory = "post"
+    pattern = (rf"{BASE_PATTERN}"
+               rf"/threads/[^/?#]+\.(\d+)/post-(\d+)")
+    example = "https://nudostar.com/forum/threads/NAME.12345/post-67890"
+
+    def posts(self):
+        thread_id, post_id = self.groups
+        url = f"{self.root}/posts/{post_id}/"
+        page = self.request_page(url).text
+
+        pos = page.find(f'data-content="post-{post_id}"')
+        if pos < 0:
+            raise exception.NotFoundError("post")
+        html = text.extract(page, "<article ", "</article>", pos-200)[0]
+
+        self.kwdict["thread"] = self._parse_thread(page)
+        return (self._parse_post(html),)
+
+
+class NudostarforumThreadExtractor(NudostarforumExtractor):
+    """Extractor for threads on nudostar forum"""
+    subcategory = "thread"
+    pattern = rf"{BASE_PATTERN}(/threads/[^/?#]+\.(\d+))(?:/page-(\d+))?"
+    example = "https://nudostar.com/forum/threads/NAME.12345/"
+
+    def posts(self):
+        path, thread_id, pnum = self.groups
+
+        for page in self._pagination(path, pnum):
+            if "thread" not in self.kwdict:
+                self.kwdict["thread"] = self._parse_thread(page)
+
+            for html in text.extract_iter(page, "<article ", "</article>"):
+                yield self._parse_post(html)
--- a/test/results/nudostarforum.py
+++ b/test/results/nudostarforum.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import nudostarforum
+
+
+__tests__ = (
+{
+    "#url"     : "https://nudostar.com/forum/threads/aspen-rae.106714/",
+    "#category": ("", "nudostarforum", "thread"),
+    "#class"   : nudostarforum.NudostarforumThreadExtractor,
+},
+
+{
+    "#url"     : "https://nudostar.com/forum/threads/aspen-rae.106714/page-2",
+    "#category": ("", "nudostarforum", "thread"),
+    "#class"   : nudostarforum.NudostarforumThreadExtractor,
+},
+
+{
+    "#url"     : "https://nudostar.com/forum/threads/name.12345/post-67890",
+    "#category": ("", "nudostarforum", "post"),
+    "#class"   : nudostarforum.NudostarforumPostExtractor,
+},
+
+)