[xenforo] implement generic XenForo forum extractors

support - https://simpcity.cr/ - https://nudostar.com/forum/ (#8333)
2025-12-11 21:26:34 +01:00
parent 814085062a
commit ab2c03b39e
7 changed files with 186 additions and 278 deletions
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@@ -146,7 +146,6 @@ modules = [
    "nozomi",
    "nsfwalbum",
    "nudostar",
-    "nudostarforum",
    "okporn",
    "paheal",
    "patreon",
@@ -187,7 +186,6 @@ modules = [
    "senmanga",
    "sexcom",
    "shimmie2",
-    "simpcity",
    "simplyhentai",
    "sizebooru",
    "skeb",
@@ -235,6 +233,7 @@ modules = [
    "wikifeet",
    "wikimedia",
    "xasiat",
+    "xenforo",
    "xfolio",
    "xhamster",
    "xvideos",
--- a/gallery_dl/extractor/nudostarforum.py
+++ b/gallery_dl/extractor/nudostarforum.py
@@ -1,201 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://nudostar.com/forum/"""
-
-from .common import Extractor, Message
-from .. import text, exception
-from ..cache import cache
-
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?nudostar\.com/forum"
-
-
-class NudostarforumExtractor(Extractor):
-    """Base class for nudostar forum extractors"""
-    category = "nudostarforum"
-    cookies_domain = "nudostar.com"
-    cookies_names = ("xf_user",)
-    root = "https://nudostar.com/forum"
-    directory_fmt = ("{category}", "{thread[title]} ({thread[id]})")
-    filename_fmt = "{post[id]}_{num:>02}_{filename}.{extension}"
-    archive_fmt = "{post[id]}/{filename}"
-
-    def items(self):
-        self.login()
-
-        for post in self.posts():
-            internal, external = self._extract_post_urls(post["content"])
-
-            data = {"post": post}
-            post["count"] = data["count"] = len(internal) + len(external)
-            yield Message.Directory, "", data
-
-            data["num"] = 0
-            for url in internal:
-                data["num"] += 1
-                text.nameext_from_url(url, data)
-                yield Message.Url, url, data
-
-            for url in external:
-                data["num"] += 1
-                yield Message.Queue, url, data
-
-    def _extract_post_urls(self, content):
-        """Extract image and video URLs from post content"""
-        internal = []
-        external = []
-        seen = set()
-
-        # Extract URLs from both href= and src= attributes
-        for attr in ('href="', 'src="'):
-            for url in text.extract_iter(content, attr, '"'):
-                if url in seen:
-                    continue
-
-                # Internal attachments
-                if "/forum/attachments/" in url:
-                    # Skip numeric-only IDs and non-file links
-                    path = url.rstrip("/")
-                    if path.split(".")[-1].isdigit() and "-" not in path:
-                        continue
-                    if "upload?" in url:
-                        continue
-                    seen.add(url)
-                    # Normalize to full URL
-                    if url.startswith("/"):
-                        url = "https://nudostar.com" + url
-                    internal.append(url)
-
-                # External image hosts
-                elif url.startswith("http") and "nudostar.com" not in url:
-                    seen.add(url)
-                    external.append(url)
-
-        return internal, external
-
-    def request_page(self, url):
-        try:
-            return self.request(url)
-        except exception.HttpError as exc:
-            if exc.status == 403:
-                raise exception.AuthRequired(
-                    ("username & password", "authenticated cookies"), None,
-                    "Login required to view this content")
-            raise
-
-    def login(self):
-        if self.cookies_check(self.cookies_names):
-            return
-
-        username, password = self._get_auth_info()
-        if username:
-            self.cookies_update(self._login_impl(username, password))
-
-    @cache(maxage=365*86400, keyarg=1)
-    def _login_impl(self, username, password):
-        self.log.info("Logging in as %s", username)
-
-        url = f"{self.root}/login/"
-        page = self.request(url).text
-        token = text.extr(page, 'name="_xfToken" value="', '"')
-
-        url = f"{self.root}/login/login"
-        data = {
-            "_xfToken" : token,
-            "login"    : username,
-            "password" : password,
-            "remember" : "1",
-            "_xfRedirect": self.root + "/",
-        }
-        response = self.request(url, method="POST", data=data)
-
-        if not response.history or "xf_user" not in response.cookies:
-            raise exception.AuthenticationError()
-
-        return {
-            cookie.name: cookie.value
-            for cookie in self.cookies
-            if cookie.domain.endswith(self.cookies_domain)
-        }
-
-    def _pagination(self, base, pnum=None):
-        if pnum is None:
-            url = f"{self.root}{base}/"
-            pnum = 1
-        else:
-            url = f"{self.root}{base}/page-{pnum}"
-            pnum = None
-
-        while True:
-            page = self.request_page(url).text
-            yield page
-
-            if pnum is None or "pageNav-jump--next" not in page:
-                return
-            pnum += 1
-            url = f"{self.root}{base}/page-{pnum}"
-
-    def _parse_thread(self, page):
-        extr = text.extract_from(page)
-
-        title = text.unescape(extr("<title>", "<"))
-        if " | " in title:
-            title = title.rpartition(" | ")[0]
-
-        thread_id = extr('data-content-key="thread-', '"')
-
-        return {
-            "id"   : thread_id,
-            "title": title.strip(),
-        }
-
-    def _parse_post(self, html):
-        extr = text.extract_from(html)
-
-        return {
-            "author": extr('data-author="', '"'),
-            "id"    : extr('data-content="post-', '"'),
-            "date"  : extr('datetime="', '"'),
-            "content": html,  # Pass full article HTML for URL extraction
-        }
-
-
-class NudostarforumPostExtractor(NudostarforumExtractor):
-    """Extractor for individual posts on nudostar forum"""
-    subcategory = "post"
-    pattern = (rf"{BASE_PATTERN}"
-               rf"/threads/[^/?#]+\.(\d+)/post-(\d+)")
-    example = "https://nudostar.com/forum/threads/NAME.12345/post-67890"
-
-    def posts(self):
-        thread_id, post_id = self.groups
-        url = f"{self.root}/posts/{post_id}/"
-        page = self.request_page(url).text
-
-        pos = page.find(f'data-content="post-{post_id}"')
-        if pos < 0:
-            raise exception.NotFoundError("post")
-        html = text.extract(page, "<article ", "</article>", pos-200)[0]
-
-        self.kwdict["thread"] = self._parse_thread(page)
-        return (self._parse_post(html),)
-
-
-class NudostarforumThreadExtractor(NudostarforumExtractor):
-    """Extractor for threads on nudostar forum"""
-    subcategory = "thread"
-    pattern = rf"{BASE_PATTERN}(/threads/[^/?#]+\.(\d+))(?:/page-(\d+))?"
-    example = "https://nudostar.com/forum/threads/NAME.12345/"
-
-    def posts(self):
-        path, thread_id, pnum = self.groups
-
-        for page in self._pagination(path, pnum):
-            if "thread" not in self.kwdict:
-                self.kwdict["thread"] = self._parse_thread(page)
-
-            for html in text.extract_iter(page, "<article ", "</article>"):
-                yield self._parse_post(html)
--- a/gallery_dl/extractor/simpcity.py
+++ b/gallery_dl/extractor/simpcity.py
@@ -6,36 +6,38 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.

-"""Extractors for https://simpcity.cr/"""
+"""Extractors for XenForo forums"""

-from .common import Extractor, Message
+from .common import BaseExtractor, Message
 from .. import text, exception
 from ..cache import cache

-BASE_PATTERN = r"(?:https?://)?(?:www\.)?simpcity\.(?:cr|su)"

-
-class SimpcityExtractor(Extractor):
-    """Base class for simpcity extractors"""
-    category = "simpcity"
-    cookies_domain = "simpcity.cr"
+class XenforoExtractor(BaseExtractor):
+    """Base class for xenforo extractors"""
+    basecategory = "xenforo"
+    #  cookies_domain = "simpcity.cr"
    cookies_names = ("ogaddgmetaprof_user",)
-    root = "https://simpcity.cr"
    directory_fmt = ("{category}", "{thread[section]}",
                     "{thread[title]} ({thread[id]})")
    filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}"
    archive_fmt = "{post[id]}/{type[0]}{id}_{filename}"

+    def __init__(self, match):
+        BaseExtractor.__init__(self, match)
+        self.cookies_domain = "." + self.root.split("/")[2]
+        self.cookies_names = self.config_instance("cookies")
+
    def items(self):
        self.login()

        extract_urls = text.re(
            r'(?s)(?:'
            r'<video (.*?\ssrc="[^"]+".*?)</video>'
-            r'|<a [^>]*?href="'
-            r'(?:https://[^"]+)?(/attachments/[^"]+".*?)</a>'
-            r'|<div [^>]*?data-src="'
-            r'(?:https://[^"]+)?(/attachments/[^"]+".*?)/>'
+            r'|<a [^>]*?href="[^"]*?'
+            r'(/attachments/[^"]+".*?)</a>'
+            r'|<div [^>]*?data-src="[^"]*?'
+            r'(/attachments/[^"]+".*?)/>'
            r'|(?:<a [^>]*?href="|<iframe [^>]*?src="|'
            r'''onclick="loadMedia\(this, ')([^"']+)'''
            r')'
@@ -50,6 +52,7 @@ class SimpcityExtractor(Extractor):
            post["count"] = data["count"] = len(urls)
            yield Message.Directory, "", data

+            id_last = None
            data["num"] = data["num_internal"] = data["num_external"] = 0
            for video, inl1, inl2, ext in urls:
                if ext:
@@ -71,12 +74,14 @@ class SimpcityExtractor(Extractor):
                    yield Message.Url, url, data

                elif (inline := inl1 or inl2):
-                    data["num"] += 1
-                    data["num_internal"] += 1
-                    data["type"] = "inline"
                    path = inline[:inline.find('"')]
                    name, _, id = path[path.rfind("/", 0, -1):].strip(
                        "/").rpartition(".")
+                    if id == id_last:
+                        id_last = None
+                        continue
+                    else:
+                        id_last = id
                    data["id"] = text.parse_int(id)
                    if alt := text.extr(inline, 'alt="', '"'):
                        text.nameext_from_name(alt, data)
@@ -85,6 +90,9 @@ class SimpcityExtractor(Extractor):
                    else:
                        data["filename"], _, data["extension"] = \
                            name.rpartition("-")
+                    data["num"] += 1
+                    data["num_internal"] += 1
+                    data["type"] = "inline"
                    yield Message.Url, self.root + path, data

    def request_page(self, url):
@@ -180,10 +188,15 @@ class SimpcityExtractor(Extractor):
            html, "blockMessage--error", "</").rpartition(">")[2].strip())

    def _parse_thread(self, page):
-        schema = self._extract_jsonld(page)["mainEntity"]
+        try:
+            data = self._extract_jsonld(page)
+        except ValueError:
+            return {}
+
+        schema = data.get("mainEntity", data)
        author = schema["author"]
        stats = schema["interactionStatistic"]
-        url_t = schema["url"]
+        url_t = schema.get("url") or schema.get("@id") or ""
        url_a = author.get("url") or ""

        thread = {
@@ -191,8 +204,6 @@ class SimpcityExtractor(Extractor):
            "url"  : url_t,
            "title": schema["headline"],
            "date" : self.parse_datetime_iso(schema["datePublished"]),
-            "views": stats[0]["userInteractionCount"],
-            "posts": stats[1]["userInteractionCount"],
            "tags" : (schema["keywords"].split(", ")
                      if "keywords" in schema else ()),
            "section"   : schema["articleSection"],
@@ -202,6 +213,13 @@ class SimpcityExtractor(Extractor):
            "author_url": url_a,
        }

+        if isinstance(stats, list):
+            thread["views"] = stats[0]["userInteractionCount"]
+            thread["posts"] = stats[1]["userInteractionCount"]
+        else:
+            thread["views"] = -1
+            thread["posts"] = stats["userInteractionCount"]
+
        return thread

    def _parse_post(self, html):
@@ -210,13 +228,11 @@ class SimpcityExtractor(Extractor):
        post = {
            "author": extr('data-author="', '"'),
            "id": extr('data-content="post-', '"'),
-            "author_url": extr('itemprop="url" content="', '"'),
+            "author_url": (extr('itemprop="url" content="', '"') or
+                           extr('<a href="', '"')),
            "date": self.parse_datetime_iso(extr('datetime="', '"')),
-            "content": (
-                extr('<div itemprop="text">',
-                     '<div class="js-selectToQuote') or
-                extr('<div >',
-                     '<div class="js-selectToQuote')).strip(),
+            "content": extr('class="message-body',
+                            '<div class="js-selectToQuote'),
            "attachments": extr('<section class="message-attachments">',
                                '</section>'),
        }
@@ -224,16 +240,35 @@ class SimpcityExtractor(Extractor):
        url_a = post["author_url"]
        post["author_id"] = url_a[url_a.rfind(".")+1:-1]

+        con = post["content"]
+        if (pos := con.find('<div class="bbWrapper')) >= 0:
+            con = con[pos:]
+        post["content"] = con.strip()
+
        return post


-class SimpcityPostExtractor(SimpcityExtractor):
+BASE_PATTERN = XenforoExtractor.update({
+    "simpcity": {
+        "root": "https://simpcity.cr",
+        "pattern": r"(?:www\.)?simpcity\.(?:cr|su)",
+        "cookies": ("ogaddgmetaprof_user",),
+    },
+    "nudostarforum": {
+        "root": "https://nudostar.com/forum",
+        "pattern": r"(?:www\.)?nudostar\.com/forum",
+        "cookies": ("xf_user",),
+    },
+})
+
+
+class XenforoPostExtractor(XenforoExtractor):
    subcategory = "post"
    pattern = rf"{BASE_PATTERN}/(?:threads/[^/?#]+/post-|posts/)(\d+)"
    example = "https://simpcity.cr/threads/TITLE.12345/post-54321"

    def posts(self):
-        post_id = self.groups[0]
+        post_id = self.groups[-1]
        url = f"{self.root}/posts/{post_id}/"
        page = self.request_page(url).text

@@ -246,18 +281,21 @@ class SimpcityPostExtractor(SimpcityExtractor):
        return (self._parse_post(html),)


-class SimpcityThreadExtractor(SimpcityExtractor):
+class XenforoThreadExtractor(XenforoExtractor):
    subcategory = "thread"
    pattern = rf"{BASE_PATTERN}(/threads/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
    example = "https://simpcity.cr/threads/TITLE.12345/"

    def posts(self):
+        path = self.groups[-2]
+        pnum = self.groups[-1]
+
        if (order := self.config("order-posts")) and \
                order[0] not in ("d", "r"):
-            pages = self._pagination(*self.groups)
+            pages = self._pagination(path, pnum)
            reverse = False
        else:
-            pages = self._pagination_reverse(*self.groups)
+            pages = self._pagination_reverse(path, pnum)
            reverse = True

        for page in pages:
@@ -271,13 +309,18 @@ class SimpcityThreadExtractor(SimpcityExtractor):
                yield self._parse_post(html)


-class SimpcityForumExtractor(SimpcityExtractor):
+class XenforoForumExtractor(XenforoExtractor):
    subcategory = "forum"
-    pattern = rf"{BASE_PATTERN}(/forums/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
+    pattern = rf"{BASE_PATTERN}(/forums/(?:[^/?#]+\.)?[^/?#]+)(?:/page-(\d+))?"
    example = "https://simpcity.cr/forums/TITLE.123/"

    def items(self):
-        data = {"_extractor": SimpcityThreadExtractor}
-        for page in self._pagination(*self.groups):
-            for path in text.extract_iter(page, ' uix-href="', '"'):
+        extract_threads = text.re(
+            r'(/threads/[^"]+)"[^>]+data-xf-init=').findall
+
+        data = {"_extractor": XenforoThreadExtractor}
+        path = self.groups[-2]
+        pnum = self.groups[-1]
+        for page in self._pagination(path, pnum):
+            for path in extract_threads(page):
                yield Message.Queue, f"{self.root}{text.unquote(path)}", data