diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 0136f17d..e7722837 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -146,6 +146,7 @@ modules = [ "nozomi", "nsfwalbum", "nudostar", + "nudostarforum", "okporn", "paheal", "patreon", diff --git a/gallery_dl/extractor/nudostarforum.py b/gallery_dl/extractor/nudostarforum.py new file mode 100644 index 00000000..eb152e93 --- /dev/null +++ b/gallery_dl/extractor/nudostarforum.py @@ -0,0 +1,201 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://nudostar.com/forum/""" + +from .common import Extractor, Message +from .. import text, exception +from ..cache import cache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?nudostar\.com/forum" + + +class NudostarforumExtractor(Extractor): + """Base class for nudostar forum extractors""" + category = "nudostarforum" + cookies_domain = "nudostar.com" + cookies_names = ("xf_user",) + root = "https://nudostar.com/forum" + directory_fmt = ("{category}", "{thread[title]} ({thread[id]})") + filename_fmt = "{post[id]}_{num:>02}_{filename}.{extension}" + archive_fmt = "{post[id]}/{filename}" + + def items(self): + self.login() + + for post in self.posts(): + internal, external = self._extract_post_urls(post["content"]) + + data = {"post": post} + post["count"] = data["count"] = len(internal) + len(external) + yield Message.Directory, "", data + + data["num"] = 0 + for url in internal: + data["num"] += 1 + text.nameext_from_url(url, data) + yield Message.Url, url, data + + for url in external: + data["num"] += 1 + yield Message.Queue, url, data + + def _extract_post_urls(self, content): + """Extract image and video URLs from post content""" + internal = [] + external = [] + seen = set() + + # Extract URLs from both href= and src= attributes + for attr in ('href="', 'src="'): + for url in text.extract_iter(content, attr, '"'): + if url in seen: + continue + + # Internal attachments + if "/forum/attachments/" in url: + # Skip numeric-only IDs and non-file links + path = url.rstrip("/") + if path.split(".")[-1].isdigit() and "-" not in path: + continue + if "upload?" in url: + continue + seen.add(url) + # Normalize to full URL + if url.startswith("/"): + url = "https://nudostar.com" + url + internal.append(url) + + # External image hosts + elif url.startswith("http") and "nudostar.com" not in url: + seen.add(url) + external.append(url) + + return internal, external + + def request_page(self, url): + try: + return self.request(url) + except exception.HttpError as exc: + if exc.status == 403: + raise exception.AuthRequired( + ("username & password", "authenticated cookies"), None, + "Login required to view this content") + raise + + def login(self): + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + if username: + self.cookies_update(self._login_impl(username, password)) + + @cache(maxage=365*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = f"{self.root}/login/" + page = self.request(url).text + token = text.extr(page, 'name="_xfToken" value="', '"') + + url = f"{self.root}/login/login" + data = { + "_xfToken" : token, + "login" : username, + "password" : password, + "remember" : "1", + "_xfRedirect": self.root + "/", + } + response = self.request(url, method="POST", data=data) + + if not response.history or "xf_user" not in response.cookies: + raise exception.AuthenticationError() + + return { + cookie.name: cookie.value + for cookie in self.cookies + if cookie.domain.endswith(self.cookies_domain) + } + + def _pagination(self, base, pnum=None): + if pnum is None: + url = f"{self.root}{base}/" + pnum = 1 + else: + url = f"{self.root}{base}/page-{pnum}" + pnum = None + + while True: + page = self.request_page(url).text + yield page + + if pnum is None or "pageNav-jump--next" not in page: + return + pnum += 1 + url = f"{self.root}{base}/page-{pnum}" + + def _parse_thread(self, page): + extr = text.extract_from(page) + + title = text.unescape(extr("