diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 5efb4276..a913c6bc 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -883,6 +883,12 @@ Consider all listed sites to potentially be NSFW. Boards, Likes, Pins, User Pins, related Pins, Search Results + + SimpCity Forums + https://simpcity.cr/ + Forums, Posts, Threads + + Simply Hentai https://www.simply-hentai.com/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index fe89cd60..fe61c428 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -171,6 +171,7 @@ modules = [ "senmanga", "sexcom", "shimmie2", + "simpcity", "simplyhentai", "sizebooru", "skeb", diff --git a/gallery_dl/extractor/simpcity.py b/gallery_dl/extractor/simpcity.py new file mode 100644 index 00000000..8cc7e388 --- /dev/null +++ b/gallery_dl/extractor/simpcity.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://simpcity.cr/""" + +from .common import Extractor, Message +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?simpcity\.(?:cr|su)" + + +class SimpcityExtractor(Extractor): + """Base class for simpcity extractors""" + category = "simpcity" + root = "https://simpcity.cr" + + def items(self): + extract_urls = text.re(r' href="([^"]+)').findall + + for post in self.posts(): + urls = extract_urls(post["content"]) + data = {"post": post} + post["count"] = data["count"] = len(urls) + for data["num"], url in enumerate(urls, 1): + yield Message.Queue, url, data + + def request_page(self, url): + try: + return self.request(url).text + except exception.HttpError as exc: + if exc.status == 403 and b">Log in<" in exc.response.content: + msg = text.extr(exc.response.text, "blockMessage--error", "")[2].strip()) + raise + + def _pagination(self, base, pnum=None): + base = f"{self.root}{base}" + + if pnum is None: + url = base + pnum = 1 + else: + url = f"{base}/page-{pnum}" + pnum = None + + while True: + page = self.request_page(url) + + yield page + + if pnum is None or "pageNav-jump--next" not in page: + return + pnum += 1 + url = f"{base}/page-{pnum}" + + def _parse_thread(self, page): + schema = self._extract_jsonld(page)["mainEntity"] + author = schema["author"] + stats = schema["interactionStatistic"] + url_t = schema["url"] + url_a = author["url"] + + thread = { + "id" : url_t[url_t.rfind(".")+1:-1], + "url" : url_t, + "title": schema["headline"], + "date" : text.parse_datetime(schema["datePublished"]), + "views": stats[0]["userInteractionCount"], + "posts": stats[1]["userInteractionCount"], + "tags" : (schema["keywords"].split(", ") + if "keywords" in schema else ()), + "section" : schema["articleSection"], + "author" : author["name"], + "author_id" : url_a[url_a.rfind(".")+1:-1], + "author_url": url_a, + } + + return thread + + def _parse_post(self, html): + extr = text.extract_from(html) + + post = { + "author": extr('data-author="', '"'), + "id": extr('data-content="post-', '"'), + "author_url": extr('itemprop="url" content="', '"'), + "date": text.parse_datetime(extr('datetime="', '"')), + "content": extr('
', "\t\t
").strip(), + } + + url_a = post["author_url"] + post["author_id"] = url_a[url_a.rfind(".")+1:-1] + + return post + + +class SimpcityPostExtractor(SimpcityExtractor): + subcategory = "post" + pattern = rf"{BASE_PATTERN}/(?:threads/[^/?#]+/post-|posts/)(\d+)" + example = "https://simpcity.cr/threads/TITLE.12345/post-54321" + + def posts(self): + post_id = self.groups[0] + url = f"{self.root}/posts/{post_id}/" + page = self.request_page(url) + + pos = page.find(f'data-content="post-{post_id}"') + if pos < 0: + raise exception.NotFoundError("post") + html = text.extract(page, "
", pos-200)[0] + + self.kwdict["thread"] = self._parse_thread(page) + return (self._parse_post(html),) + + +class SimpcityThreadExtractor(SimpcityExtractor): + subcategory = "thread" + pattern = rf"{BASE_PATTERN}(/threads/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?" + example = "https://simpcity.cr/threads/TITLE.12345/" + + def posts(self): + for page in self._pagination(*self.groups): + if "thread" not in self.kwdict: + self.kwdict["thread"] = self._parse_thread(page) + for html in text.extract_iter(page, "
"): + yield self._parse_post(html) + + +class SimpcityForumExtractor(SimpcityExtractor): + subcategory = "forum" + pattern = rf"{BASE_PATTERN}(/forums/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?" + example = "https://simpcity.cr/forums/TITLE.123/" + + def items(self): + data = {"_extractor": SimpcityThreadExtractor} + for page in self._pagination(*self.groups): + for path in text.extract_iter(page, ' uix-href="', '"'): + yield Message.Queue, f"{self.root}{text.unquote(path)}", data diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index f8fb4a62..328cd258 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -159,6 +159,7 @@ CATEGORY_MAP = { "sensescans" : "Sense-Scans", "sexcom" : "Sex.com", "silverpic" : "SilverPic.com", + "simpcity" : "SimpCity Forums", "simplyhentai" : "Simply Hentai", "sizebooru" : "Size Booru", "slickpic" : "SlickPic", diff --git a/test/results/simpcity.py b/test/results/simpcity.py new file mode 100644 index 00000000..a5dee989 --- /dev/null +++ b/test/results/simpcity.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import simpcity +from gallery_dl import exception + + +__tests__ = ( +{ + "#url" : "https://simpcity.cr/threads/ririkana-rr_loveit.10731/post-1753131", + "#class" : simpcity.SimpcityPostExtractor, + "#auth" : True, + "#results" : "https://jpg5.su/img/coWRwo", + + "count" : 1, + "num" : 1, + "post" : { + "author" : "Zebrabobinn", + "author_id" : "171827", + "author_url": "https://simpcity.cr/members/zebrabobinn.171827/", + "count" : 1, + "date" : "dt:2023-03-08 12:59:10", + "id" : "1753131", + "content" : """\ +
\ +""", + }, + "thread": { + "author" : "eula", + "author_id" : "54987", + "author_url": "https://simpcity.cr/members/eula.54987/", + "date" : "dt:2022-03-11 17:15:59", + "id" : "10731", + "posts" : range(320, 500), + "section" : "Asians", + "title" : "Ririkana | RR_loveit", + "url" : "https://simpcity.cr/threads/ririkana-rr_loveit.10731/", + "views" : range(790_000, 900_000), + "tags" : [ + "asian", + "big ass", + "gravure", + "japanese", + "japanese big ass", + "small tits", + "thicc", + ], + }, +}, + +{ + "#url" : "https://simpcity.cr/threads/ririkana-rr_loveit.10731/post-1753131", + "#class" : simpcity.SimpcityPostExtractor, + "#auth" : False, + "#exception": exception.AuthRequired, +}, + +{ + "#url" : "https://simpcity.cr/threads/alua-tatakai.89490/", + "#class" : simpcity.SimpcityThreadExtractor, + "#auth" : True, + "#pattern" : r"https://(jpg5\.su/img/\w+|bunkr\.\w+/[fiv]/\w+|pixeldrain.com/l/\w+|alua.com/tatakai)|/goto/post", + "#count" : 29, + + "count" : int, + "num" : int, + "post" : { + "author" : str, + "author_id" : r"re:\d+", + "author_url": str, + "content" : str, + "count" : int, + "date" : "type:datetime", + "id" : r"re:\d+", + }, + "thread": { + "author" : "Ekalamosus", + "author_id" : "1036155", + "author_url": "https://simpcity.cr/members/ekalamosus.1036155/", + "date" : "dt:2022-07-31 15:40:14", + "id" : "89490", + "posts" : 45, + "section" : "Asians", + "title" : "Alua tatakai", + "url" : "https://simpcity.cr/threads/alua-tatakai.89490/", + "views" : range(47_000, 60_000), + "tags" : [ + "alter", + "alua", + "pinay", + ], + }, +}, + +{ + "#url" : "https://simpcity.su/threads/angel-chan-wlep-wlop-menruinyanko_.12948/", + "#class" : simpcity.SimpcityThreadExtractor, +}, + +{ + "#url" : "https://simpcity.cr/forums/asians.48/", + "#class" : simpcity.SimpcityForumExtractor, + "#pattern" : simpcity.SimpcityThreadExtractor.pattern, + "#range" : "1-100", + "#count" : 100, +}, + +)