[simpcity] add initial support (#3127 #5145 #5879 #8187)

2025-09-10 18:52:53 +02:00
parent 65de257ce7
commit eb9543bcc3
5 changed files with 265 additions and 0 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -883,6 +883,12 @@ Consider all listed sites to potentially be NSFW.
    <td>Boards, Likes, Pins, User Pins, related Pins, Search Results</td>
    <td></td>
 </tr>
 <tr id="simpcity" title="simpcity">
    <td>SimpCity Forums</td>
    <td>https://simpcity.cr/</td>
    <td>Forums, Posts, Threads</td>
    <td></td>
 </tr>
 <tr id="simplyhentai" title="simplyhentai">
    <td>Simply Hentai</td>
    <td>https://www.simply-hentai.com/</td>
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@@ -171,6 +171,7 @@ modules = [
    "senmanga",
    "sexcom",
    "shimmie2",
    "simpcity",
    "simplyhentai",
    "sizebooru",
    "skeb",
--- a/gallery_dl/extractor/simpcity.py
+++ b/gallery_dl/extractor/simpcity.py
@@ -0,0 +1,145 @@
 # -*- coding: utf-8 -*-
 # Copyright 2025 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 """Extractors for https://simpcity.cr/"""
 from .common import Extractor, Message
 from .. import text, exception
 BASE_PATTERN = r"(?:https?://)?(?:www\.)?simpcity\.(?:cr|su)"
 class SimpcityExtractor(Extractor):
    """Base class for simpcity extractors"""
    category = "simpcity"
    root = "https://simpcity.cr"
    def items(self):
        extract_urls = text.re(r' href="([^"]+)').findall
        for post in self.posts():
            urls = extract_urls(post["content"])
            data = {"post": post}
            post["count"] = data["count"] = len(urls)
            for data["num"], url in enumerate(urls, 1):
                yield Message.Queue, url, data
    def request_page(self, url):
        try:
            return self.request(url).text
        except exception.HttpError as exc:
            if exc.status == 403 and b">Log in<" in exc.response.content:
                msg = text.extr(exc.response.text, "blockMessage--error", "</")
                raise exception.AuthRequired(
                    "'authenticated cookies'", None,
                    msg.rpartition(">")[2].strip())
            raise
    def _pagination(self, base, pnum=None):
        base = f"{self.root}{base}"
        if pnum is None:
            url = base
            pnum = 1
        else:
            url = f"{base}/page-{pnum}"
            pnum = None
        while True:
            page = self.request_page(url)
            yield page
            if pnum is None or "pageNav-jump--next" not in page:
                return
            pnum += 1
            url = f"{base}/page-{pnum}"
    def _parse_thread(self, page):
        schema = self._extract_jsonld(page)["mainEntity"]
        author = schema["author"]
        stats = schema["interactionStatistic"]
        url_t = schema["url"]
        url_a = author["url"]
        thread = {
            "id"   : url_t[url_t.rfind(".")+1:-1],
            "url"  : url_t,
            "title": schema["headline"],
            "date" : text.parse_datetime(schema["datePublished"]),
            "views": stats[0]["userInteractionCount"],
            "posts": stats[1]["userInteractionCount"],
            "tags" : (schema["keywords"].split(", ")
                      if "keywords" in schema else ()),
            "section"   : schema["articleSection"],
            "author"    : author["name"],
            "author_id" : url_a[url_a.rfind(".")+1:-1],
            "author_url": url_a,
        }
        return thread
    def _parse_post(self, html):
        extr = text.extract_from(html)
        post = {
            "author": extr('data-author="', '"'),
            "id": extr('data-content="post-', '"'),
            "author_url": extr('itemprop="url" content="', '"'),
            "date": text.parse_datetime(extr('datetime="', '"')),
            "content": extr('<div itemprop="text">', "\t\t</div>").strip(),
        }
        url_a = post["author_url"]
        post["author_id"] = url_a[url_a.rfind(".")+1:-1]
        return post
 class SimpcityPostExtractor(SimpcityExtractor):
    subcategory = "post"
    pattern = rf"{BASE_PATTERN}/(?:threads/[^/?#]+/post-|posts/)(\d+)"
    example = "https://simpcity.cr/threads/TITLE.12345/post-54321"
    def posts(self):
        post_id = self.groups[0]
        url = f"{self.root}/posts/{post_id}/"
        page = self.request_page(url)
        pos = page.find(f'data-content="post-{post_id}"')
        if pos < 0:
            raise exception.NotFoundError("post")
        html = text.extract(page, "<article ", "</article>", pos-200)[0]
        self.kwdict["thread"] = self._parse_thread(page)
        return (self._parse_post(html),)
 class SimpcityThreadExtractor(SimpcityExtractor):
    subcategory = "thread"
    pattern = rf"{BASE_PATTERN}(/threads/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
    example = "https://simpcity.cr/threads/TITLE.12345/"
    def posts(self):
        for page in self._pagination(*self.groups):
            if "thread" not in self.kwdict:
                self.kwdict["thread"] = self._parse_thread(page)
            for html in text.extract_iter(page, "<article ", "</article>"):
                yield self._parse_post(html)
 class SimpcityForumExtractor(SimpcityExtractor):
    subcategory = "forum"
    pattern = rf"{BASE_PATTERN}(/forums/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
    example = "https://simpcity.cr/forums/TITLE.123/"
    def items(self):
        data = {"_extractor": SimpcityThreadExtractor}
        for page in self._pagination(*self.groups):
            for path in text.extract_iter(page, ' uix-href="', '"'):
                yield Message.Queue, f"{self.root}{text.unquote(path)}", data
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@@ -159,6 +159,7 @@ CATEGORY_MAP = {
    "sensescans"     : "Sense-Scans",
    "sexcom"         : "Sex.com",
    "silverpic"      : "SilverPic.com",
    "simpcity"       : "SimpCity Forums",
    "simplyhentai"   : "Simply Hentai",
    "sizebooru"      : "Size Booru",
    "slickpic"       : "SlickPic",
--- a/test/results/simpcity.py
+++ b/test/results/simpcity.py
@@ -0,0 +1,112 @@
 # -*- coding: utf-8 -*-
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 from gallery_dl.extractor import simpcity
 from gallery_dl import exception
 __tests__ = (
 {
    "#url"     : "https://simpcity.cr/threads/ririkana-rr_loveit.10731/post-1753131",
    "#class"   : simpcity.SimpcityPostExtractor,
    "#auth"    : True,
    "#results" : "https://jpg5.su/img/coWRwo",
    "count" : 1,
    "num"   : 1,
    "post"  : {
        "author"    : "Zebrabobinn",
        "author_id" : "171827",
        "author_url": "https://simpcity.cr/members/zebrabobinn.171827/",
        "count"     : 1,
        "date"      : "dt:2023-03-08 12:59:10",
        "id"        : "1753131",
        "content"   : """\
 <div class="bbWrapper"><a href="https://jpg5.su/img/coWRwo" target="_blank" class="link link--external" rel="noopener"><img src="https://simp6.jpg5.su/images/FqsNcNCaIAITBEL.md.jpg" data-url="https://simp6.jpg5.su/images/FqsNcNCaIAITBEL.md.jpg" class="bbImage " loading="lazy"
 \t\talt="FqsNcNCaIAITBEL.md.jpg" title="FqsNcNCaIAITBEL.md.jpg" style="" width="" height="" /></a></div>\
 """,
    },
    "thread": {
        "author"    : "eula",
        "author_id" : "54987",
        "author_url": "https://simpcity.cr/members/eula.54987/",
        "date"      : "dt:2022-03-11 17:15:59",
        "id"        : "10731",
        "posts"     : range(320, 500),
        "section"   : "Asians",
        "title"     : "Ririkana | RR_loveit",
        "url"       : "https://simpcity.cr/threads/ririkana-rr_loveit.10731/",
        "views"     : range(790_000, 900_000),
        "tags"      : [
            "asian",
            "big ass",
            "gravure",
            "japanese",
            "japanese big ass",
            "small tits",
            "thicc",
        ],
    },
 },
 {
    "#url"     : "https://simpcity.cr/threads/ririkana-rr_loveit.10731/post-1753131",
    "#class"   : simpcity.SimpcityPostExtractor,
    "#auth"     : False,
    "#exception": exception.AuthRequired,
 },
 {
    "#url"     : "https://simpcity.cr/threads/alua-tatakai.89490/",
    "#class"   : simpcity.SimpcityThreadExtractor,
    "#auth"    : True,
    "#pattern" : r"https://(jpg5\.su/img/\w+|bunkr\.\w+/[fiv]/\w+|pixeldrain.com/l/\w+|alua.com/tatakai)|/goto/post",
    "#count"   : 29,
    "count" : int,
    "num"   : int,
    "post"  : {
        "author"    : str,
        "author_id" : r"re:\d+",
        "author_url": str,
        "content"   : str,
        "count"     : int,
        "date"      : "type:datetime",
        "id"        : r"re:\d+",
    },
    "thread": {
        "author"    : "Ekalamosus",
        "author_id" : "1036155",
        "author_url": "https://simpcity.cr/members/ekalamosus.1036155/",
        "date"      : "dt:2022-07-31 15:40:14",
        "id"        : "89490",
        "posts"     : 45,
        "section"   : "Asians",
        "title"     : "Alua tatakai",
        "url"       : "https://simpcity.cr/threads/alua-tatakai.89490/",
        "views"     : range(47_000, 60_000),
        "tags"      : [
            "alter",
            "alua",
            "pinay",
        ],
    },
 },
 {
    "#url"     : "https://simpcity.su/threads/angel-chan-wlep-wlop-menruinyanko_.12948/",
    "#class"   : simpcity.SimpcityThreadExtractor,
 },
 {
    "#url"     : "https://simpcity.cr/forums/asians.48/",
    "#class"   : simpcity.SimpcityForumExtractor,
    "#pattern" : simpcity.SimpcityThreadExtractor.pattern,
    "#range"   : "1-100",
    "#count"   : 100,
 },
 )