Files
gallery-dl/gallery_dl/extractor/simpcity.py
2025-10-17 22:49:41 +02:00

187 lines
6.1 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://simpcity.cr/"""
from .common import Extractor, Message
from .. import text, exception
BASE_PATTERN = r"(?:https?://)?(?:www\.)?simpcity\.(?:cr|su)"
class SimpcityExtractor(Extractor):
"""Base class for simpcity extractors"""
category = "simpcity"
root = "https://simpcity.cr"
def items(self):
extract_urls = text.re(
r'<(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)').findall
for post in self.posts():
urls = extract_urls(post["content"])
data = {"post": post}
post["count"] = data["count"] = len(urls)
yield Message.Directory, data
for data["num"], url in enumerate(urls, 1):
yield Message.Queue, url, data
def request_page(self, url):
try:
return self.request(url)
except exception.HttpError as exc:
if exc.status == 403 and b">Log in<" in exc.response.content:
msg = text.extr(exc.response.text, "blockMessage--error", "</")
raise exception.AuthRequired(
"'authenticated cookies'", None,
msg.rpartition(">")[2].strip())
raise
def _pagination(self, base, pnum=None):
base = f"{self.root}{base}"
if pnum is None:
url = f"{base}/"
pnum = 1
else:
url = f"{base}/page-{pnum}"
pnum = None
while True:
page = self.request_page(url).text
yield page
if pnum is None or "pageNav-jump--next" not in page:
return
pnum += 1
url = f"{base}/page-{pnum}"
def _pagination_reverse(self, base, pnum=None):
base = f"{self.root}{base}"
url = f"{base}/page-9999" # force redirect to last page
with self.request_page(url) as response:
url = response.url
if url[-1] == "/":
pnum = 1
else:
pnum = text.parse_int(url[url.rfind("-")+1:], 1)
page = response.text
while True:
yield page
pnum -= 1
if pnum > 1:
url = f"{base}/page-{pnum}"
elif pnum == 1:
url = f"{base}/"
else:
return
page = self.request_page(url).text
def _parse_thread(self, page):
schema = self._extract_jsonld(page)["mainEntity"]
author = schema["author"]
stats = schema["interactionStatistic"]
url_t = schema["url"]
url_a = author.get("url") or ""
thread = {
"id" : url_t[url_t.rfind(".")+1:-1],
"url" : url_t,
"title": schema["headline"],
"date" : self.parse_datetime_iso(schema["datePublished"]),
"views": stats[0]["userInteractionCount"],
"posts": stats[1]["userInteractionCount"],
"tags" : (schema["keywords"].split(", ")
if "keywords" in schema else ()),
"section" : schema["articleSection"],
"author" : author.get("name") or "",
"author_id" : (url_a[url_a.rfind(".")+1:-1] if url_a else
(author.get("name") or "")[15:]),
"author_url": url_a,
}
return thread
def _parse_post(self, html):
extr = text.extract_from(html)
post = {
"author": extr('data-author="', '"'),
"id": extr('data-content="post-', '"'),
"author_url": extr('itemprop="url" content="', '"'),
"date": self.parse_datetime_iso(extr('datetime="', '"')),
"content": extr('<div itemprop="text">',
'<div class="js-selectToQuote').strip(),
}
url_a = post["author_url"]
post["author_id"] = url_a[url_a.rfind(".")+1:-1]
return post
class SimpcityPostExtractor(SimpcityExtractor):
subcategory = "post"
pattern = rf"{BASE_PATTERN}/(?:threads/[^/?#]+/post-|posts/)(\d+)"
example = "https://simpcity.cr/threads/TITLE.12345/post-54321"
def posts(self):
post_id = self.groups[0]
url = f"{self.root}/posts/{post_id}/"
page = self.request_page(url).text
pos = page.find(f'data-content="post-{post_id}"')
if pos < 0:
raise exception.NotFoundError("post")
html = text.extract(page, "<article ", "</article>", pos-200)[0]
self.kwdict["thread"] = self._parse_thread(page)
return (self._parse_post(html),)
class SimpcityThreadExtractor(SimpcityExtractor):
subcategory = "thread"
pattern = rf"{BASE_PATTERN}(/threads/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
example = "https://simpcity.cr/threads/TITLE.12345/"
def posts(self):
if (order := self.config("order-posts")) and \
order[0] not in ("d", "r"):
pages = self._pagination(*self.groups)
reverse = False
else:
pages = self._pagination_reverse(*self.groups)
reverse = True
for page in pages:
if "thread" not in self.kwdict:
self.kwdict["thread"] = self._parse_thread(page)
posts = text.extract_iter(page, "<article ", "</article>")
if reverse:
posts = list(posts)
posts.reverse()
for html in posts:
yield self._parse_post(html)
class SimpcityForumExtractor(SimpcityExtractor):
subcategory = "forum"
pattern = rf"{BASE_PATTERN}(/forums/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
example = "https://simpcity.cr/forums/TITLE.123/"
def items(self):
data = {"_extractor": SimpcityThreadExtractor}
for page in self._pagination(*self.groups):
for path in text.extract_iter(page, ' uix-href="', '"'):
yield Message.Queue, f"{self.root}{text.unquote(path)}", data