228 lines
7.3 KiB
Python
228 lines
7.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2025 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
"""Extractors for https://simpcity.cr/"""
|
|
|
|
from .common import Extractor, Message
|
|
from .. import text, exception
|
|
from ..cache import cache
|
|
|
|
BASE_PATTERN = r"(?:https?://)?(?:www\.)?simpcity\.(?:cr|su)"
|
|
|
|
|
|
class SimpcityExtractor(Extractor):
|
|
"""Base class for simpcity extractors"""
|
|
category = "simpcity"
|
|
cookies_domain = "simpcity.cr"
|
|
cookies_names = ("ogaddgmetaprof_user",)
|
|
root = "https://simpcity.cr"
|
|
|
|
def items(self):
|
|
self.login()
|
|
|
|
extract_urls = text.re(
|
|
r'<(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)').findall
|
|
|
|
for post in self.posts():
|
|
urls = extract_urls(post["content"])
|
|
data = {"post": post}
|
|
post["count"] = data["count"] = len(urls)
|
|
yield Message.Directory, data
|
|
for data["num"], url in enumerate(urls, 1):
|
|
yield Message.Queue, url, data
|
|
|
|
def request_page(self, url):
|
|
try:
|
|
return self.request(url)
|
|
except exception.HttpError as exc:
|
|
if exc.status == 403 and b">Log in<" in exc.response.content:
|
|
raise exception.AuthRequired(
|
|
("username & password", "authenticated cookies"), None,
|
|
self._extract_error(exc.response.text))
|
|
raise
|
|
|
|
def login(self):
|
|
if self.cookies_check(self.cookies_names):
|
|
return
|
|
|
|
username, password = self._get_auth_info()
|
|
if username:
|
|
self.cookies_update(self._login_impl(username, password))
|
|
|
|
@cache(maxage=365*86400, keyarg=1)
|
|
def _login_impl(self, username, password):
|
|
self.log.info("Logging in as %s", username)
|
|
|
|
url = f"{self.root}/login/login"
|
|
page = self.request(url).text
|
|
data = {
|
|
"_xfToken": text.extr(page, 'name="_xfToken" value="', '"'),
|
|
"login" : username,
|
|
"password": password,
|
|
"remember": "1",
|
|
"_xfRedirect": "",
|
|
}
|
|
response = self.request(url, method="POST", data=data)
|
|
|
|
if not response.history:
|
|
err = self._extract_error(response.text)
|
|
raise exception.AuthenticationError(f'"{err}"')
|
|
|
|
return {
|
|
cookie.name: cookie.value
|
|
for cookie in self.cookies
|
|
if cookie.domain.endswith(self.cookies_domain)
|
|
}
|
|
|
|
def _pagination(self, base, pnum=None):
|
|
base = f"{self.root}{base}"
|
|
|
|
if pnum is None:
|
|
url = f"{base}/"
|
|
pnum = 1
|
|
else:
|
|
url = f"{base}/page-{pnum}"
|
|
pnum = None
|
|
|
|
while True:
|
|
page = self.request_page(url).text
|
|
|
|
yield page
|
|
|
|
if pnum is None or "pageNav-jump--next" not in page:
|
|
return
|
|
pnum += 1
|
|
url = f"{base}/page-{pnum}"
|
|
|
|
def _pagination_reverse(self, base, pnum=None):
|
|
base = f"{self.root}{base}"
|
|
|
|
url = f"{base}/page-9999" # force redirect to last page
|
|
with self.request_page(url) as response:
|
|
url = response.url
|
|
if url[-1] == "/":
|
|
pnum = 1
|
|
else:
|
|
pnum = text.parse_int(url[url.rfind("-")+1:], 1)
|
|
page = response.text
|
|
|
|
while True:
|
|
yield page
|
|
|
|
pnum -= 1
|
|
if pnum > 1:
|
|
url = f"{base}/page-{pnum}"
|
|
elif pnum == 1:
|
|
url = f"{base}/"
|
|
else:
|
|
return
|
|
|
|
page = self.request_page(url).text
|
|
|
|
def _extract_error(self, html):
|
|
return text.unescape(text.extr(
|
|
html, "blockMessage--error", "</").rpartition(">")[2].strip())
|
|
|
|
def _parse_thread(self, page):
|
|
schema = self._extract_jsonld(page)["mainEntity"]
|
|
author = schema["author"]
|
|
stats = schema["interactionStatistic"]
|
|
url_t = schema["url"]
|
|
url_a = author.get("url") or ""
|
|
|
|
thread = {
|
|
"id" : url_t[url_t.rfind(".")+1:-1],
|
|
"url" : url_t,
|
|
"title": schema["headline"],
|
|
"date" : text.parse_datetime(schema["datePublished"]),
|
|
"views": stats[0]["userInteractionCount"],
|
|
"posts": stats[1]["userInteractionCount"],
|
|
"tags" : (schema["keywords"].split(", ")
|
|
if "keywords" in schema else ()),
|
|
"section" : schema["articleSection"],
|
|
"author" : author.get("name") or "",
|
|
"author_id" : (url_a[url_a.rfind(".")+1:-1] if url_a else
|
|
(author.get("name") or "")[15:]),
|
|
"author_url": url_a,
|
|
}
|
|
|
|
return thread
|
|
|
|
def _parse_post(self, html):
|
|
extr = text.extract_from(html)
|
|
|
|
post = {
|
|
"author": extr('data-author="', '"'),
|
|
"id": extr('data-content="post-', '"'),
|
|
"author_url": extr('itemprop="url" content="', '"'),
|
|
"date": text.parse_datetime(extr('datetime="', '"')),
|
|
"content": extr('<div itemprop="text">',
|
|
'<div class="js-selectToQuote').strip(),
|
|
}
|
|
|
|
url_a = post["author_url"]
|
|
post["author_id"] = url_a[url_a.rfind(".")+1:-1]
|
|
|
|
return post
|
|
|
|
|
|
class SimpcityPostExtractor(SimpcityExtractor):
|
|
subcategory = "post"
|
|
pattern = rf"{BASE_PATTERN}/(?:threads/[^/?#]+/post-|posts/)(\d+)"
|
|
example = "https://simpcity.cr/threads/TITLE.12345/post-54321"
|
|
|
|
def posts(self):
|
|
post_id = self.groups[0]
|
|
url = f"{self.root}/posts/{post_id}/"
|
|
page = self.request_page(url).text
|
|
|
|
pos = page.find(f'data-content="post-{post_id}"')
|
|
if pos < 0:
|
|
raise exception.NotFoundError("post")
|
|
html = text.extract(page, "<article ", "</article>", pos-200)[0]
|
|
|
|
self.kwdict["thread"] = self._parse_thread(page)
|
|
return (self._parse_post(html),)
|
|
|
|
|
|
class SimpcityThreadExtractor(SimpcityExtractor):
|
|
subcategory = "thread"
|
|
pattern = rf"{BASE_PATTERN}(/threads/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
|
|
example = "https://simpcity.cr/threads/TITLE.12345/"
|
|
|
|
def posts(self):
|
|
if (order := self.config("order-posts")) and \
|
|
order[0] not in ("d", "r"):
|
|
pages = self._pagination(*self.groups)
|
|
reverse = False
|
|
else:
|
|
pages = self._pagination_reverse(*self.groups)
|
|
reverse = True
|
|
|
|
for page in pages:
|
|
if "thread" not in self.kwdict:
|
|
self.kwdict["thread"] = self._parse_thread(page)
|
|
posts = text.extract_iter(page, "<article ", "</article>")
|
|
if reverse:
|
|
posts = list(posts)
|
|
posts.reverse()
|
|
for html in posts:
|
|
yield self._parse_post(html)
|
|
|
|
|
|
class SimpcityForumExtractor(SimpcityExtractor):
|
|
subcategory = "forum"
|
|
pattern = rf"{BASE_PATTERN}(/forums/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
|
|
example = "https://simpcity.cr/forums/TITLE.123/"
|
|
|
|
def items(self):
|
|
data = {"_extractor": SimpcityThreadExtractor}
|
|
for page in self._pagination(*self.groups):
|
|
for path in text.extract_iter(page, ' uix-href="', '"'):
|
|
yield Message.Queue, f"{self.root}{text.unquote(path)}", data
|