[xenforo] implement generic XenForo forum extractors
support - https://simpcity.cr/ - https://nudostar.com/forum/ (#8333)
This commit is contained in:
@@ -146,7 +146,6 @@ modules = [
|
||||
"nozomi",
|
||||
"nsfwalbum",
|
||||
"nudostar",
|
||||
"nudostarforum",
|
||||
"okporn",
|
||||
"paheal",
|
||||
"patreon",
|
||||
@@ -187,7 +186,6 @@ modules = [
|
||||
"senmanga",
|
||||
"sexcom",
|
||||
"shimmie2",
|
||||
"simpcity",
|
||||
"simplyhentai",
|
||||
"sizebooru",
|
||||
"skeb",
|
||||
@@ -235,6 +233,7 @@ modules = [
|
||||
"wikifeet",
|
||||
"wikimedia",
|
||||
"xasiat",
|
||||
"xenforo",
|
||||
"xfolio",
|
||||
"xhamster",
|
||||
"xvideos",
|
||||
|
||||
@@ -1,201 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extractors for https://nudostar.com/forum/"""
|
||||
|
||||
from .common import Extractor, Message
|
||||
from .. import text, exception
|
||||
from ..cache import cache
|
||||
|
||||
BASE_PATTERN = r"(?:https?://)?(?:www\.)?nudostar\.com/forum"
|
||||
|
||||
|
||||
class NudostarforumExtractor(Extractor):
|
||||
"""Base class for nudostar forum extractors"""
|
||||
category = "nudostarforum"
|
||||
cookies_domain = "nudostar.com"
|
||||
cookies_names = ("xf_user",)
|
||||
root = "https://nudostar.com/forum"
|
||||
directory_fmt = ("{category}", "{thread[title]} ({thread[id]})")
|
||||
filename_fmt = "{post[id]}_{num:>02}_{filename}.{extension}"
|
||||
archive_fmt = "{post[id]}/{filename}"
|
||||
|
||||
def items(self):
|
||||
self.login()
|
||||
|
||||
for post in self.posts():
|
||||
internal, external = self._extract_post_urls(post["content"])
|
||||
|
||||
data = {"post": post}
|
||||
post["count"] = data["count"] = len(internal) + len(external)
|
||||
yield Message.Directory, "", data
|
||||
|
||||
data["num"] = 0
|
||||
for url in internal:
|
||||
data["num"] += 1
|
||||
text.nameext_from_url(url, data)
|
||||
yield Message.Url, url, data
|
||||
|
||||
for url in external:
|
||||
data["num"] += 1
|
||||
yield Message.Queue, url, data
|
||||
|
||||
def _extract_post_urls(self, content):
|
||||
"""Extract image and video URLs from post content"""
|
||||
internal = []
|
||||
external = []
|
||||
seen = set()
|
||||
|
||||
# Extract URLs from both href= and src= attributes
|
||||
for attr in ('href="', 'src="'):
|
||||
for url in text.extract_iter(content, attr, '"'):
|
||||
if url in seen:
|
||||
continue
|
||||
|
||||
# Internal attachments
|
||||
if "/forum/attachments/" in url:
|
||||
# Skip numeric-only IDs and non-file links
|
||||
path = url.rstrip("/")
|
||||
if path.split(".")[-1].isdigit() and "-" not in path:
|
||||
continue
|
||||
if "upload?" in url:
|
||||
continue
|
||||
seen.add(url)
|
||||
# Normalize to full URL
|
||||
if url.startswith("/"):
|
||||
url = "https://nudostar.com" + url
|
||||
internal.append(url)
|
||||
|
||||
# External image hosts
|
||||
elif url.startswith("http") and "nudostar.com" not in url:
|
||||
seen.add(url)
|
||||
external.append(url)
|
||||
|
||||
return internal, external
|
||||
|
||||
def request_page(self, url):
|
||||
try:
|
||||
return self.request(url)
|
||||
except exception.HttpError as exc:
|
||||
if exc.status == 403:
|
||||
raise exception.AuthRequired(
|
||||
("username & password", "authenticated cookies"), None,
|
||||
"Login required to view this content")
|
||||
raise
|
||||
|
||||
def login(self):
|
||||
if self.cookies_check(self.cookies_names):
|
||||
return
|
||||
|
||||
username, password = self._get_auth_info()
|
||||
if username:
|
||||
self.cookies_update(self._login_impl(username, password))
|
||||
|
||||
@cache(maxage=365*86400, keyarg=1)
|
||||
def _login_impl(self, username, password):
|
||||
self.log.info("Logging in as %s", username)
|
||||
|
||||
url = f"{self.root}/login/"
|
||||
page = self.request(url).text
|
||||
token = text.extr(page, 'name="_xfToken" value="', '"')
|
||||
|
||||
url = f"{self.root}/login/login"
|
||||
data = {
|
||||
"_xfToken" : token,
|
||||
"login" : username,
|
||||
"password" : password,
|
||||
"remember" : "1",
|
||||
"_xfRedirect": self.root + "/",
|
||||
}
|
||||
response = self.request(url, method="POST", data=data)
|
||||
|
||||
if not response.history or "xf_user" not in response.cookies:
|
||||
raise exception.AuthenticationError()
|
||||
|
||||
return {
|
||||
cookie.name: cookie.value
|
||||
for cookie in self.cookies
|
||||
if cookie.domain.endswith(self.cookies_domain)
|
||||
}
|
||||
|
||||
def _pagination(self, base, pnum=None):
|
||||
if pnum is None:
|
||||
url = f"{self.root}{base}/"
|
||||
pnum = 1
|
||||
else:
|
||||
url = f"{self.root}{base}/page-{pnum}"
|
||||
pnum = None
|
||||
|
||||
while True:
|
||||
page = self.request_page(url).text
|
||||
yield page
|
||||
|
||||
if pnum is None or "pageNav-jump--next" not in page:
|
||||
return
|
||||
pnum += 1
|
||||
url = f"{self.root}{base}/page-{pnum}"
|
||||
|
||||
def _parse_thread(self, page):
|
||||
extr = text.extract_from(page)
|
||||
|
||||
title = text.unescape(extr("<title>", "<"))
|
||||
if " | " in title:
|
||||
title = title.rpartition(" | ")[0]
|
||||
|
||||
thread_id = extr('data-content-key="thread-', '"')
|
||||
|
||||
return {
|
||||
"id" : thread_id,
|
||||
"title": title.strip(),
|
||||
}
|
||||
|
||||
def _parse_post(self, html):
|
||||
extr = text.extract_from(html)
|
||||
|
||||
return {
|
||||
"author": extr('data-author="', '"'),
|
||||
"id" : extr('data-content="post-', '"'),
|
||||
"date" : extr('datetime="', '"'),
|
||||
"content": html, # Pass full article HTML for URL extraction
|
||||
}
|
||||
|
||||
|
||||
class NudostarforumPostExtractor(NudostarforumExtractor):
|
||||
"""Extractor for individual posts on nudostar forum"""
|
||||
subcategory = "post"
|
||||
pattern = (rf"{BASE_PATTERN}"
|
||||
rf"/threads/[^/?#]+\.(\d+)/post-(\d+)")
|
||||
example = "https://nudostar.com/forum/threads/NAME.12345/post-67890"
|
||||
|
||||
def posts(self):
|
||||
thread_id, post_id = self.groups
|
||||
url = f"{self.root}/posts/{post_id}/"
|
||||
page = self.request_page(url).text
|
||||
|
||||
pos = page.find(f'data-content="post-{post_id}"')
|
||||
if pos < 0:
|
||||
raise exception.NotFoundError("post")
|
||||
html = text.extract(page, "<article ", "</article>", pos-200)[0]
|
||||
|
||||
self.kwdict["thread"] = self._parse_thread(page)
|
||||
return (self._parse_post(html),)
|
||||
|
||||
|
||||
class NudostarforumThreadExtractor(NudostarforumExtractor):
|
||||
"""Extractor for threads on nudostar forum"""
|
||||
subcategory = "thread"
|
||||
pattern = rf"{BASE_PATTERN}(/threads/[^/?#]+\.(\d+))(?:/page-(\d+))?"
|
||||
example = "https://nudostar.com/forum/threads/NAME.12345/"
|
||||
|
||||
def posts(self):
|
||||
path, thread_id, pnum = self.groups
|
||||
|
||||
for page in self._pagination(path, pnum):
|
||||
if "thread" not in self.kwdict:
|
||||
self.kwdict["thread"] = self._parse_thread(page)
|
||||
|
||||
for html in text.extract_iter(page, "<article ", "</article>"):
|
||||
yield self._parse_post(html)
|
||||
@@ -6,36 +6,38 @@
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extractors for https://simpcity.cr/"""
|
||||
"""Extractors for XenForo forums"""
|
||||
|
||||
from .common import Extractor, Message
|
||||
from .common import BaseExtractor, Message
|
||||
from .. import text, exception
|
||||
from ..cache import cache
|
||||
|
||||
BASE_PATTERN = r"(?:https?://)?(?:www\.)?simpcity\.(?:cr|su)"
|
||||
|
||||
|
||||
class SimpcityExtractor(Extractor):
|
||||
"""Base class for simpcity extractors"""
|
||||
category = "simpcity"
|
||||
cookies_domain = "simpcity.cr"
|
||||
class XenforoExtractor(BaseExtractor):
|
||||
"""Base class for xenforo extractors"""
|
||||
basecategory = "xenforo"
|
||||
# cookies_domain = "simpcity.cr"
|
||||
cookies_names = ("ogaddgmetaprof_user",)
|
||||
root = "https://simpcity.cr"
|
||||
directory_fmt = ("{category}", "{thread[section]}",
|
||||
"{thread[title]} ({thread[id]})")
|
||||
filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}"
|
||||
archive_fmt = "{post[id]}/{type[0]}{id}_{filename}"
|
||||
|
||||
def __init__(self, match):
|
||||
BaseExtractor.__init__(self, match)
|
||||
self.cookies_domain = "." + self.root.split("/")[2]
|
||||
self.cookies_names = self.config_instance("cookies")
|
||||
|
||||
def items(self):
|
||||
self.login()
|
||||
|
||||
extract_urls = text.re(
|
||||
r'(?s)(?:'
|
||||
r'<video (.*?\ssrc="[^"]+".*?)</video>'
|
||||
r'|<a [^>]*?href="'
|
||||
r'(?:https://[^"]+)?(/attachments/[^"]+".*?)</a>'
|
||||
r'|<div [^>]*?data-src="'
|
||||
r'(?:https://[^"]+)?(/attachments/[^"]+".*?)/>'
|
||||
r'|<a [^>]*?href="[^"]*?'
|
||||
r'(/attachments/[^"]+".*?)</a>'
|
||||
r'|<div [^>]*?data-src="[^"]*?'
|
||||
r'(/attachments/[^"]+".*?)/>'
|
||||
r'|(?:<a [^>]*?href="|<iframe [^>]*?src="|'
|
||||
r'''onclick="loadMedia\(this, ')([^"']+)'''
|
||||
r')'
|
||||
@@ -50,6 +52,7 @@ class SimpcityExtractor(Extractor):
|
||||
post["count"] = data["count"] = len(urls)
|
||||
yield Message.Directory, "", data
|
||||
|
||||
id_last = None
|
||||
data["num"] = data["num_internal"] = data["num_external"] = 0
|
||||
for video, inl1, inl2, ext in urls:
|
||||
if ext:
|
||||
@@ -71,12 +74,14 @@ class SimpcityExtractor(Extractor):
|
||||
yield Message.Url, url, data
|
||||
|
||||
elif (inline := inl1 or inl2):
|
||||
data["num"] += 1
|
||||
data["num_internal"] += 1
|
||||
data["type"] = "inline"
|
||||
path = inline[:inline.find('"')]
|
||||
name, _, id = path[path.rfind("/", 0, -1):].strip(
|
||||
"/").rpartition(".")
|
||||
if id == id_last:
|
||||
id_last = None
|
||||
continue
|
||||
else:
|
||||
id_last = id
|
||||
data["id"] = text.parse_int(id)
|
||||
if alt := text.extr(inline, 'alt="', '"'):
|
||||
text.nameext_from_name(alt, data)
|
||||
@@ -85,6 +90,9 @@ class SimpcityExtractor(Extractor):
|
||||
else:
|
||||
data["filename"], _, data["extension"] = \
|
||||
name.rpartition("-")
|
||||
data["num"] += 1
|
||||
data["num_internal"] += 1
|
||||
data["type"] = "inline"
|
||||
yield Message.Url, self.root + path, data
|
||||
|
||||
def request_page(self, url):
|
||||
@@ -180,10 +188,15 @@ class SimpcityExtractor(Extractor):
|
||||
html, "blockMessage--error", "</").rpartition(">")[2].strip())
|
||||
|
||||
def _parse_thread(self, page):
|
||||
schema = self._extract_jsonld(page)["mainEntity"]
|
||||
try:
|
||||
data = self._extract_jsonld(page)
|
||||
except ValueError:
|
||||
return {}
|
||||
|
||||
schema = data.get("mainEntity", data)
|
||||
author = schema["author"]
|
||||
stats = schema["interactionStatistic"]
|
||||
url_t = schema["url"]
|
||||
url_t = schema.get("url") or schema.get("@id") or ""
|
||||
url_a = author.get("url") or ""
|
||||
|
||||
thread = {
|
||||
@@ -191,8 +204,6 @@ class SimpcityExtractor(Extractor):
|
||||
"url" : url_t,
|
||||
"title": schema["headline"],
|
||||
"date" : self.parse_datetime_iso(schema["datePublished"]),
|
||||
"views": stats[0]["userInteractionCount"],
|
||||
"posts": stats[1]["userInteractionCount"],
|
||||
"tags" : (schema["keywords"].split(", ")
|
||||
if "keywords" in schema else ()),
|
||||
"section" : schema["articleSection"],
|
||||
@@ -202,6 +213,13 @@ class SimpcityExtractor(Extractor):
|
||||
"author_url": url_a,
|
||||
}
|
||||
|
||||
if isinstance(stats, list):
|
||||
thread["views"] = stats[0]["userInteractionCount"]
|
||||
thread["posts"] = stats[1]["userInteractionCount"]
|
||||
else:
|
||||
thread["views"] = -1
|
||||
thread["posts"] = stats["userInteractionCount"]
|
||||
|
||||
return thread
|
||||
|
||||
def _parse_post(self, html):
|
||||
@@ -210,13 +228,11 @@ class SimpcityExtractor(Extractor):
|
||||
post = {
|
||||
"author": extr('data-author="', '"'),
|
||||
"id": extr('data-content="post-', '"'),
|
||||
"author_url": extr('itemprop="url" content="', '"'),
|
||||
"author_url": (extr('itemprop="url" content="', '"') or
|
||||
extr('<a href="', '"')),
|
||||
"date": self.parse_datetime_iso(extr('datetime="', '"')),
|
||||
"content": (
|
||||
extr('<div itemprop="text">',
|
||||
'<div class="js-selectToQuote') or
|
||||
extr('<div >',
|
||||
'<div class="js-selectToQuote')).strip(),
|
||||
"content": extr('class="message-body',
|
||||
'<div class="js-selectToQuote'),
|
||||
"attachments": extr('<section class="message-attachments">',
|
||||
'</section>'),
|
||||
}
|
||||
@@ -224,16 +240,35 @@ class SimpcityExtractor(Extractor):
|
||||
url_a = post["author_url"]
|
||||
post["author_id"] = url_a[url_a.rfind(".")+1:-1]
|
||||
|
||||
con = post["content"]
|
||||
if (pos := con.find('<div class="bbWrapper')) >= 0:
|
||||
con = con[pos:]
|
||||
post["content"] = con.strip()
|
||||
|
||||
return post
|
||||
|
||||
|
||||
class SimpcityPostExtractor(SimpcityExtractor):
|
||||
BASE_PATTERN = XenforoExtractor.update({
|
||||
"simpcity": {
|
||||
"root": "https://simpcity.cr",
|
||||
"pattern": r"(?:www\.)?simpcity\.(?:cr|su)",
|
||||
"cookies": ("ogaddgmetaprof_user",),
|
||||
},
|
||||
"nudostarforum": {
|
||||
"root": "https://nudostar.com/forum",
|
||||
"pattern": r"(?:www\.)?nudostar\.com/forum",
|
||||
"cookies": ("xf_user",),
|
||||
},
|
||||
})
|
||||
|
||||
|
||||
class XenforoPostExtractor(XenforoExtractor):
|
||||
subcategory = "post"
|
||||
pattern = rf"{BASE_PATTERN}/(?:threads/[^/?#]+/post-|posts/)(\d+)"
|
||||
example = "https://simpcity.cr/threads/TITLE.12345/post-54321"
|
||||
|
||||
def posts(self):
|
||||
post_id = self.groups[0]
|
||||
post_id = self.groups[-1]
|
||||
url = f"{self.root}/posts/{post_id}/"
|
||||
page = self.request_page(url).text
|
||||
|
||||
@@ -246,18 +281,21 @@ class SimpcityPostExtractor(SimpcityExtractor):
|
||||
return (self._parse_post(html),)
|
||||
|
||||
|
||||
class SimpcityThreadExtractor(SimpcityExtractor):
|
||||
class XenforoThreadExtractor(XenforoExtractor):
|
||||
subcategory = "thread"
|
||||
pattern = rf"{BASE_PATTERN}(/threads/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
|
||||
example = "https://simpcity.cr/threads/TITLE.12345/"
|
||||
|
||||
def posts(self):
|
||||
path = self.groups[-2]
|
||||
pnum = self.groups[-1]
|
||||
|
||||
if (order := self.config("order-posts")) and \
|
||||
order[0] not in ("d", "r"):
|
||||
pages = self._pagination(*self.groups)
|
||||
pages = self._pagination(path, pnum)
|
||||
reverse = False
|
||||
else:
|
||||
pages = self._pagination_reverse(*self.groups)
|
||||
pages = self._pagination_reverse(path, pnum)
|
||||
reverse = True
|
||||
|
||||
for page in pages:
|
||||
@@ -271,13 +309,18 @@ class SimpcityThreadExtractor(SimpcityExtractor):
|
||||
yield self._parse_post(html)
|
||||
|
||||
|
||||
class SimpcityForumExtractor(SimpcityExtractor):
|
||||
class XenforoForumExtractor(XenforoExtractor):
|
||||
subcategory = "forum"
|
||||
pattern = rf"{BASE_PATTERN}(/forums/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
|
||||
pattern = rf"{BASE_PATTERN}(/forums/(?:[^/?#]+\.)?[^/?#]+)(?:/page-(\d+))?"
|
||||
example = "https://simpcity.cr/forums/TITLE.123/"
|
||||
|
||||
def items(self):
|
||||
data = {"_extractor": SimpcityThreadExtractor}
|
||||
for page in self._pagination(*self.groups):
|
||||
for path in text.extract_iter(page, ' uix-href="', '"'):
|
||||
extract_threads = text.re(
|
||||
r'(/threads/[^"]+)"[^>]+data-xf-init=').findall
|
||||
|
||||
data = {"_extractor": XenforoThreadExtractor}
|
||||
path = self.groups[-2]
|
||||
pnum = self.groups[-1]
|
||||
for page in self._pagination(path, pnum):
|
||||
for path in extract_threads(page):
|
||||
yield Message.Queue, f"{self.root}{text.unquote(path)}", data
|
||||
Reference in New Issue
Block a user