[simpcity] extract 'tiktok' media embeds (#8994)

This commit is contained in:
Mike Fährmann
2026-02-04 11:16:32 +01:00
parent b4351b8193
commit 9379397eec
2 changed files with 44 additions and 4 deletions

View File

@@ -9,7 +9,7 @@
"""Extractors for XenForo forums""" """Extractors for XenForo forums"""
from .common import BaseExtractor, Message from .common import BaseExtractor, Message
from .. import text, exception from .. import text, util, exception
from ..cache import cache from ..cache import cache
import binascii import binascii
@@ -46,10 +46,10 @@ class XenforoExtractor(BaseExtractor):
base = root if (pos := root.find("/", 8)) < 0 else root[:pos] base = root if (pos := root.find("/", 8)) < 0 else root[:pos]
for post in self.posts(): for post in self.posts():
urls = extract_urls(post["content"]) urls = extract_urls(post["content"])
if "data-s9e-mediaembed-iframe=" in post["content"]:
self._extract_embeds(urls, post)
if post["attachments"]: if post["attachments"]:
for att in text.extract_iter( self._extract_attachments(urls, post)
post["attachments"], "<li", "</li>"):
urls.append((None, att[att.find('href="')+6:], None, None))
data = {"post": post} data = {"post": post}
post["count"] = data["count"] = len(urls) post["count"] = data["count"] = len(urls)
@@ -340,6 +340,37 @@ class XenforoExtractor(BaseExtractor):
data["author_id"] = data["author"][15:] data["author_id"] = data["author"][15:]
return data return data
def _extract_attachments(self, urls, post):
for att in text.extract_iter(post["attachments"], "<li", "</li>"):
urls.append((None, att[att.find('href="')+6:], None, None))
def _extract_embeds(self, urls, post):
for embed in text.extract_iter(
post["content"], "data-s9e-mediaembed-iframe='", "'"):
data = {}
key = None
for value in util.json_loads(embed):
if key is None:
key = value
else:
data[key] = value
key = None
src = data.get("src")
if not src:
self.log.debug(data)
continue
type = data.get("data-s9e-mediaembed")
if type == "tiktok":
url = ("https://www.tiktok.com/@/video/" +
src[src.rfind("#")+1:])
else:
self.log.warning("%s: Unsupported media embed type '%s'",
post["id"], type)
continue
urls.append((None, None, None, url))
def _extract_media(self, url, file): def _extract_media(self, url, file):
media = {} media = {}
name, _, media["id"] = file.rpartition(".") name, _, media["id"] = file.rpartition(".")

View File

@@ -240,6 +240,15 @@ __tests__ = (
), ),
}, },
{
"#url" : "https://simpcity.cr/threads/arianaskyeshelby-itsarianaskyebaby-busty.1237895/post-40205575",
"#comment" : "tiktok s9e media embed iframe (#8994)",
"#category": ("xenforo", "simpcity", "post"),
"#class" : xenforo.XenforoPostExtractor,
"#auth" : True,
"#results" : "https://www.tiktok.com/@/video/7556556034794425631",
},
{ {
"#url" : "https://simpcity.cr/threads/alua-tatakai.89490/", "#url" : "https://simpcity.cr/threads/alua-tatakai.89490/",
"#category": ("xenforo", "simpcity", "thread"), "#category": ("xenforo", "simpcity", "thread"),