[simpcity] extract attachment/inline files (#8560)

This commit is contained in:
Mike Fährmann
2025-11-26 18:18:41 +01:00
parent aac1d8f36d
commit 25ac5099cf
2 changed files with 152 additions and 13 deletions

View File

@@ -21,20 +21,66 @@ class SimpcityExtractor(Extractor):
cookies_domain = "simpcity.cr"
cookies_names = ("ogaddgmetaprof_user",)
root = "https://simpcity.cr"
directory_fmt = ("{category}", "{thread[section]}",
"{thread[title]} ({thread[id]})")
filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}"
archive_fmt = "{post[id]}/{type[0]}{id}_{filename}"
def items(self):
self.login()
extract_urls = text.re(
r'<(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)').findall
r'(?s)<(?:'
r'video (.*?\ssrc="[^"]+".*?)</video>'
r'|a [^>]*?href="(?:https://[^"]+)?(/attachments/[^"]+".*?)</a>'
r'|div [^>]*?ata-src="(?:https://[^"]+)?(/attachments/[^"]+".*?)/>'
r'|(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)'
r')'
).findall
for post in self.posts():
urls = extract_urls(post["content"])
if post["attachments"]:
urls.extend(extract_urls(post["attachments"]))
data = {"post": post}
post["count"] = data["count"] = len(urls)
yield Message.Directory, data
for data["num"], url in enumerate(urls, 1):
yield Message.Queue, url, data
data["num"] = data["num_internal"] = data["num_external"] = 0
for video, inl1, inl2, ext in urls:
if ext:
data["num"] += 1
data["num_external"] += 1
data["type"] = "external"
yield Message.Queue, ext, data
elif video:
data["num"] += 1
data["num_internal"] += 1
data["type"] = "video"
url = text.extr(video, 'src="', '"')
text.nameext_from_url(url, data)
data["id"] = text.parse_int(
data["filename"].partition("-")[0])
yield Message.Url, url, data
elif (inline := inl1 or inl2):
data["num"] += 1
data["num_internal"] += 1
data["type"] = "inline"
path = inline[:inline.find('"')]
name, _, id = path[path.rfind("/", 0, -1):].strip(
"/").rpartition(".")
data["id"] = text.parse_int(id)
if alt := text.extr(inline, 'alt="', '"'):
text.nameext_from_name(alt, data)
if not data["extension"]:
data["extension"] = name.rpartition("-")[2]
else:
data["filename"], _, data["extension"] = \
name.rpartition("-")
yield Message.Url, self.root + path, data
def request_page(self, url):
try:
@@ -166,6 +212,8 @@ class SimpcityExtractor(Extractor):
'<div class="js-selectToQuote') or
extr('<div >',
'<div class="js-selectToQuote')).strip(),
"attachments": extr('<section class="message-attachments">',
'</section>'),
}
url_a = post["author_url"]

View File

@@ -13,7 +13,7 @@ __tests__ = (
"#url" : "https://simpcity.cr/threads/ririkana-rr_loveit.10731/post-1753131",
"#class" : simpcity.SimpcityPostExtractor,
"#auth" : True,
"#results" : "https://jpg5.su/img/coWRwo",
"#results" : "https://jpg6.su/img/coWRwo",
"count" : 1,
"num" : 1,
@@ -25,7 +25,7 @@ __tests__ = (
"date" : "dt:2023-03-08 12:59:10",
"id" : "1753131",
"content" : """\
<div class="bbWrapper"><a href="https://jpg5.su/img/coWRwo" target="_blank" class="link link--external" rel="noopener"><img src="https://simp6.jpg5.su/images/FqsNcNCaIAITBEL.md.jpg" data-url="https://simp6.jpg5.su/images/FqsNcNCaIAITBEL.md.jpg" class="bbImage " loading="lazy"
<div class="bbWrapper"><a href="https://jpg6.su/img/coWRwo" target="_blank" class="link link--external" rel="noopener"><img src="https://simp6.jpg6.su/images/FqsNcNCaIAITBEL.md.jpg" data-url="https://simp6.jpg6.su/images/FqsNcNCaIAITBEL.md.jpg" class="bbImage " loading="lazy"
\t\talt="FqsNcNCaIAITBEL.md.jpg" title="FqsNcNCaIAITBEL.md.jpg" style="" width="" height="" /></a></div>\
""",
},
@@ -65,7 +65,7 @@ __tests__ = (
"#class" : simpcity.SimpcityPostExtractor,
"#auth" : True,
"#results" : (
"https://jpg5.su/img/NNFssUg",
"https://jpg6.su/img/NNFssUg",
"https://saint2.cr/embed/nPy1kG3w55V",
"https://saint2.cr/embed/c0KhPjU4-F3",
"https://saint2.cr/embed/sZWnVZ_mQsV",
@@ -78,7 +78,10 @@ __tests__ = (
"#comment" : "quote in post content (#8214)",
"#class" : simpcity.SimpcityPostExtractor,
"#auth" : True,
"#results" : ("/goto/post?id=13358068", "https://cyberdrop.me/a/Sh9GlG38"),
"#results" : (
"/goto/post?id=13358068",
"https://cyberdrop.cr/a/Sh9GlG38",
),
},
{
@@ -87,10 +90,10 @@ __tests__ = (
"#class" : simpcity.SimpcityPostExtractor,
"#auth" : True,
"#results" : (
"https://jpg5.su/img/aKroBJp",
"https://jpg5.su/img/aKroy2E",
"https://jpg5.su/img/aKrofqa",
"https://jpg5.su/img/aKroDgo",
"https://jpg6.su/img/aKroBJp",
"https://jpg6.su/img/aKroy2E",
"https://jpg6.su/img/aKrofqa",
"https://jpg6.su/img/aKroDgo",
"https://bunkr.cr/v/6sErIc9pjrnQ3",
),
@@ -114,12 +117,100 @@ __tests__ = (
},
},
{
"#url" : "https://simpcity.cr/threads/sophia-diamond.10049/post-10891",
"#class" : simpcity.SimpcityPostExtractor,
"#auth" : True,
"#results" : (
"https://brandarmy.com/SophiaDiamond",
"https://www.tiktok.com/@sophia.ilysm?lang=en",
"https://www.instagram.com/sophiadiamond/",
"https://simpcity.cr/attachments/sophiadiamond_239636842_558607608495946_5357173067872834144_n-jpg.65924/",
),
"count" : 4,
"num" : range(1, 4),
"num_external": range(1, 3),
"num_internal": {0, 1},
"type" : {"inline", "external"},
"post" : {
"attachments": "",
"author" : "inoncognito",
"author_id" : "",
"author_url" : "",
"count" : 4,
"date" : "dt:2022-03-11 00:41:28",
"id" : "10891",
"content" : str,
},
"thread" : {
"author" : "inoncognito",
"author_id" : "53824",
"author_url": "https://simpcity.cr/members/inoncognito.53824/",
"date" : "dt:2022-03-11 00:41:28",
"id" : "10049",
"posts" : range(1_000, 2_000),
"section" : "TikTok",
"title" : "Sophia Diamond",
"url" : "https://simpcity.cr/threads/sophia-diamond.10049/",
"views" : range(4_200_000, 6_000_000),
"tags" : [
"busty",
"diamond",
"slut",
"sophia",
"sophiadiamond",
"tease",
"teen",
"tiktok",
"tits",
],
},
},
{
"#url" : "https://simpcity.cr/threads/sophia-diamond.10049/post-18744",
"#class" : simpcity.SimpcityPostExtractor,
"#auth" : True,
"#results" : "https://simpcity.cr/attachments/sophiadiamondcancunbikiniwp-png.36179/",
"count" : 1,
"extension" : "png",
"filename" : "SophiaDiamondCancunBikiniWP",
"id" : 36179,
"num" : 1,
"num_external": 0,
"num_internal": 1,
"type" : "inline",
"post" : {
"author" : "ElyseGooner",
"author_id" : "65059",
"author_url" : "https://simpcity.cr/members/elysegooner.65059/",
"count" : 1,
"date" : "dt:2022-03-11 22:39:06",
"id" : "18744",
"attachments": str,
"content" : """\
<div class="bbWrapper">Collage</div>
</div>\
""",
},
"thread" : {
"date" : "dt:2022-03-11 00:41:28",
"id" : "10049",
"section" : "TikTok",
"title" : "Sophia Diamond",
},
},
{
"#url" : "https://simpcity.cr/threads/alua-tatakai.89490/",
"#class" : simpcity.SimpcityThreadExtractor,
"#auth" : True,
"#pattern" : r"https://(jpg5\.su/img/\w+|bunkr\.\w+/[fiv]/\w+|pixeldrain.com/l/\w+|alua.com/tatakai)|/goto/post",
"#count" : 29,
"#pattern" : r"https://(jpg6\.su/img/\w+|bunkr\.\w+/[fiv]/\w+|pixeldrain.com/l/\w+|alua.com/tatakai)|/goto/post",
"#count" : range(100, 300),
"count" : int,
"num" : int,