[simpcity] extract attachment/inline files (#8560)
This commit is contained in:
@@ -21,20 +21,66 @@ class SimpcityExtractor(Extractor):
|
||||
cookies_domain = "simpcity.cr"
|
||||
cookies_names = ("ogaddgmetaprof_user",)
|
||||
root = "https://simpcity.cr"
|
||||
directory_fmt = ("{category}", "{thread[section]}",
|
||||
"{thread[title]} ({thread[id]})")
|
||||
filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}"
|
||||
archive_fmt = "{post[id]}/{type[0]}{id}_{filename}"
|
||||
|
||||
def items(self):
|
||||
self.login()
|
||||
|
||||
extract_urls = text.re(
|
||||
r'<(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)').findall
|
||||
r'(?s)<(?:'
|
||||
r'video (.*?\ssrc="[^"]+".*?)</video>'
|
||||
r'|a [^>]*?href="(?:https://[^"]+)?(/attachments/[^"]+".*?)</a>'
|
||||
r'|div [^>]*?ata-src="(?:https://[^"]+)?(/attachments/[^"]+".*?)/>'
|
||||
r'|(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)'
|
||||
r')'
|
||||
).findall
|
||||
|
||||
for post in self.posts():
|
||||
urls = extract_urls(post["content"])
|
||||
if post["attachments"]:
|
||||
urls.extend(extract_urls(post["attachments"]))
|
||||
|
||||
data = {"post": post}
|
||||
post["count"] = data["count"] = len(urls)
|
||||
yield Message.Directory, data
|
||||
for data["num"], url in enumerate(urls, 1):
|
||||
yield Message.Queue, url, data
|
||||
|
||||
data["num"] = data["num_internal"] = data["num_external"] = 0
|
||||
for video, inl1, inl2, ext in urls:
|
||||
if ext:
|
||||
data["num"] += 1
|
||||
data["num_external"] += 1
|
||||
data["type"] = "external"
|
||||
yield Message.Queue, ext, data
|
||||
|
||||
elif video:
|
||||
data["num"] += 1
|
||||
data["num_internal"] += 1
|
||||
data["type"] = "video"
|
||||
url = text.extr(video, 'src="', '"')
|
||||
text.nameext_from_url(url, data)
|
||||
data["id"] = text.parse_int(
|
||||
data["filename"].partition("-")[0])
|
||||
yield Message.Url, url, data
|
||||
|
||||
elif (inline := inl1 or inl2):
|
||||
data["num"] += 1
|
||||
data["num_internal"] += 1
|
||||
data["type"] = "inline"
|
||||
path = inline[:inline.find('"')]
|
||||
name, _, id = path[path.rfind("/", 0, -1):].strip(
|
||||
"/").rpartition(".")
|
||||
data["id"] = text.parse_int(id)
|
||||
if alt := text.extr(inline, 'alt="', '"'):
|
||||
text.nameext_from_name(alt, data)
|
||||
if not data["extension"]:
|
||||
data["extension"] = name.rpartition("-")[2]
|
||||
else:
|
||||
data["filename"], _, data["extension"] = \
|
||||
name.rpartition("-")
|
||||
yield Message.Url, self.root + path, data
|
||||
|
||||
def request_page(self, url):
|
||||
try:
|
||||
@@ -166,6 +212,8 @@ class SimpcityExtractor(Extractor):
|
||||
'<div class="js-selectToQuote') or
|
||||
extr('<div >',
|
||||
'<div class="js-selectToQuote')).strip(),
|
||||
"attachments": extr('<section class="message-attachments">',
|
||||
'</section>'),
|
||||
}
|
||||
|
||||
url_a = post["author_url"]
|
||||
|
||||
@@ -13,7 +13,7 @@ __tests__ = (
|
||||
"#url" : "https://simpcity.cr/threads/ririkana-rr_loveit.10731/post-1753131",
|
||||
"#class" : simpcity.SimpcityPostExtractor,
|
||||
"#auth" : True,
|
||||
"#results" : "https://jpg5.su/img/coWRwo",
|
||||
"#results" : "https://jpg6.su/img/coWRwo",
|
||||
|
||||
"count" : 1,
|
||||
"num" : 1,
|
||||
@@ -25,7 +25,7 @@ __tests__ = (
|
||||
"date" : "dt:2023-03-08 12:59:10",
|
||||
"id" : "1753131",
|
||||
"content" : """\
|
||||
<div class="bbWrapper"><a href="https://jpg5.su/img/coWRwo" target="_blank" class="link link--external" rel="noopener"><img src="https://simp6.jpg5.su/images/FqsNcNCaIAITBEL.md.jpg" data-url="https://simp6.jpg5.su/images/FqsNcNCaIAITBEL.md.jpg" class="bbImage " loading="lazy"
|
||||
<div class="bbWrapper"><a href="https://jpg6.su/img/coWRwo" target="_blank" class="link link--external" rel="noopener"><img src="https://simp6.jpg6.su/images/FqsNcNCaIAITBEL.md.jpg" data-url="https://simp6.jpg6.su/images/FqsNcNCaIAITBEL.md.jpg" class="bbImage " loading="lazy"
|
||||
\t\talt="FqsNcNCaIAITBEL.md.jpg" title="FqsNcNCaIAITBEL.md.jpg" style="" width="" height="" /></a></div>\
|
||||
""",
|
||||
},
|
||||
@@ -65,7 +65,7 @@ __tests__ = (
|
||||
"#class" : simpcity.SimpcityPostExtractor,
|
||||
"#auth" : True,
|
||||
"#results" : (
|
||||
"https://jpg5.su/img/NNFssUg",
|
||||
"https://jpg6.su/img/NNFssUg",
|
||||
"https://saint2.cr/embed/nPy1kG3w55V",
|
||||
"https://saint2.cr/embed/c0KhPjU4-F3",
|
||||
"https://saint2.cr/embed/sZWnVZ_mQsV",
|
||||
@@ -78,7 +78,10 @@ __tests__ = (
|
||||
"#comment" : "quote in post content (#8214)",
|
||||
"#class" : simpcity.SimpcityPostExtractor,
|
||||
"#auth" : True,
|
||||
"#results" : ("/goto/post?id=13358068", "https://cyberdrop.me/a/Sh9GlG38"),
|
||||
"#results" : (
|
||||
"/goto/post?id=13358068",
|
||||
"https://cyberdrop.cr/a/Sh9GlG38",
|
||||
),
|
||||
},
|
||||
|
||||
{
|
||||
@@ -87,10 +90,10 @@ __tests__ = (
|
||||
"#class" : simpcity.SimpcityPostExtractor,
|
||||
"#auth" : True,
|
||||
"#results" : (
|
||||
"https://jpg5.su/img/aKroBJp",
|
||||
"https://jpg5.su/img/aKroy2E",
|
||||
"https://jpg5.su/img/aKrofqa",
|
||||
"https://jpg5.su/img/aKroDgo",
|
||||
"https://jpg6.su/img/aKroBJp",
|
||||
"https://jpg6.su/img/aKroy2E",
|
||||
"https://jpg6.su/img/aKrofqa",
|
||||
"https://jpg6.su/img/aKroDgo",
|
||||
"https://bunkr.cr/v/6sErIc9pjrnQ3",
|
||||
),
|
||||
|
||||
@@ -114,12 +117,100 @@ __tests__ = (
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://simpcity.cr/threads/sophia-diamond.10049/post-10891",
|
||||
"#class" : simpcity.SimpcityPostExtractor,
|
||||
"#auth" : True,
|
||||
"#results" : (
|
||||
"https://brandarmy.com/SophiaDiamond",
|
||||
"https://www.tiktok.com/@sophia.ilysm?lang=en",
|
||||
"https://www.instagram.com/sophiadiamond/",
|
||||
"https://simpcity.cr/attachments/sophiadiamond_239636842_558607608495946_5357173067872834144_n-jpg.65924/",
|
||||
),
|
||||
|
||||
"count" : 4,
|
||||
"num" : range(1, 4),
|
||||
"num_external": range(1, 3),
|
||||
"num_internal": {0, 1},
|
||||
"type" : {"inline", "external"},
|
||||
"post" : {
|
||||
"attachments": "",
|
||||
"author" : "inoncognito",
|
||||
"author_id" : "",
|
||||
"author_url" : "",
|
||||
"count" : 4,
|
||||
"date" : "dt:2022-03-11 00:41:28",
|
||||
"id" : "10891",
|
||||
"content" : str,
|
||||
},
|
||||
"thread" : {
|
||||
"author" : "inoncognito",
|
||||
"author_id" : "53824",
|
||||
"author_url": "https://simpcity.cr/members/inoncognito.53824/",
|
||||
"date" : "dt:2022-03-11 00:41:28",
|
||||
"id" : "10049",
|
||||
"posts" : range(1_000, 2_000),
|
||||
"section" : "TikTok",
|
||||
"title" : "Sophia Diamond",
|
||||
"url" : "https://simpcity.cr/threads/sophia-diamond.10049/",
|
||||
"views" : range(4_200_000, 6_000_000),
|
||||
"tags" : [
|
||||
"busty",
|
||||
"diamond",
|
||||
"slut",
|
||||
"sophia",
|
||||
"sophiadiamond",
|
||||
"tease",
|
||||
"teen",
|
||||
"tiktok",
|
||||
"tits",
|
||||
],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://simpcity.cr/threads/sophia-diamond.10049/post-18744",
|
||||
"#class" : simpcity.SimpcityPostExtractor,
|
||||
"#auth" : True,
|
||||
"#results" : "https://simpcity.cr/attachments/sophiadiamondcancunbikiniwp-png.36179/",
|
||||
|
||||
"count" : 1,
|
||||
"extension" : "png",
|
||||
"filename" : "SophiaDiamondCancunBikiniWP",
|
||||
"id" : 36179,
|
||||
"num" : 1,
|
||||
"num_external": 0,
|
||||
"num_internal": 1,
|
||||
"type" : "inline",
|
||||
"post" : {
|
||||
"author" : "ElyseGooner",
|
||||
"author_id" : "65059",
|
||||
"author_url" : "https://simpcity.cr/members/elysegooner.65059/",
|
||||
"count" : 1,
|
||||
"date" : "dt:2022-03-11 22:39:06",
|
||||
"id" : "18744",
|
||||
"attachments": str,
|
||||
"content" : """\
|
||||
<div class="bbWrapper">Collage</div>
|
||||
|
||||
|
||||
</div>\
|
||||
""",
|
||||
},
|
||||
"thread" : {
|
||||
"date" : "dt:2022-03-11 00:41:28",
|
||||
"id" : "10049",
|
||||
"section" : "TikTok",
|
||||
"title" : "Sophia Diamond",
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://simpcity.cr/threads/alua-tatakai.89490/",
|
||||
"#class" : simpcity.SimpcityThreadExtractor,
|
||||
"#auth" : True,
|
||||
"#pattern" : r"https://(jpg5\.su/img/\w+|bunkr\.\w+/[fiv]/\w+|pixeldrain.com/l/\w+|alua.com/tatakai)|/goto/post",
|
||||
"#count" : 29,
|
||||
"#pattern" : r"https://(jpg6\.su/img/\w+|bunkr\.\w+/[fiv]/\w+|pixeldrain.com/l/\w+|alua.com/tatakai)|/goto/post",
|
||||
"#count" : range(100, 300),
|
||||
|
||||
"count" : int,
|
||||
"num" : int,
|
||||
|
||||
Reference in New Issue
Block a user