[simpcity] extract attachment/inline files (#8560)
This commit is contained in:
@@ -21,20 +21,66 @@ class SimpcityExtractor(Extractor):
|
|||||||
cookies_domain = "simpcity.cr"
|
cookies_domain = "simpcity.cr"
|
||||||
cookies_names = ("ogaddgmetaprof_user",)
|
cookies_names = ("ogaddgmetaprof_user",)
|
||||||
root = "https://simpcity.cr"
|
root = "https://simpcity.cr"
|
||||||
|
directory_fmt = ("{category}", "{thread[section]}",
|
||||||
|
"{thread[title]} ({thread[id]})")
|
||||||
|
filename_fmt = "{post[id]}_{num:>02}_{id}_{filename}.{extension}"
|
||||||
|
archive_fmt = "{post[id]}/{type[0]}{id}_{filename}"
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
self.login()
|
self.login()
|
||||||
|
|
||||||
extract_urls = text.re(
|
extract_urls = text.re(
|
||||||
r'<(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)').findall
|
r'(?s)<(?:'
|
||||||
|
r'video (.*?\ssrc="[^"]+".*?)</video>'
|
||||||
|
r'|a [^>]*?href="(?:https://[^"]+)?(/attachments/[^"]+".*?)</a>'
|
||||||
|
r'|div [^>]*?ata-src="(?:https://[^"]+)?(/attachments/[^"]+".*?)/>'
|
||||||
|
r'|(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)'
|
||||||
|
r')'
|
||||||
|
).findall
|
||||||
|
|
||||||
for post in self.posts():
|
for post in self.posts():
|
||||||
urls = extract_urls(post["content"])
|
urls = extract_urls(post["content"])
|
||||||
|
if post["attachments"]:
|
||||||
|
urls.extend(extract_urls(post["attachments"]))
|
||||||
|
|
||||||
data = {"post": post}
|
data = {"post": post}
|
||||||
post["count"] = data["count"] = len(urls)
|
post["count"] = data["count"] = len(urls)
|
||||||
yield Message.Directory, data
|
yield Message.Directory, data
|
||||||
for data["num"], url in enumerate(urls, 1):
|
|
||||||
yield Message.Queue, url, data
|
data["num"] = data["num_internal"] = data["num_external"] = 0
|
||||||
|
for video, inl1, inl2, ext in urls:
|
||||||
|
if ext:
|
||||||
|
data["num"] += 1
|
||||||
|
data["num_external"] += 1
|
||||||
|
data["type"] = "external"
|
||||||
|
yield Message.Queue, ext, data
|
||||||
|
|
||||||
|
elif video:
|
||||||
|
data["num"] += 1
|
||||||
|
data["num_internal"] += 1
|
||||||
|
data["type"] = "video"
|
||||||
|
url = text.extr(video, 'src="', '"')
|
||||||
|
text.nameext_from_url(url, data)
|
||||||
|
data["id"] = text.parse_int(
|
||||||
|
data["filename"].partition("-")[0])
|
||||||
|
yield Message.Url, url, data
|
||||||
|
|
||||||
|
elif (inline := inl1 or inl2):
|
||||||
|
data["num"] += 1
|
||||||
|
data["num_internal"] += 1
|
||||||
|
data["type"] = "inline"
|
||||||
|
path = inline[:inline.find('"')]
|
||||||
|
name, _, id = path[path.rfind("/", 0, -1):].strip(
|
||||||
|
"/").rpartition(".")
|
||||||
|
data["id"] = text.parse_int(id)
|
||||||
|
if alt := text.extr(inline, 'alt="', '"'):
|
||||||
|
text.nameext_from_name(alt, data)
|
||||||
|
if not data["extension"]:
|
||||||
|
data["extension"] = name.rpartition("-")[2]
|
||||||
|
else:
|
||||||
|
data["filename"], _, data["extension"] = \
|
||||||
|
name.rpartition("-")
|
||||||
|
yield Message.Url, self.root + path, data
|
||||||
|
|
||||||
def request_page(self, url):
|
def request_page(self, url):
|
||||||
try:
|
try:
|
||||||
@@ -166,6 +212,8 @@ class SimpcityExtractor(Extractor):
|
|||||||
'<div class="js-selectToQuote') or
|
'<div class="js-selectToQuote') or
|
||||||
extr('<div >',
|
extr('<div >',
|
||||||
'<div class="js-selectToQuote')).strip(),
|
'<div class="js-selectToQuote')).strip(),
|
||||||
|
"attachments": extr('<section class="message-attachments">',
|
||||||
|
'</section>'),
|
||||||
}
|
}
|
||||||
|
|
||||||
url_a = post["author_url"]
|
url_a = post["author_url"]
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ __tests__ = (
|
|||||||
"#url" : "https://simpcity.cr/threads/ririkana-rr_loveit.10731/post-1753131",
|
"#url" : "https://simpcity.cr/threads/ririkana-rr_loveit.10731/post-1753131",
|
||||||
"#class" : simpcity.SimpcityPostExtractor,
|
"#class" : simpcity.SimpcityPostExtractor,
|
||||||
"#auth" : True,
|
"#auth" : True,
|
||||||
"#results" : "https://jpg5.su/img/coWRwo",
|
"#results" : "https://jpg6.su/img/coWRwo",
|
||||||
|
|
||||||
"count" : 1,
|
"count" : 1,
|
||||||
"num" : 1,
|
"num" : 1,
|
||||||
@@ -25,7 +25,7 @@ __tests__ = (
|
|||||||
"date" : "dt:2023-03-08 12:59:10",
|
"date" : "dt:2023-03-08 12:59:10",
|
||||||
"id" : "1753131",
|
"id" : "1753131",
|
||||||
"content" : """\
|
"content" : """\
|
||||||
<div class="bbWrapper"><a href="https://jpg5.su/img/coWRwo" target="_blank" class="link link--external" rel="noopener"><img src="https://simp6.jpg5.su/images/FqsNcNCaIAITBEL.md.jpg" data-url="https://simp6.jpg5.su/images/FqsNcNCaIAITBEL.md.jpg" class="bbImage " loading="lazy"
|
<div class="bbWrapper"><a href="https://jpg6.su/img/coWRwo" target="_blank" class="link link--external" rel="noopener"><img src="https://simp6.jpg6.su/images/FqsNcNCaIAITBEL.md.jpg" data-url="https://simp6.jpg6.su/images/FqsNcNCaIAITBEL.md.jpg" class="bbImage " loading="lazy"
|
||||||
\t\talt="FqsNcNCaIAITBEL.md.jpg" title="FqsNcNCaIAITBEL.md.jpg" style="" width="" height="" /></a></div>\
|
\t\talt="FqsNcNCaIAITBEL.md.jpg" title="FqsNcNCaIAITBEL.md.jpg" style="" width="" height="" /></a></div>\
|
||||||
""",
|
""",
|
||||||
},
|
},
|
||||||
@@ -65,7 +65,7 @@ __tests__ = (
|
|||||||
"#class" : simpcity.SimpcityPostExtractor,
|
"#class" : simpcity.SimpcityPostExtractor,
|
||||||
"#auth" : True,
|
"#auth" : True,
|
||||||
"#results" : (
|
"#results" : (
|
||||||
"https://jpg5.su/img/NNFssUg",
|
"https://jpg6.su/img/NNFssUg",
|
||||||
"https://saint2.cr/embed/nPy1kG3w55V",
|
"https://saint2.cr/embed/nPy1kG3w55V",
|
||||||
"https://saint2.cr/embed/c0KhPjU4-F3",
|
"https://saint2.cr/embed/c0KhPjU4-F3",
|
||||||
"https://saint2.cr/embed/sZWnVZ_mQsV",
|
"https://saint2.cr/embed/sZWnVZ_mQsV",
|
||||||
@@ -78,7 +78,10 @@ __tests__ = (
|
|||||||
"#comment" : "quote in post content (#8214)",
|
"#comment" : "quote in post content (#8214)",
|
||||||
"#class" : simpcity.SimpcityPostExtractor,
|
"#class" : simpcity.SimpcityPostExtractor,
|
||||||
"#auth" : True,
|
"#auth" : True,
|
||||||
"#results" : ("/goto/post?id=13358068", "https://cyberdrop.me/a/Sh9GlG38"),
|
"#results" : (
|
||||||
|
"/goto/post?id=13358068",
|
||||||
|
"https://cyberdrop.cr/a/Sh9GlG38",
|
||||||
|
),
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -87,10 +90,10 @@ __tests__ = (
|
|||||||
"#class" : simpcity.SimpcityPostExtractor,
|
"#class" : simpcity.SimpcityPostExtractor,
|
||||||
"#auth" : True,
|
"#auth" : True,
|
||||||
"#results" : (
|
"#results" : (
|
||||||
"https://jpg5.su/img/aKroBJp",
|
"https://jpg6.su/img/aKroBJp",
|
||||||
"https://jpg5.su/img/aKroy2E",
|
"https://jpg6.su/img/aKroy2E",
|
||||||
"https://jpg5.su/img/aKrofqa",
|
"https://jpg6.su/img/aKrofqa",
|
||||||
"https://jpg5.su/img/aKroDgo",
|
"https://jpg6.su/img/aKroDgo",
|
||||||
"https://bunkr.cr/v/6sErIc9pjrnQ3",
|
"https://bunkr.cr/v/6sErIc9pjrnQ3",
|
||||||
),
|
),
|
||||||
|
|
||||||
@@ -114,12 +117,100 @@ __tests__ = (
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://simpcity.cr/threads/sophia-diamond.10049/post-10891",
|
||||||
|
"#class" : simpcity.SimpcityPostExtractor,
|
||||||
|
"#auth" : True,
|
||||||
|
"#results" : (
|
||||||
|
"https://brandarmy.com/SophiaDiamond",
|
||||||
|
"https://www.tiktok.com/@sophia.ilysm?lang=en",
|
||||||
|
"https://www.instagram.com/sophiadiamond/",
|
||||||
|
"https://simpcity.cr/attachments/sophiadiamond_239636842_558607608495946_5357173067872834144_n-jpg.65924/",
|
||||||
|
),
|
||||||
|
|
||||||
|
"count" : 4,
|
||||||
|
"num" : range(1, 4),
|
||||||
|
"num_external": range(1, 3),
|
||||||
|
"num_internal": {0, 1},
|
||||||
|
"type" : {"inline", "external"},
|
||||||
|
"post" : {
|
||||||
|
"attachments": "",
|
||||||
|
"author" : "inoncognito",
|
||||||
|
"author_id" : "",
|
||||||
|
"author_url" : "",
|
||||||
|
"count" : 4,
|
||||||
|
"date" : "dt:2022-03-11 00:41:28",
|
||||||
|
"id" : "10891",
|
||||||
|
"content" : str,
|
||||||
|
},
|
||||||
|
"thread" : {
|
||||||
|
"author" : "inoncognito",
|
||||||
|
"author_id" : "53824",
|
||||||
|
"author_url": "https://simpcity.cr/members/inoncognito.53824/",
|
||||||
|
"date" : "dt:2022-03-11 00:41:28",
|
||||||
|
"id" : "10049",
|
||||||
|
"posts" : range(1_000, 2_000),
|
||||||
|
"section" : "TikTok",
|
||||||
|
"title" : "Sophia Diamond",
|
||||||
|
"url" : "https://simpcity.cr/threads/sophia-diamond.10049/",
|
||||||
|
"views" : range(4_200_000, 6_000_000),
|
||||||
|
"tags" : [
|
||||||
|
"busty",
|
||||||
|
"diamond",
|
||||||
|
"slut",
|
||||||
|
"sophia",
|
||||||
|
"sophiadiamond",
|
||||||
|
"tease",
|
||||||
|
"teen",
|
||||||
|
"tiktok",
|
||||||
|
"tits",
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://simpcity.cr/threads/sophia-diamond.10049/post-18744",
|
||||||
|
"#class" : simpcity.SimpcityPostExtractor,
|
||||||
|
"#auth" : True,
|
||||||
|
"#results" : "https://simpcity.cr/attachments/sophiadiamondcancunbikiniwp-png.36179/",
|
||||||
|
|
||||||
|
"count" : 1,
|
||||||
|
"extension" : "png",
|
||||||
|
"filename" : "SophiaDiamondCancunBikiniWP",
|
||||||
|
"id" : 36179,
|
||||||
|
"num" : 1,
|
||||||
|
"num_external": 0,
|
||||||
|
"num_internal": 1,
|
||||||
|
"type" : "inline",
|
||||||
|
"post" : {
|
||||||
|
"author" : "ElyseGooner",
|
||||||
|
"author_id" : "65059",
|
||||||
|
"author_url" : "https://simpcity.cr/members/elysegooner.65059/",
|
||||||
|
"count" : 1,
|
||||||
|
"date" : "dt:2022-03-11 22:39:06",
|
||||||
|
"id" : "18744",
|
||||||
|
"attachments": str,
|
||||||
|
"content" : """\
|
||||||
|
<div class="bbWrapper">Collage</div>
|
||||||
|
|
||||||
|
|
||||||
|
</div>\
|
||||||
|
""",
|
||||||
|
},
|
||||||
|
"thread" : {
|
||||||
|
"date" : "dt:2022-03-11 00:41:28",
|
||||||
|
"id" : "10049",
|
||||||
|
"section" : "TikTok",
|
||||||
|
"title" : "Sophia Diamond",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://simpcity.cr/threads/alua-tatakai.89490/",
|
"#url" : "https://simpcity.cr/threads/alua-tatakai.89490/",
|
||||||
"#class" : simpcity.SimpcityThreadExtractor,
|
"#class" : simpcity.SimpcityThreadExtractor,
|
||||||
"#auth" : True,
|
"#auth" : True,
|
||||||
"#pattern" : r"https://(jpg5\.su/img/\w+|bunkr\.\w+/[fiv]/\w+|pixeldrain.com/l/\w+|alua.com/tatakai)|/goto/post",
|
"#pattern" : r"https://(jpg6\.su/img/\w+|bunkr\.\w+/[fiv]/\w+|pixeldrain.com/l/\w+|alua.com/tatakai)|/goto/post",
|
||||||
"#count" : 29,
|
"#count" : range(100, 300),
|
||||||
|
|
||||||
"count" : int,
|
"count" : int,
|
||||||
"num" : int,
|
"num" : int,
|
||||||
|
|||||||
Reference in New Issue
Block a user