[subscribestar] fix 'content' extraction (#7486)

and extract 'tags' metadata

Authored by: prowlguru

Co-authored-by: prowlguru <183935626+prowlguru@users.noreply.github.com>
This commit is contained in:
Mike Fährmann
2025-05-10 20:57:55 +02:00
parent 1da44d8fe6
commit 98fdcd4d72
2 changed files with 48 additions and 7 deletions

View File

@@ -40,8 +40,14 @@ class SubscribestarExtractor(Extractor):
for post_html in self.posts():
media = self._media_from_post(post_html)
data = self._data_from_post(post_html)
data["title"] = text.unescape(text.extr(
data["content"], "<h1>", "</h1>"))
content = data["content"]
if "<html><body>" in content:
data["content"] = content = text.extr(
content, "<body>", "</body>")
data["title"] = text.unescape(
text.extr(content, "<h1>", "</h1>"))
yield Message.Directory, data
for num, item in enumerate(media, 1):
item.update(data)
@@ -189,7 +195,12 @@ class SubscribestarExtractor(Extractor):
"author_nick": text.unescape(extr('>', '<')),
"date" : self._parse_datetime(extr(
'class="post-date">', '</').rpartition(">")[2]),
"content" : extr('<body>', '</body>').strip(),
"content" : extr(
'<div class="post-content" data-role="post_content-text">',
'</div><div class="post-uploads for-youtube"').strip(),
"tags" : list(text.extract_iter(extr(
'<div class="post_tags for-post">',
'<div class="post-actions">'), '?tag=', '"')),
}
def _parse_datetime(self, dt):
@@ -243,7 +254,12 @@ class SubscribestarPostExtractor(SubscribestarExtractor):
"post_id" : text.parse_int(extr('data-id="', '"')),
"date" : self._parse_datetime(extr(
'<div class="section-title_date">', '<')),
"content" : extr('<body>', '</body>').strip(),
"content" : extr(
'<div class="post-content" data-role="post_content-text">',
'</div><div class="post-uploads for-youtube"').strip(),
"tags" : list(text.extract_iter(extr(
'<div class="post_tags for-post">',
'<div class="post-actions">'), '?tag=', '"')),
"author_name": text.unescape(extr(
'class="star_link" href="/', '"')),
"author_id" : text.parse_int(extr('data-user-id="', '"')),

View File

@@ -24,6 +24,7 @@ __tests__ = (
"id" : int,
"num" : int,
"post_id" : int,
"tags" : list,
"title" : str,
"type" : r"re:image|video|attachment",
"url" : str,
@@ -44,6 +45,7 @@ __tests__ = (
"#url" : "https://subscribestar.adult/kanashiipanda",
"#category": ("", "subscribestar", "user-adult"),
"#class" : subscribestar.SubscribestarUserExtractor,
"#auth" : True,
"#range" : "1-10",
"#count" : 10,
},
@@ -59,14 +61,15 @@ __tests__ = (
"author_nick": "SubscribeStar",
"content" : r"re:<h1>Brand Guidelines and Assets</h1>",
"date" : "dt:2020-05-07 12:33:00",
"extension" : "",
"filename" : "uploads",
"extension" : "jpg",
"filename" : "8ff61299-b249-47dc-880a-cdacc9081c62",
"group" : "imgs_and_videos",
"height" : 291,
"id" : 203885,
"num" : 1,
"pinned" : False,
"post_id" : 102468,
"tags" : [],
"title" : "Brand Guidelines and Assets",
"type" : "image",
"width" : 700,
@@ -78,7 +81,7 @@ __tests__ = (
"#category": ("", "subscribestar", "post"),
"#class" : subscribestar.SubscribestarPostExtractor,
"#range" : "2",
"#pattern" : r"https://ss-uploads-prod\.b-cdn\.net/uploads_v2/users/11/posts/920015/bc018a55-9668-47f4-a664-b5fd66b56aaa\.pdf\?token=.+",
"#pattern" : r"https://ss-uploads-prod\.b-cdn\.net/uploads_v2/users/11/posts/920015/bc018a55-9668-47f4-a664-b5fd66b56aaa\.pdf",
"date" : "dt:2023-05-30 09:20:00",
"extension": "pdf",
@@ -91,10 +94,32 @@ __tests__ = (
"type" : "attachment",
},
{
"#url" : "https://www.subscribestar.com/posts/1851025",
"#comment" : "content / title not inside <body> (#7486)",
"#category": ("", "subscribestar", "post"),
"#class" : subscribestar.SubscribestarPostExtractor,
"author_id" : 581352,
"author_name": "inelia-benz",
"author_nick": "Inelia Benz",
"content" : "<h1>Listening to Sasquatch - Driving to the Rez - Episode 243 - Part One</h1>\n\n<p>Topics we cover:</p>\n\n<p>Tree breaks, Foot stomps, Tracks and trackways, Hoots/calls with answers, \nTree structures, nests, Portal Cracks, Shapeshifting, Shimmer/invisibility \ncloaking, direct physical interaction inside the cloaking field, manipulation \nof canoe while we are in it, face to face interactions with multiple individuals \nteen aged and adult, male and female, cloaked and not cloaked, \nand vocalizations like drops of water. Truly amazing stories.</p>\n\n<p><a href=\"https://www.subscribestar.com/posts/1853792\" data-href=\"https://www.subscribestar.com/posts/1853792\">Go To Part Two</a></p>\n\n<p><a href=\"/away?url=aHR0cHM6Ly92aWRlby5pbmVsaWFiZW56LmNvbS9saXN0ZW5pbmctdG8tc2Fz%0AcXVhdGNoLWRyaXZpbmctdG8tdGhlLXJlei1lcGlzb2RlLTI0My1wYXJ0LW9u%0AZQ==%0A\" data-href=\"https://video.ineliabenz.com/listening-to-sasquatch-driving-to-the-rez-episode-243-part-one\">Watch the Video</a></p>\n\n<p><a href=\"/away?url=aHR0cHM6Ly9pbmVsaWEuc3Vic3RhY2suY29tL3AvbGlzdGVuaW5nLXRvLXNh%0Ac3F1YXRjaA==%0A\" data-href=\"https://inelia.substack.com/p/listening-to-sasquatch\">Read the article</a></p>\n\n<p>Audio is attached to this post.</p>",
"date" : "dt:2025-05-07 13:23:00",
"extension" : {"mp3", "jpg"},
"filename" : {"dttr-243-sasquatch-part1", "38cba130-3a31-4d8d-b326-7e5d3704801f"},
"id" : {0, 4627253},
"num" : range(1, 2),
"post_id" : 1851025,
"tags" : [],
"title" : "Listening to Sasquatch - Driving to the Rez - Episode 243 - Part One",
"type" : {"audio", "image"},
},
{
"#url" : "https://subscribestar.adult/posts/22950",
"#category": ("", "subscribestar", "post-adult"),
"#class" : subscribestar.SubscribestarPostExtractor,
"#auth" : True,
"#count" : 1,
"date": "dt:2019-04-28 07:32:00",