From 98fdcd4d723a1bf948da96cc7bac609db17d16cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 10 May 2025 20:57:55 +0200 Subject: [PATCH] [subscribestar] fix 'content' extraction (#7486) and extract 'tags' metadata Authored by: prowlguru Co-authored-by: prowlguru <183935626+prowlguru@users.noreply.github.com> --- gallery_dl/extractor/subscribestar.py | 24 +++++++++++++++++---- test/results/subscribestar.py | 31 ++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 1054a630..110cf3eb 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -40,8 +40,14 @@ class SubscribestarExtractor(Extractor): for post_html in self.posts(): media = self._media_from_post(post_html) data = self._data_from_post(post_html) - data["title"] = text.unescape(text.extr( - data["content"], "

", "

")) + + content = data["content"] + if "" in content: + data["content"] = content = text.extr( + content, "", "") + data["title"] = text.unescape( + text.extr(content, "

", "

")) + yield Message.Directory, data for num, item in enumerate(media, 1): item.update(data) @@ -189,7 +195,12 @@ class SubscribestarExtractor(Extractor): "author_nick": text.unescape(extr('>', '<')), "date" : self._parse_datetime(extr( 'class="post-date">', '")[2]), - "content" : extr('', '').strip(), + "content" : extr( + '
', + '
', + '
'), '?tag=', '"')), } def _parse_datetime(self, dt): @@ -243,7 +254,12 @@ class SubscribestarPostExtractor(SubscribestarExtractor): "post_id" : text.parse_int(extr('data-id="', '"')), "date" : self._parse_datetime(extr( '
', '<')), - "content" : extr('', '').strip(), + "content" : extr( + '
', + '
', + '
'), '?tag=', '"')), "author_name": text.unescape(extr( 'class="star_link" href="/', '"')), "author_id" : text.parse_int(extr('data-user-id="', '"')), diff --git a/test/results/subscribestar.py b/test/results/subscribestar.py index 66474ff1..94850ac3 100644 --- a/test/results/subscribestar.py +++ b/test/results/subscribestar.py @@ -24,6 +24,7 @@ __tests__ = ( "id" : int, "num" : int, "post_id" : int, + "tags" : list, "title" : str, "type" : r"re:image|video|attachment", "url" : str, @@ -44,6 +45,7 @@ __tests__ = ( "#url" : "https://subscribestar.adult/kanashiipanda", "#category": ("", "subscribestar", "user-adult"), "#class" : subscribestar.SubscribestarUserExtractor, + "#auth" : True, "#range" : "1-10", "#count" : 10, }, @@ -59,14 +61,15 @@ __tests__ = ( "author_nick": "SubscribeStar", "content" : r"re:

Brand Guidelines and Assets

", "date" : "dt:2020-05-07 12:33:00", - "extension" : "", - "filename" : "uploads", + "extension" : "jpg", + "filename" : "8ff61299-b249-47dc-880a-cdacc9081c62", "group" : "imgs_and_videos", "height" : 291, "id" : 203885, "num" : 1, "pinned" : False, "post_id" : 102468, + "tags" : [], "title" : "Brand Guidelines and Assets", "type" : "image", "width" : 700, @@ -78,7 +81,7 @@ __tests__ = ( "#category": ("", "subscribestar", "post"), "#class" : subscribestar.SubscribestarPostExtractor, "#range" : "2", - "#pattern" : r"https://ss-uploads-prod\.b-cdn\.net/uploads_v2/users/11/posts/920015/bc018a55-9668-47f4-a664-b5fd66b56aaa\.pdf\?token=.+", + "#pattern" : r"https://ss-uploads-prod\.b-cdn\.net/uploads_v2/users/11/posts/920015/bc018a55-9668-47f4-a664-b5fd66b56aaa\.pdf", "date" : "dt:2023-05-30 09:20:00", "extension": "pdf", @@ -91,10 +94,32 @@ __tests__ = ( "type" : "attachment", }, +{ + "#url" : "https://www.subscribestar.com/posts/1851025", + "#comment" : "content / title not inside (#7486)", + "#category": ("", "subscribestar", "post"), + "#class" : subscribestar.SubscribestarPostExtractor, + + "author_id" : 581352, + "author_name": "inelia-benz", + "author_nick": "Inelia Benz", + "content" : "

Listening to Sasquatch - Driving to the Rez - Episode 243 - Part One

\n\n

Topics we cover:

\n\n

Tree breaks, Foot stomps, Tracks and trackways, Hoots/calls with answers, \nTree structures, nests, Portal Cracks, Shapeshifting, Shimmer/invisibility \ncloaking, direct physical interaction inside the cloaking field, manipulation \nof canoe while we are in it, face to face interactions with multiple individuals \nteen aged and adult, male and female, cloaked and not cloaked, \nand vocalizations like drops of water. Truly amazing stories.

\n\n

Go To Part Two

\n\n

Watch the Video

\n\n

Read the article

\n\n

Audio is attached to this post.

", + "date" : "dt:2025-05-07 13:23:00", + "extension" : {"mp3", "jpg"}, + "filename" : {"dttr-243-sasquatch-part1", "38cba130-3a31-4d8d-b326-7e5d3704801f"}, + "id" : {0, 4627253}, + "num" : range(1, 2), + "post_id" : 1851025, + "tags" : [], + "title" : "Listening to Sasquatch - Driving to the Rez - Episode 243 - Part One", + "type" : {"audio", "image"}, +}, + { "#url" : "https://subscribestar.adult/posts/22950", "#category": ("", "subscribestar", "post-adult"), "#class" : subscribestar.SubscribestarPostExtractor, + "#auth" : True, "#count" : 1, "date": "dt:2019-04-28 07:32:00",