From e5c91d33ec048b824e2864a5e1e080c7dedb283c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 30 Jul 2025 10:37:32 +0200 Subject: [PATCH] [blogger] fix video extraction (#7892) --- gallery_dl/extractor/blogger.py | 42 +++++++++++++++++++++++---------- test/results/blogspot.py | 4 ++-- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 796d9d1d..af434460 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -31,6 +31,11 @@ class BloggerExtractor(BaseExtractor): self.blog = self.root.rpartition("/")[2] self.videos = self.config("videos", True) + if self.videos: + self.findall_video = util.re( + r"""src=["'](https?://www\.blogger\.com""" + r"""/video\.g\?token=[^"']+)""").findall + def items(self): blog = self.api.blog_by_url("http://" + self.blog) blog["pages"] = blog["pages"]["totalItems"] @@ -43,8 +48,6 @@ class BloggerExtractor(BaseExtractor): r'blogger\.googleusercontent\.com/img|' r'lh\d+(?:-\w+)?\.googleusercontent\.com|' r'\d+\.bp\.blogspot\.com)/[^"]+)').findall - findall_video = util.re( - r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall metadata = self.metadata() for post in self.posts(blog): @@ -54,16 +57,10 @@ class BloggerExtractor(BaseExtractor): for idx, url in enumerate(files): files[idx] = original(url) - if self.videos and 'id="BLOG_video-' in content: - page = self.request(post["url"]).text - for url in findall_video(page): - page = self.request(url).text - video_config = util.json_loads(text.extr( - page, 'var VIDEO_CONFIG =', '\n')) - files.append(max( - video_config["streams"], - key=lambda x: x["format_id"], - )["play_url"]) + if self.videos and ( + 'id="BLOG_video-' in content or + 'class="BLOG_video_' in content): + self._extract_videos(files, post) post["author"] = post["author"]["displayName"] post["replies"] = post["replies"]["totalItems"] @@ -87,6 +84,27 @@ class BloggerExtractor(BaseExtractor): def metadata(self): """Return additional metadata""" + def _extract_videos(self, files, post): + url = f"https://{self.blog}/feeds/posts/default/{post['id']}" + params = { + "alt" : "json", + "v" : "2", + "dynamicviews" : "1", + "rewriteforssl": "true", + } + + data = self.request_json(url, params=params) + html = data["entry"]["content"]["$t"] + + for url in self.findall_video(html): + page = self.request(url).text + video_config = util.json_loads(text.extr( + page, 'var VIDEO_CONFIG =', '\n')) + files.append(max( + video_config["streams"], + key=lambda x: x["format_id"], + )["play_url"]) + BASE_PATTERN = BloggerExtractor.update({ "blogspot": { diff --git a/test/results/blogspot.py b/test/results/blogspot.py index 7ef33c6c..5d70c3a4 100644 --- a/test/results/blogspot.py +++ b/test/results/blogspot.py @@ -47,8 +47,8 @@ __tests__ = ( }, { - "#url" : "http://cfnmscenesinmovies.blogspot.com/2011/11/cfnm-scene-jenna-fischer-in-office.html", - "#comment" : "video (#587)", + "#url" : "https://hotgrannysomas.blogspot.com/2012/08/para-amantes-del-buen-sexo-anal-los.html", + "#comment" : "video", "#category": ("blogger", "blogspot", "post"), "#class" : blogger.BloggerPostExtractor, "#pattern" : r"https://.+\.googlevideo\.com/videoplayback",