From 2001cf12e878a4936dcd180d3408e08299711110 Mon Sep 17 00:00:00 2001 From: CasualYT31 <21147925+CasualYT31@users.noreply.github.com> Date: Sun, 16 Mar 2025 15:00:08 +0000 Subject: [PATCH 1/5] [tiktok] Add retry mechanism to rehydration data extraction #7098 --- gallery_dl/extractor/tiktok.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py index 30f310d6..eaf942d1 100644 --- a/gallery_dl/extractor/tiktok.py +++ b/gallery_dl/extractor/tiktok.py @@ -6,6 +6,7 @@ """Extractors for https://www.tiktok.com/""" +from json.decoder import JSONDecodeError from .common import Extractor, Message from .. import text, util, ytdl, exception @@ -143,12 +144,22 @@ class TiktokExtractor(Extractor): def _sanitize_url(self, url): return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1)) - def _extract_rehydration_data(self, url): - html = self.request(url).text - data = text.extr( - html, '') - return util.json_loads(data)["__DEFAULT_SCOPE__"] + def _extract_rehydration_data(self, url, *, retries=1): + try: + html = self.request(url).text + data = text.extr( + html, '') + return util.json_loads(data)["__DEFAULT_SCOPE__"] + except JSONDecodeError: + # We failed to retrieve rehydration data. This happens relatively + # frequently, so retry if we're told to do so. + self.log.warning("%s: Failed to retrieve rehydration data, trying " + "%d more time%s", url, retries, + "" if retries == 1 else "s") + if retries > 0: + return self._extract_rehydration_data(url, retries=retries-1) + raise def _extract_audio(self, post): audio = post["music"] @@ -179,7 +190,7 @@ class TiktokExtractor(Extractor): elif status == 10204: self.log.error("%s: Requested post not available", url) elif status == 10231: - self.log.error("%s: Region locked - Try downloading with a" + self.log.error("%s: Region locked - Try downloading with a " "VPN/proxy connection", url) else: self.log.error( From c7685bdfc775cb8e40f53e91215032ecaf6c00d9 Mon Sep 17 00:00:00 2001 From: CasualYT31 <21147925+CasualYT31@users.noreply.github.com> Date: Mon, 17 Mar 2025 11:00:52 +0000 Subject: [PATCH 2/5] [tiktok] Incorporate --retries and --http-timeout into rehydration data extraction --- gallery_dl/extractor/tiktok.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py index eaf942d1..ddbb7d07 100644 --- a/gallery_dl/extractor/tiktok.py +++ b/gallery_dl/extractor/tiktok.py @@ -144,7 +144,9 @@ class TiktokExtractor(Extractor): def _sanitize_url(self, url): return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1)) - def _extract_rehydration_data(self, url, *, retries=1): + def _extract_rehydration_data(self, url, *, retries=None): + if retries is None: + retries = self._retries try: html = self.request(url).text data = text.extr( @@ -153,10 +155,12 @@ class TiktokExtractor(Extractor): return util.json_loads(data)["__DEFAULT_SCOPE__"] except JSONDecodeError: # We failed to retrieve rehydration data. This happens relatively - # frequently, so retry if we're told to do so. + # frequently when making many requests, so retry. self.log.warning("%s: Failed to retrieve rehydration data, trying " - "%d more time%s", url, retries, - "" if retries == 1 else "s") + "%d more time%s and delaying for %d second(s)", + url, retries, "" if retries == 1 else "s", + self._timeout) + self.sleep(self._timeout, "retry") if retries > 0: return self._extract_rehydration_data(url, retries=retries-1) raise From d6d2b1fba0a001392449d02cfcb05817451054f5 Mon Sep 17 00:00:00 2001 From: CasualYT31 <21147925+CasualYT31@users.noreply.github.com> Date: Wed, 19 Mar 2025 18:50:06 +0000 Subject: [PATCH 3/5] [tiktok] Queue links from user profiles instead of returning a list of links [tiktok] Move avatar download to the user extractor, which results in more accurate metadata output (it would previously write the metadata of the video which the avatar was scraped from) [tiktok] Fix tests and remove redundant user profile test --- gallery_dl/extractor/tiktok.py | 82 +++++++++++++--------------------- test/results/tiktok.py | 40 +++++++---------- 2 files changed, 47 insertions(+), 75 deletions(-) diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py index ddbb7d07..a87bd7b4 100644 --- a/gallery_dl/extractor/tiktok.py +++ b/gallery_dl/extractor/tiktok.py @@ -26,14 +26,8 @@ class TiktokExtractor(Extractor): def _init(self): self.audio = self.config("audio", True) self.video = self.config("videos", True) - if not self.config("avatar", True): - self.avatar = util.false def items(self): - # We assume that all of the URLs served by urls() come from the same - # author. - downloaded_avatar = not self.avatar() - for tiktok_url in self.urls(): tiktok_url = self._sanitize_url(tiktok_url) data = self._extract_rehydration_data(tiktok_url) @@ -50,18 +44,10 @@ class TiktokExtractor(Extractor): post = video_detail["itemInfo"]["itemStruct"] author = post["author"] - post["user"] = user = author["uniqueId"] + post["user"] = author["uniqueId"] post["date"] = text.parse_timestamp(post["createTime"]) original_title = title = post["desc"] - if not downloaded_avatar: - avatar_url = author["avatarLarger"] - avatar = self._generate_avatar( - avatar_url, post, user, author["id"]) - yield Message.Directory, avatar - yield Message.Url, avatar_url, avatar - downloaded_avatar = True - yield Message.Directory, post ytdl_media = False @@ -112,35 +98,6 @@ class TiktokExtractor(Extractor): }) yield Message.Url, "ytdl:" + tiktok_url, post - # If we couldn't download the avatar because the given user has no - # posts, we'll need to make a separate request for the user's page - # and download the avatar that way. - if not downloaded_avatar: - user_name = self.avatar() - profile_url = "https://www.tiktok.com/@{}".format(user_name) - data = self._extract_rehydration_data(profile_url) - data = data["webapp.user-detail"]["userInfo"]["user"] - data["user"] = user_name - avatar_url = data["avatarLarger"] - avatar = self._generate_avatar( - avatar_url, data, user_name, data["id"]) - yield Message.Directory, avatar - yield Message.Url, avatar_url, avatar - - def avatar(self): - return False - - def _generate_avatar(self, avatar_url, data, user_name, user_id): - avatar = text.nameext_from_url(avatar_url, data.copy()) - avatar.update({ - "type" : "avatar", - "title" : "@" + user_name, - "id" : user_id, - "img_id": avatar["filename"].partition("~")[0], - "num" : 0, - }) - return avatar - def _sanitize_url(self, url): return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1)) @@ -245,7 +202,10 @@ class TiktokUserExtractor(TiktokExtractor): pattern = BASE_PATTERN + r"/@([\w_.-]+)/?(?:$|\?|#)" example = "https://www.tiktok.com/@USER" - def urls(self): + def _init(self): + self.avatar = self.config("avatar", True) + + def items(self): """Attempt to use yt-dlp/youtube-dl to extract links from a user's page""" @@ -278,19 +238,39 @@ class TiktokUserExtractor(TiktokExtractor): ytdl_instance = ytdl.construct_YoutubeDL( module, self, user_opts, extr_opts) - # transfer cookies to ytdl + # Transfer cookies to ytdl. if self.cookies: set_cookie = ytdl_instance.cookiejar.set_cookie for cookie in self.cookies: set_cookie(cookie) + user_name = self.groups[0] + profile_url = "{}/@{}".format(self.root, user_name) + if self.avatar: + avatar_url, avatar = self._generate_avatar(user_name, profile_url) + yield Message.Directory, avatar + yield Message.Url, avatar_url, avatar + with ytdl_instance as ydl: info_dict = ydl._YoutubeDL__extract_info( - "{}/@{}".format(self.root, self.groups[0]), - ydl.get_info_extractor("TikTokUser"), + profile_url, ydl.get_info_extractor("TikTokUser"), False, {}, True) # This should include video and photo posts in /video/ URL form. - return [video["url"] for video in info_dict["entries"]] + for video in info_dict["entries"]: + data = {"_extractor": TiktokPostExtractor} + yield Message.Queue, video["url"].partition("?")[0], data - def avatar(self): - return self.groups[0] + def _generate_avatar(self, user_name, profile_url): + data = self._extract_rehydration_data(profile_url) + data = data["webapp.user-detail"]["userInfo"]["user"] + data["user"] = user_name + avatar_url = data["avatarLarger"] + avatar = text.nameext_from_url(avatar_url, data.copy()) + avatar.update({ + "type" : "avatar", + "title" : "@" + user_name, + "id" : data["id"], + "img_id": avatar["filename"].partition("~")[0], + "num" : 0, + }) + return (avatar_url, avatar) diff --git a/test/results/tiktok.py b/test/results/tiktok.py index 9cd73a92..d38540b5 100644 --- a/test/results/tiktok.py +++ b/test/results/tiktok.py @@ -7,7 +7,8 @@ from gallery_dl.extractor import tiktok PATTERN = r"https://p1[69]-[^/?#.]+\.tiktokcdn[^/?#.]*\.com/[^/?#]+/\w+~.*\.jpe?g" -PATTERN_WITH_AUDIO = r"(?:" + PATTERN + r"|ytdl:http.+)" +PATTERN_WITH_AUDIO = r"(?:" + PATTERN + r"|https://v\d+m?\.tiktokcdn[^/?#.]*\.com/[^?#]+\?[^/?#]+)" +USER_PATTERN = r"(https://www.tiktok.com/@([\w_.-]+)/video/(\d+)|" + PATTERN + r")" __tests__ = ( @@ -17,7 +18,7 @@ __tests__ = ( "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, "#pattern" : PATTERN, - "#options" : {"videos": False}, + "#options" : {"videos": False, "audio": False}, }, { @@ -26,7 +27,7 @@ __tests__ = ( "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, "#pattern" : PATTERN, - "#options" : {"videos": False}, + "#options" : {"videos": False, "audio": False}, }, { @@ -35,7 +36,7 @@ __tests__ = ( "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, "#pattern" : PATTERN, - "#options" : {"videos": False}, + "#options" : {"videos": False, "audio": False}, }, { @@ -44,7 +45,7 @@ __tests__ = ( "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, "#pattern" : PATTERN, - "#options" : {"videos": False}, + "#options" : {"videos": False, "audio": False}, }, { @@ -53,7 +54,7 @@ __tests__ = ( "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, "#pattern" : PATTERN, - "#options" : {"videos": False}, + "#options" : {"videos": False, "audio": False}, }, { @@ -62,7 +63,7 @@ __tests__ = ( "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, "#pattern" : PATTERN, - "#options" : {"videos": False}, + "#options" : {"videos": False, "audio": False}, }, { @@ -71,7 +72,7 @@ __tests__ = ( "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, "#pattern" : PATTERN, - "#options" : {"videos": False}, + "#options" : {"videos": False, "audio": False}, }, { @@ -80,7 +81,7 @@ __tests__ = ( "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, "#pattern" : PATTERN, - "#options" : {"videos": False}, + "#options" : {"videos": False, "audio": False}, }, { @@ -89,7 +90,7 @@ __tests__ = ( "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, "#pattern" : PATTERN, - "#options" : {"videos": False}, + "#options" : {"videos": False, "audio": False}, }, { @@ -97,7 +98,7 @@ __tests__ = ( "#comment" : "deleted post", "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, - "#options" : {"videos": False}, + "#options" : {"videos": False, "audio": False}, "count" : 0, }, @@ -107,7 +108,7 @@ __tests__ = ( "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, "#urls" : "ytdl:https://www.tiktok.com/@memezar/video/7449708266168274208", - "#options" : {"videos": True}, + "#options" : {"videos": True, "audio": True}, }, { @@ -116,7 +117,7 @@ __tests__ = ( "#category" : ("", "tiktok", "post"), "#class" : tiktok.TiktokPostExtractor, "#urls" : "ytdl:https://www.tiktok.com/@memezar/video/7449708266168274208", - "#options" : {"videos": True}, + "#options" : {"videos": True, "audio": True}, }, { @@ -241,17 +242,8 @@ __tests__ = ( "#comment" : "User profile", "#category" : ("", "tiktok", "user"), "#class" : tiktok.TiktokUserExtractor, - "#pattern" : PATTERN_WITH_AUDIO, - "#options" : {"videos": True, "tiktok-range": "1-10"}, -}, - -{ - "#url" : "https://www.tiktok.com/@chillezy/", - "#comment" : "User profile without audio or videos", - "#category" : ("", "tiktok", "user"), - "#class" : tiktok.TiktokUserExtractor, - "#pattern" : PATTERN, - "#options" : {"videos": False, "tiktok-range": "1-10"}, + "#pattern" : USER_PATTERN, + "#options" : {"videos": True, "audio": True, "tiktok-range": "1-10"}, }, { From 7b791405b6969e11f7afd57f6611a960040cfa00 Mon Sep 17 00:00:00 2001 From: CasualYT31 <21147925+CasualYT31@users.noreply.github.com> Date: Wed, 19 Mar 2025 19:14:02 +0000 Subject: [PATCH 4/5] [tiktok] Address review comments on _extract_rehydration_data() --- gallery_dl/extractor/tiktok.py | 41 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py index a87bd7b4..54320eb2 100644 --- a/gallery_dl/extractor/tiktok.py +++ b/gallery_dl/extractor/tiktok.py @@ -6,7 +6,6 @@ """Extractors for https://www.tiktok.com/""" -from json.decoder import JSONDecodeError from .common import Extractor, Message from .. import text, util, ytdl, exception @@ -101,26 +100,26 @@ class TiktokExtractor(Extractor): def _sanitize_url(self, url): return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1)) - def _extract_rehydration_data(self, url, *, retries=None): - if retries is None: - retries = self._retries - try: - html = self.request(url).text - data = text.extr( - html, '') - return util.json_loads(data)["__DEFAULT_SCOPE__"] - except JSONDecodeError: - # We failed to retrieve rehydration data. This happens relatively - # frequently when making many requests, so retry. - self.log.warning("%s: Failed to retrieve rehydration data, trying " - "%d more time%s and delaying for %d second(s)", - url, retries, "" if retries == 1 else "s", - self._timeout) - self.sleep(self._timeout, "retry") - if retries > 0: - return self._extract_rehydration_data(url, retries=retries-1) - raise + def _extract_rehydration_data(self, url): + tries = 0 + while True: + try: + html = self.request(url).text + data = text.extr( + html, '') + return util.json_loads(data)["__DEFAULT_SCOPE__"] + except ValueError: + # We failed to retrieve rehydration data. This happens + # relatively frequently when making many requests, so + # retry. + tries += 1 + self.log.warning("%s: Failed to retrieve rehydration data " + "(%s/%s)", url.rpartition("/")[2], tries, + self._retries) + self.sleep(self._timeout, "retry") + if tries >= self._retries: + raise def _extract_audio(self, post): audio = post["music"] From 685660bcb1c3c43ebba091e753ccc4a22418105f Mon Sep 17 00:00:00 2001 From: CasualYT31 <21147925+CasualYT31@users.noreply.github.com> Date: Wed, 19 Mar 2025 19:21:33 +0000 Subject: [PATCH 5/5] [tiktok] Don't skip last retry --- gallery_dl/extractor/tiktok.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py index 54320eb2..4c1da7ae 100644 --- a/gallery_dl/extractor/tiktok.py +++ b/gallery_dl/extractor/tiktok.py @@ -113,13 +113,13 @@ class TiktokExtractor(Extractor): # We failed to retrieve rehydration data. This happens # relatively frequently when making many requests, so # retry. + if tries >= self._retries: + raise tries += 1 self.log.warning("%s: Failed to retrieve rehydration data " "(%s/%s)", url.rpartition("/")[2], tries, self._retries) self.sleep(self._timeout, "retry") - if tries >= self._retries: - raise def _extract_audio(self, post): audio = post["music"]