From 2001cf12e878a4936dcd180d3408e08299711110 Mon Sep 17 00:00:00 2001
From: CasualYT31 <21147925+CasualYT31@users.noreply.github.com>
Date: Sun, 16 Mar 2025 15:00:08 +0000
Subject: [PATCH 1/5] [tiktok] Add retry mechanism to rehydration data
extraction
#7098
---
gallery_dl/extractor/tiktok.py | 25 ++++++++++++++++++-------
1 file changed, 18 insertions(+), 7 deletions(-)
diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
index 30f310d6..eaf942d1 100644
--- a/gallery_dl/extractor/tiktok.py
+++ b/gallery_dl/extractor/tiktok.py
@@ -6,6 +6,7 @@
"""Extractors for https://www.tiktok.com/"""
+from json.decoder import JSONDecodeError
from .common import Extractor, Message
from .. import text, util, ytdl, exception
@@ -143,12 +144,22 @@ class TiktokExtractor(Extractor):
def _sanitize_url(self, url):
return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1))
- def _extract_rehydration_data(self, url):
- html = self.request(url).text
- data = text.extr(
- html, '')
- return util.json_loads(data)["__DEFAULT_SCOPE__"]
+ def _extract_rehydration_data(self, url, *, retries=1):
+ try:
+ html = self.request(url).text
+ data = text.extr(
+ html, '')
+ return util.json_loads(data)["__DEFAULT_SCOPE__"]
+ except JSONDecodeError:
+ # We failed to retrieve rehydration data. This happens relatively
+ # frequently, so retry if we're told to do so.
+ self.log.warning("%s: Failed to retrieve rehydration data, trying "
+ "%d more time%s", url, retries,
+ "" if retries == 1 else "s")
+ if retries > 0:
+ return self._extract_rehydration_data(url, retries=retries-1)
+ raise
def _extract_audio(self, post):
audio = post["music"]
@@ -179,7 +190,7 @@ class TiktokExtractor(Extractor):
elif status == 10204:
self.log.error("%s: Requested post not available", url)
elif status == 10231:
- self.log.error("%s: Region locked - Try downloading with a"
+ self.log.error("%s: Region locked - Try downloading with a "
"VPN/proxy connection", url)
else:
self.log.error(
From c7685bdfc775cb8e40f53e91215032ecaf6c00d9 Mon Sep 17 00:00:00 2001
From: CasualYT31 <21147925+CasualYT31@users.noreply.github.com>
Date: Mon, 17 Mar 2025 11:00:52 +0000
Subject: [PATCH 2/5] [tiktok] Incorporate --retries and --http-timeout into
rehydration data extraction
---
gallery_dl/extractor/tiktok.py | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
index eaf942d1..ddbb7d07 100644
--- a/gallery_dl/extractor/tiktok.py
+++ b/gallery_dl/extractor/tiktok.py
@@ -144,7 +144,9 @@ class TiktokExtractor(Extractor):
def _sanitize_url(self, url):
return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1))
- def _extract_rehydration_data(self, url, *, retries=1):
+ def _extract_rehydration_data(self, url, *, retries=None):
+ if retries is None:
+ retries = self._retries
try:
html = self.request(url).text
data = text.extr(
@@ -153,10 +155,12 @@ class TiktokExtractor(Extractor):
return util.json_loads(data)["__DEFAULT_SCOPE__"]
except JSONDecodeError:
# We failed to retrieve rehydration data. This happens relatively
- # frequently, so retry if we're told to do so.
+ # frequently when making many requests, so retry.
self.log.warning("%s: Failed to retrieve rehydration data, trying "
- "%d more time%s", url, retries,
- "" if retries == 1 else "s")
+ "%d more time%s and delaying for %d second(s)",
+ url, retries, "" if retries == 1 else "s",
+ self._timeout)
+ self.sleep(self._timeout, "retry")
if retries > 0:
return self._extract_rehydration_data(url, retries=retries-1)
raise
From d6d2b1fba0a001392449d02cfcb05817451054f5 Mon Sep 17 00:00:00 2001
From: CasualYT31 <21147925+CasualYT31@users.noreply.github.com>
Date: Wed, 19 Mar 2025 18:50:06 +0000
Subject: [PATCH 3/5] [tiktok] Queue links from user profiles instead of
returning a list of links
[tiktok] Move avatar download to the user extractor, which results in more accurate metadata output (it would previously write the metadata of the video which the avatar was scraped from)
[tiktok] Fix tests and remove redundant user profile test
---
gallery_dl/extractor/tiktok.py | 82 +++++++++++++---------------------
test/results/tiktok.py | 40 +++++++----------
2 files changed, 47 insertions(+), 75 deletions(-)
diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
index ddbb7d07..a87bd7b4 100644
--- a/gallery_dl/extractor/tiktok.py
+++ b/gallery_dl/extractor/tiktok.py
@@ -26,14 +26,8 @@ class TiktokExtractor(Extractor):
def _init(self):
self.audio = self.config("audio", True)
self.video = self.config("videos", True)
- if not self.config("avatar", True):
- self.avatar = util.false
def items(self):
- # We assume that all of the URLs served by urls() come from the same
- # author.
- downloaded_avatar = not self.avatar()
-
for tiktok_url in self.urls():
tiktok_url = self._sanitize_url(tiktok_url)
data = self._extract_rehydration_data(tiktok_url)
@@ -50,18 +44,10 @@ class TiktokExtractor(Extractor):
post = video_detail["itemInfo"]["itemStruct"]
author = post["author"]
- post["user"] = user = author["uniqueId"]
+ post["user"] = author["uniqueId"]
post["date"] = text.parse_timestamp(post["createTime"])
original_title = title = post["desc"]
- if not downloaded_avatar:
- avatar_url = author["avatarLarger"]
- avatar = self._generate_avatar(
- avatar_url, post, user, author["id"])
- yield Message.Directory, avatar
- yield Message.Url, avatar_url, avatar
- downloaded_avatar = True
-
yield Message.Directory, post
ytdl_media = False
@@ -112,35 +98,6 @@ class TiktokExtractor(Extractor):
})
yield Message.Url, "ytdl:" + tiktok_url, post
- # If we couldn't download the avatar because the given user has no
- # posts, we'll need to make a separate request for the user's page
- # and download the avatar that way.
- if not downloaded_avatar:
- user_name = self.avatar()
- profile_url = "https://www.tiktok.com/@{}".format(user_name)
- data = self._extract_rehydration_data(profile_url)
- data = data["webapp.user-detail"]["userInfo"]["user"]
- data["user"] = user_name
- avatar_url = data["avatarLarger"]
- avatar = self._generate_avatar(
- avatar_url, data, user_name, data["id"])
- yield Message.Directory, avatar
- yield Message.Url, avatar_url, avatar
-
- def avatar(self):
- return False
-
- def _generate_avatar(self, avatar_url, data, user_name, user_id):
- avatar = text.nameext_from_url(avatar_url, data.copy())
- avatar.update({
- "type" : "avatar",
- "title" : "@" + user_name,
- "id" : user_id,
- "img_id": avatar["filename"].partition("~")[0],
- "num" : 0,
- })
- return avatar
-
def _sanitize_url(self, url):
return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1))
@@ -245,7 +202,10 @@ class TiktokUserExtractor(TiktokExtractor):
pattern = BASE_PATTERN + r"/@([\w_.-]+)/?(?:$|\?|#)"
example = "https://www.tiktok.com/@USER"
- def urls(self):
+ def _init(self):
+ self.avatar = self.config("avatar", True)
+
+ def items(self):
"""Attempt to use yt-dlp/youtube-dl to extract links from a
user's page"""
@@ -278,19 +238,39 @@ class TiktokUserExtractor(TiktokExtractor):
ytdl_instance = ytdl.construct_YoutubeDL(
module, self, user_opts, extr_opts)
- # transfer cookies to ytdl
+ # Transfer cookies to ytdl.
if self.cookies:
set_cookie = ytdl_instance.cookiejar.set_cookie
for cookie in self.cookies:
set_cookie(cookie)
+ user_name = self.groups[0]
+ profile_url = "{}/@{}".format(self.root, user_name)
+ if self.avatar:
+ avatar_url, avatar = self._generate_avatar(user_name, profile_url)
+ yield Message.Directory, avatar
+ yield Message.Url, avatar_url, avatar
+
with ytdl_instance as ydl:
info_dict = ydl._YoutubeDL__extract_info(
- "{}/@{}".format(self.root, self.groups[0]),
- ydl.get_info_extractor("TikTokUser"),
+ profile_url, ydl.get_info_extractor("TikTokUser"),
False, {}, True)
# This should include video and photo posts in /video/ URL form.
- return [video["url"] for video in info_dict["entries"]]
+ for video in info_dict["entries"]:
+ data = {"_extractor": TiktokPostExtractor}
+ yield Message.Queue, video["url"].partition("?")[0], data
- def avatar(self):
- return self.groups[0]
+ def _generate_avatar(self, user_name, profile_url):
+ data = self._extract_rehydration_data(profile_url)
+ data = data["webapp.user-detail"]["userInfo"]["user"]
+ data["user"] = user_name
+ avatar_url = data["avatarLarger"]
+ avatar = text.nameext_from_url(avatar_url, data.copy())
+ avatar.update({
+ "type" : "avatar",
+ "title" : "@" + user_name,
+ "id" : data["id"],
+ "img_id": avatar["filename"].partition("~")[0],
+ "num" : 0,
+ })
+ return (avatar_url, avatar)
diff --git a/test/results/tiktok.py b/test/results/tiktok.py
index 9cd73a92..d38540b5 100644
--- a/test/results/tiktok.py
+++ b/test/results/tiktok.py
@@ -7,7 +7,8 @@
from gallery_dl.extractor import tiktok
PATTERN = r"https://p1[69]-[^/?#.]+\.tiktokcdn[^/?#.]*\.com/[^/?#]+/\w+~.*\.jpe?g"
-PATTERN_WITH_AUDIO = r"(?:" + PATTERN + r"|ytdl:http.+)"
+PATTERN_WITH_AUDIO = r"(?:" + PATTERN + r"|https://v\d+m?\.tiktokcdn[^/?#.]*\.com/[^?#]+\?[^/?#]+)"
+USER_PATTERN = r"(https://www.tiktok.com/@([\w_.-]+)/video/(\d+)|" + PATTERN + r")"
__tests__ = (
@@ -17,7 +18,7 @@ __tests__ = (
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
"#pattern" : PATTERN,
- "#options" : {"videos": False},
+ "#options" : {"videos": False, "audio": False},
},
{
@@ -26,7 +27,7 @@ __tests__ = (
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
"#pattern" : PATTERN,
- "#options" : {"videos": False},
+ "#options" : {"videos": False, "audio": False},
},
{
@@ -35,7 +36,7 @@ __tests__ = (
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
"#pattern" : PATTERN,
- "#options" : {"videos": False},
+ "#options" : {"videos": False, "audio": False},
},
{
@@ -44,7 +45,7 @@ __tests__ = (
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
"#pattern" : PATTERN,
- "#options" : {"videos": False},
+ "#options" : {"videos": False, "audio": False},
},
{
@@ -53,7 +54,7 @@ __tests__ = (
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
"#pattern" : PATTERN,
- "#options" : {"videos": False},
+ "#options" : {"videos": False, "audio": False},
},
{
@@ -62,7 +63,7 @@ __tests__ = (
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
"#pattern" : PATTERN,
- "#options" : {"videos": False},
+ "#options" : {"videos": False, "audio": False},
},
{
@@ -71,7 +72,7 @@ __tests__ = (
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
"#pattern" : PATTERN,
- "#options" : {"videos": False},
+ "#options" : {"videos": False, "audio": False},
},
{
@@ -80,7 +81,7 @@ __tests__ = (
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
"#pattern" : PATTERN,
- "#options" : {"videos": False},
+ "#options" : {"videos": False, "audio": False},
},
{
@@ -89,7 +90,7 @@ __tests__ = (
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
"#pattern" : PATTERN,
- "#options" : {"videos": False},
+ "#options" : {"videos": False, "audio": False},
},
{
@@ -97,7 +98,7 @@ __tests__ = (
"#comment" : "deleted post",
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
- "#options" : {"videos": False},
+ "#options" : {"videos": False, "audio": False},
"count" : 0,
},
@@ -107,7 +108,7 @@ __tests__ = (
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
"#urls" : "ytdl:https://www.tiktok.com/@memezar/video/7449708266168274208",
- "#options" : {"videos": True},
+ "#options" : {"videos": True, "audio": True},
},
{
@@ -116,7 +117,7 @@ __tests__ = (
"#category" : ("", "tiktok", "post"),
"#class" : tiktok.TiktokPostExtractor,
"#urls" : "ytdl:https://www.tiktok.com/@memezar/video/7449708266168274208",
- "#options" : {"videos": True},
+ "#options" : {"videos": True, "audio": True},
},
{
@@ -241,17 +242,8 @@ __tests__ = (
"#comment" : "User profile",
"#category" : ("", "tiktok", "user"),
"#class" : tiktok.TiktokUserExtractor,
- "#pattern" : PATTERN_WITH_AUDIO,
- "#options" : {"videos": True, "tiktok-range": "1-10"},
-},
-
-{
- "#url" : "https://www.tiktok.com/@chillezy/",
- "#comment" : "User profile without audio or videos",
- "#category" : ("", "tiktok", "user"),
- "#class" : tiktok.TiktokUserExtractor,
- "#pattern" : PATTERN,
- "#options" : {"videos": False, "tiktok-range": "1-10"},
+ "#pattern" : USER_PATTERN,
+ "#options" : {"videos": True, "audio": True, "tiktok-range": "1-10"},
},
{
From 7b791405b6969e11f7afd57f6611a960040cfa00 Mon Sep 17 00:00:00 2001
From: CasualYT31 <21147925+CasualYT31@users.noreply.github.com>
Date: Wed, 19 Mar 2025 19:14:02 +0000
Subject: [PATCH 4/5] [tiktok] Address review comments on
_extract_rehydration_data()
---
gallery_dl/extractor/tiktok.py | 41 +++++++++++++++++-----------------
1 file changed, 20 insertions(+), 21 deletions(-)
diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
index a87bd7b4..54320eb2 100644
--- a/gallery_dl/extractor/tiktok.py
+++ b/gallery_dl/extractor/tiktok.py
@@ -6,7 +6,6 @@
"""Extractors for https://www.tiktok.com/"""
-from json.decoder import JSONDecodeError
from .common import Extractor, Message
from .. import text, util, ytdl, exception
@@ -101,26 +100,26 @@ class TiktokExtractor(Extractor):
def _sanitize_url(self, url):
return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1))
- def _extract_rehydration_data(self, url, *, retries=None):
- if retries is None:
- retries = self._retries
- try:
- html = self.request(url).text
- data = text.extr(
- html, '')
- return util.json_loads(data)["__DEFAULT_SCOPE__"]
- except JSONDecodeError:
- # We failed to retrieve rehydration data. This happens relatively
- # frequently when making many requests, so retry.
- self.log.warning("%s: Failed to retrieve rehydration data, trying "
- "%d more time%s and delaying for %d second(s)",
- url, retries, "" if retries == 1 else "s",
- self._timeout)
- self.sleep(self._timeout, "retry")
- if retries > 0:
- return self._extract_rehydration_data(url, retries=retries-1)
- raise
+ def _extract_rehydration_data(self, url):
+ tries = 0
+ while True:
+ try:
+ html = self.request(url).text
+ data = text.extr(
+ html, '')
+ return util.json_loads(data)["__DEFAULT_SCOPE__"]
+ except ValueError:
+ # We failed to retrieve rehydration data. This happens
+ # relatively frequently when making many requests, so
+ # retry.
+ tries += 1
+ self.log.warning("%s: Failed to retrieve rehydration data "
+ "(%s/%s)", url.rpartition("/")[2], tries,
+ self._retries)
+ self.sleep(self._timeout, "retry")
+ if tries >= self._retries:
+ raise
def _extract_audio(self, post):
audio = post["music"]
From 685660bcb1c3c43ebba091e753ccc4a22418105f Mon Sep 17 00:00:00 2001
From: CasualYT31 <21147925+CasualYT31@users.noreply.github.com>
Date: Wed, 19 Mar 2025 19:21:33 +0000
Subject: [PATCH 5/5] [tiktok] Don't skip last retry
---
gallery_dl/extractor/tiktok.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
index 54320eb2..4c1da7ae 100644
--- a/gallery_dl/extractor/tiktok.py
+++ b/gallery_dl/extractor/tiktok.py
@@ -113,13 +113,13 @@ class TiktokExtractor(Extractor):
# We failed to retrieve rehydration data. This happens
# relatively frequently when making many requests, so
# retry.
+ if tries >= self._retries:
+ raise
tries += 1
self.log.warning("%s: Failed to retrieve rehydration data "
"(%s/%s)", url.rpartition("/")[2], tries,
self._retries)
self.sleep(self._timeout, "retry")
- if tries >= self._retries:
- raise
def _extract_audio(self, post):
audio = post["music"]