[tiktok] do not fail entire extraction if one post fails (#8962)

This commit is contained in:
CasualYouTuber31
2026-01-30 22:03:59 +00:00
committed by GitHub
parent fd5f5611f6
commit 01657caa15

View File

@@ -57,94 +57,102 @@ class TiktokExtractor(Extractor):
def items(self): def items(self):
for tiktok_url in self.posts(): for tiktok_url in self.posts():
tiktok_url = self._sanitize_url(tiktok_url) try:
tiktok_url = self._sanitize_url(tiktok_url)
data = self._extract_rehydration_data(tiktok_url)
if "webapp.video-detail" not in data:
# Only /video/ links result in the video-detail dict we need.
# Try again using that form of link.
tiktok_url = self._sanitize_url(
data["seo.abtest"]["canonical"])
data = self._extract_rehydration_data(tiktok_url) data = self._extract_rehydration_data(tiktok_url)
video_detail = data["webapp.video-detail"] if "webapp.video-detail" not in data:
if not self._check_status_code(video_detail, tiktok_url, "post"): # Only /video/ links result in the video-detail dict we
continue # need. Try again using that form of link.
post = video_detail["itemInfo"]["itemStruct"] tiktok_url = self._sanitize_url(
data["seo.abtest"]["canonical"])
data = self._extract_rehydration_data(tiktok_url)
video_detail = data["webapp.video-detail"]
if not self._check_status_code(
video_detail, tiktok_url, "post"):
continue
post = video_detail["itemInfo"]["itemStruct"]
post["user"] = (a := post.get("author")) and a["uniqueId"] or "" post["user"] = \
post["date"] = self.parse_timestamp(post["createTime"]) (a := post.get("author")) and a["uniqueId"] or ""
post["post_type"] = "image" if "imagePost" in post else "video" post["date"] = self.parse_timestamp(post["createTime"])
original_title = title = post["desc"] post["post_type"] = "image" if "imagePost" in post else "video"
original_title = title = post["desc"]
yield Message.Directory, "", post yield Message.Directory, "", post
ytdl_media = False ytdl_media = False
if "imagePost" in post: if "imagePost" in post:
if self.photo: if self.photo:
if not original_title:
title = f"TikTok photo #{post['id']}"
img_list = post["imagePost"]["images"]
for i, img in enumerate(img_list, 1):
url = img["imageURL"]["urlList"][0]
text.nameext_from_url(url, post)
post.update({
"type" : "image",
"image" : img,
"title" : title,
"num" : i,
"file_id": post["filename"].partition("~")[0],
"width" : img["imageWidth"],
"height" : img["imageHeight"],
})
yield Message.Url, url, post
if self.audio and "music" in post:
if self.audio == "ytdl":
ytdl_media = "audio"
elif url := self._extract_audio(post):
yield Message.Url, url, post
elif "video" in post:
if self.video == "ytdl":
ytdl_media = "video"
elif self.video and (url := self._extract_video(post)):
yield Message.Url, url, post
del post["_fallback"]
if self.cover:
for url in self._extract_covers(post, "video"):
yield Message.Url, url, post
if self.cover != "all":
break
if self.subtitles:
for url in self._extract_subtitles(post, "video"):
yield Message.Url, url, post
# remove the subtitle related fields for the next item
post.pop("subtitle_lang_id", None)
post.pop("subtitle_lang_codename", None)
post.pop("subtitle_format", None)
post.pop("subtitle_version", None)
post.pop("subtitle_source", None)
else:
self.log.info("%s: Skipping post", tiktok_url)
if ytdl_media:
if not original_title: if not original_title:
title = f"TikTok photo #{post['id']}" title = f"TikTok {ytdl_media} #{post['id']}"
img_list = post["imagePost"]["images"] post.update({
for i, img in enumerate(img_list, 1): "type" : ytdl_media,
url = img["imageURL"]["urlList"][0] "image" : None,
text.nameext_from_url(url, post) "filename" : "",
post.update({ "extension" :
"type" : "image", "mp3" if ytdl_media == "audio" else "mp4",
"image" : img, "title" : title,
"title" : title, "num" : 0,
"num" : i, "file_id" : "",
"file_id": post["filename"].partition("~")[0], "width" : 0,
"width" : img["imageWidth"], "height" : 0,
"height" : img["imageHeight"], })
}) yield Message.Url, "ytdl:" + tiktok_url, post
yield Message.Url, url, post except Exception as exc:
self.log.traceback(exc)
if self.audio and "music" in post: self.log.error("%s: Failed to extract post (%s: %s)",
if self.audio == "ytdl": tiktok_url, exc.__class__.__name__, exc)
ytdl_media = "audio"
elif url := self._extract_audio(post):
yield Message.Url, url, post
elif "video" in post:
if self.video == "ytdl":
ytdl_media = "video"
elif self.video and (url := self._extract_video(post)):
yield Message.Url, url, post
del post["_fallback"]
if self.cover:
for url in self._extract_covers(post, "video"):
yield Message.Url, url, post
if self.cover != "all":
break
if self.subtitles:
for url in self._extract_subtitles(post, "video"):
yield Message.Url, url, post
# remove the subtitle related fields for the next item
post.pop("subtitle_lang_id", None)
post.pop("subtitle_lang_codename", None)
post.pop("subtitle_format", None)
post.pop("subtitle_version", None)
post.pop("subtitle_source", None)
else:
self.log.info("%s: Skipping post", tiktok_url)
if ytdl_media:
if not original_title:
title = f"TikTok {ytdl_media} #{post['id']}"
post.update({
"type" : ytdl_media,
"image" : None,
"filename" : "",
"extension" : "mp3" if ytdl_media == "audio" else "mp4",
"title" : title,
"num" : 0,
"file_id" : "",
"width" : 0,
"height" : 0,
})
yield Message.Url, "ytdl:" + tiktok_url, post
def _sanitize_url(self, url): def _sanitize_url(self, url):
return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1)) return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1))