[tiktok] do not fail entire extraction if one post fails (#8962)
This commit is contained in:
@@ -57,94 +57,102 @@ class TiktokExtractor(Extractor):
|
|||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
for tiktok_url in self.posts():
|
for tiktok_url in self.posts():
|
||||||
tiktok_url = self._sanitize_url(tiktok_url)
|
try:
|
||||||
|
tiktok_url = self._sanitize_url(tiktok_url)
|
||||||
|
|
||||||
data = self._extract_rehydration_data(tiktok_url)
|
|
||||||
if "webapp.video-detail" not in data:
|
|
||||||
# Only /video/ links result in the video-detail dict we need.
|
|
||||||
# Try again using that form of link.
|
|
||||||
tiktok_url = self._sanitize_url(
|
|
||||||
data["seo.abtest"]["canonical"])
|
|
||||||
data = self._extract_rehydration_data(tiktok_url)
|
data = self._extract_rehydration_data(tiktok_url)
|
||||||
video_detail = data["webapp.video-detail"]
|
if "webapp.video-detail" not in data:
|
||||||
if not self._check_status_code(video_detail, tiktok_url, "post"):
|
# Only /video/ links result in the video-detail dict we
|
||||||
continue
|
# need. Try again using that form of link.
|
||||||
post = video_detail["itemInfo"]["itemStruct"]
|
tiktok_url = self._sanitize_url(
|
||||||
|
data["seo.abtest"]["canonical"])
|
||||||
|
data = self._extract_rehydration_data(tiktok_url)
|
||||||
|
video_detail = data["webapp.video-detail"]
|
||||||
|
if not self._check_status_code(
|
||||||
|
video_detail, tiktok_url, "post"):
|
||||||
|
continue
|
||||||
|
post = video_detail["itemInfo"]["itemStruct"]
|
||||||
|
|
||||||
post["user"] = (a := post.get("author")) and a["uniqueId"] or ""
|
post["user"] = \
|
||||||
post["date"] = self.parse_timestamp(post["createTime"])
|
(a := post.get("author")) and a["uniqueId"] or ""
|
||||||
post["post_type"] = "image" if "imagePost" in post else "video"
|
post["date"] = self.parse_timestamp(post["createTime"])
|
||||||
original_title = title = post["desc"]
|
post["post_type"] = "image" if "imagePost" in post else "video"
|
||||||
|
original_title = title = post["desc"]
|
||||||
|
|
||||||
yield Message.Directory, "", post
|
yield Message.Directory, "", post
|
||||||
ytdl_media = False
|
ytdl_media = False
|
||||||
|
|
||||||
if "imagePost" in post:
|
if "imagePost" in post:
|
||||||
if self.photo:
|
if self.photo:
|
||||||
|
if not original_title:
|
||||||
|
title = f"TikTok photo #{post['id']}"
|
||||||
|
img_list = post["imagePost"]["images"]
|
||||||
|
for i, img in enumerate(img_list, 1):
|
||||||
|
url = img["imageURL"]["urlList"][0]
|
||||||
|
text.nameext_from_url(url, post)
|
||||||
|
post.update({
|
||||||
|
"type" : "image",
|
||||||
|
"image" : img,
|
||||||
|
"title" : title,
|
||||||
|
"num" : i,
|
||||||
|
"file_id": post["filename"].partition("~")[0],
|
||||||
|
"width" : img["imageWidth"],
|
||||||
|
"height" : img["imageHeight"],
|
||||||
|
})
|
||||||
|
yield Message.Url, url, post
|
||||||
|
|
||||||
|
if self.audio and "music" in post:
|
||||||
|
if self.audio == "ytdl":
|
||||||
|
ytdl_media = "audio"
|
||||||
|
elif url := self._extract_audio(post):
|
||||||
|
yield Message.Url, url, post
|
||||||
|
|
||||||
|
elif "video" in post:
|
||||||
|
if self.video == "ytdl":
|
||||||
|
ytdl_media = "video"
|
||||||
|
elif self.video and (url := self._extract_video(post)):
|
||||||
|
yield Message.Url, url, post
|
||||||
|
del post["_fallback"]
|
||||||
|
|
||||||
|
if self.cover:
|
||||||
|
for url in self._extract_covers(post, "video"):
|
||||||
|
yield Message.Url, url, post
|
||||||
|
if self.cover != "all":
|
||||||
|
break
|
||||||
|
|
||||||
|
if self.subtitles:
|
||||||
|
for url in self._extract_subtitles(post, "video"):
|
||||||
|
yield Message.Url, url, post
|
||||||
|
|
||||||
|
# remove the subtitle related fields for the next item
|
||||||
|
post.pop("subtitle_lang_id", None)
|
||||||
|
post.pop("subtitle_lang_codename", None)
|
||||||
|
post.pop("subtitle_format", None)
|
||||||
|
post.pop("subtitle_version", None)
|
||||||
|
post.pop("subtitle_source", None)
|
||||||
|
else:
|
||||||
|
self.log.info("%s: Skipping post", tiktok_url)
|
||||||
|
|
||||||
|
if ytdl_media:
|
||||||
if not original_title:
|
if not original_title:
|
||||||
title = f"TikTok photo #{post['id']}"
|
title = f"TikTok {ytdl_media} #{post['id']}"
|
||||||
img_list = post["imagePost"]["images"]
|
post.update({
|
||||||
for i, img in enumerate(img_list, 1):
|
"type" : ytdl_media,
|
||||||
url = img["imageURL"]["urlList"][0]
|
"image" : None,
|
||||||
text.nameext_from_url(url, post)
|
"filename" : "",
|
||||||
post.update({
|
"extension" :
|
||||||
"type" : "image",
|
"mp3" if ytdl_media == "audio" else "mp4",
|
||||||
"image" : img,
|
"title" : title,
|
||||||
"title" : title,
|
"num" : 0,
|
||||||
"num" : i,
|
"file_id" : "",
|
||||||
"file_id": post["filename"].partition("~")[0],
|
"width" : 0,
|
||||||
"width" : img["imageWidth"],
|
"height" : 0,
|
||||||
"height" : img["imageHeight"],
|
})
|
||||||
})
|
yield Message.Url, "ytdl:" + tiktok_url, post
|
||||||
yield Message.Url, url, post
|
except Exception as exc:
|
||||||
|
self.log.traceback(exc)
|
||||||
if self.audio and "music" in post:
|
self.log.error("%s: Failed to extract post (%s: %s)",
|
||||||
if self.audio == "ytdl":
|
tiktok_url, exc.__class__.__name__, exc)
|
||||||
ytdl_media = "audio"
|
|
||||||
elif url := self._extract_audio(post):
|
|
||||||
yield Message.Url, url, post
|
|
||||||
|
|
||||||
elif "video" in post:
|
|
||||||
if self.video == "ytdl":
|
|
||||||
ytdl_media = "video"
|
|
||||||
elif self.video and (url := self._extract_video(post)):
|
|
||||||
yield Message.Url, url, post
|
|
||||||
del post["_fallback"]
|
|
||||||
|
|
||||||
if self.cover:
|
|
||||||
for url in self._extract_covers(post, "video"):
|
|
||||||
yield Message.Url, url, post
|
|
||||||
if self.cover != "all":
|
|
||||||
break
|
|
||||||
|
|
||||||
if self.subtitles:
|
|
||||||
for url in self._extract_subtitles(post, "video"):
|
|
||||||
yield Message.Url, url, post
|
|
||||||
|
|
||||||
# remove the subtitle related fields for the next item
|
|
||||||
post.pop("subtitle_lang_id", None)
|
|
||||||
post.pop("subtitle_lang_codename", None)
|
|
||||||
post.pop("subtitle_format", None)
|
|
||||||
post.pop("subtitle_version", None)
|
|
||||||
post.pop("subtitle_source", None)
|
|
||||||
else:
|
|
||||||
self.log.info("%s: Skipping post", tiktok_url)
|
|
||||||
|
|
||||||
if ytdl_media:
|
|
||||||
if not original_title:
|
|
||||||
title = f"TikTok {ytdl_media} #{post['id']}"
|
|
||||||
post.update({
|
|
||||||
"type" : ytdl_media,
|
|
||||||
"image" : None,
|
|
||||||
"filename" : "",
|
|
||||||
"extension" : "mp3" if ytdl_media == "audio" else "mp4",
|
|
||||||
"title" : title,
|
|
||||||
"num" : 0,
|
|
||||||
"file_id" : "",
|
|
||||||
"width" : 0,
|
|
||||||
"height" : 0,
|
|
||||||
})
|
|
||||||
yield Message.Url, "ytdl:" + tiktok_url, post
|
|
||||||
|
|
||||||
def _sanitize_url(self, url):
|
def _sanitize_url(self, url):
|
||||||
return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1))
|
return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1))
|
||||||
|
|||||||
Reference in New Issue
Block a user