[tiktok] Restructure to allow user extractors to provide their own rehydration data (#8848)
This commit is contained in:
@@ -43,6 +43,7 @@ class TiktokExtractor(Extractor):
|
|||||||
def items(self):
|
def items(self):
|
||||||
for tiktok_url in self.posts():
|
for tiktok_url in self.posts():
|
||||||
tiktok_url = self._sanitize_url(tiktok_url)
|
tiktok_url = self._sanitize_url(tiktok_url)
|
||||||
|
|
||||||
data = self._extract_rehydration_data(tiktok_url)
|
data = self._extract_rehydration_data(tiktok_url)
|
||||||
if "webapp.video-detail" not in data:
|
if "webapp.video-detail" not in data:
|
||||||
# Only /video/ links result in the video-detail dict we need.
|
# Only /video/ links result in the video-detail dict we need.
|
||||||
@@ -51,11 +52,10 @@ class TiktokExtractor(Extractor):
|
|||||||
data["seo.abtest"]["canonical"])
|
data["seo.abtest"]["canonical"])
|
||||||
data = self._extract_rehydration_data(tiktok_url)
|
data = self._extract_rehydration_data(tiktok_url)
|
||||||
video_detail = data["webapp.video-detail"]
|
video_detail = data["webapp.video-detail"]
|
||||||
|
|
||||||
if not self._check_status_code(video_detail, tiktok_url, "post"):
|
if not self._check_status_code(video_detail, tiktok_url, "post"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
post = video_detail["itemInfo"]["itemStruct"]
|
post = video_detail["itemInfo"]["itemStruct"]
|
||||||
|
|
||||||
post["user"] = (a := post.get("author")) and a["uniqueId"] or ""
|
post["user"] = (a := post.get("author")) and a["uniqueId"] or ""
|
||||||
post["date"] = self.parse_timestamp(post["createTime"])
|
post["date"] = self.parse_timestamp(post["createTime"])
|
||||||
post["post_type"] = "image" if "imagePost" in post else "video"
|
post["post_type"] = "image" if "imagePost" in post else "video"
|
||||||
@@ -414,7 +414,7 @@ class TiktokPostExtractor(TiktokExtractor):
|
|||||||
def posts(self):
|
def posts(self):
|
||||||
user, post_id = self.groups
|
user, post_id = self.groups
|
||||||
url = f"{self.root}/@{user or ''}/video/{post_id}"
|
url = f"{self.root}/@{user or ''}/video/{post_id}"
|
||||||
return (url,)
|
return {url: None}
|
||||||
|
|
||||||
|
|
||||||
class TiktokVmpostExtractor(TiktokExtractor):
|
class TiktokVmpostExtractor(TiktokExtractor):
|
||||||
@@ -709,13 +709,13 @@ class TiktokSavedExtractor(TiktokExtractor):
|
|||||||
self.audio)
|
self.audio)
|
||||||
|
|
||||||
|
|
||||||
class TiktokFollowingExtractor(TiktokUserExtractor):
|
class TiktokFollowingExtractor(TiktokExtractor):
|
||||||
"""Extract all of the stories of all of the users you follow"""
|
"""Extract all of the stories of all of the users you follow"""
|
||||||
subcategory = "following"
|
subcategory = "following"
|
||||||
pattern = rf"{BASE_PATTERN}/following"
|
pattern = rf"{BASE_PATTERN}/following"
|
||||||
example = "https://www.tiktok.com/following"
|
example = "https://www.tiktok.com/following"
|
||||||
|
|
||||||
def items(self):
|
def posts(self):
|
||||||
"""Attempt to extract all of the stories of all of the accounts
|
"""Attempt to extract all of the stories of all of the accounts
|
||||||
the user follows"""
|
the user follows"""
|
||||||
|
|
||||||
@@ -732,7 +732,7 @@ class TiktokFollowingExtractor(TiktokUserExtractor):
|
|||||||
self.log.warning("%s: No followers with stories could be "
|
self.log.warning("%s: No followers with stories could be "
|
||||||
"extracted", self.url)
|
"extracted", self.url)
|
||||||
|
|
||||||
entries = []
|
entries = {}
|
||||||
# Batch all of the users up into groups of at most ten and use the
|
# Batch all of the users up into groups of at most ten and use the
|
||||||
# batch endpoint to improve performance. The response to the story user
|
# batch endpoint to improve performance. The response to the story user
|
||||||
# list request may also include the user themselves, so skip them if
|
# list request may also include the user themselves, so skip them if
|
||||||
@@ -769,13 +769,11 @@ class TiktokFollowingExtractor(TiktokUserExtractor):
|
|||||||
request.execute(self, f"Batch {batch_number}", query_parameters)
|
request.execute(self, f"Batch {batch_number}", query_parameters)
|
||||||
# We technically don't need to have the correct user name in the
|
# We technically don't need to have the correct user name in the
|
||||||
# URL and it's easier to just ignore it here.
|
# URL and it's easier to just ignore it here.
|
||||||
entries += request.generate_urls("https://www.tiktok.com/@_",
|
entries.update(request.generate_urls("https://www.tiktok.com/@_",
|
||||||
self.video, self.photo,
|
self.video, self.photo,
|
||||||
self.audio)
|
self.audio))
|
||||||
|
|
||||||
for video in entries:
|
return entries
|
||||||
data = {"_extractor": TiktokPostExtractor}
|
|
||||||
yield Message.Queue, video, data
|
|
||||||
|
|
||||||
def _is_current_user(self, user_id):
|
def _is_current_user(self, user_id):
|
||||||
self._ensure_rehydration_data_app_context_cache_is_populated()
|
self._ensure_rehydration_data_app_context_cache_is_populated()
|
||||||
@@ -1084,9 +1082,10 @@ class TiktokPaginationRequest:
|
|||||||
|
|
||||||
Returns
|
Returns
|
||||||
-------
|
-------
|
||||||
list
|
dict
|
||||||
Ideally one URL for each item, although subclasses are
|
Ideally one URL for each item, that points to a video detail
|
||||||
permitted to return a list of any format they wish.
|
object, although subclasses are permitted to return a list
|
||||||
|
or dict of any format they wish.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return []
|
return []
|
||||||
@@ -1193,7 +1192,7 @@ class TiktokItemListRequest(TiktokPaginationRequest):
|
|||||||
return len(self.items) > max(r.stop for r in self.range_predicate) - 1
|
return len(self.items) > max(r.stop for r in self.range_predicate) - 1
|
||||||
|
|
||||||
def generate_urls(self, profile_url, video, photo, audio):
|
def generate_urls(self, profile_url, video, photo, audio):
|
||||||
urls = []
|
urls = {}
|
||||||
for index, id in enumerate(self.items.keys()):
|
for index, id in enumerate(self.items.keys()):
|
||||||
if not self._matches_filters(self.items.get(id), index + 1, video,
|
if not self._matches_filters(self.items.get(id), index + 1, video,
|
||||||
photo, audio):
|
photo, audio):
|
||||||
@@ -1207,7 +1206,7 @@ class TiktokItemListRequest(TiktokPaginationRequest):
|
|||||||
except KeyError:
|
except KeyError:
|
||||||
# Use the given profile URL as a back up.
|
# Use the given profile URL as a back up.
|
||||||
url = f"{profile_url}/video/{id}"
|
url = f"{profile_url}/video/{id}"
|
||||||
urls.append(url)
|
urls[url] = self.items.get(id)
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
def _matches_filters(self, item, index, video, photo, audio):
|
def _matches_filters(self, item, index, video, photo, audio):
|
||||||
|
|||||||
Reference in New Issue
Block a user