From fd5f5611f6b0b2f51b960d9d045bb1d3a4d49ff5 Mon Sep 17 00:00:00 2001 From: bassberry <5092217+bassberry@users.noreply.github.com> Date: Fri, 30 Jan 2026 21:01:06 +0100 Subject: [PATCH] [tiktok] extract subtitles and all cover types (#8805) * Make sure that `img_id`, `audio_id` and `cover_id` fields are always available. The values are set '' where they are not applicable. Having `img_id` is necessary for the default `archive_fmt`, the other fields are handled for consistency. * Allow downloading more than one cover. The previous behavior is kept as-is, but setting the "covers" option to "all" now grabs all available covers. * Add support for downloading subtitles Allows filtering subtitles by source type (ASR, MT) and language. * Ensure archive uniqueness for covers and subtitles. * Update the URL test pattern to include the `image` extension. Although Tiktok may serve the covers with jpeg content, the file ending can be `.image`. The test before 0c14b164 failed because the asserted URL did not match all cover types, but the now used pattern needs the mentioned file ending. * Add support for "creator_caption" subtitles in "LC" format. These subtitles have the keys "Format" set to "creator_caption" and "Source" to "LC". * Add "LC" (Local Captions) as a subtitle source type in the documentation * Code deduplication and renaming subtitle metadata Changed the item type from singular `subtitle` to `subtitles`. Removed the wrong descriptor `cover` from the subtitles fallback title. * Refactor subtitle filtering The filter is now prepared in `_init` to prevent parsing the same config parameter for every item. The `_extract_subtitles` function will still extract if either filter (source or language) matches. * Generate a `file_id` for subtitles Subtitles have multiple fields that determine the unique file, so these are simply concatenated. This is similar to the cover types, only with more variations. * Added tests for subtitles * fix docs entries * fix '"covers": "all"' * simplify some code * Fix fallback title for subtitles Added the missing "f" to the f-string and added "subtitle" to the title. The resulting title will look like "TikTok video subtitle #1234567" --- docs/configuration.rst | 56 ++++++++++++- docs/gallery-dl.conf | 9 ++- gallery_dl/extractor/tiktok.py | 138 ++++++++++++++++++++++++++------- test/results/tiktok.py | 55 ++++++++++++- 4 files changed, 225 insertions(+), 33 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index a0e13d3a..68b63d79 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -5914,12 +5914,25 @@ Description extractor.tiktok.covers ----------------------- Type - ``bool`` + * ``bool`` + * ``string`` Default ``false`` Description Download video covers. + ``true`` + Download the first cover found in the following order: + + * ``thumbnail`` + * ``cover`` + * ``originCover`` + * ``dynamicCover`` + ``false`` + Do not download covers + ``"all"`` + Download all available covers + extractor.tiktok.photos ----------------------- @@ -5931,6 +5944,47 @@ Description Download photos. +extractor.tiktok.subtitles +-------------------------- +Type + * ``bool`` + * ``string`` +Default + ``false`` +Example + * ``"all"`` + * ``"ASR,MT,LC"`` + * ``"ASR,eng-US"`` +Description + Download video subtitles. + The subtitles can be filtered by source or language. + The following source types can be filtered: + + * ``ASR`` - Automatic Speech Recognition + * ``MT`` - Machine Translation + * ``LC`` - Local Captions / Creator Captions + + If both source types and language codes are provided, + only subtitles matching both are downloaded. + + ``true`` + Download all subtitles tagged ``ASR`` + ``false`` + Do not download subtitles + ``"all"`` + Download all available subtitles. + ``"ASR,MT,eng-US,cmn-Hans-CN"`` + Download english and simplified chinese subtitles + that are either automatically recognized or machine translated. + + The source types and languages can be listed in any order. +Note + It is not possible to filter all subtitles of a specific source type, + while also filtering for additional languages of another source type. + (e.g. any ASR subtitle + fra-FR of any source type) + For this, refer to `extractor.*.image-filter`_. + + extractor.tiktok.videos ----------------------- Type diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 1f1b1724..c71d4430 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -825,10 +825,11 @@ }, "tiktok": { - "audio" : true, - "covers": false, - "photos": true, - "videos": true, + "audio" : true, + "covers" : false, + "photos" : true, + "subtitles": false, + "videos" : true, "tiktok-range": "", "posts": { diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py index 3025cace..c94f7a48 100644 --- a/gallery_dl/extractor/tiktok.py +++ b/gallery_dl/extractor/tiktok.py @@ -36,10 +36,25 @@ class TiktokExtractor(Extractor): self.audio = self.config("audio", True) self.video = self.config("videos", True) self.cover = self.config("covers", False) + self.subtitles = self.config("subtitles", False) self.range = self.config("tiktok-range") or "" self.range_predicate = util.predicate_range_parse(self.range) + # If one of these fields is None, the filter for it is disabled. + # Therefore, if both fields are none, all subtitles are extracted. + self.subtitle_sources = None + self.subtitle_langs = None + + if self.subtitles and self.subtitles != "all": + if self.subtitles is True or not isinstance(self.subtitles, str): + self.subtitles = "ASR" + + known_sources = {"ASR", "MT", "LC"} + filters = set(self.subtitles.split(",")) + self.subtitle_sources = known_sources.intersection(filters) or None + self.subtitle_langs = filters.difference(known_sources) or None + def items(self): for tiktok_url in self.posts(): tiktok_url = self._sanitize_url(tiktok_url) @@ -73,13 +88,13 @@ class TiktokExtractor(Extractor): url = img["imageURL"]["urlList"][0] text.nameext_from_url(url, post) post.update({ - "type" : "image", - "image" : img, - "title" : title, - "num" : i, + "type" : "image", + "image" : img, + "title" : title, + "num" : i, "file_id": post["filename"].partition("~")[0], - "width" : img["imageWidth"], - "height": img["imageHeight"], + "width" : img["imageWidth"], + "height" : img["imageHeight"], }) yield Message.Url, url, post @@ -95,9 +110,23 @@ class TiktokExtractor(Extractor): elif self.video and (url := self._extract_video(post)): yield Message.Url, url, post del post["_fallback"] - if self.cover and (url := self._extract_cover(post, "video")): - yield Message.Url, url, post + if self.cover: + for url in self._extract_covers(post, "video"): + yield Message.Url, url, post + if self.cover != "all": + break + + if self.subtitles: + for url in self._extract_subtitles(post, "video"): + yield Message.Url, url, post + + # remove the subtitle related fields for the next item + post.pop("subtitle_lang_id", None) + post.pop("subtitle_lang_codename", None) + post.pop("subtitle_format", None) + post.pop("subtitle_version", None) + post.pop("subtitle_source", None) else: self.log.info("%s: Skipping post", tiktok_url) @@ -277,7 +306,7 @@ class TiktokExtractor(Extractor): "title" : post["desc"] or f"TikTok video #{post['id']}", "duration" : video.get("duration"), "num" : 0, - "file_id" : video.get("id"), + "file_id" : "", "width" : video.get("width"), "height" : video.get("height"), }) @@ -334,28 +363,85 @@ class TiktokExtractor(Extractor): post["extension"] = "mp3" return url - def _extract_cover(self, post, type): + def _extract_covers(self, post, type): media = post[type] for cover_id in ("thumbnail", "cover", "originCover", "dynamicCover"): if url := media.get(cover_id): - break - else: - return + text.nameext_from_url(url, post) + post.update({ + "type" : "cover", + "extension": "jpg", + "image" : url, + "title" : post["desc"] or + f"TikTok {type} cover #{post['id']}", + "duration" : media.get("duration"), + "num" : 0, + "file_id" : cover_id, + "width" : 0, + "height" : 0, + }) + yield url - text.nameext_from_url(url, post) - post.update({ - "type" : "cover", - "extension": "jpg", - "image" : url, - "title" : post["desc"] or f"TikTok {type} cover #{post['id']}", - "duration" : media.get("duration"), - "num" : 0, - "file_id" : cover_id, - "width" : 0, - "height" : 0, - }) - return url + def _extract_subtitles(self, post, type): + media = post[type] + sources_filtered = self.subtitle_sources is not None + langs_filtered = self.subtitle_langs is not None + + for subtitle in media.get("subtitleInfos", ()): + sub_lang_id = subtitle.get("LanguageID") + sub_lang_codename = subtitle.get("LanguageCodeName") + sub_format = subtitle.get("Format") + sub_version = subtitle.get("Version") + sub_source = subtitle.get("Source") + + # guard the iterable access + sources_match = sources_filtered and \ + sub_source in self.subtitle_sources + langs_match = langs_filtered and \ + sub_lang_codename in self.subtitle_langs + + # Subtitles will be extracted when either filter matches. + if not sources_match and not langs_match and \ + (sources_filtered or langs_filtered): + continue + + if url := subtitle.get("Url"): + text.nameext_from_url(url, post) + + # subtitle urls may not specify a filename, + # so the metadata can be used to build one. + if not post["filename"]: + post["filename"] = (f"{post['id']}_{sub_lang_codename}_" + f"{sub_version}_{sub_source}") + post["extension"] = sub_format.lower() + + # replace extensions for known formats + if post["extension"] == "webvtt": + post["extension"] = "vtt" + elif post["extension"] == "creator_caption": + post["extension"] = "json" + + post.update({ + "type" : "subtitle", + "image" : None, + "title" : + post["desc"] or + f"TikTok {type} subtitle #{post['id']}", + "duration" : media.get("duration"), + "num" : 0, + "file_id" : + f"{sub_lang_id}_{sub_lang_codename}_{sub_source}_" + f"{sub_version}_{sub_format}", + "subtitle_lang_id" : sub_lang_id, + "subtitle_lang_codename": sub_lang_codename, + "subtitle_format" : sub_format, + "subtitle_version" : sub_version, + "subtitle_source" : sub_source, + "width" : 0, + "height" : 0, + }) + yield url def _check_status_code(self, detail, url, type_of_url): status = detail.get("statusCode") diff --git a/test/results/tiktok.py b/test/results/tiktok.py index 3f8f0383..b601757f 100644 --- a/test/results/tiktok.py +++ b/test/results/tiktok.py @@ -6,12 +6,13 @@ from gallery_dl.extractor import tiktok -PATTERN = r"https://p1[69]-[^/?#.]+\.tiktokcdn[^/?#.]*\.com/[^/?#]+/\w+~.*\.jpe?g" +PATTERN = r"https://p1[69]-[^/?#.]+\.tiktokcdn[^/?#.]*\.com/[^/?#]+/\w+~.*\.(jpe?g|image)" PATTERN_WITH_AUDIO = r"(?:" + PATTERN + r"|https://v\d+m?\.tiktokcdn[^/?#.]*\.com/[^?#]+\?[^/?#]+)" VIDEO_PATTERN = r"https://v1[69]-webapp-prime.tiktok.com/video/tos/[^?#]+\?[^/?#]+" OLD_VIDEO_PATTERN = r"https://www.tiktok.com/aweme/v1/play/\?[^/?#]+" COMBINED_VIDEO_PATTERN = r"(?:" + VIDEO_PATTERN + r")|(?:" + OLD_VIDEO_PATTERN + r")" USER_PATTERN = r"(https://www.tiktok.com/@([\w_.-]+)/video/(\d+)|" + PATTERN + r")" +SUBTITLE_PATTERN = r"https://v1[69]-[^/?#.]+\.tiktokcdn[^/?#.]*\.com/[^/?#]+/.*" __tests__ = ( @@ -127,10 +128,22 @@ __tests__ = ( "#url" : "https://www.tiktok.com/@memezar/video/7449708266168274208", "#comment" : "video post cover image", "#class" : tiktok.TiktokPostExtractor, - "#pattern" : r"https://p19-common-sign-useastred.tiktokcdn-eu.com/tos-useast2a-p-0037-euttp/o4rVzhI1bABhooAaEqtCAYGi6nijIsDib8NGfC~tplv-tiktokx-origin.image\?dr=10395&x-expires=\d+&x-signature=.+", + "#pattern" : PATTERN, + "#count" : 1, "#options" : {"videos": False, "covers": True}, +}, + +{ + "#url" : "https://www.tiktok.com/@memezar/video/7449708266168274208", + "#comment" : "all video post cover images", + "#class" : tiktok.TiktokPostExtractor, + "#pattern" : PATTERN, + "#count" : 3, + "#options" : {"videos": False, "covers": "all"}, + + }, { @@ -211,6 +224,44 @@ __tests__ = ( "#options" : {"videos": "ytdl"}, }, +{ + "#url" : "https://www.tiktok.com/@memezar/video/7588916452304997635", + "#comment" : "default subtitles", + "#class" : tiktok.TiktokPostExtractor, + "#pattern" : SUBTITLE_PATTERN, + "#count" : 1, + "#options" : {"videos": False, "covers": False, "subtitles": True} +}, + +{ + "#url" : "https://www.tiktok.com/@memezar/video/7588916452304997635", + "#comment" : "english subtitles", + "#class" : tiktok.TiktokPostExtractor, + "#pattern" : SUBTITLE_PATTERN, + "#count" : 1, + "#options" : {"videos": False, "covers": False, "subtitles": "eng-US"} +}, + +# This test is prone to break when more translation agents are added! +{ + "#url" : "https://www.tiktok.com/@memezar/video/7588916452304997635", + "#comment" : "combined subtitle filter", + "#class" : tiktok.TiktokPostExtractor, + "#pattern" : SUBTITLE_PATTERN, + "#count" : 6, + "#options" : {"videos": False, "covers": False, "subtitles": "ASR,deu-DE"} +}, + +# This test is prone to break when new languages or more translation agents are added! +{ + "#url" : "https://www.tiktok.com/@memezar/video/7588916452304997635", + "#comment" : "all subtitles", + "#class" : tiktok.TiktokPostExtractor, + "#pattern" : SUBTITLE_PATTERN, + "#count" : 64, + "#options" : {"videos": False, "covers": False, "subtitles": "all"} +}, + { "#url" : "https://vm.tiktok.com/ZGdh4WUhr/", "#comment" : "vm.tiktok.com link: many photos",