From fd5f5611f6b0b2f51b960d9d045bb1d3a4d49ff5 Mon Sep 17 00:00:00 2001
From: bassberry <5092217+bassberry@users.noreply.github.com>
Date: Fri, 30 Jan 2026 21:01:06 +0100
Subject: [PATCH] [tiktok] extract subtitles and all cover types (#8805)

* Make sure that `img_id`, `audio_id` and `cover_id` fields are always available.
    The values are set '' where they are not applicable.
    Having `img_id` is necessary for the default `archive_fmt`, the other fields are handled for consistency.
* Allow downloading more than one cover.
    The previous behavior is kept as-is, but setting the "covers" option to "all" now grabs all available covers.
* Add support for downloading subtitles
    Allows filtering subtitles by source type (ASR, MT) and language.
* Ensure archive uniqueness for covers and subtitles.
* Update the URL test pattern to include the `image` extension.
    Although Tiktok may serve the covers with jpeg content, the file ending can be `.image`.
    The test before 0c14b164 failed because the asserted URL did not match all cover types, but the now used pattern needs the mentioned file ending.
* Add support for "creator_caption" subtitles in "LC" format.
    These subtitles have the keys "Format" set to "creator_caption" and "Source" to "LC".
* Add "LC" (Local Captions) as a subtitle source type in the documentation
* Code deduplication and renaming subtitle metadata
    Changed the item type from singular `subtitle` to `subtitles`.
    Removed the wrong descriptor `cover` from the subtitles fallback title.
* Refactor subtitle filtering
    The filter is now prepared in `_init` to prevent parsing the same config parameter for every item.
    The `_extract_subtitles` function will still extract if either filter (source or language) matches.
* Generate a `file_id` for subtitles
    Subtitles have multiple fields that determine the unique file, so these are simply concatenated.
    This is similar to the cover types, only with more variations.
* Added tests for subtitles
* fix docs entries
* fix '"covers": "all"'
* simplify some code
* Fix fallback title for subtitles
    Added the missing "f" to the f-string and added "subtitle" to the title.
    The resulting title will look like "TikTok video subtitle #1234567"
---
 docs/configuration.rst         |  56 ++++++++++++-
 docs/gallery-dl.conf           |   9 ++-
 gallery_dl/extractor/tiktok.py | 138 ++++++++++++++++++++++++++-------
 test/results/tiktok.py         |  55 ++++++++++++-
 4 files changed, 225 insertions(+), 33 deletions(-)

diff --git a/docs/configuration.rst b/docs/configuration.rst
index a0e13d3a..68b63d79 100644
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -5914,12 +5914,25 @@ Description
 extractor.tiktok.covers
 -----------------------
 Type
-    ``bool``
+    * ``bool``
+    * ``string``
 Default
     ``false``
 Description
     Download video covers.
 
+    ``true``
+        Download the first cover found in the following order:
+
+        * ``thumbnail``
+        * ``cover``
+        * ``originCover``
+        * ``dynamicCover``
+    ``false``
+        Do not download covers
+    ``"all"``
+        Download all available covers
+
 
 extractor.tiktok.photos
 -----------------------
@@ -5931,6 +5944,47 @@ Description
     Download photos.
 
 
+extractor.tiktok.subtitles
+--------------------------
+Type
+    * ``bool``
+    * ``string``
+Default
+    ``false``
+Example
+    * ``"all"``
+    * ``"ASR,MT,LC"``
+    * ``"ASR,eng-US"``
+Description
+    Download video subtitles.
+    The subtitles can be filtered by source or language.
+    The following source types can be filtered:
+
+    * ``ASR`` - Automatic Speech Recognition
+    * ``MT`` - Machine Translation
+    * ``LC`` - Local Captions / Creator Captions
+
+    If both source types and language codes are provided,
+    only subtitles matching both are downloaded.
+
+    ``true``
+        Download all subtitles tagged ``ASR``
+    ``false``
+        Do not download subtitles
+    ``"all"``
+        Download all available subtitles.
+    ``"ASR,MT,eng-US,cmn-Hans-CN"``
+        Download english and simplified chinese subtitles
+        that are either automatically recognized or machine translated.
+
+        The source types and languages can be listed in any order.
+Note
+    It is not possible to filter all subtitles of a specific source type,
+    while also filtering for additional languages of another source type.
+    (e.g. any ASR subtitle + fra-FR of any source type)
+    For this, refer to `extractor.*.image-filter`_.
+
+
 extractor.tiktok.videos
 -----------------------
 Type
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index 1f1b1724..c71d4430 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -825,10 +825,11 @@
         },
         "tiktok":
         {
-            "audio" : true,
-            "covers": false,
-            "photos": true,
-            "videos": true,
+            "audio"    : true,
+            "covers"   : false,
+            "photos"   : true,
+            "subtitles": false,
+            "videos"   : true,
             "tiktok-range": "",
 
             "posts": {
diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
index 3025cace..c94f7a48 100644
--- a/gallery_dl/extractor/tiktok.py
+++ b/gallery_dl/extractor/tiktok.py
@@ -36,10 +36,25 @@ class TiktokExtractor(Extractor):
         self.audio = self.config("audio", True)
         self.video = self.config("videos", True)
         self.cover = self.config("covers", False)
+        self.subtitles = self.config("subtitles", False)
 
         self.range = self.config("tiktok-range") or ""
         self.range_predicate = util.predicate_range_parse(self.range)
 
+        # If one of these fields is None, the filter for it is disabled.
+        # Therefore, if both fields are none, all subtitles are extracted.
+        self.subtitle_sources = None
+        self.subtitle_langs = None
+
+        if self.subtitles and self.subtitles != "all":
+            if self.subtitles is True or not isinstance(self.subtitles, str):
+                self.subtitles = "ASR"
+
+            known_sources = {"ASR", "MT", "LC"}
+            filters = set(self.subtitles.split(","))
+            self.subtitle_sources = known_sources.intersection(filters) or None
+            self.subtitle_langs = filters.difference(known_sources) or None
+
     def items(self):
         for tiktok_url in self.posts():
             tiktok_url = self._sanitize_url(tiktok_url)
@@ -73,13 +88,13 @@ class TiktokExtractor(Extractor):
                         url = img["imageURL"]["urlList"][0]
                         text.nameext_from_url(url, post)
                         post.update({
-                            "type"  : "image",
-                            "image" : img,
-                            "title" : title,
-                            "num"   : i,
+                            "type"   : "image",
+                            "image"  : img,
+                            "title"  : title,
+                            "num"    : i,
                             "file_id": post["filename"].partition("~")[0],
-                            "width" : img["imageWidth"],
-                            "height": img["imageHeight"],
+                            "width"  : img["imageWidth"],
+                            "height" : img["imageHeight"],
                         })
                         yield Message.Url, url, post
 
@@ -95,9 +110,23 @@ class TiktokExtractor(Extractor):
                 elif self.video and (url := self._extract_video(post)):
                     yield Message.Url, url, post
                     del post["_fallback"]
-                if self.cover and (url := self._extract_cover(post, "video")):
-                    yield Message.Url, url, post
 
+                if self.cover:
+                    for url in self._extract_covers(post, "video"):
+                        yield Message.Url, url, post
+                        if self.cover != "all":
+                            break
+
+                if self.subtitles:
+                    for url in self._extract_subtitles(post, "video"):
+                        yield Message.Url, url, post
+
+                    # remove the subtitle related fields for the next item
+                    post.pop("subtitle_lang_id", None)
+                    post.pop("subtitle_lang_codename", None)
+                    post.pop("subtitle_format", None)
+                    post.pop("subtitle_version", None)
+                    post.pop("subtitle_source", None)
             else:
                 self.log.info("%s: Skipping post", tiktok_url)
 
@@ -277,7 +306,7 @@ class TiktokExtractor(Extractor):
             "title"    : post["desc"] or f"TikTok video #{post['id']}",
             "duration" : video.get("duration"),
             "num"      : 0,
-            "file_id"  : video.get("id"),
+            "file_id"  : "",
             "width"    : video.get("width"),
             "height"   : video.get("height"),
         })
@@ -334,28 +363,85 @@ class TiktokExtractor(Extractor):
             post["extension"] = "mp3"
         return url
 
-    def _extract_cover(self, post, type):
+    def _extract_covers(self, post, type):
         media = post[type]
 
         for cover_id in ("thumbnail", "cover", "originCover", "dynamicCover"):
             if url := media.get(cover_id):
-                break
-        else:
-            return
+                text.nameext_from_url(url, post)
+                post.update({
+                    "type"     : "cover",
+                    "extension": "jpg",
+                    "image"    : url,
+                    "title"    : post["desc"] or
+                                 f"TikTok {type} cover #{post['id']}",
+                    "duration" : media.get("duration"),
+                    "num"      : 0,
+                    "file_id"  : cover_id,
+                    "width"    : 0,
+                    "height"   : 0,
+                })
+                yield url
 
-        text.nameext_from_url(url, post)
-        post.update({
-            "type"     : "cover",
-            "extension": "jpg",
-            "image"    : url,
-            "title"    : post["desc"] or f"TikTok {type} cover #{post['id']}",
-            "duration" : media.get("duration"),
-            "num"      : 0,
-            "file_id"  : cover_id,
-            "width"    : 0,
-            "height"   : 0,
-        })
-        return url
+    def _extract_subtitles(self, post, type):
+        media = post[type]
+        sources_filtered = self.subtitle_sources is not None
+        langs_filtered = self.subtitle_langs is not None
+
+        for subtitle in media.get("subtitleInfos", ()):
+            sub_lang_id = subtitle.get("LanguageID")
+            sub_lang_codename = subtitle.get("LanguageCodeName")
+            sub_format = subtitle.get("Format")
+            sub_version = subtitle.get("Version")
+            sub_source = subtitle.get("Source")
+
+            # guard the iterable access
+            sources_match = sources_filtered and \
+                sub_source in self.subtitle_sources
+            langs_match = langs_filtered and \
+                sub_lang_codename in self.subtitle_langs
+
+            # Subtitles will be extracted when either filter matches.
+            if not sources_match and not langs_match and \
+                    (sources_filtered or langs_filtered):
+                continue
+
+            if url := subtitle.get("Url"):
+                text.nameext_from_url(url, post)
+
+                # subtitle urls may not specify a filename,
+                # so the metadata can be used to build one.
+                if not post["filename"]:
+                    post["filename"] = (f"{post['id']}_{sub_lang_codename}_"
+                                        f"{sub_version}_{sub_source}")
+                    post["extension"] = sub_format.lower()
+
+                    # replace extensions for known formats
+                    if post["extension"] == "webvtt":
+                        post["extension"] = "vtt"
+                    elif post["extension"] == "creator_caption":
+                        post["extension"] = "json"
+
+                post.update({
+                    "type"                  : "subtitle",
+                    "image"                 : None,
+                    "title"                 :
+                        post["desc"] or
+                        f"TikTok {type} subtitle #{post['id']}",
+                    "duration"              : media.get("duration"),
+                    "num"                   : 0,
+                    "file_id"               :
+                        f"{sub_lang_id}_{sub_lang_codename}_{sub_source}_"
+                        f"{sub_version}_{sub_format}",
+                    "subtitle_lang_id"      : sub_lang_id,
+                    "subtitle_lang_codename": sub_lang_codename,
+                    "subtitle_format"       : sub_format,
+                    "subtitle_version"      : sub_version,
+                    "subtitle_source"       : sub_source,
+                    "width"                 : 0,
+                    "height"                : 0,
+                })
+                yield url
 
     def _check_status_code(self, detail, url, type_of_url):
         status = detail.get("statusCode")
diff --git a/test/results/tiktok.py b/test/results/tiktok.py
index 3f8f0383..b601757f 100644
--- a/test/results/tiktok.py
+++ b/test/results/tiktok.py
@@ -6,12 +6,13 @@
 
 from gallery_dl.extractor import tiktok
 
-PATTERN = r"https://p1[69]-[^/?#.]+\.tiktokcdn[^/?#.]*\.com/[^/?#]+/\w+~.*\.jpe?g"
+PATTERN = r"https://p1[69]-[^/?#.]+\.tiktokcdn[^/?#.]*\.com/[^/?#]+/\w+~.*\.(jpe?g|image)"
 PATTERN_WITH_AUDIO = r"(?:" + PATTERN + r"|https://v\d+m?\.tiktokcdn[^/?#.]*\.com/[^?#]+\?[^/?#]+)"
 VIDEO_PATTERN = r"https://v1[69]-webapp-prime.tiktok.com/video/tos/[^?#]+\?[^/?#]+"
 OLD_VIDEO_PATTERN = r"https://www.tiktok.com/aweme/v1/play/\?[^/?#]+"
 COMBINED_VIDEO_PATTERN = r"(?:" + VIDEO_PATTERN + r")|(?:" + OLD_VIDEO_PATTERN + r")"
 USER_PATTERN = r"(https://www.tiktok.com/@([\w_.-]+)/video/(\d+)|" + PATTERN + r")"
+SUBTITLE_PATTERN = r"https://v1[69]-[^/?#.]+\.tiktokcdn[^/?#.]*\.com/[^/?#]+/.*"
 
 
 __tests__ = (
@@ -127,10 +128,22 @@ __tests__ = (
     "#url"      : "https://www.tiktok.com/@memezar/video/7449708266168274208",
     "#comment"  : "video post cover image",
     "#class"    : tiktok.TiktokPostExtractor,
-    "#pattern"  : r"https://p19-common-sign-useastred.tiktokcdn-eu.com/tos-useast2a-p-0037-euttp/o4rVzhI1bABhooAaEqtCAYGi6nijIsDib8NGfC~tplv-tiktokx-origin.image\?dr=10395&x-expires=\d+&x-signature=.+",
+    "#pattern"  : PATTERN,
+    "#count"    : 1,
     "#options"  : {"videos": False, "covers": True},
 
 
+},
+
+{
+    "#url"      : "https://www.tiktok.com/@memezar/video/7449708266168274208",
+    "#comment"  : "all video post cover images",
+    "#class"    : tiktok.TiktokPostExtractor,
+    "#pattern"  : PATTERN,
+    "#count"    : 3,
+    "#options"  : {"videos": False, "covers": "all"},
+
+
 },
 
 {
@@ -211,6 +224,44 @@ __tests__ = (
     "#options" : {"videos": "ytdl"},
 },
 
+{
+    "#url"     : "https://www.tiktok.com/@memezar/video/7588916452304997635",
+    "#comment" : "default subtitles",
+    "#class"   : tiktok.TiktokPostExtractor,
+    "#pattern" : SUBTITLE_PATTERN,
+    "#count"   : 1,
+    "#options" : {"videos": False, "covers": False, "subtitles": True}
+},
+
+{
+    "#url"     : "https://www.tiktok.com/@memezar/video/7588916452304997635",
+    "#comment" : "english subtitles",
+    "#class"   : tiktok.TiktokPostExtractor,
+    "#pattern" : SUBTITLE_PATTERN,
+    "#count"   : 1,
+    "#options" : {"videos": False, "covers": False, "subtitles": "eng-US"}
+},
+
+# This test is prone to break when more translation agents are added!
+{
+    "#url"     : "https://www.tiktok.com/@memezar/video/7588916452304997635",
+    "#comment" : "combined subtitle filter",
+    "#class"   : tiktok.TiktokPostExtractor,
+    "#pattern" : SUBTITLE_PATTERN,
+    "#count"   : 6,
+    "#options" : {"videos": False, "covers": False, "subtitles": "ASR,deu-DE"}
+},
+
+# This test is prone to break when new languages or more translation agents are added!
+{
+    "#url"     : "https://www.tiktok.com/@memezar/video/7588916452304997635",
+    "#comment" : "all subtitles",
+    "#class"   : tiktok.TiktokPostExtractor,
+    "#pattern" : SUBTITLE_PATTERN,
+    "#count"   : 64,
+    "#options" : {"videos": False, "covers": False, "subtitles": "all"}
+},
+
 {
     "#url"      : "https://vm.tiktok.com/ZGdh4WUhr/",
     "#comment"  : "vm.tiktok.com link: many photos",