Subtitle extraction from streaming media manifests #247
Authored by fstirlitz Modified from: https://github.com/ytdl-org/youtube-dl/pull/6144 Closes: #73 Fixes: https://github.com/ytdl-org/youtube-dl/issues/6106 https://github.com/ytdl-org/youtube-dl/issues/14977 https://github.com/ytdl-org/youtube-dl/issues/21438 https://github.com/ytdl-org/youtube-dl/issues/23609 https://github.com/ytdl-org/youtube-dl/issues/28132 Might also fix (untested): https://github.com/ytdl-org/youtube-dl/issues/15424 https://github.com/ytdl-org/youtube-dl/issues/18267 https://github.com/ytdl-org/youtube-dl/issues/23899 https://github.com/ytdl-org/youtube-dl/issues/24375 https://github.com/ytdl-org/youtube-dl/issues/24595 https://github.com/ytdl-org/youtube-dl/issues/27899 Related: https://github.com/ytdl-org/youtube-dl/issues/22379 https://github.com/ytdl-org/youtube-dl/pull/24517 https://github.com/ytdl-org/youtube-dl/pull/24886 https://github.com/ytdl-org/youtube-dl/pull/27215 Notes: * The functions `extractor.common._extract_..._formats` are still kept for compatibility * Only some extractors have currently been moved to using `_extract_..._formats_and_subtitles` * Direct subtitle manifests (without a master) are not supported and are wrongly identified as containing video formats * AES support is untested * The fragmented TTML subtitles extracted from DASH/ISM are valid, but are unsupported by `ffmpeg` and most video players * Their XML fragments can be dumped using `ffmpeg -i in.mp4 -f data -map 0 -c copy out.ttml`. Once the unnecessary headers are stripped out of this, it becomes a valid self-contained ttml file * The ttml subs downloaded from DASH manifests can also be directly opened with <https://github.com/SubtitleEdit> * Fragmented WebVTT files extracted from DASH/ISM are also unsupported by most tools * Unlike the ttml files, the XML fragments of these cannot be dumped using `ffmpeg` * The webtt subs extracted from DASH can be parsed by <https://github.com/gpac/gpac> * But validity of the those extracted from ISM are untested
This commit is contained in:
@@ -86,18 +86,19 @@ class AtresPlayerIE(InfoExtractor):
|
||||
title = episode['titulo']
|
||||
|
||||
formats = []
|
||||
subtitles = {}
|
||||
for source in episode.get('sources', []):
|
||||
src = source.get('src')
|
||||
if not src:
|
||||
continue
|
||||
src_type = source.get('type')
|
||||
if src_type == 'application/vnd.apple.mpegurl':
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
formats, subtitles = self._extract_m3u8_formats(
|
||||
src, video_id, 'mp4', 'm3u8_native',
|
||||
m3u8_id='hls', fatal=False))
|
||||
m3u8_id='hls', fatal=False)
|
||||
elif src_type == 'application/dash+xml':
|
||||
formats.extend(self._extract_mpd_formats(
|
||||
src, video_id, mpd_id='dash', fatal=False))
|
||||
formats, subtitles = self._extract_mpd_formats(
|
||||
src, video_id, mpd_id='dash', fatal=False)
|
||||
self._sort_formats(formats)
|
||||
|
||||
heartbeat = episode.get('heartbeat') or {}
|
||||
@@ -115,4 +116,5 @@ class AtresPlayerIE(InfoExtractor):
|
||||
'channel': get_meta('channel'),
|
||||
'season': get_meta('season'),
|
||||
'episode_number': int_or_none(get_meta('episodeNumber')),
|
||||
'subtitles': subtitles,
|
||||
}
|
||||
|
||||
@@ -82,6 +82,7 @@ class BYUtvIE(InfoExtractor):
|
||||
|
||||
info = {}
|
||||
formats = []
|
||||
subtitles = {}
|
||||
for format_id, ep in video.items():
|
||||
if not isinstance(ep, dict):
|
||||
continue
|
||||
@@ -90,12 +91,16 @@ class BYUtvIE(InfoExtractor):
|
||||
continue
|
||||
ext = determine_ext(video_url)
|
||||
if ext == 'm3u8':
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
|
||||
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
|
||||
m3u8_id='hls', fatal=False))
|
||||
m3u8_id='hls', fatal=False)
|
||||
formats.extend(m3u8_fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
|
||||
elif ext == 'mpd':
|
||||
formats.extend(self._extract_mpd_formats(
|
||||
video_url, video_id, mpd_id='dash', fatal=False))
|
||||
mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles(
|
||||
video_url, video_id, mpd_id='dash', fatal=False)
|
||||
formats.extend(mpd_fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, mpd_subs)
|
||||
else:
|
||||
formats.append({
|
||||
'url': video_url,
|
||||
@@ -114,4 +119,5 @@ class BYUtvIE(InfoExtractor):
|
||||
'display_id': display_id,
|
||||
'title': display_id,
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
})
|
||||
|
||||
@@ -83,24 +83,31 @@ class CanvasIE(InfoExtractor):
|
||||
description = data.get('description')
|
||||
|
||||
formats = []
|
||||
subtitles = {}
|
||||
for target in data['targetUrls']:
|
||||
format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type'))
|
||||
if not format_url or not format_type:
|
||||
continue
|
||||
format_type = format_type.upper()
|
||||
if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||
format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
|
||||
m3u8_id=format_type, fatal=False))
|
||||
m3u8_id=format_type, fatal=False)
|
||||
formats.extend(fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, subs)
|
||||
elif format_type == 'HDS':
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
format_url, video_id, f4m_id=format_type, fatal=False))
|
||||
elif format_type == 'MPEG_DASH':
|
||||
formats.extend(self._extract_mpd_formats(
|
||||
format_url, video_id, mpd_id=format_type, fatal=False))
|
||||
fmts, subs = self._extract_mpd_formats_and_subtitles(
|
||||
format_url, video_id, mpd_id=format_type, fatal=False)
|
||||
formats.extend(fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, subs)
|
||||
elif format_type == 'HSS':
|
||||
formats.extend(self._extract_ism_formats(
|
||||
format_url, video_id, ism_id='mss', fatal=False))
|
||||
fmts, subs = self._extract_ism_formats_and_subtitles(
|
||||
format_url, video_id, ism_id='mss', fatal=False)
|
||||
formats.extend(fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, subs)
|
||||
else:
|
||||
formats.append({
|
||||
'format_id': format_type,
|
||||
@@ -108,7 +115,6 @@ class CanvasIE(InfoExtractor):
|
||||
})
|
||||
self._sort_formats(formats)
|
||||
|
||||
subtitles = {}
|
||||
subtitle_urls = data.get('subtitleUrls')
|
||||
if isinstance(subtitle_urls, list):
|
||||
for subtitle in subtitle_urls:
|
||||
|
||||
@@ -1879,11 +1879,21 @@ class InfoExtractor(object):
|
||||
'format_note': 'Quality selection URL',
|
||||
}
|
||||
|
||||
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
|
||||
entry_protocol='m3u8', preference=None, quality=None,
|
||||
m3u8_id=None, note=None, errnote=None,
|
||||
fatal=True, live=False, data=None, headers={},
|
||||
query={}):
|
||||
def _extract_m3u8_formats(self, *args, **kwargs):
|
||||
fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
|
||||
if subs:
|
||||
self.report_warning(bug_reports_message(
|
||||
"Ignoring subtitle tracks found in the HLS manifest; "
|
||||
"if any subtitle tracks are missing,"
|
||||
))
|
||||
return fmts
|
||||
|
||||
def _extract_m3u8_formats_and_subtitles(
|
||||
self, m3u8_url, video_id, ext=None, entry_protocol='m3u8',
|
||||
preference=None, quality=None, m3u8_id=None, note=None,
|
||||
errnote=None, fatal=True, live=False, data=None, headers={},
|
||||
query={}):
|
||||
|
||||
res = self._download_webpage_handle(
|
||||
m3u8_url, video_id,
|
||||
note=note or 'Downloading m3u8 information',
|
||||
@@ -1891,30 +1901,34 @@ class InfoExtractor(object):
|
||||
fatal=fatal, data=data, headers=headers, query=query)
|
||||
|
||||
if res is False:
|
||||
return []
|
||||
return [], {}
|
||||
|
||||
m3u8_doc, urlh = res
|
||||
m3u8_url = urlh.geturl()
|
||||
|
||||
return self._parse_m3u8_formats(
|
||||
return self._parse_m3u8_formats_and_subtitles(
|
||||
m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
|
||||
preference=preference, quality=quality, m3u8_id=m3u8_id,
|
||||
note=note, errnote=errnote, fatal=fatal, live=live, data=data,
|
||||
headers=headers, query=query, video_id=video_id)
|
||||
|
||||
def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
|
||||
entry_protocol='m3u8', preference=None, quality=None,
|
||||
m3u8_id=None, live=False, note=None, errnote=None,
|
||||
fatal=True, data=None, headers={}, query={}, video_id=None):
|
||||
def _parse_m3u8_formats_and_subtitles(
|
||||
self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8',
|
||||
preference=None, quality=None, m3u8_id=None, live=False, note=None,
|
||||
errnote=None, fatal=True, data=None, headers={}, query={},
|
||||
video_id=None):
|
||||
|
||||
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
|
||||
return []
|
||||
return [], {}
|
||||
|
||||
if (not self._downloader.params.get('allow_unplayable_formats')
|
||||
and re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc)): # Apple FairPlay
|
||||
return []
|
||||
return [], {}
|
||||
|
||||
formats = []
|
||||
|
||||
subtitles = {}
|
||||
|
||||
format_url = lambda u: (
|
||||
u
|
||||
if re.match(r'^https?://', u)
|
||||
@@ -2001,7 +2015,7 @@ class InfoExtractor(object):
|
||||
}
|
||||
formats.append(f)
|
||||
|
||||
return formats
|
||||
return formats, subtitles
|
||||
|
||||
groups = {}
|
||||
last_stream_inf = {}
|
||||
@@ -2013,6 +2027,21 @@ class InfoExtractor(object):
|
||||
if not (media_type and group_id and name):
|
||||
return
|
||||
groups.setdefault(group_id, []).append(media)
|
||||
# <https://tools.ietf.org/html/rfc8216#section-4.3.4.1>
|
||||
if media_type == 'SUBTITLES':
|
||||
lang = media['LANGUAGE'] # XXX: normalise?
|
||||
url = format_url(media['URI'])
|
||||
sub_info = {
|
||||
'url': url,
|
||||
'ext': determine_ext(url),
|
||||
}
|
||||
if sub_info['ext'] == 'm3u8':
|
||||
# Per RFC 8216 §3.1, the only possible subtitle format m3u8
|
||||
# files may contain is WebVTT:
|
||||
# <https://tools.ietf.org/html/rfc8216#section-3.1>
|
||||
sub_info['ext'] = 'vtt'
|
||||
sub_info['protocol'] = 'm3u8_native'
|
||||
subtitles.setdefault(lang, []).append(sub_info)
|
||||
if media_type not in ('VIDEO', 'AUDIO'):
|
||||
return
|
||||
media_url = media.get('URI')
|
||||
@@ -2160,7 +2189,7 @@ class InfoExtractor(object):
|
||||
formats.append(http_f)
|
||||
|
||||
last_stream_inf = {}
|
||||
return formats
|
||||
return formats, subtitles
|
||||
|
||||
@staticmethod
|
||||
def _xpath_ns(path, namespace=None):
|
||||
@@ -2403,23 +2432,44 @@ class InfoExtractor(object):
|
||||
})
|
||||
return entries
|
||||
|
||||
def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
|
||||
def _extract_mpd_formats(self, *args, **kwargs):
|
||||
fmts, subs = self._extract_mpd_formats_and_subtitles(*args, **kwargs)
|
||||
if subs:
|
||||
self.report_warning(bug_reports_message(
|
||||
"Ignoring subtitle tracks found in the DASH manifest; "
|
||||
"if any subtitle tracks are missing,"
|
||||
))
|
||||
return fmts
|
||||
|
||||
def _extract_mpd_formats_and_subtitles(
|
||||
self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
|
||||
fatal=True, data=None, headers={}, query={}):
|
||||
res = self._download_xml_handle(
|
||||
mpd_url, video_id,
|
||||
note=note or 'Downloading MPD manifest',
|
||||
errnote=errnote or 'Failed to download MPD manifest',
|
||||
fatal=fatal, data=data, headers=headers, query=query)
|
||||
if res is False:
|
||||
return []
|
||||
return [], {}
|
||||
mpd_doc, urlh = res
|
||||
if mpd_doc is None:
|
||||
return []
|
||||
return [], {}
|
||||
mpd_base_url = base_url(urlh.geturl())
|
||||
|
||||
return self._parse_mpd_formats(
|
||||
return self._parse_mpd_formats_and_subtitles(
|
||||
mpd_doc, mpd_id, mpd_base_url, mpd_url)
|
||||
|
||||
def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
|
||||
def _parse_mpd_formats(self, *args, **kwargs):
|
||||
fmts, subs = self._parse_mpd_formats_and_subtitles(*args, **kwargs)
|
||||
if subs:
|
||||
self.report_warning(bug_reports_message(
|
||||
"Ignoring subtitle tracks found in the DASH manifest; "
|
||||
"if any subtitle tracks are missing,"
|
||||
))
|
||||
return fmts
|
||||
|
||||
def _parse_mpd_formats_and_subtitles(
|
||||
self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
|
||||
"""
|
||||
Parse formats from MPD manifest.
|
||||
References:
|
||||
@@ -2429,7 +2479,7 @@ class InfoExtractor(object):
|
||||
"""
|
||||
if not self._downloader.params.get('dynamic_mpd', True):
|
||||
if mpd_doc.get('type') == 'dynamic':
|
||||
return []
|
||||
return [], {}
|
||||
|
||||
namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
|
||||
|
||||
@@ -2501,6 +2551,7 @@ class InfoExtractor(object):
|
||||
|
||||
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
|
||||
formats = []
|
||||
subtitles = {}
|
||||
for period in mpd_doc.findall(_add_ns('Period')):
|
||||
period_duration = parse_duration(period.get('duration')) or mpd_duration
|
||||
period_ms_info = extract_multisegment_info(period, {
|
||||
@@ -2518,11 +2569,9 @@ class InfoExtractor(object):
|
||||
representation_attrib.update(representation.attrib)
|
||||
# According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
|
||||
mime_type = representation_attrib['mimeType']
|
||||
content_type = mime_type.split('/')[0]
|
||||
if content_type == 'text':
|
||||
# TODO implement WebVTT downloading
|
||||
pass
|
||||
elif content_type in ('video', 'audio'):
|
||||
content_type = representation_attrib.get('contentType', mime_type.split('/')[0])
|
||||
|
||||
if content_type in ('video', 'audio', 'text'):
|
||||
base_url = ''
|
||||
for element in (representation, adaptation_set, period, mpd_doc):
|
||||
base_url_e = element.find(_add_ns('BaseURL'))
|
||||
@@ -2539,21 +2588,28 @@ class InfoExtractor(object):
|
||||
url_el = representation.find(_add_ns('BaseURL'))
|
||||
filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
|
||||
bandwidth = int_or_none(representation_attrib.get('bandwidth'))
|
||||
f = {
|
||||
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
|
||||
'manifest_url': mpd_url,
|
||||
'ext': mimetype2ext(mime_type),
|
||||
'width': int_or_none(representation_attrib.get('width')),
|
||||
'height': int_or_none(representation_attrib.get('height')),
|
||||
'tbr': float_or_none(bandwidth, 1000),
|
||||
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
|
||||
'fps': int_or_none(representation_attrib.get('frameRate')),
|
||||
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
|
||||
'format_note': 'DASH %s' % content_type,
|
||||
'filesize': filesize,
|
||||
'container': mimetype2ext(mime_type) + '_dash',
|
||||
}
|
||||
f.update(parse_codecs(representation_attrib.get('codecs')))
|
||||
if content_type in ('video', 'audio'):
|
||||
f = {
|
||||
'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
|
||||
'manifest_url': mpd_url,
|
||||
'ext': mimetype2ext(mime_type),
|
||||
'width': int_or_none(representation_attrib.get('width')),
|
||||
'height': int_or_none(representation_attrib.get('height')),
|
||||
'tbr': float_or_none(bandwidth, 1000),
|
||||
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
|
||||
'fps': int_or_none(representation_attrib.get('frameRate')),
|
||||
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
|
||||
'format_note': 'DASH %s' % content_type,
|
||||
'filesize': filesize,
|
||||
'container': mimetype2ext(mime_type) + '_dash',
|
||||
}
|
||||
f.update(parse_codecs(representation_attrib.get('codecs')))
|
||||
elif content_type == 'text':
|
||||
f = {
|
||||
'ext': mimetype2ext(mime_type),
|
||||
'manifest_url': mpd_url,
|
||||
'filesize': filesize,
|
||||
}
|
||||
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
|
||||
|
||||
def prepare_template(template_name, identifiers):
|
||||
@@ -2700,26 +2756,38 @@ class InfoExtractor(object):
|
||||
else:
|
||||
# Assuming direct URL to unfragmented media.
|
||||
f['url'] = base_url
|
||||
formats.append(f)
|
||||
if content_type in ('video', 'audio'):
|
||||
formats.append(f)
|
||||
elif content_type == 'text':
|
||||
subtitles.setdefault(lang or 'und', []).append(f)
|
||||
else:
|
||||
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
|
||||
return formats
|
||||
return formats, subtitles
|
||||
|
||||
def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
|
||||
def _extract_ism_formats(self, *args, **kwargs):
|
||||
fmts, subs = self._extract_ism_formats_and_subtitles(*args, **kwargs)
|
||||
if subs:
|
||||
self.report_warning(bug_reports_message(
|
||||
"Ignoring subtitle tracks found in the ISM manifest; "
|
||||
"if any subtitle tracks are missing,"
|
||||
))
|
||||
return fmts
|
||||
|
||||
def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
|
||||
res = self._download_xml_handle(
|
||||
ism_url, video_id,
|
||||
note=note or 'Downloading ISM manifest',
|
||||
errnote=errnote or 'Failed to download ISM manifest',
|
||||
fatal=fatal, data=data, headers=headers, query=query)
|
||||
if res is False:
|
||||
return []
|
||||
return [], {}
|
||||
ism_doc, urlh = res
|
||||
if ism_doc is None:
|
||||
return []
|
||||
return [], {}
|
||||
|
||||
return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
|
||||
return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
|
||||
|
||||
def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
|
||||
def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
|
||||
"""
|
||||
Parse formats from ISM manifest.
|
||||
References:
|
||||
@@ -2727,26 +2795,28 @@ class InfoExtractor(object):
|
||||
https://msdn.microsoft.com/en-us/library/ff469518.aspx
|
||||
"""
|
||||
if ism_doc.get('IsLive') == 'TRUE':
|
||||
return []
|
||||
return [], {}
|
||||
if (not self._downloader.params.get('allow_unplayable_formats')
|
||||
and ism_doc.find('Protection') is not None):
|
||||
return []
|
||||
return [], {}
|
||||
|
||||
duration = int(ism_doc.attrib['Duration'])
|
||||
timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
|
||||
|
||||
formats = []
|
||||
subtitles = {}
|
||||
for stream in ism_doc.findall('StreamIndex'):
|
||||
stream_type = stream.get('Type')
|
||||
if stream_type not in ('video', 'audio'):
|
||||
if stream_type not in ('video', 'audio', 'text'):
|
||||
continue
|
||||
url_pattern = stream.attrib['Url']
|
||||
stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
|
||||
stream_name = stream.get('Name')
|
||||
stream_language = stream.get('Language', 'und')
|
||||
for track in stream.findall('QualityLevel'):
|
||||
fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
|
||||
# TODO: add support for WVC1 and WMAP
|
||||
if fourcc not in ('H264', 'AVC1', 'AACL'):
|
||||
if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'):
|
||||
self.report_warning('%s is not a supported codec' % fourcc)
|
||||
continue
|
||||
tbr = int(track.attrib['Bitrate']) // 1000
|
||||
@@ -2789,33 +2859,52 @@ class InfoExtractor(object):
|
||||
format_id.append(stream_name)
|
||||
format_id.append(compat_str(tbr))
|
||||
|
||||
formats.append({
|
||||
'format_id': '-'.join(format_id),
|
||||
'url': ism_url,
|
||||
'manifest_url': ism_url,
|
||||
'ext': 'ismv' if stream_type == 'video' else 'isma',
|
||||
'width': width,
|
||||
'height': height,
|
||||
'tbr': tbr,
|
||||
'asr': sampling_rate,
|
||||
'vcodec': 'none' if stream_type == 'audio' else fourcc,
|
||||
'acodec': 'none' if stream_type == 'video' else fourcc,
|
||||
'protocol': 'ism',
|
||||
'fragments': fragments,
|
||||
'_download_params': {
|
||||
'duration': duration,
|
||||
'timescale': stream_timescale,
|
||||
'width': width or 0,
|
||||
'height': height or 0,
|
||||
'fourcc': fourcc,
|
||||
'codec_private_data': track.get('CodecPrivateData'),
|
||||
'sampling_rate': sampling_rate,
|
||||
'channels': int_or_none(track.get('Channels', 2)),
|
||||
'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
|
||||
'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
|
||||
},
|
||||
})
|
||||
return formats
|
||||
if stream_type == 'text':
|
||||
subtitles.setdefault(stream_language, []).append({
|
||||
'ext': 'ismt',
|
||||
'protocol': 'ism',
|
||||
'url': ism_url,
|
||||
'manifest_url': ism_url,
|
||||
'fragments': fragments,
|
||||
'_download_params': {
|
||||
'stream_type': stream_type,
|
||||
'duration': duration,
|
||||
'timescale': stream_timescale,
|
||||
'fourcc': fourcc,
|
||||
'language': stream_language,
|
||||
'codec_private_data': track.get('CodecPrivateData'),
|
||||
}
|
||||
})
|
||||
elif stream_type in ('video', 'audio'):
|
||||
formats.append({
|
||||
'format_id': '-'.join(format_id),
|
||||
'url': ism_url,
|
||||
'manifest_url': ism_url,
|
||||
'ext': 'ismv' if stream_type == 'video' else 'isma',
|
||||
'width': width,
|
||||
'height': height,
|
||||
'tbr': tbr,
|
||||
'asr': sampling_rate,
|
||||
'vcodec': 'none' if stream_type == 'audio' else fourcc,
|
||||
'acodec': 'none' if stream_type == 'video' else fourcc,
|
||||
'protocol': 'ism',
|
||||
'fragments': fragments,
|
||||
'_download_params': {
|
||||
'stream_type': stream_type,
|
||||
'duration': duration,
|
||||
'timescale': stream_timescale,
|
||||
'width': width or 0,
|
||||
'height': height or 0,
|
||||
'fourcc': fourcc,
|
||||
'language': stream_language,
|
||||
'codec_private_data': track.get('CodecPrivateData'),
|
||||
'sampling_rate': sampling_rate,
|
||||
'channels': int_or_none(track.get('Channels', 2)),
|
||||
'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
|
||||
'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
|
||||
},
|
||||
})
|
||||
return formats, subtitles
|
||||
|
||||
def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
|
||||
def absolute_url(item_url):
|
||||
@@ -2940,7 +3029,16 @@ class InfoExtractor(object):
|
||||
entries.append(media_info)
|
||||
return entries
|
||||
|
||||
def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
|
||||
def _extract_akamai_formats(self, *args, **kwargs):
|
||||
fmts, subs = self._extract_akamai_formats_and_subtitles(*args, **kwargs)
|
||||
if subs:
|
||||
self.report_warning(bug_reports_message(
|
||||
"Ignoring subtitle tracks found in the manifests; "
|
||||
"if any subtitle tracks are missing,"
|
||||
))
|
||||
return fmts
|
||||
|
||||
def _extract_akamai_formats_and_subtitles(self, manifest_url, video_id, hosts={}):
|
||||
signed = 'hdnea=' in manifest_url
|
||||
if not signed:
|
||||
# https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
|
||||
@@ -2949,6 +3047,7 @@ class InfoExtractor(object):
|
||||
'', manifest_url).strip('?')
|
||||
|
||||
formats = []
|
||||
subtitles = {}
|
||||
|
||||
hdcore_sign = 'hdcore=3.7.0'
|
||||
f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
|
||||
@@ -2967,10 +3066,11 @@ class InfoExtractor(object):
|
||||
hls_host = hosts.get('hls')
|
||||
if hls_host:
|
||||
m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
|
||||
m3u8_formats = self._extract_m3u8_formats(
|
||||
m3u8_formats, m3u8_subtitles = self._extract_m3u8_formats_and_subtitles(
|
||||
m3u8_url, video_id, 'mp4', 'm3u8_native',
|
||||
m3u8_id='hls', fatal=False)
|
||||
formats.extend(m3u8_formats)
|
||||
subtitles = self._merge_subtitles(subtitles, m3u8_subtitles)
|
||||
|
||||
http_host = hosts.get('http')
|
||||
if http_host and m3u8_formats and not signed:
|
||||
@@ -2994,7 +3094,7 @@ class InfoExtractor(object):
|
||||
formats.append(http_f)
|
||||
i += 1
|
||||
|
||||
return formats
|
||||
return formats, subtitles
|
||||
|
||||
def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
|
||||
query = compat_urlparse.urlparse(url).query
|
||||
@@ -3319,12 +3419,22 @@ class InfoExtractor(object):
|
||||
return ret
|
||||
|
||||
@classmethod
|
||||
def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
|
||||
""" Merge two subtitle dictionaries, language by language. """
|
||||
ret = dict(subtitle_dict1)
|
||||
for lang in subtitle_dict2:
|
||||
ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
|
||||
return ret
|
||||
def _merge_subtitles(cls, *dicts, **kwargs):
|
||||
""" Merge subtitle dictionaries, language by language. """
|
||||
|
||||
target = (lambda target=None: target)(**kwargs)
|
||||
# The above lambda extracts the keyword argument 'target' from kwargs
|
||||
# while ensuring there are no stray ones. When Python 2 support
|
||||
# is dropped, remove it and change the function signature to:
|
||||
#
|
||||
# def _merge_subtitles(cls, *dicts, target=None):
|
||||
|
||||
if target is None:
|
||||
target = {}
|
||||
for d in dicts:
|
||||
for lang, subs in d.items():
|
||||
target[lang] = cls._merge_subtitle_items(target.get(lang, []), subs)
|
||||
return target
|
||||
|
||||
def extract_automatic_captions(self, *args, **kwargs):
|
||||
if (self._downloader.params.get('writeautomaticsub', False)
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import re
|
||||
import tempfile
|
||||
|
||||
from .common import InfoExtractor
|
||||
from ..utils import (
|
||||
@@ -12,12 +10,12 @@ from ..utils import (
|
||||
try_get,
|
||||
)
|
||||
from ..compat import compat_str
|
||||
from ..downloader.hls import HlsFD
|
||||
|
||||
|
||||
class ElonetIE(InfoExtractor):
|
||||
_VALID_URL = r'https?://elonet\.finna\.fi/Record/kavi\.elonet_elokuva_(?P<id>[0-9]+)'
|
||||
_TEST = {
|
||||
_TESTS = [{
|
||||
# m3u8 with subtitles
|
||||
'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_107867',
|
||||
'md5': '8efc954b96c543711707f87de757caea',
|
||||
'info_dict': {
|
||||
@@ -27,62 +25,17 @@ class ElonetIE(InfoExtractor):
|
||||
'description': 'Valkoinen peura (1952) on Erik Blombergin ohjaama ja yhdessä Mirjami Kuosmasen kanssa käsikirjoittama tarunomainen kertomus valkoisen peuran hahmossa lii...',
|
||||
'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_107867&index=0&size=large',
|
||||
},
|
||||
}
|
||||
|
||||
def _download_m3u8_chunked_subtitle(self, chunklist_url):
|
||||
"""
|
||||
Download VTT subtitles from pieces in manifest URL.
|
||||
Return a string containing joined chunks with extra headers removed.
|
||||
"""
|
||||
with tempfile.NamedTemporaryFile(delete=True) as outfile:
|
||||
fname = outfile.name
|
||||
hlsdl = HlsFD(self._downloader, {})
|
||||
hlsdl.download(compat_str(fname), {"url": chunklist_url})
|
||||
with open(fname, 'r') as fin:
|
||||
# Remove (some) headers
|
||||
fdata = re.sub(r'X-TIMESTAMP-MAP.*\n+|WEBVTT\n+', '', fin.read())
|
||||
os.remove(fname)
|
||||
return "WEBVTT\n\n" + fdata
|
||||
|
||||
def _parse_m3u8_subtitles(self, m3u8_doc, m3u8_url):
|
||||
"""
|
||||
Parse subtitles from HLS / m3u8 manifest.
|
||||
"""
|
||||
subtitles = {}
|
||||
baseurl = m3u8_url[:m3u8_url.rindex('/') + 1]
|
||||
for line in m3u8_doc.split('\n'):
|
||||
if 'EXT-X-MEDIA:TYPE=SUBTITLES' in line:
|
||||
lang = self._search_regex(
|
||||
r'LANGUAGE="(.+?)"', line, 'lang', default=False)
|
||||
uri = self._search_regex(
|
||||
r'URI="(.+?)"', line, 'uri', default=False)
|
||||
if lang and uri:
|
||||
data = self._download_m3u8_chunked_subtitle(baseurl + uri)
|
||||
subtitles[lang] = [{'ext': 'vtt', 'data': data}]
|
||||
return subtitles
|
||||
|
||||
def _parse_mpd_subtitles(self, mpd_doc):
|
||||
"""
|
||||
Parse subtitles from MPD manifest.
|
||||
"""
|
||||
ns = '{urn:mpeg:dash:schema:mpd:2011}'
|
||||
subtitles = {}
|
||||
for aset in mpd_doc.findall(".//%sAdaptationSet[@mimeType='text/vtt']" % (ns)):
|
||||
lang = aset.attrib.get('lang', 'unk')
|
||||
url = aset.find("./%sRepresentation/%sBaseURL" % (ns, ns)).text
|
||||
subtitles[lang] = [{'ext': 'vtt', 'url': url}]
|
||||
return subtitles
|
||||
|
||||
def _get_subtitles(self, fmt, doc, url):
|
||||
if fmt == 'm3u8':
|
||||
subs = self._parse_m3u8_subtitles(doc, url)
|
||||
elif fmt == 'mpd':
|
||||
subs = self._parse_mpd_subtitles(doc)
|
||||
else:
|
||||
self.report_warning(
|
||||
"Cannot download subtitles from '%s' streams." % (fmt))
|
||||
subs = {}
|
||||
return subs
|
||||
}, {
|
||||
# DASH with subtitles
|
||||
'url': 'https://elonet.finna.fi/Record/kavi.elonet_elokuva_116539',
|
||||
'info_dict': {
|
||||
'id': '116539',
|
||||
'ext': 'mp4',
|
||||
'title': 'Minulla on tiikeri',
|
||||
'description': 'Pienellä pojalla, joka asuu kerrostalossa, on kotieläimenä tiikeri. Se on kuitenkin salaisuus. Kerrostalon räpätäti on Kotilaisen täti, joka on aina vali...',
|
||||
'thumbnail': 'https://elonet.finna.fi/Cover/Show?id=kavi.elonet_elokuva_116539&index=0&size=large&source=Solr',
|
||||
}
|
||||
}]
|
||||
|
||||
def _real_extract(self, url):
|
||||
video_id = self._match_id(url)
|
||||
@@ -101,8 +54,8 @@ class ElonetIE(InfoExtractor):
|
||||
self._parse_json(json_s, video_id),
|
||||
lambda x: x[0]["src"], compat_str)
|
||||
formats = []
|
||||
subtitles = {}
|
||||
if re.search(r'\.m3u8\??', src):
|
||||
fmt = 'm3u8'
|
||||
res = self._download_webpage_handle(
|
||||
# elonet servers have certificate problems
|
||||
src.replace('https:', 'http:'), video_id,
|
||||
@@ -111,11 +64,10 @@ class ElonetIE(InfoExtractor):
|
||||
if res:
|
||||
doc, urlh = res
|
||||
url = urlh.geturl()
|
||||
formats = self._parse_m3u8_formats(doc, url)
|
||||
formats, subtitles = self._parse_m3u8_formats_and_subtitles(doc, url)
|
||||
for f in formats:
|
||||
f['ext'] = 'mp4'
|
||||
elif re.search(r'\.mpd\??', src):
|
||||
fmt = 'mpd'
|
||||
res = self._download_xml_handle(
|
||||
src, video_id,
|
||||
note='Downloading MPD manifest',
|
||||
@@ -123,7 +75,7 @@ class ElonetIE(InfoExtractor):
|
||||
if res:
|
||||
doc, urlh = res
|
||||
url = base_url(urlh.geturl())
|
||||
formats = self._parse_mpd_formats(doc, mpd_base_url=url)
|
||||
formats, subtitles = self._parse_mpd_formats_and_subtitles(doc, mpd_base_url=url)
|
||||
else:
|
||||
raise ExtractorError("Unknown streaming format")
|
||||
|
||||
@@ -133,5 +85,5 @@ class ElonetIE(InfoExtractor):
|
||||
'description': description,
|
||||
'thumbnail': thumbnail,
|
||||
'formats': formats,
|
||||
'subtitles': self.extract_subtitles(fmt, doc, url),
|
||||
'subtitles': subtitles,
|
||||
}
|
||||
|
||||
@@ -151,6 +151,7 @@ class FranceTVIE(InfoExtractor):
|
||||
videos.append(fallback_info['video'])
|
||||
|
||||
formats = []
|
||||
subtitles = {}
|
||||
for video in videos:
|
||||
video_url = video.get('url')
|
||||
if not video_url:
|
||||
@@ -171,10 +172,12 @@ class FranceTVIE(InfoExtractor):
|
||||
sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44',
|
||||
video_id, f4m_id=format_id, fatal=False))
|
||||
elif ext == 'm3u8':
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
|
||||
sign(video_url, format_id), video_id, 'mp4',
|
||||
entry_protocol='m3u8_native', m3u8_id=format_id,
|
||||
fatal=False))
|
||||
fatal=False)
|
||||
formats.extend(m3u8_fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
|
||||
elif ext == 'mpd':
|
||||
formats.extend(self._extract_mpd_formats(
|
||||
sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))
|
||||
@@ -199,13 +202,12 @@ class FranceTVIE(InfoExtractor):
|
||||
title += ' - %s' % subtitle
|
||||
title = title.strip()
|
||||
|
||||
subtitles = {}
|
||||
subtitles_list = [{
|
||||
'url': subformat['url'],
|
||||
'ext': subformat.get('format'),
|
||||
} for subformat in info.get('subtitles', []) if subformat.get('url')]
|
||||
if subtitles_list:
|
||||
subtitles['fr'] = subtitles_list
|
||||
subtitles.setdefault('fr', []).extend(
|
||||
[{
|
||||
'url': subformat['url'],
|
||||
'ext': subformat.get('format'),
|
||||
} for subformat in info.get('subtitles', []) if subformat.get('url')]
|
||||
)
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
|
||||
@@ -2444,8 +2444,9 @@ class GenericIE(InfoExtractor):
|
||||
m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type)
|
||||
if m:
|
||||
format_id = compat_str(m.group('format_id'))
|
||||
subtitles = {}
|
||||
if format_id.endswith('mpegurl'):
|
||||
formats = self._extract_m3u8_formats(url, video_id, 'mp4')
|
||||
formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
|
||||
elif format_id == 'f4m':
|
||||
formats = self._extract_f4m_formats(url, video_id)
|
||||
else:
|
||||
@@ -2457,6 +2458,7 @@ class GenericIE(InfoExtractor):
|
||||
info_dict['direct'] = True
|
||||
self._sort_formats(formats)
|
||||
info_dict['formats'] = formats
|
||||
info_dict['subtitles'] = subtitles
|
||||
return info_dict
|
||||
|
||||
if not self._downloader.params.get('test', False) and not is_intentional:
|
||||
@@ -2510,7 +2512,7 @@ class GenericIE(InfoExtractor):
|
||||
if doc.tag == 'rss':
|
||||
return self._extract_rss(url, video_id, doc)
|
||||
elif doc.tag == 'SmoothStreamingMedia':
|
||||
info_dict['formats'] = self._parse_ism_formats(doc, url)
|
||||
info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url)
|
||||
self._sort_formats(info_dict['formats'])
|
||||
return info_dict
|
||||
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
|
||||
@@ -2524,7 +2526,7 @@ class GenericIE(InfoExtractor):
|
||||
xspf_base_url=full_response.geturl()),
|
||||
video_id)
|
||||
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
|
||||
info_dict['formats'] = self._parse_mpd_formats(
|
||||
info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
|
||||
doc,
|
||||
mpd_base_url=full_response.geturl().rpartition('/')[0],
|
||||
mpd_url=url)
|
||||
|
||||
@@ -46,6 +46,7 @@ class NYTimesBaseIE(InfoExtractor):
|
||||
|
||||
urls = []
|
||||
formats = []
|
||||
subtitles = {}
|
||||
for video in video_data.get('renditions', []):
|
||||
video_url = video.get('url')
|
||||
format_id = video.get('type')
|
||||
@@ -54,9 +55,11 @@ class NYTimesBaseIE(InfoExtractor):
|
||||
urls.append(video_url)
|
||||
ext = mimetype2ext(video.get('mimetype')) or determine_ext(video_url)
|
||||
if ext == 'm3u8':
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
|
||||
video_url, video_id, 'mp4', 'm3u8_native',
|
||||
m3u8_id=format_id or 'hls', fatal=False))
|
||||
m3u8_id=format_id or 'hls', fatal=False)
|
||||
formats.extend(m3u8_fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
|
||||
elif ext == 'mpd':
|
||||
continue
|
||||
# formats.extend(self._extract_mpd_formats(
|
||||
@@ -96,6 +99,7 @@ class NYTimesBaseIE(InfoExtractor):
|
||||
'uploader': video_data.get('byline'),
|
||||
'duration': float_or_none(video_data.get('duration'), 1000),
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
'thumbnails': thumbnails,
|
||||
}
|
||||
|
||||
|
||||
@@ -103,7 +103,7 @@ class RoosterTeethIE(InfoExtractor):
|
||||
api_episode_url + '/videos', display_id,
|
||||
'Downloading video JSON metadata')['data'][0]
|
||||
m3u8_url = video_data['attributes']['url']
|
||||
subtitle_m3u8_url = video_data['links']['download']
|
||||
# XXX: additional URL at video_data['links']['download']
|
||||
except ExtractorError as e:
|
||||
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
|
||||
if self._parse_json(e.cause.read().decode(), display_id).get('access') is False:
|
||||
@@ -111,7 +111,7 @@ class RoosterTeethIE(InfoExtractor):
|
||||
'%s is only available for FIRST members' % display_id)
|
||||
raise
|
||||
|
||||
formats = self._extract_m3u8_formats(
|
||||
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
|
||||
m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls')
|
||||
self._sort_formats(formats)
|
||||
|
||||
@@ -134,33 +134,6 @@ class RoosterTeethIE(InfoExtractor):
|
||||
'url': img_url,
|
||||
})
|
||||
|
||||
subtitles = {}
|
||||
res = self._download_webpage_handle(
|
||||
subtitle_m3u8_url, display_id,
|
||||
'Downloading m3u8 information',
|
||||
'Failed to download m3u8 information',
|
||||
fatal=True, data=None, headers={}, query={})
|
||||
if res is not False:
|
||||
subtitle_m3u8_doc, _ = res
|
||||
for line in subtitle_m3u8_doc.split('\n'):
|
||||
if 'EXT-X-MEDIA:TYPE=SUBTITLES' in line:
|
||||
parts = line.split(',')
|
||||
for part in parts:
|
||||
if 'LANGUAGE' in part:
|
||||
lang = part[part.index('=') + 2:-1]
|
||||
elif 'URI' in part:
|
||||
uri = part[part.index('=') + 2:-1]
|
||||
res = self._download_webpage_handle(
|
||||
uri, display_id,
|
||||
'Downloading m3u8 information',
|
||||
'Failed to download m3u8 information',
|
||||
fatal=True, data=None, headers={}, query={})
|
||||
doc, _ = res
|
||||
for l in doc.split('\n'):
|
||||
if not l.startswith('#'):
|
||||
subtitles[lang] = [{'url': uri[:-uri[::-1].index('/')] + l}]
|
||||
break
|
||||
|
||||
return {
|
||||
'id': video_id,
|
||||
'display_id': display_id,
|
||||
|
||||
@@ -87,6 +87,7 @@ class SRGSSRIE(InfoExtractor):
|
||||
title = media_data['title']
|
||||
|
||||
formats = []
|
||||
subtitles = {}
|
||||
q = qualities(['SD', 'HD'])
|
||||
for source in (media_data.get('resourceList') or []):
|
||||
format_url = source.get('url')
|
||||
@@ -104,12 +105,16 @@ class SRGSSRIE(InfoExtractor):
|
||||
if source.get('tokenType') == 'AKAMAI':
|
||||
format_url = self._get_tokenized_src(
|
||||
format_url, media_id, format_id)
|
||||
formats.extend(self._extract_akamai_formats(
|
||||
format_url, media_id))
|
||||
fmts, subs = self._extract_akamai_formats_and_subtitles(
|
||||
format_url, media_id)
|
||||
formats.extend(fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, subs)
|
||||
elif protocol == 'HLS':
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
|
||||
format_url, media_id, 'mp4', 'm3u8_native',
|
||||
m3u8_id=format_id, fatal=False))
|
||||
m3u8_id=format_id, fatal=False)
|
||||
formats.extend(m3u8_fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, m3u8_subs)
|
||||
elif protocol in ('HTTP', 'HTTPS'):
|
||||
formats.append({
|
||||
'format_id': format_id,
|
||||
@@ -133,7 +138,6 @@ class SRGSSRIE(InfoExtractor):
|
||||
})
|
||||
self._sort_formats(formats)
|
||||
|
||||
subtitles = {}
|
||||
if media_type == 'video':
|
||||
for sub in (media_data.get('subtitleList') or []):
|
||||
sub_url = sub.get('url')
|
||||
|
||||
@@ -99,16 +99,21 @@ class ThreeQSDNIE(InfoExtractor):
|
||||
aspect = float_or_none(config.get('aspect'))
|
||||
|
||||
formats = []
|
||||
subtitles = {}
|
||||
for source_type, source in (config.get('sources') or {}).items():
|
||||
if not source:
|
||||
continue
|
||||
if source_type == 'dash':
|
||||
formats.extend(self._extract_mpd_formats(
|
||||
source, video_id, mpd_id='mpd', fatal=False))
|
||||
fmts, subs = self._extract_mpd_formats_and_subtitles(
|
||||
source, video_id, mpd_id='mpd', fatal=False)
|
||||
formats.extend(fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, subs)
|
||||
elif source_type == 'hls':
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||
source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native',
|
||||
m3u8_id='hls', fatal=False))
|
||||
m3u8_id='hls', fatal=False)
|
||||
formats.extend(fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, subs)
|
||||
elif source_type == 'progressive':
|
||||
for s in source:
|
||||
src = s.get('src')
|
||||
@@ -138,7 +143,6 @@ class ThreeQSDNIE(InfoExtractor):
|
||||
# behaviour is being kept as-is
|
||||
self._sort_formats(formats, ('res', 'source_preference'))
|
||||
|
||||
subtitles = {}
|
||||
for subtitle in (config.get('subtitles') or []):
|
||||
src = subtitle.get('src')
|
||||
if not src:
|
||||
|
||||
@@ -93,18 +93,31 @@ class TV4IE(InfoExtractor):
|
||||
'device': 'browser',
|
||||
'protocol': 'hls',
|
||||
})['playbackItem']['manifestUrl']
|
||||
formats = self._extract_m3u8_formats(
|
||||
formats = []
|
||||
subtitles = {}
|
||||
|
||||
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||
manifest_url, video_id, 'mp4',
|
||||
'm3u8_native', m3u8_id='hls', fatal=False)
|
||||
formats.extend(self._extract_mpd_formats(
|
||||
formats.extend(fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, subs)
|
||||
|
||||
fmts, subs = self._extract_mpd_formats_and_subtitles(
|
||||
manifest_url.replace('.m3u8', '.mpd'),
|
||||
video_id, mpd_id='dash', fatal=False))
|
||||
formats.extend(self._extract_f4m_formats(
|
||||
video_id, mpd_id='dash', fatal=False)
|
||||
formats.extend(fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, subs)
|
||||
|
||||
fmts = self._extract_f4m_formats(
|
||||
manifest_url.replace('.m3u8', '.f4m'),
|
||||
video_id, f4m_id='hds', fatal=False))
|
||||
formats.extend(self._extract_ism_formats(
|
||||
video_id, f4m_id='hds', fatal=False)
|
||||
formats.extend(fmts)
|
||||
|
||||
fmts, subs = self._extract_ism_formats_and_subtitles(
|
||||
re.sub(r'\.ism/.*?\.m3u8', r'.ism/Manifest', manifest_url),
|
||||
video_id, ism_id='mss', fatal=False))
|
||||
video_id, ism_id='mss', fatal=False)
|
||||
formats.extend(fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, subs)
|
||||
|
||||
if not formats and info.get('is_geo_restricted'):
|
||||
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
|
||||
@@ -115,7 +128,7 @@ class TV4IE(InfoExtractor):
|
||||
'id': video_id,
|
||||
'title': title,
|
||||
'formats': formats,
|
||||
# 'subtitles': subtitles,
|
||||
'subtitles': subtitles,
|
||||
'description': info.get('description'),
|
||||
'timestamp': parse_iso8601(info.get('broadcast_date_time')),
|
||||
'duration': int_or_none(info.get('duration')),
|
||||
|
||||
@@ -36,9 +36,9 @@ class TwitterBaseIE(InfoExtractor):
|
||||
def _extract_variant_formats(self, variant, video_id):
|
||||
variant_url = variant.get('url')
|
||||
if not variant_url:
|
||||
return []
|
||||
return [], {}
|
||||
elif '.m3u8' in variant_url:
|
||||
return self._extract_m3u8_formats(
|
||||
return self._extract_m3u8_formats_and_subtitles(
|
||||
variant_url, video_id, 'mp4', 'm3u8_native',
|
||||
m3u8_id='hls', fatal=False)
|
||||
else:
|
||||
@@ -49,22 +49,27 @@ class TwitterBaseIE(InfoExtractor):
|
||||
'tbr': tbr,
|
||||
}
|
||||
self._search_dimensions_in_video_url(f, variant_url)
|
||||
return [f]
|
||||
return [f], {}
|
||||
|
||||
def _extract_formats_from_vmap_url(self, vmap_url, video_id):
|
||||
vmap_data = self._download_xml(vmap_url, video_id)
|
||||
formats = []
|
||||
subtitles = {}
|
||||
urls = []
|
||||
for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'):
|
||||
video_variant.attrib['url'] = compat_urllib_parse_unquote(
|
||||
video_variant.attrib['url'])
|
||||
urls.append(video_variant.attrib['url'])
|
||||
formats.extend(self._extract_variant_formats(
|
||||
video_variant.attrib, video_id))
|
||||
fmts, subs = self._extract_variant_formats(
|
||||
video_variant.attrib, video_id)
|
||||
formats.extend(fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, subs)
|
||||
video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile'))
|
||||
if video_url not in urls:
|
||||
formats.extend(self._extract_variant_formats({'url': video_url}, video_id))
|
||||
return formats
|
||||
fmts, subs = self._extract_variant_formats({'url': video_url}, video_id)
|
||||
formats.extend(fmts)
|
||||
subtitles = self._merge_subtitles(subtitles, subs)
|
||||
return formats, subtitles
|
||||
|
||||
@staticmethod
|
||||
def _search_dimensions_in_video_url(a_format, video_url):
|
||||
@@ -471,8 +476,11 @@ class TwitterIE(TwitterBaseIE):
|
||||
video_info = media.get('video_info') or {}
|
||||
|
||||
formats = []
|
||||
subtitles = {}
|
||||
for variant in video_info.get('variants', []):
|
||||
formats.extend(self._extract_variant_formats(variant, twid))
|
||||
fmts, subs = self._extract_variant_formats(variant, twid)
|
||||
subtitles = self._merge_subtitles(subtitles, subs)
|
||||
formats.extend(fmts)
|
||||
self._sort_formats(formats)
|
||||
|
||||
thumbnails = []
|
||||
@@ -491,6 +499,7 @@ class TwitterIE(TwitterBaseIE):
|
||||
|
||||
info.update({
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
'thumbnails': thumbnails,
|
||||
'duration': float_or_none(video_info.get('duration_millis'), 1000),
|
||||
})
|
||||
@@ -540,7 +549,7 @@ class TwitterIE(TwitterBaseIE):
|
||||
is_amplify = card_name == 'amplify'
|
||||
vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url')
|
||||
content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player'))
|
||||
formats = self._extract_formats_from_vmap_url(vmap_url, content_id or twid)
|
||||
formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid)
|
||||
self._sort_formats(formats)
|
||||
|
||||
thumbnails = []
|
||||
@@ -558,6 +567,7 @@ class TwitterIE(TwitterBaseIE):
|
||||
|
||||
info.update({
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
'thumbnails': thumbnails,
|
||||
'duration': int_or_none(get_binding_value(
|
||||
'content_duration_seconds')),
|
||||
|
||||
@@ -30,7 +30,7 @@ class UplynkIE(InfoExtractor):
|
||||
def _extract_uplynk_info(self, uplynk_content_url):
|
||||
path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups()
|
||||
display_id = video_id or external_id
|
||||
formats = self._extract_m3u8_formats(
|
||||
formats, subtitles = self._extract_m3u8_formats_and_subtitles(
|
||||
'http://content.uplynk.com/%s.m3u8' % path,
|
||||
display_id, 'mp4', 'm3u8_native')
|
||||
if session_id:
|
||||
@@ -48,6 +48,7 @@ class UplynkIE(InfoExtractor):
|
||||
'duration': float_or_none(asset.get('duration')),
|
||||
'uploader_id': asset.get('owner'),
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
}
|
||||
|
||||
def _real_extract(self, url):
|
||||
|
||||
@@ -69,19 +69,24 @@ class WatIE(InfoExtractor):
|
||||
title = video_info['title']
|
||||
|
||||
formats = []
|
||||
subtitles = {}
|
||||
|
||||
def extract_formats(manifest_urls):
|
||||
for f, f_url in manifest_urls.items():
|
||||
if not f_url:
|
||||
continue
|
||||
if f in ('dash', 'mpd'):
|
||||
formats.extend(self._extract_mpd_formats(
|
||||
fmts, subs = self._extract_mpd_formats_and_subtitles(
|
||||
f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'),
|
||||
video_id, mpd_id='dash', fatal=False))
|
||||
video_id, mpd_id='dash', fatal=False)
|
||||
elif f == 'hls':
|
||||
formats.extend(self._extract_m3u8_formats(
|
||||
fmts, subs = self._extract_m3u8_formats_and_subtitles(
|
||||
f_url, video_id, 'mp4',
|
||||
'm3u8_native', m3u8_id='hls', fatal=False))
|
||||
'm3u8_native', m3u8_id='hls', fatal=False)
|
||||
else:
|
||||
continue
|
||||
formats.extend(fmts)
|
||||
self._merge_subtitles(subs, target=subtitles)
|
||||
|
||||
delivery = video_data.get('delivery') or {}
|
||||
extract_formats({delivery.get('format'): delivery.get('url')})
|
||||
@@ -103,4 +108,5 @@ class WatIE(InfoExtractor):
|
||||
video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])),
|
||||
'duration': int_or_none(video_info.get('duration')),
|
||||
'formats': formats,
|
||||
'subtitles': subtitles,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user