From 84412f41fa461f21e42f1402891e4539bc5fa241 Mon Sep 17 00:00:00 2001 From: Lauren Liberda Date: Tue, 16 Mar 2021 21:34:15 +0100 Subject: [PATCH] support for vtt subtitles in m3u8 manifests --- haruhi_dl/HaruhiDL.py | 26 +++++++++++++++++----- haruhi_dl/downloader/external.py | 4 +++- haruhi_dl/extractor/common.py | 38 ++++++++++++++++++++++---------- 3 files changed, 50 insertions(+), 18 deletions(-) diff --git a/haruhi_dl/HaruhiDL.py b/haruhi_dl/HaruhiDL.py index 6692fb89e..ea7dede6a 100755 --- a/haruhi_dl/HaruhiDL.py +++ b/haruhi_dl/HaruhiDL.py @@ -1531,6 +1531,19 @@ class HaruhiDL(object): if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + # Some fragmented media manifests like m3u8 allow embedding subtitles + # This is a weird hack to provide these subtitles to users without a very huge refactor of extractors + if 'formats' in info_dict: + formats_subtitles = list(filter(lambda x: x.get('_subtitle'), info_dict['formats'])) + if formats_subtitles: + info_dict.setdefault('subtitles', {}) + for sub in formats_subtitles: + if sub['_key'] not in info_dict['subtitles']: + info_dict['subtitles'][sub['_key']] = [] + info_dict['subtitles'][sub['_key']].append(sub['_subtitle']) + # remove these subtitles from formats now + info_dict['formats'] = list(filter(lambda x: '_subtitle' not in x, info_dict['formats'])) + for cc_kind in ('subtitles', 'automatic_captions'): cc = info_dict.get(cc_kind) if cc: @@ -1538,6 +1551,12 @@ class HaruhiDL(object): for subtitle_format in subtitle: if subtitle_format.get('url'): subtitle_format['url'] = sanitize_url(subtitle_format['url']) + if subtitle_format.get('protocol') is None: + subtitle_format['protocol'] = determine_protocol(subtitle_format['url']) + if subtitle_format.get('http_headers') is None: + full_info = info_dict.copy() + full_info.update(subtitle_format) + subtitle_format['http_headers'] = self._calc_headers(full_info) if subtitle_format.get('ext') is None: subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() @@ -1854,7 +1873,6 @@ class HaruhiDL(object): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['requested_subtitles'] - ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) @@ -1873,10 +1891,8 @@ class HaruhiDL(object): return else: try: - sub_data = ie._request_webpage( - sub_info['url'], info_dict['id'], note=False).read() - with io.open(encodeFilename(sub_filename), 'wb') as subfile: - subfile.write(sub_data) + subd = get_suitable_downloader(sub_info, self.params)(self, self.params) + subd.download(sub_filename, sub_info) except (ExtractorError, IOError, OSError, ValueError) as err: self.report_warning('Unable to download subtitle for "%s": %s' % (sub_lang, error_to_compat_str(err))) diff --git a/haruhi_dl/downloader/external.py b/haruhi_dl/downloader/external.py index b74e95bf1..ae6484c22 100644 --- a/haruhi_dl/downloader/external.py +++ b/haruhi_dl/downloader/external.py @@ -318,7 +318,9 @@ class FFmpegFD(ExternalFD): args += ['-fs', compat_str(self._TEST_FILE_SIZE)] if protocol in ('m3u8', 'm3u8_native'): - if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': + if info_dict['ext'] == 'vtt': + args += ['-f', 'webvtt'] + elif self.params.get('hls_use_mpegts', False) or tmpfilename == '-': args += ['-f', 'mpegts'] else: args += ['-f', 'mp4'] diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index 5b4ad411c..3d07978bb 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -1391,6 +1391,10 @@ class InfoExtractor(object): f['tbr'] = f['abr'] + f['vbr'] def _formats_key(f): + # manifest subtitle workaround + if '_subtitle' in f: + return (-1,) + # TODO remove the following workaround from ..utils import determine_ext if not f.get('ext') and 'url' in f: @@ -1726,7 +1730,7 @@ class InfoExtractor(object): if not (media_type and group_id and name): return groups.setdefault(group_id, []).append(media) - if media_type not in ('VIDEO', 'AUDIO'): + if media_type not in ('VIDEO', 'AUDIO', 'SUBTITLES'): return media_url = media.get('URI') if media_url: @@ -1734,17 +1738,27 @@ class InfoExtractor(object): for v in (m3u8_id, group_id, name): if v: format_id.append(v) - f = { - 'format_id': '-'.join(format_id), - 'url': format_url(media_url), - 'manifest_url': m3u8_url, - 'language': media.get('LANGUAGE'), - 'ext': ext, - 'protocol': entry_protocol, - 'preference': preference, - } - if media_type == 'AUDIO': - f['vcodec'] = 'none' + if media_type == 'SUBTITLES': + f = { + '_subtitle': { + 'url': format_url(media_url), + 'ext': 'vtt', + 'protocol': entry_protocol, + }, + '_key': media.get('LANGUAGE'), + } + else: + f = { + 'format_id': '-'.join(format_id), + 'url': format_url(media_url), + 'manifest_url': m3u8_url, + 'language': media.get('LANGUAGE'), + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + } + if media_type == 'AUDIO': + f['vcodec'] = 'none' formats.append(f) def build_stream_name():