From 84412f41fa461f21e42f1402891e4539bc5fa241 Mon Sep 17 00:00:00 2001
From: Lauren Liberda <laura@selfisekai.rocks>
Date: Tue, 16 Mar 2021 21:34:15 +0100
Subject: [PATCH] support for vtt subtitles in m3u8 manifests

---
 haruhi_dl/HaruhiDL.py            | 26 +++++++++++++++++-----
 haruhi_dl/downloader/external.py |  4 +++-
 haruhi_dl/extractor/common.py    | 38 ++++++++++++++++++++++----------
 3 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/haruhi_dl/HaruhiDL.py b/haruhi_dl/HaruhiDL.py
index 6692fb89e..ea7dede6a 100755
--- a/haruhi_dl/HaruhiDL.py
+++ b/haruhi_dl/HaruhiDL.py
@@ -1531,6 +1531,19 @@ class HaruhiDL(object):
             if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
                 info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
 
+        # Some fragmented media manifests like m3u8 allow embedding subtitles
+        # This is a weird hack to provide these subtitles to users without a very huge refactor of extractors
+        if 'formats' in info_dict:
+            formats_subtitles = list(filter(lambda x: x.get('_subtitle'), info_dict['formats']))
+            if formats_subtitles:
+                info_dict.setdefault('subtitles', {})
+                for sub in formats_subtitles:
+                    if sub['_key'] not in info_dict['subtitles']:
+                        info_dict['subtitles'][sub['_key']] = []
+                    info_dict['subtitles'][sub['_key']].append(sub['_subtitle'])
+                # remove these subtitles from formats now
+                info_dict['formats'] = list(filter(lambda x: '_subtitle' not in x, info_dict['formats']))
+
         for cc_kind in ('subtitles', 'automatic_captions'):
             cc = info_dict.get(cc_kind)
             if cc:
@@ -1538,6 +1551,12 @@ class HaruhiDL(object):
                     for subtitle_format in subtitle:
                         if subtitle_format.get('url'):
                             subtitle_format['url'] = sanitize_url(subtitle_format['url'])
+                            if subtitle_format.get('protocol') is None:
+                                subtitle_format['protocol'] = determine_protocol(subtitle_format['url'])
+                            if subtitle_format.get('http_headers') is None:
+                                full_info = info_dict.copy()
+                                full_info.update(subtitle_format)
+                                subtitle_format['http_headers'] = self._calc_headers(full_info)
                         if subtitle_format.get('ext') is None:
                             subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
 
@@ -1854,7 +1873,6 @@ class HaruhiDL(object):
             # subtitles download errors are already managed as troubles in relevant IE
             # that way it will silently go on when used with unsupporting IE
             subtitles = info_dict['requested_subtitles']
-            ie = self.get_info_extractor(info_dict['extractor_key'])
             for sub_lang, sub_info in subtitles.items():
                 sub_format = sub_info['ext']
                 sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
@@ -1873,10 +1891,8 @@ class HaruhiDL(object):
                             return
                     else:
                         try:
-                            sub_data = ie._request_webpage(
-                                sub_info['url'], info_dict['id'], note=False).read()
-                            with io.open(encodeFilename(sub_filename), 'wb') as subfile:
-                                subfile.write(sub_data)
+                            subd = get_suitable_downloader(sub_info, self.params)(self, self.params)
+                            subd.download(sub_filename, sub_info)
                         except (ExtractorError, IOError, OSError, ValueError) as err:
                             self.report_warning('Unable to download subtitle for "%s": %s' %
                                                 (sub_lang, error_to_compat_str(err)))
diff --git a/haruhi_dl/downloader/external.py b/haruhi_dl/downloader/external.py
index b74e95bf1..ae6484c22 100644
--- a/haruhi_dl/downloader/external.py
+++ b/haruhi_dl/downloader/external.py
@@ -318,7 +318,9 @@ class FFmpegFD(ExternalFD):
             args += ['-fs', compat_str(self._TEST_FILE_SIZE)]
 
         if protocol in ('m3u8', 'm3u8_native'):
-            if self.params.get('hls_use_mpegts', False) or tmpfilename == '-':
+            if info_dict['ext'] == 'vtt':
+                args += ['-f', 'webvtt']
+            elif self.params.get('hls_use_mpegts', False) or tmpfilename == '-':
                 args += ['-f', 'mpegts']
             else:
                 args += ['-f', 'mp4']
diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py
index 5b4ad411c..3d07978bb 100644
--- a/haruhi_dl/extractor/common.py
+++ b/haruhi_dl/extractor/common.py
@@ -1391,6 +1391,10 @@ class InfoExtractor(object):
                 f['tbr'] = f['abr'] + f['vbr']
 
         def _formats_key(f):
+            # manifest subtitle workaround
+            if '_subtitle' in f:
+                return (-1,)
+
             # TODO remove the following workaround
             from ..utils import determine_ext
             if not f.get('ext') and 'url' in f:
@@ -1726,7 +1730,7 @@ class InfoExtractor(object):
             if not (media_type and group_id and name):
                 return
             groups.setdefault(group_id, []).append(media)
-            if media_type not in ('VIDEO', 'AUDIO'):
+            if media_type not in ('VIDEO', 'AUDIO', 'SUBTITLES'):
                 return
             media_url = media.get('URI')
             if media_url:
@@ -1734,17 +1738,27 @@ class InfoExtractor(object):
                 for v in (m3u8_id, group_id, name):
                     if v:
                         format_id.append(v)
-                f = {
-                    'format_id': '-'.join(format_id),
-                    'url': format_url(media_url),
-                    'manifest_url': m3u8_url,
-                    'language': media.get('LANGUAGE'),
-                    'ext': ext,
-                    'protocol': entry_protocol,
-                    'preference': preference,
-                }
-                if media_type == 'AUDIO':
-                    f['vcodec'] = 'none'
+                if media_type == 'SUBTITLES':
+                    f = {
+                        '_subtitle': {
+                            'url': format_url(media_url),
+                            'ext': 'vtt',
+                            'protocol': entry_protocol,
+                        },
+                        '_key': media.get('LANGUAGE'),
+                    }
+                else:
+                    f = {
+                        'format_id': '-'.join(format_id),
+                        'url': format_url(media_url),
+                        'manifest_url': m3u8_url,
+                        'language': media.get('LANGUAGE'),
+                        'ext': ext,
+                        'protocol': entry_protocol,
+                        'preference': preference,
+                    }
+                    if media_type == 'AUDIO':
+                        f['vcodec'] = 'none'
                 formats.append(f)
 
         def build_stream_name():