ytdl-org · fstirlitz · Nov 7, 2016 · Nov 7, 2016
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
@@ -438,7 +438,14 @@ def test_parse_m3u8_formats(self):
                     'tbr': 1467,
                     'width': 1024,
                     'height': 576,
-                }]
+                }],
+                {
+                    'fra': [{
+                        'url': 'http://replayftv-pmd.francetv.fr/subtitles/2017/16/156589847-1492488987.m3u8',
+                        'ext': 'vtt',
+                        'protocol': 'm3u8_webvtt'
+                    }]
+                },
             ),
             (
                 # https://github.com/ytdl-org/youtube-dl/issues/11995
@@ -512,7 +519,8 @@ def test_parse_m3u8_formats(self):
                     'tbr': 2374,
                     'width': 1024,
                     'height': 576,
-                }]
+                }],
+                {},
             ),
             (
                 # https://github.com/ytdl-org/youtube-dl/issues/12211
@@ -571,7 +579,8 @@ def test_parse_m3u8_formats(self):
                     'tbr': 1396.736,
                     'width': 854,
                     'height': 480,
-                }]
+                }],
+                {},
             ),
             (
                 # http://www.twitch.tv/riotgames/v/6528877
@@ -641,7 +650,8 @@ def test_parse_m3u8_formats(self):
                     'tbr': 3214.134,
                     'width': 1280,
                     'height': 720,
-                }]
+                }],
+                {},
             ),
             (
                 # http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
@@ -676,7 +686,8 @@ def test_parse_m3u8_formats(self):
                     'tbr': 1200,
                     'width': 1280,
                     'height': 720,
-                }]
+                }],
+                {}
             ),
             (
                 # https://github.com/ytdl-org/youtube-dl/issues/18923
@@ -733,17 +744,19 @@ def test_parse_m3u8_formats(self):
                     'acodec': 'none',
                     'width': 1280,
                     'height': 720,
-                }]
+                }],
+                {}
             ),
         ]
 
-        for m3u8_file, m3u8_url, expected_formats in _TEST_CASES:
+        for m3u8_file, m3u8_url, expected_formats, expected_subs in _TEST_CASES:
             with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file,
                          mode='r', encoding='utf-8') as f:
-                formats = self.ie._parse_m3u8_formats(
+                formats, subs = self.ie._parse_m3u8_formats_and_subtitles(
                     f.read(), m3u8_url, ext='mp4')
                 self.ie._sort_formats(formats)
                 expect_value(self, formats, expected_formats, None)
+                expect_value(self, subs, expected_subs, None)
 
     def test_parse_mpd_formats(self):
         _TEST_CASES = [

diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
@@ -1830,10 +1830,19 @@ def ensure_dir_exists(path):
                             return
                     else:
                         try:
-                            sub_data = ie._request_webpage(
-                                sub_info['url'], info_dict['id'], note=False).read()
-                            with io.open(encodeFilename(sub_filename), 'wb') as subfile:
-                                subfile.write(sub_data)
+                            if sub_info.get('protocol') is not None:
+                                sub_info_dict = {
+                                    'id': info_dict['id'],
+                                    'protocol': sub_info['protocol'],
+                                    'url': sub_info['url']
+                                }
+                                sub_fd = get_suitable_downloader(sub_info_dict)(self, self.params)
+                                sub_fd.download(sub_filename, sub_info_dict)
+                            else:
+                                sub_data = ie._request_webpage(
+                                    sub_info['url'], info_dict['id'], note=False).read()
+                                with io.open(encodeFilename(sub_filename), 'wb') as subfile:
+                                    subfile.write(sub_data)
                         except (ExtractorError, IOError, OSError, ValueError) as err:
                             self.report_warning('Unable to download subtitle for "%s": %s' %
                                                 (sub_lang, error_to_compat_str(err)))

diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py
@@ -3,6 +3,7 @@
 from .common import FileDownloader
 from .f4m import F4mFD
 from .hls import HlsFD
+from .hls import WebVttHlsFD
 from .http import HttpFD
 from .rtmp import RtmpFD
 from .dash import DashSegmentsFD
@@ -20,6 +21,7 @@
 PROTOCOL_MAP = {
     'rtmp': RtmpFD,
     'm3u8_native': HlsFD,
+    'm3u8_webvtt': WebVttHlsFD,
     'm3u8': FFmpegFD,
     'mms': RtspFD,
     'rtsp': RtspFD,

diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
@@ -208,3 +208,153 @@ def is_ad_fragment_end(s):
         self._finish_frag_download(ctx)
 
         return True
+
+
+class WebVttHlsFD(FragmentFD):
+    """ A downloader for HLS WebVTT subtitles. """
+    FD_NAME = 'hlswebvtt'
+
+    @staticmethod
+    def _parse_ts(ts):
+        m = re.match('(?:(?:([0-9]+):)?([0-9]+):)?([0-9]+)(?:\.([0-9]+))?', ts)
+        hrs, min, sec, msc = m.groups()
+        return 90 * (
+            int(hrs or 0) * 3600000 +
+            int(min or 0) *   60000 +
+            int(sec or 0) *    1000 +
+            int(msc or 0)
+        )
+
+    @staticmethod
+    def _format_ts(ts):
+        ts  = int(ts / 90)
+        hrs = ts / 3600000
+        ts %=      3600000
+        min = ts /   60000
+        ts %=        60000
+        sec = ts /    1000
+        ts %=         1000
+        return '%02u:%02u:%02u.%03u' % (hrs, min, sec, ts)
+
+    def real_download(self, filename, info_dict):
+        url = info_dict['url']
+        self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
+        data = self.ydl.urlopen(url).read()
+        s = data.decode('utf-8', 'ignore')
+        segment_urls = []
+        for line in s.splitlines():
+            line = line.strip()
+            if line and not line.startswith('#'):
+                segment_url = (
+                    line if re.match(r'^https?://', line)
+                    else compat_urlparse.urljoin(url, line))
+                segment_urls.append(segment_url)
+
+        ctx = {
+            'filename': filename,
+            'total_frags': len(segment_urls),
+        }
+
+        self._prepare_and_start_frag_download(ctx)
+
+        cues = []
+        header = []
+        frags_filenames = []
+        for i, frag_url in enumerate(segment_urls):
+            frag_name = 'Frag%d' % i
+            frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name)
+
+            success = ctx['dl'].download(frag_filename, {'url': frag_url})
+            if not success:
+                return False
+            down, frag_sanitized = sanitize_open(frag_filename, 'rb')
+            lines = down.read().decode('utf-8', 'ignore').splitlines()
+            down.close()
+            frags_filenames.append(frag_sanitized)
+
+            line_iter = iter(lines)
+            line = next(line_iter)
+            if not line.startswith('WEBVTT'):
+                self.report_error('Not a valid WebVTT subtitles segment')
+            if len(line) > 6 and not (line.startswith('WEBVTT ') or line.startswith('WEBVTT\t')):
+                self.report_error('Not a valid WebVTT subtitles segment')
+
+            try:
+                # read header
+                tsadj = 0
+                while True:
+                    line = next(line_iter)
+                    if line == '':
+                        break
+                    elif line.find('-->') != -1:
+                        break
+
+                    if line.startswith('X-TIMESTAMP-MAP='):
+                        m = re.search(r'LOCAL:([0-9:.]+)', line)
+                        locl_ts = self._parse_ts(m.group(1))
+                        m = re.search(r'MPEGTS:([0-9]+)', line)
+                        mpeg_ts = int(m.group(1))
+                        tsadj = mpeg_ts - locl_ts
+                    else:
+                        header.append(line)
+
+                subtitle = None
+                while True:
+                    while line == '':
+                        line = next(line_iter)
+                    cue = {}
+
+                    if line.find('-->') == -1:
+                        cue['id'] = line
+                        line = next(line_iter)
+                        if line == '':
+                            continue
+
+                    m = re.match(r'^([0-9:.]+\s*)-->\s*([0-9:.]+)(\s+.*)?', line)
+                    if m:
+                        ts_start = self._parse_ts(m.group(1))
+                        ts_end   = self._parse_ts(m.group(2))
+                        cue['style'] = m.group(3) or ''
+                    else:
+                        continue
+
+                    ts_start += tsadj
+                    ts_end   += tsadj
+
+                    cue['start_ts'] = self._format_ts(ts_start)
+                    cue['end_ts'] = self._format_ts(ts_end)
+
+                    line = next(line_iter)
+
+                    cue['text'] = ''
+
+                    try:
+                        while line != '':
+                            if line.find('-->') != -1:
+                                break
+                            cue['text'] += line + '\n'
+                            line = next(line_iter)
+                    finally:
+                        cues.append(cue)
+            except StopIteration:
+                pass
+
+        cues.sort(key=lambda cue: cue['start_ts'])
+        with ctx['dest_stream'] as outf:
+            outf.write(b'WEBVTT\n')
+            for item in header:
+                outf.write(('%s\n' % item).encode('utf-8'))
+            for cue in cues:
+                outf.write(b'\n')
+                if cue.get('id'):
+                    outf.write(('%s\n' % cue['id']).encode('utf-8'))
+                outf.write(
+                    ('%s --> %s%s\n' % (cue['start_ts'], cue['end_ts'], cue['style']))
+                        .encode('utf-8')
+                )
+                outf.write(cue['text'].encode('utf-8'))
+
+        self._finish_frag_download(ctx)
+
+        for frag_file in frags_filenames:
+            os.remove(encodeFilename(frag_file))
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
@@ -1334,7 +1334,6 @@ def _sort_formats(self, formats, field_preference=None):
 
         def _formats_key(f):
             # TODO remove the following workaround
-            from ..utils import determine_ext
             if not f.get('ext') and 'url' in f:
                 f['ext'] = determine_ext(f['url'])
 
@@ -1583,7 +1582,11 @@ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
             'format_note': 'Quality selection URL',
         }
 
-    def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
+    def _extract_m3u8_formats(self, *args, **kwargs):
+        fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
+        return fmts
+
+    def _extract_m3u8_formats_and_subtitles(self, m3u8_url, video_id, ext=None,
                               entry_protocol='m3u8', preference=None,
                               m3u8_id=None, note=None, errnote=None,
                               fatal=True, live=False, data=None, headers={},
@@ -1595,26 +1598,28 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
             fatal=fatal, data=data, headers=headers, query=query)
 
         if res is False:
-            return []
+            return [], {}
 
         m3u8_doc, urlh = res
         m3u8_url = urlh.geturl()
 
-        return self._parse_m3u8_formats(
+        return self._parse_m3u8_formats_and_subtitles(
             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
             preference=preference, m3u8_id=m3u8_id, live=live)
 
-    def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
+    def _parse_m3u8_formats_and_subtitles(self, m3u8_doc, m3u8_url, ext=None,
                             entry_protocol='m3u8', preference=None,
                             m3u8_id=None, live=False):
         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
-            return []
+            return [], {}
 
         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
             return []
 
         formats = []
 
+        subtitles = {}
+
         format_url = lambda u: (
             u
             if re.match(r'^https?://', u)
@@ -1655,6 +1660,17 @@ def extract_media(x_media_line):
             if not (media_type and group_id and name):
                 return
             groups.setdefault(group_id, []).append(media)
+            # <https://tools.ietf.org/html/draft-pantos-http-live-streaming-13#section-3.4.9>
+            if media_type == 'SUBTITLES':
+                lang = media['LANGUAGE'] # XXX: normalise?
+                sub_info = {
+                    'url': media['URI'],
+                    'ext': determine_ext(media['URI'])
+                }
+                if sub_info['ext'] == 'm3u8': # XXX
+                    sub_info['ext'] = 'vtt'
+                    sub_info['protocol'] = 'm3u8_webvtt'
+                subtitles.setdefault(lang, []).append(sub_info)
             if media_type not in ('VIDEO', 'AUDIO'):
                 return
             media_url = media.get('URI')
@@ -1780,7 +1796,7 @@ def build_stream_name():
                     formats.append(http_f)
 
                 last_stream_inf = {}
-        return formats
+        return formats, subtitles
 
     @staticmethod
     def _xpath_ns(path, namespace=None):