From c5e61e32dac1a4f0a324b656e0c9ba45bfae9465 Mon Sep 17 00:00:00 2001 From: felix Date: Mon, 7 Nov 2016 15:45:42 +0100 Subject: [PATCH 1/2] [common] extract subtitles info from m3u8 media _extract_m3u8_formats is renamed to _extract_m3u8_formats_and_subtitles and extended to properly handle subtitle references; a wrapper with the old name is provided for compatibility. _parse_m3u8_formats is likewise renamed and extended, but without adding the compatibility wrapper; the test suite is adjusted to test the enhanced method instead. --- test/test_InfoExtractor.py | 29 +++++++++++++++++++++-------- youtube_dl/extractor/common.py | 27 ++++++++++++++++++++------- 2 files changed, 41 insertions(+), 15 deletions(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 71f6608feae..d4f12848c77 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -438,7 +438,14 @@ def test_parse_m3u8_formats(self): 'tbr': 1467, 'width': 1024, 'height': 576, - }] + }], + { + 'fra': [{ + 'url': 'http://replayftv-pmd.francetv.fr/subtitles/2017/16/156589847-1492488987.m3u8', + 'ext': 'vtt', + 'protocol': 'm3u8_webvtt' + }] + }, ), ( # https://github.com/ytdl-org/youtube-dl/issues/11995 @@ -512,7 +519,8 @@ def test_parse_m3u8_formats(self): 'tbr': 2374, 'width': 1024, 'height': 576, - }] + }], + {}, ), ( # https://github.com/ytdl-org/youtube-dl/issues/12211 @@ -571,7 +579,8 @@ def test_parse_m3u8_formats(self): 'tbr': 1396.736, 'width': 854, 'height': 480, - }] + }], + {}, ), ( # http://www.twitch.tv/riotgames/v/6528877 @@ -641,7 +650,8 @@ def test_parse_m3u8_formats(self): 'tbr': 3214.134, 'width': 1280, 'height': 720, - }] + }], + {}, ), ( # http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 @@ -676,7 +686,8 @@ def test_parse_m3u8_formats(self): 'tbr': 1200, 'width': 1280, 'height': 720, - }] + }], + {} ), ( # https://github.com/ytdl-org/youtube-dl/issues/18923 @@ -733,17 +744,19 @@ def test_parse_m3u8_formats(self): 'acodec': 'none', 'width': 1280, 'height': 720, - }] + }], + {} ), ] - for m3u8_file, m3u8_url, expected_formats in _TEST_CASES: + for m3u8_file, m3u8_url, expected_formats, expected_subs in _TEST_CASES: with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file, mode='r', encoding='utf-8') as f: - formats = self.ie._parse_m3u8_formats( + formats, subs = self.ie._parse_m3u8_formats_and_subtitles( f.read(), m3u8_url, ext='mp4') self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + expect_value(self, subs, expected_subs, None) def test_parse_mpd_formats(self): _TEST_CASES = [ diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eaae5e484f9..824773c6be2 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1334,7 +1334,6 @@ def _sort_formats(self, formats, field_preference=None): def _formats_key(f): # TODO remove the following workaround - from ..utils import determine_ext if not f.get('ext') and 'url' in f: f['ext'] = determine_ext(f['url']) @@ -1583,7 +1582,11 @@ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None): 'format_note': 'Quality selection URL', } - def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, + def _extract_m3u8_formats(self, *args, **kwargs): + fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs) + return fmts + + def _extract_m3u8_formats_and_subtitles(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, fatal=True, live=False, data=None, headers={}, @@ -1595,26 +1598,28 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, fatal=fatal, data=data, headers=headers, query=query) if res is False: - return [] + return [], {} m3u8_doc, urlh = res m3u8_url = urlh.geturl() - return self._parse_m3u8_formats( + return self._parse_m3u8_formats_and_subtitles( m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, preference=preference, m3u8_id=m3u8_id, live=live) - def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, + def _parse_m3u8_formats_and_subtitles(self, m3u8_doc, m3u8_url, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, live=False): if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access - return [] + return [], {} if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay return [] formats = [] + subtitles = {} + format_url = lambda u: ( u if re.match(r'^https?://', u) @@ -1655,6 +1660,14 @@ def extract_media(x_media_line): if not (media_type and group_id and name): return groups.setdefault(group_id, []).append(media) + # + if media_type == 'SUBTITLES': + lang = media['LANGUAGE'] # XXX: normalise? + sub_info = { + 'url': media['URI'], + 'ext': determine_ext(media['URI']) + } + subtitles.setdefault(lang, []).append(sub_info) if media_type not in ('VIDEO', 'AUDIO'): return media_url = media.get('URI') @@ -1780,7 +1793,7 @@ def build_stream_name(): formats.append(http_f) last_stream_inf = {} - return formats + return formats, subtitles @staticmethod def _xpath_ns(path, namespace=None): From f32961f2017593ee9303c250c5f2532c3a4f8bd2 Mon Sep 17 00:00:00 2001 From: felix Date: Mon, 7 Nov 2016 15:45:43 +0100 Subject: [PATCH 2/2] [hls] add HLS WebVTT downloader; also, enable subtitle downloading with it --- youtube_dl/YoutubeDL.py | 17 +++- youtube_dl/downloader/__init__.py | 2 + youtube_dl/downloader/hls.py | 150 ++++++++++++++++++++++++++++++ youtube_dl/extractor/common.py | 3 + 4 files changed, 168 insertions(+), 4 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f5cb4630819..09b6e5e3c15 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1830,10 +1830,19 @@ def ensure_dir_exists(path): return else: try: - sub_data = ie._request_webpage( - sub_info['url'], info_dict['id'], note=False).read() - with io.open(encodeFilename(sub_filename), 'wb') as subfile: - subfile.write(sub_data) + if sub_info.get('protocol') is not None: + sub_info_dict = { + 'id': info_dict['id'], + 'protocol': sub_info['protocol'], + 'url': sub_info['url'] + } + sub_fd = get_suitable_downloader(sub_info_dict)(self, self.params) + sub_fd.download(sub_filename, sub_info_dict) + else: + sub_data = ie._request_webpage( + sub_info['url'], info_dict['id'], note=False).read() + with io.open(encodeFilename(sub_filename), 'wb') as subfile: + subfile.write(sub_data) except (ExtractorError, IOError, OSError, ValueError) as err: self.report_warning('Unable to download subtitle for "%s": %s' % (sub_lang, error_to_compat_str(err))) diff --git a/youtube_dl/downloader/__init__.py b/youtube_dl/downloader/__init__.py index 2e485df9dac..175d9cfa728 100644 --- a/youtube_dl/downloader/__init__.py +++ b/youtube_dl/downloader/__init__.py @@ -3,6 +3,7 @@ from .common import FileDownloader from .f4m import F4mFD from .hls import HlsFD +from .hls import WebVttHlsFD from .http import HttpFD from .rtmp import RtmpFD from .dash import DashSegmentsFD @@ -20,6 +21,7 @@ PROTOCOL_MAP = { 'rtmp': RtmpFD, 'm3u8_native': HlsFD, + 'm3u8_webvtt': WebVttHlsFD, 'm3u8': FFmpegFD, 'mms': RtspFD, 'rtsp': RtspFD, diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index b59aad73f9f..138afe605d7 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -208,3 +208,153 @@ def is_ad_fragment_end(s): self._finish_frag_download(ctx) return True + + +class WebVttHlsFD(FragmentFD): + """ A downloader for HLS WebVTT subtitles. """ + FD_NAME = 'hlswebvtt' + + @staticmethod + def _parse_ts(ts): + m = re.match('(?:(?:([0-9]+):)?([0-9]+):)?([0-9]+)(?:\.([0-9]+))?', ts) + hrs, min, sec, msc = m.groups() + return 90 * ( + int(hrs or 0) * 3600000 + + int(min or 0) * 60000 + + int(sec or 0) * 1000 + + int(msc or 0) + ) + + @staticmethod + def _format_ts(ts): + ts = int(ts / 90) + hrs = ts / 3600000 + ts %= 3600000 + min = ts / 60000 + ts %= 60000 + sec = ts / 1000 + ts %= 1000 + return '%02u:%02u:%02u.%03u' % (hrs, min, sec, ts) + + def real_download(self, filename, info_dict): + url = info_dict['url'] + self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) + data = self.ydl.urlopen(url).read() + s = data.decode('utf-8', 'ignore') + segment_urls = [] + for line in s.splitlines(): + line = line.strip() + if line and not line.startswith('#'): + segment_url = ( + line if re.match(r'^https?://', line) + else compat_urlparse.urljoin(url, line)) + segment_urls.append(segment_url) + + ctx = { + 'filename': filename, + 'total_frags': len(segment_urls), + } + + self._prepare_and_start_frag_download(ctx) + + cues = [] + header = [] + frags_filenames = [] + for i, frag_url in enumerate(segment_urls): + frag_name = 'Frag%d' % i + frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name) + + success = ctx['dl'].download(frag_filename, {'url': frag_url}) + if not success: + return False + down, frag_sanitized = sanitize_open(frag_filename, 'rb') + lines = down.read().decode('utf-8', 'ignore').splitlines() + down.close() + frags_filenames.append(frag_sanitized) + + line_iter = iter(lines) + line = next(line_iter) + if not line.startswith('WEBVTT'): + self.report_error('Not a valid WebVTT subtitles segment') + if len(line) > 6 and not (line.startswith('WEBVTT ') or line.startswith('WEBVTT\t')): + self.report_error('Not a valid WebVTT subtitles segment') + + try: + # read header + tsadj = 0 + while True: + line = next(line_iter) + if line == '': + break + elif line.find('-->') != -1: + break + + if line.startswith('X-TIMESTAMP-MAP='): + m = re.search(r'LOCAL:([0-9:.]+)', line) + locl_ts = self._parse_ts(m.group(1)) + m = re.search(r'MPEGTS:([0-9]+)', line) + mpeg_ts = int(m.group(1)) + tsadj = mpeg_ts - locl_ts + else: + header.append(line) + + subtitle = None + while True: + while line == '': + line = next(line_iter) + cue = {} + + if line.find('-->') == -1: + cue['id'] = line + line = next(line_iter) + if line == '': + continue + + m = re.match(r'^([0-9:.]+\s*)-->\s*([0-9:.]+)(\s+.*)?', line) + if m: + ts_start = self._parse_ts(m.group(1)) + ts_end = self._parse_ts(m.group(2)) + cue['style'] = m.group(3) or '' + else: + continue + + ts_start += tsadj + ts_end += tsadj + + cue['start_ts'] = self._format_ts(ts_start) + cue['end_ts'] = self._format_ts(ts_end) + + line = next(line_iter) + + cue['text'] = '' + + try: + while line != '': + if line.find('-->') != -1: + break + cue['text'] += line + '\n' + line = next(line_iter) + finally: + cues.append(cue) + except StopIteration: + pass + + cues.sort(key=lambda cue: cue['start_ts']) + with ctx['dest_stream'] as outf: + outf.write(b'WEBVTT\n') + for item in header: + outf.write(('%s\n' % item).encode('utf-8')) + for cue in cues: + outf.write(b'\n') + if cue.get('id'): + outf.write(('%s\n' % cue['id']).encode('utf-8')) + outf.write( + ('%s --> %s%s\n' % (cue['start_ts'], cue['end_ts'], cue['style'])) + .encode('utf-8') + ) + outf.write(cue['text'].encode('utf-8')) + + self._finish_frag_download(ctx) + + for frag_file in frags_filenames: + os.remove(encodeFilename(frag_file)) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 824773c6be2..febed514b6a 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1667,6 +1667,9 @@ def extract_media(x_media_line): 'url': media['URI'], 'ext': determine_ext(media['URI']) } + if sub_info['ext'] == 'm3u8': # XXX + sub_info['ext'] = 'vtt' + sub_info['protocol'] = 'm3u8_webvtt' subtitles.setdefault(lang, []).append(sub_info) if media_type not in ('VIDEO', 'AUDIO'): return