Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 21 additions & 8 deletions test/test_InfoExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,14 @@ def test_parse_m3u8_formats(self):
'tbr': 1467,
'width': 1024,
'height': 576,
}]
}],
{
'fra': [{
'url': 'http://replayftv-pmd.francetv.fr/subtitles/2017/16/156589847-1492488987.m3u8',
'ext': 'vtt',
'protocol': 'm3u8_webvtt'
}]
},
),
(
# https://github.com/ytdl-org/youtube-dl/issues/11995
Expand Down Expand Up @@ -512,7 +519,8 @@ def test_parse_m3u8_formats(self):
'tbr': 2374,
'width': 1024,
'height': 576,
}]
}],
{},
),
(
# https://github.com/ytdl-org/youtube-dl/issues/12211
Expand Down Expand Up @@ -571,7 +579,8 @@ def test_parse_m3u8_formats(self):
'tbr': 1396.736,
'width': 854,
'height': 480,
}]
}],
{},
),
(
# http://www.twitch.tv/riotgames/v/6528877
Expand Down Expand Up @@ -641,7 +650,8 @@ def test_parse_m3u8_formats(self):
'tbr': 3214.134,
'width': 1280,
'height': 720,
}]
}],
{},
),
(
# http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
Expand Down Expand Up @@ -676,7 +686,8 @@ def test_parse_m3u8_formats(self):
'tbr': 1200,
'width': 1280,
'height': 720,
}]
}],
{}
),
(
# https://github.com/ytdl-org/youtube-dl/issues/18923
Expand Down Expand Up @@ -733,17 +744,19 @@ def test_parse_m3u8_formats(self):
'acodec': 'none',
'width': 1280,
'height': 720,
}]
}],
{}
),
]

for m3u8_file, m3u8_url, expected_formats in _TEST_CASES:
for m3u8_file, m3u8_url, expected_formats, expected_subs in _TEST_CASES:
with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file,
mode='r', encoding='utf-8') as f:
formats = self.ie._parse_m3u8_formats(
formats, subs = self.ie._parse_m3u8_formats_and_subtitles(
f.read(), m3u8_url, ext='mp4')
self.ie._sort_formats(formats)
expect_value(self, formats, expected_formats, None)
expect_value(self, subs, expected_subs, None)

def test_parse_mpd_formats(self):
_TEST_CASES = [
Expand Down
17 changes: 13 additions & 4 deletions youtube_dl/YoutubeDL.py
Original file line number Diff line number Diff line change
Expand Up @@ -1830,10 +1830,19 @@ def ensure_dir_exists(path):
return
else:
try:
sub_data = ie._request_webpage(
sub_info['url'], info_dict['id'], note=False).read()
with io.open(encodeFilename(sub_filename), 'wb') as subfile:
subfile.write(sub_data)
if sub_info.get('protocol') is not None:
sub_info_dict = {
'id': info_dict['id'],
'protocol': sub_info['protocol'],
'url': sub_info['url']
}
sub_fd = get_suitable_downloader(sub_info_dict)(self, self.params)
sub_fd.download(sub_filename, sub_info_dict)
else:
sub_data = ie._request_webpage(
sub_info['url'], info_dict['id'], note=False).read()
with io.open(encodeFilename(sub_filename), 'wb') as subfile:
subfile.write(sub_data)
except (ExtractorError, IOError, OSError, ValueError) as err:
self.report_warning('Unable to download subtitle for "%s": %s' %
(sub_lang, error_to_compat_str(err)))
Expand Down
2 changes: 2 additions & 0 deletions youtube_dl/downloader/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .common import FileDownloader
from .f4m import F4mFD
from .hls import HlsFD
from .hls import WebVttHlsFD
from .http import HttpFD
from .rtmp import RtmpFD
from .dash import DashSegmentsFD
Expand All @@ -20,6 +21,7 @@
PROTOCOL_MAP = {
'rtmp': RtmpFD,
'm3u8_native': HlsFD,
'm3u8_webvtt': WebVttHlsFD,
'm3u8': FFmpegFD,
'mms': RtspFD,
'rtsp': RtspFD,
Expand Down
150 changes: 150 additions & 0 deletions youtube_dl/downloader/hls.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,3 +208,153 @@ def is_ad_fragment_end(s):
self._finish_frag_download(ctx)

return True


class WebVttHlsFD(FragmentFD):
""" A downloader for HLS WebVTT subtitles. """
FD_NAME = 'hlswebvtt'

@staticmethod
def _parse_ts(ts):
m = re.match('(?:(?:([0-9]+):)?([0-9]+):)?([0-9]+)(?:\.([0-9]+))?', ts)
hrs, min, sec, msc = m.groups()
return 90 * (
int(hrs or 0) * 3600000 +
int(min or 0) * 60000 +
int(sec or 0) * 1000 +
int(msc or 0)
)

@staticmethod
def _format_ts(ts):
ts = int(ts / 90)
hrs = ts / 3600000
ts %= 3600000
min = ts / 60000
ts %= 60000
sec = ts / 1000
ts %= 1000
return '%02u:%02u:%02u.%03u' % (hrs, min, sec, ts)

def real_download(self, filename, info_dict):
url = info_dict['url']
self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
data = self.ydl.urlopen(url).read()
s = data.decode('utf-8', 'ignore')
segment_urls = []
for line in s.splitlines():
line = line.strip()
if line and not line.startswith('#'):
segment_url = (
line if re.match(r'^https?://', line)
else compat_urlparse.urljoin(url, line))
segment_urls.append(segment_url)

ctx = {
'filename': filename,
'total_frags': len(segment_urls),
}

self._prepare_and_start_frag_download(ctx)

cues = []
header = []
frags_filenames = []
for i, frag_url in enumerate(segment_urls):
frag_name = 'Frag%d' % i
frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name)

success = ctx['dl'].download(frag_filename, {'url': frag_url})
if not success:
return False
down, frag_sanitized = sanitize_open(frag_filename, 'rb')
lines = down.read().decode('utf-8', 'ignore').splitlines()
down.close()
frags_filenames.append(frag_sanitized)

line_iter = iter(lines)
line = next(line_iter)
if not line.startswith('WEBVTT'):
self.report_error('Not a valid WebVTT subtitles segment')
if len(line) > 6 and not (line.startswith('WEBVTT ') or line.startswith('WEBVTT\t')):
self.report_error('Not a valid WebVTT subtitles segment')

try:
# read header
tsadj = 0
while True:
line = next(line_iter)
if line == '':
break
elif line.find('-->') != -1:
break

if line.startswith('X-TIMESTAMP-MAP='):
m = re.search(r'LOCAL:([0-9:.]+)', line)
locl_ts = self._parse_ts(m.group(1))
m = re.search(r'MPEGTS:([0-9]+)', line)
mpeg_ts = int(m.group(1))
tsadj = mpeg_ts - locl_ts
else:
header.append(line)

subtitle = None
while True:
while line == '':
line = next(line_iter)
cue = {}

if line.find('-->') == -1:
cue['id'] = line
line = next(line_iter)
if line == '':
continue

m = re.match(r'^([0-9:.]+\s*)-->\s*([0-9:.]+)(\s+.*)?', line)
if m:
ts_start = self._parse_ts(m.group(1))
ts_end = self._parse_ts(m.group(2))
cue['style'] = m.group(3) or ''
else:
continue

ts_start += tsadj
ts_end += tsadj

cue['start_ts'] = self._format_ts(ts_start)
cue['end_ts'] = self._format_ts(ts_end)

line = next(line_iter)

cue['text'] = ''

try:
while line != '':
if line.find('-->') != -1:
break
cue['text'] += line + '\n'
line = next(line_iter)
finally:
cues.append(cue)
except StopIteration:
pass

cues.sort(key=lambda cue: cue['start_ts'])
with ctx['dest_stream'] as outf:
outf.write(b'WEBVTT\n')
for item in header:
outf.write(('%s\n' % item).encode('utf-8'))
for cue in cues:
outf.write(b'\n')
if cue.get('id'):
outf.write(('%s\n' % cue['id']).encode('utf-8'))
outf.write(
('%s --> %s%s\n' % (cue['start_ts'], cue['end_ts'], cue['style']))
.encode('utf-8')
)
outf.write(cue['text'].encode('utf-8'))

self._finish_frag_download(ctx)

for frag_file in frags_filenames:
os.remove(encodeFilename(frag_file))
30 changes: 23 additions & 7 deletions youtube_dl/extractor/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1334,7 +1334,6 @@ def _sort_formats(self, formats, field_preference=None):

def _formats_key(f):
# TODO remove the following workaround
from ..utils import determine_ext
if not f.get('ext') and 'url' in f:
f['ext'] = determine_ext(f['url'])

Expand Down Expand Up @@ -1583,7 +1582,11 @@ def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
'format_note': 'Quality selection URL',
}

def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
def _extract_m3u8_formats(self, *args, **kwargs):
fmts, subs = self._extract_m3u8_formats_and_subtitles(*args, **kwargs)
return fmts

def _extract_m3u8_formats_and_subtitles(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None,
m3u8_id=None, note=None, errnote=None,
fatal=True, live=False, data=None, headers={},
Expand All @@ -1595,26 +1598,28 @@ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
fatal=fatal, data=data, headers=headers, query=query)

if res is False:
return []
return [], {}

m3u8_doc, urlh = res
m3u8_url = urlh.geturl()

return self._parse_m3u8_formats(
return self._parse_m3u8_formats_and_subtitles(
m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
preference=preference, m3u8_id=m3u8_id, live=live)

def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
def _parse_m3u8_formats_and_subtitles(self, m3u8_doc, m3u8_url, ext=None,
entry_protocol='m3u8', preference=None,
m3u8_id=None, live=False):
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
return []
return [], {}

if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
return []

formats = []

subtitles = {}

format_url = lambda u: (
u
if re.match(r'^https?://', u)
Expand Down Expand Up @@ -1655,6 +1660,17 @@ def extract_media(x_media_line):
if not (media_type and group_id and name):
return
groups.setdefault(group_id, []).append(media)
# <https://tools.ietf.org/html/draft-pantos-http-live-streaming-13#section-3.4.9>
if media_type == 'SUBTITLES':
lang = media['LANGUAGE'] # XXX: normalise?
sub_info = {
'url': media['URI'],
'ext': determine_ext(media['URI'])
}
if sub_info['ext'] == 'm3u8': # XXX
sub_info['ext'] = 'vtt'
sub_info['protocol'] = 'm3u8_webvtt'
subtitles.setdefault(lang, []).append(sub_info)
if media_type not in ('VIDEO', 'AUDIO'):
return
media_url = media.get('URI')
Expand Down Expand Up @@ -1780,7 +1796,7 @@ def build_stream_name():
formats.append(http_f)

last_stream_inf = {}
return formats
return formats, subtitles

@staticmethod
def _xpath_ns(path, namespace=None):
Expand Down