[youtube] Quick extraction tempfix (closes #22367, closes #22163)

This commit is contained in:
Sergey M․ 2019-09-11 22:44:47 +07:00
parent bff90fc518
commit bf1317d257
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -1915,6 +1915,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return int_or_none(self._search_regex( return int_or_none(self._search_regex(
r'\bclen[=/](\d+)', media_url, 'filesize', default=None)) r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) or []
streaming_formats.extend(try_get(player_response, lambda x: x['streamingData']['adaptiveFormats'], list) or [])
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
self.report_rtmp_download() self.report_rtmp_download()
formats = [{ formats = [{
@ -1923,10 +1926,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': video_info['conn'][0], 'url': video_info['conn'][0],
'player_url': player_url, 'player_url': player_url,
}] }]
elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1):
encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0]
if 'rtmpe%3Dyes' in encoded_url_map: if 'rtmpe%3Dyes' in encoded_url_map:
raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True)
formats = []
formats_spec = {} formats_spec = {}
fmt_list = video_info.get('fmt_list', [''])[0] fmt_list = video_info.get('fmt_list', [''])[0]
if fmt_list: if fmt_list:
@ -1941,90 +1945,105 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'height': int_or_none(width_height[1]), 'height': int_or_none(width_height[1]),
} }
q = qualities(['small', 'medium', 'hd720']) q = qualities(['small', 'medium', 'hd720'])
streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) for fmt in streaming_formats:
if streaming_formats: itag = str_or_none(fmt.get('itag'))
for fmt in streaming_formats: if not itag:
itag = str_or_none(fmt.get('itag'))
if not itag:
continue
quality = fmt.get('quality')
quality_label = fmt.get('qualityLabel') or quality
formats_spec[itag] = {
'asr': int_or_none(fmt.get('audioSampleRate')),
'filesize': int_or_none(fmt.get('contentLength')),
'format_note': quality_label,
'fps': int_or_none(fmt.get('fps')),
'height': int_or_none(fmt.get('height')),
'quality': q(quality),
# bitrate for itag 43 is always 2147483647
'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
'width': int_or_none(fmt.get('width')),
}
formats = []
for url_data_str in encoded_url_map.split(','):
url_data = compat_parse_qs(url_data_str)
if 'itag' not in url_data or 'url' not in url_data or url_data.get('drm_families'):
continue continue
quality = fmt.get('quality')
quality_label = fmt.get('qualityLabel') or quality
formats_spec[itag] = {
'asr': int_or_none(fmt.get('audioSampleRate')),
'filesize': int_or_none(fmt.get('contentLength')),
'format_note': quality_label,
'fps': int_or_none(fmt.get('fps')),
'height': int_or_none(fmt.get('height')),
'quality': q(quality),
# bitrate for itag 43 is always 2147483647
'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
'width': int_or_none(fmt.get('width')),
}
for fmt in streaming_formats:
if fmt.get('drm_families'):
continue
url = url_or_none(fmt.get('url'))
if not url:
cipher = fmt.get('cipher')
if not cipher:
continue
url_data = compat_parse_qs(cipher)
url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str))
if not url:
continue
else:
cipher = None
url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
# Unsupported FORMAT_STREAM_TYPE_OTF # Unsupported FORMAT_STREAM_TYPE_OTF
if stream_type == 3: if stream_type == 3:
continue continue
format_id = url_data['itag'][0]
url = url_data['url'][0]
if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): format_id = fmt.get('itag') or url_data['itag'][0]
ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' if not format_id:
jsplayer_url_json = self._search_regex( continue
ASSETS_RE, format_id = compat_str(format_id)
embed_webpage if age_gate else video_webpage,
'JS player URL (1)', default=None) if cipher:
if not jsplayer_url_json and not age_gate: if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
# We need the embed website after all ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
if embed_webpage is None:
embed_url = proto + '://www.youtube.com/embed/%s' % video_id
embed_webpage = self._download_webpage(
embed_url, video_id, 'Downloading embed webpage')
jsplayer_url_json = self._search_regex( jsplayer_url_json = self._search_regex(
ASSETS_RE, embed_webpage, 'JS player URL') ASSETS_RE,
embed_webpage if age_gate else video_webpage,
'JS player URL (1)', default=None)
if not jsplayer_url_json and not age_gate:
# We need the embed website after all
if embed_webpage is None:
embed_url = proto + '://www.youtube.com/embed/%s' % video_id
embed_webpage = self._download_webpage(
embed_url, video_id, 'Downloading embed webpage')
jsplayer_url_json = self._search_regex(
ASSETS_RE, embed_webpage, 'JS player URL')
player_url = json.loads(jsplayer_url_json) player_url = json.loads(jsplayer_url_json)
if player_url is None:
player_url_json = self._search_regex(
r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
video_webpage, 'age gate player URL')
player_url = json.loads(player_url_json)
if 'sig' in url_data:
url += '&signature=' + url_data['sig'][0]
elif 's' in url_data:
encrypted_sig = url_data['s'][0]
if self._downloader.params.get('verbose'):
if player_url is None: if player_url is None:
player_version = 'unknown' player_url_json = self._search_regex(
player_desc = 'unknown' r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
else: video_webpage, 'age gate player URL')
if player_url.endswith('swf'): player_url = json.loads(player_url_json)
player_version = self._search_regex(
r'-(.+?)(?:/watch_as3)?\.swf$', player_url, if 'sig' in url_data:
'flash player', fatal=False) url += '&signature=' + url_data['sig'][0]
player_desc = 'flash player %s' % player_version elif 's' in url_data:
encrypted_sig = url_data['s'][0]
if self._downloader.params.get('verbose'):
if player_url is None:
player_version = 'unknown'
player_desc = 'unknown'
else: else:
player_version = self._search_regex( if player_url.endswith('swf'):
[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', player_version = self._search_regex(
r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
player_url, 'flash player', fatal=False)
'html5 player', fatal=False) player_desc = 'flash player %s' % player_version
player_desc = 'html5 player %s' % player_version else:
player_version = self._search_regex(
[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
parts_sizes = self._signature_cache_id(encrypted_sig) parts_sizes = self._signature_cache_id(encrypted_sig)
self.to_screen('{%s} signature length %s, %s' % self.to_screen('{%s} signature length %s, %s' %
(format_id, parts_sizes, player_desc)) (format_id, parts_sizes, player_desc))
signature = self._decrypt_signature( signature = self._decrypt_signature(
encrypted_sig, video_id, player_url, age_gate) encrypted_sig, video_id, player_url, age_gate)
sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature'
url += '&%s=%s' % (sp, signature) url += '&%s=%s' % (sp, signature)
if 'ratebypass' not in url: if 'ratebypass' not in url:
url += '&ratebypass=yes' url += '&ratebypass=yes'
@ -2044,24 +2063,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0]) mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
if width is None:
width = int_or_none(fmt.get('width'))
if height is None:
height = int_or_none(fmt.get('height'))
filesize = int_or_none(url_data.get( filesize = int_or_none(url_data.get(
'clen', [None])[0]) or _extract_filesize(url) 'clen', [None])[0]) or _extract_filesize(url)
quality = url_data.get('quality', [None])[0] quality = url_data.get('quality', [None])[0] or fmt.get('quality')
quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel')
tbr = float_or_none(url_data.get('bitrate', [None])[0], 1000) or float_or_none(fmt.get('bitrate'), 1000)
fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps'))
more_fields = { more_fields = {
'filesize': filesize, 'filesize': filesize,
'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), 'tbr': tbr,
'width': width, 'width': width,
'height': height, 'height': height,
'fps': int_or_none(url_data.get('fps', [None])[0]), 'fps': fps,
'format_note': url_data.get('quality_label', [None])[0] or quality, 'format_note': quality_label or quality,
'quality': q(quality), 'quality': q(quality),
} }
for key, value in more_fields.items(): for key, value in more_fields.items():
if value: if value:
dct[key] = value dct[key] = value
type_ = url_data.get('type', [None])[0] type_ = url_data.get('type', [None])[0] or fmt.get('mimeType')
if type_: if type_:
type_split = type_.split(';') type_split = type_.split(';')
kind_ext = type_split[0].split('/') kind_ext = type_split[0].split('/')