diff --git a/devscripts/ExtractCryptoYT.sh b/devscripts/ExtractCryptoYT.sh index 6772de75e..ff4a2aabf 100755 --- a/devscripts/ExtractCryptoYT.sh +++ b/devscripts/ExtractCryptoYT.sh @@ -19,11 +19,11 @@ IFS=';' for i in $code; do num="$(echo "$i" | grep -Poh ',[0-9]+' | grep -Poh '[0-9]+')" if [[ "$i" == *"$splice"* ]]; then - echo "a=a[$num:]" + echo "a = a[$num:]" elif [[ "$i" == *"$rev"* ]]; then echo "a.reverse()" elif [[ "$i" == *"$mess"* ]]; then - echo "a=self.mess(a,$num)" + echo "a = self.mess(a, $num)" else echo "UNKNOWN????" fi diff --git a/haruhi_dl/extractor/youtube.py b/haruhi_dl/extractor/youtube.py index 44336a38e..d9e91fb66 100644 --- a/haruhi_dl/extractor/youtube.py +++ b/haruhi_dl/extractor/youtube.py @@ -7,7 +7,6 @@ import os.path import random import re import time -import traceback from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( @@ -50,6 +49,7 @@ from ..utils import ( urlencode_postdata, ) + class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' @@ -1149,6 +1149,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, ] + _VALID_SIG_VALUE_RE = r'^AO[a-zA-Z0-9_-]+=*$' + def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) self._player_cache = {} @@ -1181,35 +1183,91 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break else: raise ExtractorError('Cannot identify player %r' % player_url) - return id_m.group('ext'), id_m.group('id') + return id_m.group('id') def _extract_signature_function(self, video_id, player_url, example_sig): - player_type, player_id = self._extract_player_info(player_url) + player_id = self._extract_player_info(player_url) # Read from filesystem cache - func_id = '%s_%s_%s' % ( - player_type, player_id, self._signature_cache_id(example_sig)) + func_id = '%s_%s' % ( + player_id, self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id + """ cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) + """ + if not player_url.startswith('http'): + player_url = 'https://www.youtube.com' + player_url download_note = ( 'Downloading player %s' % player_url if self._downloader.params.get('verbose') else - 'Downloading %s player %s' % (player_type, player_id) + 'Downloading js player %s' % player_id ) + code = self._download_webpage( + player_url, video_id, + note=download_note, + errnote='Download of js player %s failed' % player_url) + res = self._parse_sig_js(code) - - + """ test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = res(test_string) + cache_res = self._do_decrypt_signature(test_string, res) cache_spec = [ord(c) for c in cache_res] self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) + """ return res + def _parse_sig_js(self, js_player): + shit_parser = re.search(r'[a-z]\=a\.split\((?:""|\'\')\);(([a-zA-Z_][a-zA-Z\d_]+).*);return a\.join', js_player) + if not shit_parser: + raise ExtractorError('Signature decryption code not found') + func, obfuscated_name = shit_parser.group(1, 2) + obfuscated_func = re.search(r'%s\s*=\s*{([\s\w(){}[\].,:;=%s]*?})};' % (re.escape(obfuscated_name), '%'), + js_player) + if not obfuscated_func: + raise ExtractorError('Signature decrypting deobfuscated functions not found') + obfuscated_stack = obfuscated_func.group(1) + obf_map = {} + for obffun in re.finditer(r'([a-zA-Z_][a-zA-Z\d_]+):function\(a(?:,b)?\){(.*?)}', obfuscated_stack): + obfname, obfval = obffun.group(1, 2) + if 'splice' in obfval: + obf_map[obfname] = 'splice' + elif 'reverse' in obfval: + obf_map[obfname] = 'reverse' + elif 'var' in obfval and 'length' in obfval: + obf_map[obfname] = 'mess' + else: + raise ExtractorError('Unknown obfuscation function type: %s.%s' % (obfuscated_name, obfname)) + decryptor_stack = [] + for instruction in re.finditer(r'%s\.([a-zA-Z_][a-zA-Z\d_]+)\(a,(\d+)\);?' % re.escape(obfuscated_name), + func): + obf_name, obf_arg = instruction.group(1, 2) + inst = obf_map.get(obf_name) + if self._downloader.params.get('verbose', True): + self.to_screen('sig %s %s %s' % (obf_name, inst, obf_arg)) + if inst: + decryptor_stack.append((inst, int(obf_arg) if inst != 'reverse' else None)) + else: + raise ExtractorError('Unknown obfuscation function: %s.%s' % (obfuscated_name, obf_name)) + return decryptor_stack + + def _do_decrypt_signature(self, sig, stack): + a = list(sig) + for fun in stack: + if fun[0] == 'splice': + a = a[fun[1]:] + elif fun[0] == 'reverse': + a.reverse() + elif fun[0] == 'mess': + a = self.mess(a, fun[1]) + else: + raise ExtractorError('Unknown stack action: %s' % (fun[0])) + return ''.join(a) + def _print_sig_code(self, func, example_sig): def gen_sig_code(idxs): def _genslice(start, end, step): @@ -1249,39 +1307,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ' return %s\n') % (signature_id_tuple, expr_code) self.to_screen('Extracted signature function:\n' + code) - def mess(self,a,b): - c=a[0] - a[0]=a[b%len(a)] - a[b%len(a)]=c + def mess(self, a, b): + c = a[0] + a[0] = a[b % len(a)] + a[b % len(a)] = c return a - def _decrypt_signature(self, s): - """Turn the encrypted s field into a working signature - YouTube ignores this? It only matters on protected videos...""" - a=[char for char in s] - a=self.mess(a,67) - a=a[1:] - a=self.mess(a,49) - a=a[3:] - a=self.mess(a,52) + def _decrypt_signature_protected(self, s): + a = list(s) + a = self.mess(a, 69) a.reverse() - a=a[1:] - a=self.mess(a,43) + a = a[2:] + a = self.mess(a, 56) + a = a[1:] + a.reverse() + a = a[3:] a.reverse() return "".join(a) - def _decrypt_signature_protected(self, s): - a=[char for char in s] - a=self.mess(a,69) - a.reverse() - a=a[2:] - a=self.mess(a,56) - a=a[1:] - a.reverse() - a=a[3:] - a.reverse() - return "".join(a) - def _get_subtitles(self, video_id, webpage): try: subs_doc = self._download_xml( @@ -1682,8 +1725,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): data = compat_urllib_parse_urlencode({ 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, -# 'sts': self._search_regex( - # r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), + # 'sts': self._search_regex( + # r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data try: @@ -1889,6 +1932,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'width': int_or_none(fmt.get('width')), } + sig_decrypt_stack = None for fmt in streaming_formats: if fmt.get('drmFamilies') or fmt.get('drm_families'): continue @@ -1919,11 +1963,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if cipher: if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): ASSETS_RE = r'"jsUrl":"(/s/player/.*?/player_ias.vflset/.*?/base.js)' - + player_url = self._search_regex( ASSETS_RE, embed_webpage if age_gate else video_webpage, '', default=None) - + if not player_url and not age_gate: # We need the embed website after all if embed_webpage is None: @@ -1933,7 +1977,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_url = self._search_regex( ASSETS_RE, embed_webpage, 'JS player URL') - #if player_url is None: + # if player_url is None: # player_url_json = self._search_regex( # r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', # video_webpage, 'age gate player URL') @@ -1948,14 +1992,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_url is None: player_desc = 'unknown' else: - player_type, player_version = self._extract_player_info(player_url) + player_version = self._extract_player_info(player_url) player_desc = 'html5 player %s' % player_version parts_sizes = self._signature_cache_id(encrypted_sig) self.to_screen('{%s} signature length %s, %s' % (format_id, parts_sizes, player_desc)) signature = self._decrypt_signature_protected(encrypted_sig) - + if not re.match(self._VALID_SIG_VALUE_RE, signature): + if not sig_decrypt_stack: + if self._downloader.params.get('verbose'): + self.to_screen("Built-in signature decryption failed, trying dynamic") + sig_decrypt_stack = self._extract_signature_function(video_id, player_url, encrypted_sig) + signature = self._do_decrypt_signature(encrypted_sig, sig_decrypt_stack) + sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' url += '&%s=%s' % (sp, signature) if 'ratebypass' not in url: