From 5f8b81c6e753319fbcc691d497994cb36100743d Mon Sep 17 00:00:00 2001 From: Lauren Liberda Date: Mon, 21 Jun 2021 23:01:02 +0200 Subject: [PATCH] prerelease artifact generator, for youtube sig --- .gitignore | 1 + devscripts/prerelease_codegen.py | 31 +++++++++++++++++++++ haruhi_dl/extractor/youtube.py | 46 ++++++++++++++++++++++---------- 3 files changed, 64 insertions(+), 14 deletions(-) create mode 100644 devscripts/prerelease_codegen.py diff --git a/.gitignore b/.gitignore index 09e127d85..482e0f823 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ haruhi-dl.1 haruhi-dl.bash-completion haruhi-dl.fish haruhi_dl/extractor/lazy_extractors.py +haruhi_dl/extractor_artifacts/ haruhi-dl haruhi-dl.exe haruhi-dl.tar.gz diff --git a/devscripts/prerelease_codegen.py b/devscripts/prerelease_codegen.py new file mode 100644 index 000000000..3dc8ff770 --- /dev/null +++ b/devscripts/prerelease_codegen.py @@ -0,0 +1,31 @@ +# this is intended to speed-up some extractors, +# which sometimes need to extract some data that doesn't change very much often, +# but it does on random times, like youtube's signature "crypto" or soundcloud's client id + +import os +from os.path import dirname as dirn +import sys +sys.path.insert(0, dirn(dirn((os.path.abspath(__file__))))) + +from haruhi_dl import HaruhiDL +from haruhi_dl.utils import ( + ExtractorError, +) + +hdl = HaruhiDL(params={ + 'quiet': True, +}) +artifact_dir = os.path.join(dirn(dirn((os.path.abspath(__file__)))), 'haruhi_dl', 'extractor_artifacts') +if not os.path.exists(artifact_dir): + os.mkdir(artifact_dir) + +for ie_name in ( + 'Youtube', +): + ie = hdl.get_info_extractor(ie_name) + try: + file_contents = ie._generate_prerelease_file() + with open(os.path.join(artifact_dir, ie_name.lower() + '.py'), 'w') as file: + file.write(file_contents) + except ExtractorError as err: + print(err) diff --git a/haruhi_dl/extractor/youtube.py b/haruhi_dl/extractor/youtube.py index 69223aca8..6adeb2555 100644 --- a/haruhi_dl/extractor/youtube.py +++ b/haruhi_dl/extractor/youtube.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from datetime import datetime import json import hashlib +from inspect import getsource import random import re import time @@ -45,6 +46,10 @@ from ..utils import ( urlencode_postdata, GeoRestrictedError, ) +try: + from ..extractor_artifacts.youtube import _decrypt_signature_protected +except ImportError: + _decrypt_signature_protected = None class YoutubeBaseInfoExtractor(InfoExtractor): @@ -901,7 +906,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError('Cannot identify player %r' % player_url) return id_m.group('id') - def _extract_signature_function(self, video_id, player_url, example_sig): + def _extract_signature_function(self, video_id, player_url): player_id = self._extract_player_info(player_url) # Read from filesystem cache @@ -1012,31 +1017,44 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ' return %s\n') % (signature_id_tuple, expr_code) self.to_screen('Extracted signature function:\n' + code) - def mess(self, a, b): + @staticmethod + def mess(a, b): c = a[0] a[0] = a[b % len(a)] a[b % len(a)] = c return a - def _decrypt_signature_protected(self, s): - a = list(s) - a = self.mess(a, 49) - a = self.mess(a, 26) - a.reverse() - a = self.mess(a, 62) - a.reverse() - a = a[2:] - return "".join(a) - def _full_signature_handling(self, sig, player_url, video_id): - signature = self._decrypt_signature_protected(sig) + signature = _decrypt_signature_protected(sig) if re.match(self._VALID_SIG_VALUE_RE, signature): return signature if self._downloader.params.get('verbose'): self.to_screen("Built-in signature decryption failed, trying dynamic") - sig_decrypt_stack = self._extract_signature_function(video_id, player_url, sig) + sig_decrypt_stack = self._extract_signature_function(video_id, player_url) return self._do_decrypt_signature(sig, sig_decrypt_stack) + def _generate_prerelease_file(self): + # It's Monday, so I'm in a bad mood, but at least my sailor uniform is super cute! + video_id = 'ieQ1rAIjzXc' + self._set_consent() + webpage = self._download_webpage('https://www.youtube.com/watch?v=%s' % video_id, video_id) + player_url = self._search_regex(r'"jsUrl":"(/s/player/.*?/player_ias.vflset/.*?/base.js)', webpage, 'player url') + sig_decrypt_stack = self._extract_signature_function(video_id, player_url) + func = re.sub(r'(?m)^ ', '', getsource(self.mess).replace('@staticmethod', '')) + func += '\n\ndef _decrypt_signature_protected(sig):\n' + stack = ['a = list(sig)'] + for fun in sig_decrypt_stack: + if fun[0] == 'splice': + stack.append(f'a = a[{fun[1]}:]') + elif fun[0] == 'reverse': + stack.append('a.reverse()') + elif fun[0] == 'mess': + stack.append(f'a = mess(a, {fun[1]})') + else: + raise ExtractorError('Unknown stack action: %s' % (fun[0])) + stack.append("return ''.join(a)") + return func + '\n'.join(map(lambda x: ' ' * 4 + x, stack)) + '\n' + def _get_subtitles(self, video_id, webpage): try: subs_doc = self._download_xml(