diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 1eeb17a93..568c77ef3 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1484,6 +1484,7 @@ from .youtube import ( YoutubeIE, YoutubeChannelIE, YoutubePlaylistIE, + YoutubeSearchIE, YoutubeTruncatedIDIE, YoutubeTruncatedURLIE, ) diff --git a/haruhi_dl/extractor/youtube.py b/haruhi_dl/extractor/youtube.py index 54bf59410..6c8bd4abb 100644 --- a/haruhi_dl/extractor/youtube.py +++ b/haruhi_dl/extractor/youtube.py @@ -7,7 +7,7 @@ import random import re import time -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( compat_chr, compat_kwargs, @@ -65,6 +65,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'x-youtube-client-version': '2.20201112.04.01', } + _YOUTUBE_API_KEY = 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8' + def _set_language(self): self._set_cookie( '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en', @@ -2396,6 +2398,7 @@ class YoutubeBaseListInfoExtractor(YoutubeBaseInfoExtractor): ], expected_type=compat_str), 'channel_url': try_get(entry, [ lambda x: 'https://www.youtube.com' + x['shortBylineText']['runs'][0]['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], + lambda x: 'https://www.youtube.com/channel/' + x['shortBylineText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId'], ], expected_type=compat_str) or try_get(full_data, [ lambda x: x['metadata']['channelMetadataRenderer']['ownerUrls'][0], lambda x: x['metadata']['channelMetadataRenderer']['vanityChannelUrl'], @@ -2403,22 +2406,26 @@ class YoutubeBaseListInfoExtractor(YoutubeBaseInfoExtractor): ]), } - def _real_extract(self, url): - list_id = self._match_id(url) - if self._handle_url: - url = self._handle_url(url) + def _download_first_data(self, url, list_id, query=None): webpage = self._download_webpage(url, list_id, note='Downloading %s page #1 (webpage)' % (self._LIST_NAME)) - data = self._parse_json( + return self._parse_json( self._search_regex( r'(?:window(?:\["|\.)|var )ytInitialData(?:"])?\s*=\s*({.+});', - webpage, 'initial data JSON'), 'initial data JSON') + webpage, 'initial data JSON'), 'initial data JSON'), webpage + + def _real_extract(self, url, results=None, query=None): + is_search = True if query else False + list_id = query or self._match_id(url) + if self._handle_url: + url = self._handle_url(url) + data, webpage = self._download_first_data(url, list_id, query=query) videos = self._parse_init_video_list(data) entries = videos['entries'] continuation_token = videos['continuation'] - if continuation_token: + if continuation_token and (not is_search or results): page_no = 2 - while continuation_token is not None: + while continuation_token is not None and (len(entries) < results if results else True): cont_res = self._download_continuation(continuation_token, list_id, page_no) cont_parser = self._parse_continuation_video_list if not cont_parser: @@ -2437,7 +2444,10 @@ class YoutubeBaseListInfoExtractor(YoutubeBaseInfoExtractor): if 'info_dict' in videos: info_dict.update(videos['info_dict']) if 'title' not in info_dict: - info_dict['title'] = self._og_search_title(webpage) + if is_search: + info_dict['title'] = list_id + else: + info_dict['title'] = self._og_search_title(webpage) info_dict['entries'] = [] for _entry in entries: @@ -2449,6 +2459,8 @@ class YoutubeBaseListInfoExtractor(YoutubeBaseInfoExtractor): } entry.update(_entry) info_dict['entries'].append(entry) + if results and len(info_dict['entries']) >= results: + break return info_dict @@ -2462,6 +2474,30 @@ class YoutubeAjaxListInfoExtractor(YoutubeBaseListInfoExtractor): }) +class YoutubeYti1ListInfoExtractor(YoutubeBaseListInfoExtractor): + # /youtubei/v1/[action] + _ACTION_URL = 'https://www.youtube.com/youtubei/v1/%s?key=%s' % ('%s', YoutubeBaseInfoExtractor._YOUTUBE_API_KEY) + _ACTION_NAME = 'browse' + + _YTI_CONTEXT = { + "client": { + "hl": "en-US", + "clientName": "WEB", + "clientVersion": "2.20201112.04.01", + }, + } + + def _download_continuation(self, continuation, list_id, page_no): + return self._download_json(self._ACTION_URL % (self._ACTION_NAME), list_id, + note='Downloading %s page #%d (yti1)' % (self._LIST_NAME, page_no), + headers={ + 'Content-Type': 'application/json', + }, data=bytes(json.dumps({ + 'context': self._YTI_CONTEXT, + 'continuation': continuation, + }), encoding='utf-8')) + + class YoutubeChannelIE(YoutubeAjaxListInfoExtractor): IE_NAME = 'youtube:channel' _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?!watch|playlist|v|e|embed|shared)(?:(?Puser|channel|c)/)?(?P\w+)(?!/live)' @@ -2577,6 +2613,56 @@ class YoutubePlaylistIE(YoutubeAjaxListInfoExtractor): } +class YoutubeSearchIE(SearchInfoExtractor, YoutubeYti1ListInfoExtractor): + IE_NAME = 'youtube:search' + _SEARCH_KEY = 'ytsearch' + _MAX_RESULTS = float('inf') + _ACTION_NAME = 'search' + _LIST_NAME = 'search results' + _searcher = YoutubeBaseListInfoExtractor._real_extract + + def _download_first_data(self, url, list_id, query=''): + return self._download_json(self._ACTION_URL % (self._ACTION_NAME), list_id, + note='Downloading %s page #1 (yti1)' % (self._LIST_NAME), + headers={ + 'Content-Type': 'application/json', + }, data=bytes(json.dumps({ + 'context': self._YTI_CONTEXT, + 'query': query, + }), encoding='utf-8')), '' + + def _parse_init_video_list(self, data): + renderer = try_get(data, [ + # initial + lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer'], + # continuation + lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems'][0]['itemSectionRenderer'], + ]) + if not renderer: + raise ExtractorError('Could not extract %s item list renderer' % self._LIST_NAME) + rend_items = try_get(renderer, [ + lambda x: x['contents'], + ]) + if not rend_items: + raise ExtractorError('Could not extract %s renderer item list' % self._LIST_NAME) + entries = [] + for item in rend_items: + entries.append(self._parse_video(item, entry_key='videoRenderer')) + return { + 'entries': entries, + 'continuation': try_get(data, [ + # initial + lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], + # continuation + lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems'][-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'], + ], expected_type=compat_str), + 'info_dict': {}, + } + + def _get_n_results(self, query, n): + return self._searcher('ytsearch', results=n, query=query) + + class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list