[youtube] search info extractor

2020-11-15 20:31:40 +01:00 · 2020-11-15 20:31:40 +01:00 · 158d4e9088
parent ede99f9f13
commit 158d4e9088
2 changed files with 97 additions and 10 deletions
--- a/haruhi_dl/extractor/extractors.py
+++ b/haruhi_dl/extractor/extractors.py
@ -1484,6 +1484,7 @@ from .youtube import (
    YoutubeIE,
    YoutubeChannelIE,
    YoutubePlaylistIE,
+    YoutubeSearchIE,
    YoutubeTruncatedIDIE,
    YoutubeTruncatedURLIE,
 )
--- a/haruhi_dl/extractor/youtube.py
+++ b/haruhi_dl/extractor/youtube.py
@ -7,7 +7,7 @@ import random
 import re
 import time

-from .common import InfoExtractor
+from .common import InfoExtractor, SearchInfoExtractor
 from ..compat import (
    compat_chr,
    compat_kwargs,
@ -65,6 +65,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
        'x-youtube-client-version': '2.20201112.04.01',
    }

+    _YOUTUBE_API_KEY = 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'
+
    def _set_language(self):
        self._set_cookie(
            '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
@ -2396,6 +2398,7 @@ class YoutubeBaseListInfoExtractor(YoutubeBaseInfoExtractor):
            ], expected_type=compat_str),
            'channel_url': try_get(entry, [
                lambda x: 'https://www.youtube.com' + x['shortBylineText']['runs'][0]['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'],
+                lambda x: 'https://www.youtube.com/channel/' + x['shortBylineText']['runs'][0]['navigationEndpoint']['browseEndpoint']['browseId'],
            ], expected_type=compat_str) or try_get(full_data, [
                lambda x: x['metadata']['channelMetadataRenderer']['ownerUrls'][0],
                lambda x: x['metadata']['channelMetadataRenderer']['vanityChannelUrl'],
@ -2403,22 +2406,26 @@ class YoutubeBaseListInfoExtractor(YoutubeBaseInfoExtractor):
            ]),
        }

-    def _real_extract(self, url):
-        list_id = self._match_id(url)
-        if self._handle_url:
-            url = self._handle_url(url)
+    def _download_first_data(self, url, list_id, query=None):
        webpage = self._download_webpage(url, list_id,
                                         note='Downloading %s page #1 (webpage)' % (self._LIST_NAME))
-        data = self._parse_json(
+        return self._parse_json(
            self._search_regex(
                r'(?:window(?:\["|\.)|var )ytInitialData(?:"])?\s*=\s*({.+});',
-                webpage, 'initial data JSON'), 'initial data JSON')
+                webpage, 'initial data JSON'), 'initial data JSON'), webpage
+
+    def _real_extract(self, url, results=None, query=None):
+        is_search = True if query else False
+        list_id = query or self._match_id(url)
+        if self._handle_url:
+            url = self._handle_url(url)
+        data, webpage = self._download_first_data(url, list_id, query=query)
        videos = self._parse_init_video_list(data)
        entries = videos['entries']
        continuation_token = videos['continuation']
-        if continuation_token:
+        if continuation_token and (not is_search or results):
            page_no = 2
-            while continuation_token is not None:
+            while continuation_token is not None and (len(entries) < results if results else True):
                cont_res = self._download_continuation(continuation_token, list_id, page_no)
                cont_parser = self._parse_continuation_video_list
                if not cont_parser:
@ -2437,7 +2444,10 @@ class YoutubeBaseListInfoExtractor(YoutubeBaseInfoExtractor):
        if 'info_dict' in videos:
            info_dict.update(videos['info_dict'])
        if 'title' not in info_dict:
-            info_dict['title'] = self._og_search_title(webpage)
+            if is_search:
+                info_dict['title'] = list_id
+            else:
+                info_dict['title'] = self._og_search_title(webpage)

        info_dict['entries'] = []
        for _entry in entries:
@ -2449,6 +2459,8 @@ class YoutubeBaseListInfoExtractor(YoutubeBaseInfoExtractor):
                }
                entry.update(_entry)
                info_dict['entries'].append(entry)
+                if results and len(info_dict['entries']) >= results:
+                    break

        return info_dict

@ -2462,6 +2474,30 @@ class YoutubeAjaxListInfoExtractor(YoutubeBaseListInfoExtractor):
                                   })


+class YoutubeYti1ListInfoExtractor(YoutubeBaseListInfoExtractor):
+    # /youtubei/v1/[action]
+    _ACTION_URL = 'https://www.youtube.com/youtubei/v1/%s?key=%s' % ('%s', YoutubeBaseInfoExtractor._YOUTUBE_API_KEY)
+    _ACTION_NAME = 'browse'
+
+    _YTI_CONTEXT = {
+        "client": {
+            "hl": "en-US",
+            "clientName": "WEB",
+            "clientVersion": "2.20201112.04.01",
+        },
+    }
+
+    def _download_continuation(self, continuation, list_id, page_no):
+        return self._download_json(self._ACTION_URL % (self._ACTION_NAME), list_id,
+                                   note='Downloading %s page #%d (yti1)' % (self._LIST_NAME, page_no),
+                                   headers={
+                                       'Content-Type': 'application/json',
+        }, data=bytes(json.dumps({
+            'context': self._YTI_CONTEXT,
+            'continuation': continuation,
+        }), encoding='utf-8'))
+
+
 class YoutubeChannelIE(YoutubeAjaxListInfoExtractor):
    IE_NAME = 'youtube:channel'
    _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?!watch|playlist|v|e|embed|shared)(?:(?P<type>user|channel|c)/)?(?P<id>\w+)(?!/live)'
@ -2577,6 +2613,56 @@ class YoutubePlaylistIE(YoutubeAjaxListInfoExtractor):
        }


+class YoutubeSearchIE(SearchInfoExtractor, YoutubeYti1ListInfoExtractor):
+    IE_NAME = 'youtube:search'
+    _SEARCH_KEY = 'ytsearch'
+    _MAX_RESULTS = float('inf')
+    _ACTION_NAME = 'search'
+    _LIST_NAME = 'search results'
+    _searcher = YoutubeBaseListInfoExtractor._real_extract
+
+    def _download_first_data(self, url, list_id, query=''):
+        return self._download_json(self._ACTION_URL % (self._ACTION_NAME), list_id,
+                                   note='Downloading %s page #1 (yti1)' % (self._LIST_NAME),
+                                   headers={
+                                       'Content-Type': 'application/json',
+        }, data=bytes(json.dumps({
+            'context': self._YTI_CONTEXT,
+            'query': query,
+        }), encoding='utf-8')), ''
+
+    def _parse_init_video_list(self, data):
+        renderer = try_get(data, [
+            # initial
+            lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][0]['itemSectionRenderer'],
+            # continuation
+            lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems'][0]['itemSectionRenderer'],
+        ])
+        if not renderer:
+            raise ExtractorError('Could not extract %s item list renderer' % self._LIST_NAME)
+        rend_items = try_get(renderer, [
+            lambda x: x['contents'],
+        ])
+        if not rend_items:
+            raise ExtractorError('Could not extract %s renderer item list' % self._LIST_NAME)
+        entries = []
+        for item in rend_items:
+            entries.append(self._parse_video(item, entry_key='videoRenderer'))
+        return {
+            'entries': entries,
+            'continuation': try_get(data, [
+                # initial
+                lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'][-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
+                # continuation
+                lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems'][-1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
+            ], expected_type=compat_str),
+            'info_dict': {},
+        }
+
+    def _get_n_results(self, query, n):
+        return self._searcher('ytsearch', results=n, query=query)
+
+
 class YoutubeTruncatedURLIE(InfoExtractor):
    IE_NAME = 'youtube:truncated_url'
    IE_DESC = False  # Do not list