From 1786d6c1c4a30620d1a22e292a6413b55746316e Mon Sep 17 00:00:00 2001 From: Lauren Liberda Date: Mon, 1 Mar 2021 21:44:26 +0100 Subject: [PATCH] [peertube] playlist, channel and account extractor --- haruhi_dl/extractor/extractors.py | 7 +- haruhi_dl/extractor/peertube.py | 319 ++++++++++++++++++++++-------- 2 files changed, 241 insertions(+), 85 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 53e34e798..6cb94b4c9 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -913,7 +913,12 @@ from .parliamentliveuk import ParliamentLiveUKIE from .patreon import PatreonIE from .pbs import PBSIE from .pearvideo import PearVideoIE -from .peertube import PeerTubeSHIE +from .peertube import ( + PeerTubeSHIE, + PeerTubePlaylistSHIE, + PeerTubeAccountSHIE, + PeerTubeChannelSHIE, +) from .people import PeopleIE from .performgroup import PerformGroupIE from .periscope import ( diff --git a/haruhi_dl/extractor/peertube.py b/haruhi_dl/extractor/peertube.py index 6cba97a7b..f8aaaa7d6 100644 --- a/haruhi_dl/extractor/peertube.py +++ b/haruhi_dl/extractor/peertube.py @@ -16,17 +16,112 @@ from ..utils import ( ) -class PeerTubeSHIE(SelfhostedInfoExtractor): +class PeerTubeBaseExtractor(SelfhostedInfoExtractor): _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' - _API_BASE = 'https://%s/api/v1/videos/%s/%s' - _VALID_URL = r'peertube:(?P[^:]+):(?P%s)' % (_UUID_RE) - _SH_VALID_URL = r'https?://(?P[^/]+)/(?:videos/(?:watch|embed)|api/v\d/videos)/(?P%s)' % _UUID_RE + _API_BASE = 'https://%s/api/v1/%s/%s/%s' _SH_VALID_CONTENT_STRINGS = ( 'PeerTube<', 'There will be other non JS-based clients to access PeerTube', + '>There are other non JS-based unofficial clients to access PeerTube', '>We are sorry but it seems that PeerTube is not compatible with your web browser.<', + '<meta property="og:platform" content="PeerTube"', ) + def _call_api(self, host, resource, resource_id, path, note=None, errnote=None, fatal=True): + return self._download_json( + self._API_BASE % (host, resource, resource_id, path), resource_id, + note=note, errnote=errnote, fatal=fatal) + + def _parse_video(self, video, url): + host, display_id = self._match_id_and_host(url) + info_dict = {} + + formats = [] + files = video.get('files') or [] + for playlist in (video.get('streamingPlaylists') or []): + if not isinstance(playlist, dict): + continue + playlist_files = playlist.get('files') + if not (playlist_files and isinstance(playlist_files, list)): + continue + files.extend(playlist_files) + for file_ in files: + if not isinstance(file_, dict): + continue + file_url = url_or_none(file_.get('fileUrl')) + if not file_url: + continue + file_size = int_or_none(file_.get('size')) + format_id = try_get( + file_, lambda x: x['resolution']['label'], compat_str) + f = parse_resolution(format_id) + f.update({ + 'url': file_url, + 'format_id': format_id, + 'filesize': file_size, + }) + if format_id == '0p': + f['vcodec'] = 'none' + else: + f['fps'] = int_or_none(file_.get('fps')) + formats.append(f) + if files: + self._sort_formats(formats) + info_dict['formats'] = formats + else: + info_dict.update({ + '_type': 'url_transparent', + 'url': 'peertube:%s:%s' % (host, video['uuid']), + 'ie_key': 'PeerTubeSH', + }) + + def data(section, field, type_): + return try_get(video, lambda x: x[section][field], type_) + + def account_data(field, type_): + return data('account', field, type_) + + def channel_data(field, type_): + return data('channel', field, type_) + + category = data('category', 'label', compat_str) + categories = [category] if category else None + + nsfw = video.get('nsfw') + if nsfw is bool: + age_limit = 18 if nsfw else 0 + else: + age_limit = None + + info_dict.update({ + 'id': video['uuid'], + 'title': video['name'], + 'description': video.get('description'), + 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'timestamp': unified_timestamp(video.get('publishedAt')), + 'uploader': account_data('displayName', compat_str), + 'uploader_id': str_or_none(account_data('id', int)), + 'uploader_url': url_or_none(account_data('url', compat_str)), + 'channel': channel_data('displayName', compat_str), + 'channel_id': str_or_none(channel_data('id', int)), + 'channel_url': url_or_none(channel_data('url', compat_str)), + 'language': data('language', 'id', compat_str), + 'license': data('licence', 'label', compat_str), + 'duration': int_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likes')), + 'dislike_count': int_or_none(video.get('dislikes')), + 'age_limit': age_limit, + 'tags': try_get(video, lambda x: x['tags'], list), + 'categories': categories, + }) + return info_dict + + +class PeerTubeSHIE(PeerTubeBaseExtractor): + _VALID_URL = r'peertube:(?P<host>[^:]+):(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE) + _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)|api/v\d/videos)/(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE) + _TESTS = [{ 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', 'md5': '9bed8c0137913e17b86334e5885aacff', @@ -91,14 +186,9 @@ class PeerTubeSHIE(SelfhostedInfoExtractor): return ['peertube:%s:%s' % (mobj.group('host'), mobj.group('video_id')) for mobj in entries] - def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): - return self._download_json( - self._API_BASE % (host, video_id, path), video_id, - note=note, errnote=errnote, fatal=fatal) - def _get_subtitles(self, host, video_id): captions = self._call_api( - host, video_id, 'captions', note='Downloading captions JSON', + host, 'videos', video_id, 'captions', note='Downloading captions JSON', fatal=False) if not isinstance(captions, dict): return @@ -117,101 +207,162 @@ class PeerTubeSHIE(SelfhostedInfoExtractor): return subtitles def _selfhosted_extract(self, url, webpage=None): - mobj = re.match(self._VALID_URL, url) - if not mobj: - mobj = re.match(self._SH_VALID_URL, url) - host = mobj.group('host') - video_id = mobj.group('id') + host, video_id = self._match_id_and_host(url) video = self._call_api( - host, video_id, '', note='Downloading video JSON') + host, 'videos', video_id, '', note='Downloading video JSON') - title = video['name'] + info_dict = self._parse_video(video, url) - formats = [] - files = video.get('files') or [] - for playlist in (video.get('streamingPlaylists') or []): - if not isinstance(playlist, dict): - continue - playlist_files = playlist.get('files') - if not (playlist_files and isinstance(playlist_files, list)): - continue - files.extend(playlist_files) - for file_ in files: - if not isinstance(file_, dict): - continue - file_url = url_or_none(file_.get('fileUrl')) - if not file_url: - continue - file_size = int_or_none(file_.get('size')) - format_id = try_get( - file_, lambda x: x['resolution']['label'], compat_str) - f = parse_resolution(format_id) - f.update({ - 'url': file_url, - 'format_id': format_id, - 'filesize': file_size, - }) - if format_id == '0p': - f['vcodec'] = 'none' - else: - f['fps'] = int_or_none(file_.get('fps')) - formats.append(f) - self._sort_formats(formats) + info_dict['subtitles'] = self.extract_subtitles(host, video_id) description = None if webpage: description = self._og_search_description(webpage) if not description: full_description = self._call_api( - host, video_id, 'description', note='Downloading description JSON', + host, 'videos', video_id, 'description', note='Downloading description JSON', fatal=False) if isinstance(full_description, dict): description = str_or_none(full_description.get('description')) if not description: description = video.get('description') + info_dict['description'] = description - subtitles = self.extract_subtitles(host, video_id) + return info_dict - def data(section, field, type_): - return try_get(video, lambda x: x[section][field], type_) - def account_data(field, type_): - return data('account', field, type_) +class PeerTubePlaylistSHIE(PeerTubeBaseExtractor): + _VALID_URL = r'peertube:playlist:(?P<host>[^:]+):(?P<id>.+)' + _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)/playlist|api/v\d/video-playlists)/(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE) - def channel_data(field, type_): - return data('channel', field, type_) + _TESTS = [{ + 'url': 'https://video.internet-czas-dzialac.pl/videos/watch/playlist/3c81b894-acde-4539-91a2-1748b208c14c?playlistPosition=1', + 'info_dict': { + 'id': '3c81b894-acde-4539-91a2-1748b208c14c', + 'title': 'Podcast Internet. Czas Działać!', + 'uploader_id': 3, + 'uploader': 'Internet. Czas działać!', + }, + 'playlist_mincount': 14, + }] - category = data('category', 'label', compat_str) - categories = [category] if category else None + def _selfhosted_extract(self, url, webpage=None): + host, display_id = self._match_id_and_host(url) - nsfw = video.get('nsfw') - if nsfw is bool: - age_limit = 18 if nsfw else 0 - else: - age_limit = None + playlist_data = self._call_api(host, 'video-playlists', display_id, '', 'Downloading playlist metadata') + entries = [] + i = 0 + videos = {'total': 0} + while len(entries) < videos['total'] or i == 0: + videos = self._call_api(host, 'video-playlists', display_id, + 'videos?start=%d&count=25' % (i * 25), + note=('Downloading playlist video list (page #%d)' % i)) + i += 1 + for video in videos['data']: + entries.append(self._parse_video(video['video'], url)) return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': urljoin(url, video.get('thumbnailPath')), - 'timestamp': unified_timestamp(video.get('publishedAt')), - 'uploader': account_data('displayName', compat_str), - 'uploader_id': str_or_none(account_data('id', int)), - 'uploader_url': url_or_none(account_data('url', compat_str)), - 'channel': channel_data('displayName', compat_str), - 'channel_id': str_or_none(channel_data('id', int)), - 'channel_url': url_or_none(channel_data('url', compat_str)), - 'language': data('language', 'id', compat_str), - 'license': data('licence', 'label', compat_str), - 'duration': int_or_none(video.get('duration')), - 'view_count': int_or_none(video.get('views')), - 'like_count': int_or_none(video.get('likes')), - 'dislike_count': int_or_none(video.get('dislikes')), - 'age_limit': age_limit, - 'tags': try_get(video, lambda x: x['tags'], list), - 'categories': categories, - 'formats': formats, - 'subtitles': subtitles + '_type': 'playlist', + 'entries': entries, + 'id': playlist_data['uuid'], + 'title': playlist_data['displayName'], + 'description': playlist_data.get('description'), + 'channel': playlist_data['videoChannel']['displayName'], + 'channel_id': playlist_data['videoChannel']['id'], + 'channel_url': playlist_data['videoChannel']['url'], + 'uploader': playlist_data['ownerAccount']['displayName'], + 'uploader_id': playlist_data['ownerAccount']['id'], + 'uploader_url': playlist_data['ownerAccount']['url'], + } + + +class PeerTubeChannelSHIE(PeerTubeBaseExtractor): + _VALID_URL = r'peertube:channel:(?P<host>[^:]+):(?P<id>.+)' + _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:api/v\d/)?video-channels/(?P<id>[^/?#]+)(?:/videos)?' + + _TESTS = [{ + 'url': 'https://video.internet-czas-dzialac.pl/video-channels/internet_czas_dzialac/videos', + 'info_dict': { + 'id': '2', + 'title': 'internet_czas_dzialac', + 'description': 'md5:4d2e215ea0d9ae4501a556ef6e9a5308', + 'uploader_id': 3, + 'uploader': 'Internet. Czas działać!', + }, + 'playlist_mincount': 14, + }] + + def _selfhosted_extract(self, url, webpage=None): + host, display_id = self._match_id_and_host(url) + + channel_data = self._call_api(host, 'video-channels', display_id, '', 'Downloading channel metadata') + entries = [] + i = 0 + videos = {'total': 0} + while len(entries) < videos['total'] or i == 0: + videos = self._call_api(host, 'video-channels', display_id, + 'videos?start=%d&count=25&sort=publishedAt' % (i * 25), + note=('Downloading channel video list (page #%d)' % i)) + i += 1 + for video in videos['data']: + entries.append(self._parse_video(video, url)) + + return { + '_type': 'playlist', + 'entries': entries, + 'id': str(channel_data['id']), + 'title': channel_data['displayName'], + 'display_id': channel_data['name'], + 'description': channel_data.get('description'), + 'channel': channel_data['displayName'], + 'channel_id': channel_data['id'], + 'channel_url': channel_data['url'], + 'uploader': channel_data['ownerAccount']['displayName'], + 'uploader_id': channel_data['ownerAccount']['id'], + 'uploader_url': channel_data['ownerAccount']['url'], + } + + +class PeerTubeAccountSHIE(PeerTubeBaseExtractor): + _VALID_URL = r'peertube:account:(?P<host>[^:]+):(?P<id>.+)' + _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:api/v\d/)?accounts/(?P<id>[^/?#]+)(?:/video(?:s|-channels))?' + + _TESTS = [{ + 'url': 'https://video.internet-czas-dzialac.pl/accounts/icd/video-channels', + 'info_dict': { + 'id': '3', + 'description': 'md5:ab3c9b934dd39030eea1c9fe76079870', + 'uploader': 'Internet. Czas działać!', + 'title': 'Internet. Czas działać!', + 'uploader_id': 3, + }, + 'playlist_mincount': 14, + }] + + def _selfhosted_extract(self, url, webpage=None): + host, display_id = self._match_id_and_host(url) + + account_data = self._call_api(host, 'accounts', display_id, '', 'Downloading account metadata') + entries = [] + i = 0 + videos = {'total': 0} + while len(entries) < videos['total'] or i == 0: + videos = self._call_api(host, 'accounts', display_id, + 'videos?start=%d&count=25&sort=publishedAt' % (i * 25), + note=('Downloading account video list (page #%d)' % i)) + i += 1 + for video in videos['data']: + entries.append(self._parse_video(video, url)) + + return { + '_type': 'playlist', + 'entries': entries, + 'id': str(account_data['id']), + 'title': account_data['displayName'], + 'display_id': account_data['name'], + 'description': account_data.get('description'), + 'uploader': account_data['displayName'], + 'uploader_id': account_data['id'], + 'uploader_url': account_data['url'], }