funkwhale extractors

merge-requests/5/head
Laura Liberda 2020-12-12 06:12:40 +01:00
parent b2e1200c40
commit 4221c2ee68
2 changed files with 307 additions and 0 deletions

View File

@ -403,6 +403,13 @@ from .frontendmasters import (
)
from .funimation import FunimationIE
from .funk import FunkIE
from .funkwhale import (
FunkwhaleAlbumSHIE,
FunkwhaleArtistSHIE,
FunkwhaleChannelSHIE,
FunkwhalePlaylistSHIE,
FunkwhaleTrackSHIE,
)
from .fusion import FusionIE
from .fxnetworks import FXNetworksIE
from .gaia import GaiaIE

View File

@ -0,0 +1,300 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import SelfhostedInfoExtractor
from ..utils import (
compat_str,
compat_urllib_parse_urlencode,
try_get,
parse_iso8601,
)
class FunkwhaleBaseExtractor(SelfhostedInfoExtractor):
_SH_VALID_CONTENT_STRINGS = (
"<noscript><strong>We're sorry but Funkwhale doesn't work",
)
def _call_api(self, host, method, params, vis_id, note='Downloading JSON metadata'):
# basic querystring handling
qs = ''
if isinstance(params, dict):
qs = compat_urllib_parse_urlencode(params)
return self._download_json('https://%s/api/v1/%s?%s' % (host, method, qs), vis_id, note)
def _cover_to_thumbnails(self, cover_data):
if cover_data is None:
return cover_data
thumbnails = [{
'url': cover_data['urls']['original'],
'filesize': cover_data['size'],
'preference': 500,
}]
for quality in ('large_square_crop', 'medium_square_crop'):
if cover_data['urls'].get(quality):
thumbnails.append({
'url': cover_data['urls'][quality],
})
return thumbnails
def _track_data_to_entry(self, track_data, host):
formats = []
for upload in track_data.get('uploads') or ():
formats.append({
'url': 'https://%s%s' % (host, upload['listen_url']),
'ext': upload['extension'],
'abr': upload['bitrate'],
'filesize': upload['size'],
})
channel_data = track_data.get('artist', {})
info_dict = {
'id': compat_str(track_data['id']),
'formats': formats,
'title': track_data['title'],
'description': try_get(track_data, lambda x: x['description']['text'], compat_str),
'channel': channel_data.get('name'),
'channel_url': 'https://%s/library/artists/%d/' % (host, channel_data.get('id'))
if isinstance(channel_data.get('id'), int) else None,
'thumbnails': self._cover_to_thumbnails(try_get(track_data, (
lambda x: x['cover'],
lambda x: x['album']['cover'],
), dict)),
'duration': try_get(track_data, lambda x: ['uploads'][0]['duration'], int),
'timestamp': parse_iso8601(track_data.get('creation_date')),
'view_count': track_data.get('downloads_count'),
'license': track_data.get('license'),
'tags': track_data.get('tags'),
}
info_dict.update(self._uploader_data_to_info_dict(track_data.get('attributed_to')))
return info_dict
def _uploader_data_to_info_dict(self, uploader_data):
if uploader_data is None:
return {}
return {
'uploader': uploader_data.get('name'),
'uploader_url': 'https://%s/@%s' % (uploader_data.get('domain'), uploader_data.get('preferred_username')),
}
class FunkwhaleTrackSHIE(FunkwhaleBaseExtractor):
IE_NAME = 'funkwhale:track'
_VALID_URL = r'funkwhale:track:(?P<host>[^:]+):(?P<id>.+)'
_SH_VALID_URL = r'https?://(?P<host>[^/]+)/library/tracks/(?P<id>\d+)'
_TESTS = [{
'url': 'https://podcast.midline.pl/library/tracks/10/',
'info_dict': {
'id': '10',
'ext': 'mp3',
'uploader': 'Internet. Czas działać!',
'title': '#0 - Podcast "Internet. Czas działać! | Trailer',
'description': '"Internet. Czas działać!" to podcast, z którego dowiecie się, jak internetowe technologie wpływają na społeczeństwo i jak być ich świadomym konsumentem.',
'upload_date': '20201207',
'timestamp': 1607301944,
},
}]
def _selfhosted_extract(self, url, webpage=None):
host, vis_id = self._match_id_and_host(url)
track_data = self._call_api(host, 'tracks/%s' % vis_id, None, vis_id)
return self._track_data_to_entry(track_data, host)
class FunkwhaleArtistSHIE(FunkwhaleBaseExtractor):
IE_NAME = 'funkwhale:artist'
_VALID_URL = r'funkwhale:artist:(?P<host>[^:]+):(?P<id>.+)'
_SH_VALID_URL = r'https?://(?P<host>[^/]+)/library/artists/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://open.audio/library/artists/13556/',
'info_dict': {
'id': '13556',
'title': 'Violons_Populaires_en_Nouvelle_Aquitaine',
'uploader': 'Violons_Populaires_en_Nouvelle_Aquitaine',
},
'playlist_mincount': 38, # 77 tracks, but just 38 of them are playable 🤷‍♀️
}]
def _selfhosted_extract(self, url, webpage=None):
host, vis_id = self._match_id_and_host(url)
artist_data = self._call_api(host, 'artists/%s' % vis_id, None, vis_id)
# the same is done on the frontend
# https://dev.funkwhale.audio/funkwhale/funkwhale/-/blob/89037a76/front/src/components/library/ArtistBase.vue#L189
if artist_data.get('channel'):
return self.url_result('funkwhale:channel:%s:%s' % (host, artist_data['channel']['uuid']), ie='FunkwhaleChannelSH')
tracks_data = self._call_api(host, 'tracks', {
'artist': vis_id,
'hidden': '',
'playable': 'true',
}, vis_id, 'Downloading track list')
tracks = tracks_data['results']
page = 1
while tracks_data.get('next') is not None:
page += 1
tracks_data = tracks_data = self._call_api(host, 'tracks', {
'artist': vis_id,
'hidden': '',
'playable': 'true',
'page': page,
}, vis_id, 'Downloading track list (page #%d)' % page)
tracks.extend(tracks_data['results'])
entries = [self._track_data_to_entry(track, host) for track in tracks]
info_dict = {
'_type': 'playlist',
'id': vis_id,
'entries': entries,
'title': artist_data['attributed_to'].get('name'),
}
info_dict.update(self._uploader_data_to_info_dict(artist_data['attributed_to']))
return info_dict
class FunkwhaleChannelSHIE(FunkwhaleBaseExtractor):
IE_NAME = 'funkwhale:channel'
_VALID_URL = r'funkwhale:channel:(?P<host>[^:]+):(?P<id>.+)'
_SH_VALID_URL = r'https?://(?P<host>[^/]+)/channels/(?P<id>[\w-]+)'
_TESTS = [{
'url': 'https://podcast.midline.pl/channels/Midline/',
'info_dict': {
'id': 'd98ae7a5-5bd5-48c8-a178-a9a12e84cfc7',
},
'playlist_mincount': 9,
}, {
'url': 'https://podcast.midline.pl/channels/d98ae7a5-5bd5-48c8-a178-a9a12e84cfc7/',
'only_matching': True,
}]
def _selfhosted_extract(self, url, webpage=None):
host, vis_id = self._match_id_and_host(url)
channel_data = self._call_api(host, 'channels/%s' % vis_id, None, vis_id)
uuid = channel_data['uuid']
tracks_data = self._call_api(host, 'tracks', {
'channel': uuid,
'include_channels': 'true',
'playable': 'true',
}, uuid, 'Downloading track list')
tracks = tracks_data['results']
page = 1
while tracks_data.get('next') is not None:
page += 1
tracks_data = tracks_data = self._call_api(host, 'tracks', {
'channel': uuid,
'include_channels': 'true',
'playable': 'true',
'page': page,
}, vis_id, 'Downloading track list (page #%d)' % page)
tracks.extend(tracks_data['results'])
entries = [self._track_data_to_entry(track, host) for track in tracks]
info_dict = {
'_type': 'playlist',
'id': uuid,
'title': channel_data['attributed_to'].get('name'),
'entries': entries,
}
info_dict.update(self._uploader_data_to_info_dict(channel_data['attributed_to']))
return info_dict
class FunkwhalePlaylistSHIE(FunkwhaleBaseExtractor):
IE_NAME = 'funkwhale:playlist'
_VALID_URL = r'funkwhale:playlist:(?P<host>[^:]+):(?P<id>.+)'
_SH_VALID_URL = r'https?://(?P<host>[^/]+)/library/playlists/(?P<id>\d+)'
_TESTS = [{
'url': 'https://open.audio/library/playlists/268',
'info_dict': {
'id': '268',
'title': 'Cleaning',
'uploader': 'trash',
},
'playlist_mincount': 180,
}]
def _selfhosted_extract(self, url, webpage=None):
host, vis_id = self._match_id_and_host(url)
playlist_data = self._call_api(host, 'playlists/%s' % vis_id, None, vis_id)
tracks_data = self._call_api(host, 'playlists/%s/tracks' % vis_id, {
'playable': 'true',
}, vis_id, 'Downloading track list')
entries = [self._track_data_to_entry(track.get('track'), host) for track in tracks_data['results']]
info_dict = {
'_type': 'playlist',
'id': vis_id,
'title': playlist_data['name'],
'entries': entries,
}
info_dict.update(self._uploader_data_to_info_dict(playlist_data.get('actor')))
return info_dict
class FunkwhaleAlbumSHIE(FunkwhaleBaseExtractor):
IE_NAME = 'funkwhale:album'
_VALID_URL = r'funkwhale:album:(?P<host>[^:]+):(?P<id>.+)'
_SH_VALID_URL = r'https?://(?P<host>[^/]+)/library/albums/(?P<id>\d+)'
_TESTS = [{
'url': 'https://open.audio/library/albums/5623/',
'info_dict': {
'id': '5623',
'title': 'Volume 5',
},
'playlist_mincount': 115,
}]
def _selfhosted_extract(self, url, webpage=None):
host, vis_id = self._match_id_and_host(url)
album_data = self._call_api(host, 'albums/%s' % vis_id, None, vis_id)
tracks_data = self._call_api(host, 'tracks', {
'ordering': 'disc_number,position',
'album': vis_id,
'include_channels': 'true',
'playable': 'true',
}, vis_id, 'Downloading track list')
tracks = tracks_data['results']
page = 1
while tracks_data.get('next') is not None:
page += 1
tracks_data = tracks_data = self._call_api(host, 'tracks', {
'ordering': 'disc_number,position',
'album': vis_id,
'include_channels': 'true',
'playable': 'true',
'page': page,
}, vis_id, 'Downloading track list (page #%d)' % page)
tracks.extend(tracks_data['results'])
entries = [self._track_data_to_entry(track, host) for track in tracks]
thumbnails = self._cover_to_thumbnails(album_data.get('cover'))
info_dict = {
'_type': 'playlist',
'id': vis_id,
'title': album_data['title'],
'entries': entries,
'thumbnails': thumbnails,
}
return info_dict