funkwhale extractors

2020-12-12 06:12:40 +01:00 · 2020-12-12 06:12:40 +01:00 · 4221c2ee68
parent b2e1200c40
commit 4221c2ee68
2 changed files with 307 additions and 0 deletions
--- a/haruhi_dl/extractor/extractors.py
+++ b/haruhi_dl/extractor/extractors.py
@ -403,6 +403,13 @@ from .frontendmasters import (
 )
 from .funimation import FunimationIE
 from .funk import FunkIE
+from .funkwhale import (
+    FunkwhaleAlbumSHIE,
+    FunkwhaleArtistSHIE,
+    FunkwhaleChannelSHIE,
+    FunkwhalePlaylistSHIE,
+    FunkwhaleTrackSHIE,
+)
 from .fusion import FusionIE
 from .fxnetworks import FXNetworksIE
 from .gaia import GaiaIE
--- a/haruhi_dl/extractor/funkwhale.py
+++ b/haruhi_dl/extractor/funkwhale.py
@ -0,0 +1,300 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import SelfhostedInfoExtractor
+from ..utils import (
+    compat_str,
+    compat_urllib_parse_urlencode,
+    try_get,
+    parse_iso8601,
+)
+
+
+class FunkwhaleBaseExtractor(SelfhostedInfoExtractor):
+    _SH_VALID_CONTENT_STRINGS = (
+        "<noscript><strong>We're sorry but Funkwhale doesn't work",
+    )
+
+    def _call_api(self, host, method, params, vis_id, note='Downloading JSON metadata'):
+        # basic querystring handling
+        qs = ''
+        if isinstance(params, dict):
+            qs = compat_urllib_parse_urlencode(params)
+        return self._download_json('https://%s/api/v1/%s?%s' % (host, method, qs), vis_id, note)
+
+    def _cover_to_thumbnails(self, cover_data):
+        if cover_data is None:
+            return cover_data
+
+        thumbnails = [{
+            'url': cover_data['urls']['original'],
+            'filesize': cover_data['size'],
+            'preference': 500,
+        }]
+        for quality in ('large_square_crop', 'medium_square_crop'):
+            if cover_data['urls'].get(quality):
+                thumbnails.append({
+                    'url': cover_data['urls'][quality],
+                })
+        return thumbnails
+
+    def _track_data_to_entry(self, track_data, host):
+        formats = []
+        for upload in track_data.get('uploads') or ():
+            formats.append({
+                'url': 'https://%s%s' % (host, upload['listen_url']),
+                'ext': upload['extension'],
+                'abr': upload['bitrate'],
+                'filesize': upload['size'],
+            })
+
+        channel_data = track_data.get('artist', {})
+
+        info_dict = {
+            'id': compat_str(track_data['id']),
+            'formats': formats,
+            'title': track_data['title'],
+            'description': try_get(track_data, lambda x: x['description']['text'], compat_str),
+            'channel': channel_data.get('name'),
+            'channel_url': 'https://%s/library/artists/%d/' % (host, channel_data.get('id'))
+            if isinstance(channel_data.get('id'), int) else None,
+
+            'thumbnails': self._cover_to_thumbnails(try_get(track_data, (
+                lambda x: x['cover'],
+                lambda x: x['album']['cover'],
+            ), dict)),
+            'duration': try_get(track_data, lambda x: ['uploads'][0]['duration'], int),
+            'timestamp': parse_iso8601(track_data.get('creation_date')),
+            'view_count': track_data.get('downloads_count'),
+            'license': track_data.get('license'),
+            'tags': track_data.get('tags'),
+        }
+        info_dict.update(self._uploader_data_to_info_dict(track_data.get('attributed_to')))
+        return info_dict
+
+    def _uploader_data_to_info_dict(self, uploader_data):
+        if uploader_data is None:
+            return {}
+
+        return {
+            'uploader': uploader_data.get('name'),
+            'uploader_url': 'https://%s/@%s' % (uploader_data.get('domain'), uploader_data.get('preferred_username')),
+        }
+
+
+class FunkwhaleTrackSHIE(FunkwhaleBaseExtractor):
+    IE_NAME = 'funkwhale:track'
+
+    _VALID_URL = r'funkwhale:track:(?P<host>[^:]+):(?P<id>.+)'
+    _SH_VALID_URL = r'https?://(?P<host>[^/]+)/library/tracks/(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'https://podcast.midline.pl/library/tracks/10/',
+        'info_dict': {
+            'id': '10',
+            'ext': 'mp3',
+            'uploader': 'Internet. Czas działać!',
+            'title': '#0 - Podcast "Internet. Czas działać! | Trailer',
+            'description': '"Internet. Czas działać!" to podcast, z którego dowiecie się, jak internetowe technologie wpływają na społeczeństwo i jak być ich świadomym konsumentem.',
+            'upload_date': '20201207',
+            'timestamp': 1607301944,
+        },
+    }]
+
+    def _selfhosted_extract(self, url, webpage=None):
+        host, vis_id = self._match_id_and_host(url)
+
+        track_data = self._call_api(host, 'tracks/%s' % vis_id, None, vis_id)
+
+        return self._track_data_to_entry(track_data, host)
+
+
+class FunkwhaleArtistSHIE(FunkwhaleBaseExtractor):
+    IE_NAME = 'funkwhale:artist'
+
+    _VALID_URL = r'funkwhale:artist:(?P<host>[^:]+):(?P<id>.+)'
+    _SH_VALID_URL = r'https?://(?P<host>[^/]+)/library/artists/(?P<id>[\w-]+)'
+
+    _TESTS = [{
+        'url': 'https://open.audio/library/artists/13556/',
+        'info_dict': {
+            'id': '13556',
+            'title': 'Violons_Populaires_en_Nouvelle_Aquitaine',
+            'uploader': 'Violons_Populaires_en_Nouvelle_Aquitaine',
+        },
+        'playlist_mincount': 38,    # 77 tracks, but just 38 of them are playable 🤷‍♀️
+    }]
+
+    def _selfhosted_extract(self, url, webpage=None):
+        host, vis_id = self._match_id_and_host(url)
+
+        artist_data = self._call_api(host, 'artists/%s' % vis_id, None, vis_id)
+
+        # the same is done on the frontend
+        # https://dev.funkwhale.audio/funkwhale/funkwhale/-/blob/89037a76/front/src/components/library/ArtistBase.vue#L189
+        if artist_data.get('channel'):
+            return self.url_result('funkwhale:channel:%s:%s' % (host, artist_data['channel']['uuid']), ie='FunkwhaleChannelSH')
+
+        tracks_data = self._call_api(host, 'tracks', {
+            'artist': vis_id,
+            'hidden': '',
+            'playable': 'true',
+        }, vis_id, 'Downloading track list')
+        tracks = tracks_data['results']
+        page = 1
+        while tracks_data.get('next') is not None:
+            page += 1
+            tracks_data = tracks_data = self._call_api(host, 'tracks', {
+                'artist': vis_id,
+                'hidden': '',
+                'playable': 'true',
+                'page': page,
+            }, vis_id, 'Downloading track list (page #%d)' % page)
+            tracks.extend(tracks_data['results'])
+        entries = [self._track_data_to_entry(track, host) for track in tracks]
+
+        info_dict = {
+            '_type': 'playlist',
+            'id': vis_id,
+            'entries': entries,
+            'title': artist_data['attributed_to'].get('name'),
+        }
+        info_dict.update(self._uploader_data_to_info_dict(artist_data['attributed_to']))
+
+        return info_dict
+
+
+class FunkwhaleChannelSHIE(FunkwhaleBaseExtractor):
+    IE_NAME = 'funkwhale:channel'
+
+    _VALID_URL = r'funkwhale:channel:(?P<host>[^:]+):(?P<id>.+)'
+    _SH_VALID_URL = r'https?://(?P<host>[^/]+)/channels/(?P<id>[\w-]+)'
+
+    _TESTS = [{
+        'url': 'https://podcast.midline.pl/channels/Midline/',
+        'info_dict': {
+            'id': 'd98ae7a5-5bd5-48c8-a178-a9a12e84cfc7',
+        },
+        'playlist_mincount': 9,
+    }, {
+        'url': 'https://podcast.midline.pl/channels/d98ae7a5-5bd5-48c8-a178-a9a12e84cfc7/',
+        'only_matching': True,
+    }]
+
+    def _selfhosted_extract(self, url, webpage=None):
+        host, vis_id = self._match_id_and_host(url)
+
+        channel_data = self._call_api(host, 'channels/%s' % vis_id, None, vis_id)
+        uuid = channel_data['uuid']
+        tracks_data = self._call_api(host, 'tracks', {
+            'channel': uuid,
+            'include_channels': 'true',
+            'playable': 'true',
+        }, uuid, 'Downloading track list')
+        tracks = tracks_data['results']
+        page = 1
+        while tracks_data.get('next') is not None:
+            page += 1
+            tracks_data = tracks_data = self._call_api(host, 'tracks', {
+                'channel': uuid,
+                'include_channels': 'true',
+                'playable': 'true',
+                'page': page,
+            }, vis_id, 'Downloading track list (page #%d)' % page)
+            tracks.extend(tracks_data['results'])
+        entries = [self._track_data_to_entry(track, host) for track in tracks]
+
+        info_dict = {
+            '_type': 'playlist',
+            'id': uuid,
+            'title': channel_data['attributed_to'].get('name'),
+            'entries': entries,
+        }
+        info_dict.update(self._uploader_data_to_info_dict(channel_data['attributed_to']))
+        return info_dict
+
+
+class FunkwhalePlaylistSHIE(FunkwhaleBaseExtractor):
+    IE_NAME = 'funkwhale:playlist'
+
+    _VALID_URL = r'funkwhale:playlist:(?P<host>[^:]+):(?P<id>.+)'
+    _SH_VALID_URL = r'https?://(?P<host>[^/]+)/library/playlists/(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'https://open.audio/library/playlists/268',
+        'info_dict': {
+            'id': '268',
+            'title': 'Cleaning',
+            'uploader': 'trash',
+        },
+        'playlist_mincount': 180,
+    }]
+
+    def _selfhosted_extract(self, url, webpage=None):
+        host, vis_id = self._match_id_and_host(url)
+
+        playlist_data = self._call_api(host, 'playlists/%s' % vis_id, None, vis_id)
+        tracks_data = self._call_api(host, 'playlists/%s/tracks' % vis_id, {
+            'playable': 'true',
+        }, vis_id, 'Downloading track list')
+        entries = [self._track_data_to_entry(track.get('track'), host) for track in tracks_data['results']]
+
+        info_dict = {
+            '_type': 'playlist',
+            'id': vis_id,
+            'title': playlist_data['name'],
+            'entries': entries,
+        }
+        info_dict.update(self._uploader_data_to_info_dict(playlist_data.get('actor')))
+        return info_dict
+
+
+class FunkwhaleAlbumSHIE(FunkwhaleBaseExtractor):
+    IE_NAME = 'funkwhale:album'
+
+    _VALID_URL = r'funkwhale:album:(?P<host>[^:]+):(?P<id>.+)'
+    _SH_VALID_URL = r'https?://(?P<host>[^/]+)/library/albums/(?P<id>\d+)'
+
+    _TESTS = [{
+        'url': 'https://open.audio/library/albums/5623/',
+        'info_dict': {
+            'id': '5623',
+            'title': 'Volume 5',
+        },
+        'playlist_mincount': 115,
+    }]
+
+    def _selfhosted_extract(self, url, webpage=None):
+        host, vis_id = self._match_id_and_host(url)
+
+        album_data = self._call_api(host, 'albums/%s' % vis_id, None, vis_id)
+        tracks_data = self._call_api(host, 'tracks', {
+            'ordering': 'disc_number,position',
+            'album': vis_id,
+            'include_channels': 'true',
+            'playable': 'true',
+        }, vis_id, 'Downloading track list')
+        tracks = tracks_data['results']
+        page = 1
+        while tracks_data.get('next') is not None:
+            page += 1
+            tracks_data = tracks_data = self._call_api(host, 'tracks', {
+                'ordering': 'disc_number,position',
+                'album': vis_id,
+                'include_channels': 'true',
+                'playable': 'true',
+                'page': page,
+            }, vis_id, 'Downloading track list (page #%d)' % page)
+            tracks.extend(tracks_data['results'])
+        entries = [self._track_data_to_entry(track, host) for track in tracks]
+
+        thumbnails = self._cover_to_thumbnails(album_data.get('cover'))
+
+        info_dict = {
+            '_type': 'playlist',
+            'id': vis_id,
+            'title': album_data['title'],
+            'entries': entries,
+            'thumbnails': thumbnails,
+        }
+        return info_dict