From 91d6c6dbd48ea48818f280789fb4ba9c11655f5c Mon Sep 17 00:00:00 2001
From: Laura Liberda <laura@selfisekai.rocks>
Date: Sun, 6 Dec 2020 00:28:55 +0100
Subject: [PATCH] [polskieradio] livestream player extractor

---
 haruhi_dl/extractor/extractors.py   |   1 +
 haruhi_dl/extractor/polskieradio.py | 104 ++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+)
diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py
index 79f0de755..7811acbec 100644
--- a/haruhi_dl/extractor/extractors.py
+++ b/haruhi_dl/extractor/extractors.py
@@ -871,6 +871,7 @@ from .pokemon import PokemonIE
 from .polskieradio import (
     PolskieRadioIE,
     PolskieRadioCategoryIE,
+    PolskieRadioPlayerIE,
 )
 from .popcorntimes import PopcorntimesIE
 from .popcorntv import PopcornTVIE
diff --git a/haruhi_dl/extractor/polskieradio.py b/haruhi_dl/extractor/polskieradio.py
index 27c4b4041..2259275ee 100644
--- a/haruhi_dl/extractor/polskieradio.py
+++ b/haruhi_dl/extractor/polskieradio.py
@@ -12,6 +12,7 @@ from ..compat import (
 )
 from ..utils import (
     extract_attributes,
+    ExtractorError,
     int_or_none,
     strip_or_none,
     unescapeHTML,
@@ -193,3 +194,106 @@ class PolskieRadioCategoryIE(InfoExtractor):
         return self.playlist_result(
             self._entries(url, webpage, category_id),
             category_id, title)
+
+
+class PolskieRadioPlayerIE(InfoExtractor):
+    IE_NAME = 'polskieradio:player'
+    _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P<id>[^/]+)'
+
+    _BASE_URL = 'https://player.polskieradio.pl'
+    _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js'
+    _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje'
+
+    _TESTS = [{
+        'url': 'https://player.polskieradio.pl/anteny/trojka',
+        'info_dict': {
+            'id': '3',
+            'ext': 'm3u8',
+            'title': 'Trójka',
+        },
+        'params': {
+            'format': 'bestaudio',
+            # endless stream
+            'skip_download': True,
+        },
+    }]
+
+    def _get_channel_list(self, channel_url='no_channel'):
+        player_code = self._download_webpage(
+            self._PLAYER_URL, channel_url,
+            note='Downloading js player')
+        channel_list = self._search_regex(
+            r';var r="anteny",a=(\[.+?\])},', player_code, 'channel list')
+        # weird regex replaces to hopefully make it a valid JSON string to parse
+
+        # insert keys inside quotemarks ("key")
+        channel_list = re.sub(r'([{,])(\w+):', r'\1"\2":', channel_list)
+        # replace shortened booleans (!0, !1, !-0.1)
+        channel_list = re.sub(r':\s*!-?(?:[1-9]\d*(?:\.\d+)?|0\.\d+)', r':true', channel_list)
+        channel_list = re.sub(r':\s*!0', r':false', channel_list)
+
+        return self._parse_json(channel_list, channel_url)
+
+    def _real_extract(self, url):
+        channel_url = self._match_id(url)
+        channel_list = self._get_channel_list(channel_url)
+
+        channel = None
+        for f_channel in channel_list:
+            if f_channel.get('url') == channel_url:
+                channel = f_channel
+                break
+
+        if not channel:
+            raise ExtractorError('Channel not found')
+
+        station_list = self._download_json(self._STATIONS_API_URL, channel_url,
+                                           note='Downloading stream url list',
+                                           headers={
+                                               'Accept': 'application/json',
+                                               'Referer': url,
+                                               'Origin': self._BASE_URL,
+                                           })
+        station = None
+        for f_station in station_list:
+            if f_station.get('Name') == (channel.get('streamName') or channel.get('name')):
+                station = f_station
+                break
+        if not station:
+            raise ExtractorError('Station not found even though we extracted channel (this is crazy)')
+
+        formats = []
+        # I have no idea who thought providing just a list of undescribed URLs is ok
+        for stream_url in station['Streams']:
+            if stream_url.startswith('//'):
+                # assume https on protocol independent URLs
+                stream_url = 'https:' + stream_url
+            if stream_url.endswith('/playlist.m3u8'):
+                formats.extend(self._extract_m3u8_formats(stream_url, channel_url, preference=500, live=True))
+            elif stream_url.endswith('/manifest.f4m'):
+                formats.extend(self._extract_mpd_formats(stream_url, channel_url))
+            elif stream_url.endswith('/Manifest'):
+                formats.extend(self._extract_ism_formats(stream_url, channel_url))
+            elif stream_url.startswith('rtmp://') \
+                    or stream_url.startswith('rtsp://') \
+                    or stream_url.startswith('mms://'):
+                formats.append({
+                    'url': stream_url,
+                    'preference': -1000,
+                })
+            else:
+                formats.append({
+                    'url': stream_url,
+                    'preference': -500,
+                })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': compat_str(channel['id']),
+            'formats': formats,
+            'title': channel.get('name') or channel.get('streamName'),
+            'display_id': channel_url,
+            'thumbnail': '%s/images/%s-color-logo.png' % (self._BASE_URL, channel_url),
+            'is_live': True,
+        }