[agora] wyborcza/wysokieobcasy/tokfm podcast fixes

2021-01-23 00:31:40 +01:00 · 2021-01-23 00:31:40 +01:00 · 570cf794a9
parent 3bb3d99229
commit 570cf794a9
1 changed files with 44 additions and 7 deletions
--- a/haruhi_dl/extractor/agora.py
+++ b/haruhi_dl/extractor/agora.py
@ -63,7 +63,12 @@ class WyborczaVideoIE(InfoExtractor):


 class WyborczaPodcastIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?wyborcza\.pl/podcast(?:/0,172673\.html(?:\?(?:[^&]+?&)*?podcast=(?P<episode_id>\d+))?)?'
+    _VALID_URL = r'''(?x)
+        https?://(?:www\.)?
+            (?:wyborcza\.pl/podcast(?:/0,172673\.html)?
+            |wysokieobcasy\.pl/wysokie-obcasy/0,176631\.html)
+        (?:\?(?:[^&]+?&)*?podcast=(?P<episode_id>\d+))?
+    '''
    _TESTS = [{
        'url': 'https://wyborcza.pl/podcast/0,172673.html?podcast=100720#S.main_topic-K.C-B.6-L.1.podcast',
        'info_dict': {
@ -74,6 +79,16 @@ class WyborczaPodcastIE(InfoExtractor):
            'upload_date': '20210117',
            'description': 'md5:49f0a06ffc4c1931210d3ab1416a651d',
        },
+    }, {
+        'url': 'https://www.wysokieobcasy.pl/wysokie-obcasy/0,176631.html?podcast=100673',
+        'info_dict': {
+            'id': '100673',
+            'ext': 'mp3',
+            'title': 'Czym jest ubóstwo menstruacyjne i dlaczego dotyczy każdej i każdego z nas?',
+            'uploader': 'Agnieszka Urazińska ',
+            'upload_date': '20210115',
+            'description': 'md5:c161dc035f8dbb60077011fc41274899',
+        },
    }, {
        'url': 'https://wyborcza.pl/podcast',
        'info_dict': {
@ -81,6 +96,13 @@ class WyborczaPodcastIE(InfoExtractor):
            'title': 'Gościnnie w TOK FM: Wyborcza, 8:10',
        },
        'playlist_mincount': 370,
+    }, {
+        'url': 'https://www.wysokieobcasy.pl/wysokie-obcasy/0,176631.html',
+        'info_dict': {
+            'id': '395',
+            'title': 'Gościnnie w TOK FM: Wysokie Obcasy',
+        },
+        'playlist_mincount': 12,
    }]

    def _real_extract(self, url):
@ -90,10 +112,12 @@ class WyborczaPodcastIE(InfoExtractor):
        if not podcast_id:
            return {
                '_type': 'url',
-                'url': 'tokfm:audition:334',
+                'url': 'tokfm:audition:%s' % ('395' if 'wysokieobcasy.pl/' in url else '334'),
                'ie_key': 'TokFMAudition',
            }
-        meta = self._download_json('https://wyborcza.pl/api/podcast?guid=%s' % podcast_id, podcast_id)
+        meta = self._download_json('https://wyborcza.pl/api/podcast?guid=%s%s' % (podcast_id,
+                                                                                  '&type=wo' if 'wysokieobcasy.pl/' in url else ''),
+                                   podcast_id)
        published_date = meta['publishedDate'].split(' ')
        upload_date = '%s%s%s' % (published_date[2], {
            'stycznia': '01',
@ -184,18 +208,31 @@ class TokFMAuditionIE(InfoExtractor):
    def _real_extract(self, url):
        audition_id = self._match_id(url)

+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Linux; Android 9; Redmi 3S Build/PQ3A.190801.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/87.0.4280.101 Mobile Safari/537.36',
+        }
+
        data = self._download_json(
            'https://api.podcast.radioagora.pl/api4/getSeries?series_id=%s' % (audition_id),
-            audition_id, 'Downloading audition metadata')
+            audition_id, 'Downloading audition metadata', headers=headers)

        if len(data) == 0:
            raise ExtractorError('No such audition')
        data = data[0]
        entries = []
        for page in range(0, (int(data['total_podcasts']) // 30) + 1):
-            podcast_page = self._download_json(
-                'https://api.podcast.radioagora.pl/api4/getPodcasts?series_id=%s&limit=30&offset=%d&with_guests=true&with_leaders_for_mobile=true' % (audition_id, page),
-                audition_id, 'Downloading podcast list (page #%d)' % (page + 1))
+            podcast_page = False
+            retries = 0
+            while retries <= 5 and podcast_page is False:
+                podcast_page = self._download_json(
+                    'https://api.podcast.radioagora.pl/api4/getPodcasts?series_id=%s&limit=30&offset=%d&with_guests=true&with_leaders_for_mobile=true' % (audition_id, page),
+                    audition_id, 'Downloading podcast list (page #%d%s)' % (
+                        page + 1,
+                        (', try %d' % retries) if retries > 0 else ''),
+                    headers=headers)
+                retries += 1
+            if podcast_page is False:
+                raise ExtractorError('Agora returned shit 5 times in a row', expected=True)
            for podcast in podcast_page:
                entries.append({
                    '_type': 'url_transparent',