[polskieradio] radiokierowcow.pl extractor

merge-requests/5/head
Lauren Liberda 2021-03-16 14:57:21 +01:00
parent 999ab0298b
commit 3b151afce7
2 changed files with 72 additions and 22 deletions

View File

@ -964,6 +964,7 @@ from .polskieradio import (
PolskieRadioPlayerIE,
PolskieRadioPodcastIE,
PolskieRadioPodcastListIE,
PolskieRadioRadioKierowcowIE,
)
from .popcorntimes import PopcorntimesIE
from .popcorntv import PopcornTVIE

View File

@ -23,7 +23,36 @@ from ..utils import (
)
class PolskieRadioIE(InfoExtractor):
class PolskieRadioBaseExtractor(InfoExtractor):
def _extract_webpage_player_entries(self, webpage, playlist_id, base_data):
entries = []
media_urls = set()
for data_media in re.findall(r'<[^>]+data-media=(["\']?)({[^>]+})\1', webpage):
media = self._parse_json(unescapeHTML(data_media[1]), playlist_id, fatal=False)
if not media.get('file'):
continue
media_url = self._proto_relative_url(media['file'], 'https:')
if media_url in media_urls:
continue
media_urls.add(media_url)
entry = base_data.copy()
entry.update({
'id': compat_str(media['id']),
'url': media_url,
'duration': int_or_none(media.get('length')),
'vcodec': 'none' if media.get('provider') == 'audio' else None,
})
entry_title = compat_urllib_parse_unquote(media['desc'])
if entry_title:
entry['title'] = entry_title
entries.append(entry)
return entries
class PolskieRadioIE(PolskieRadioBaseExtractor):
_VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)'
_TESTS = [{
# like data-media={"type":"muzyka"}
@ -92,29 +121,13 @@ class PolskieRadioIE(InfoExtractor):
thumbnail_url = self._og_search_thumbnail(webpage)
entries = []
media_urls = set()
title = self._og_search_title(webpage).strip()
for data_media in re.findall(r'<[^>]+data-media=(["\']?)({[^>]+})\1', content):
media = self._parse_json(unescapeHTML(data_media[1]), playlist_id, fatal=False)
if not media.get('file'):
continue
media_url = self._proto_relative_url(media['file'], 'http:')
if media_url in media_urls:
continue
media_urls.add(media_url)
entries.append({
'id': compat_str(media['id']),
'url': media_url,
'title': compat_urllib_parse_unquote(media['desc']) or title,
'duration': int_or_none(media.get('length')),
'vcodec': 'none' if media.get('provider') == 'audio' else None,
'timestamp': timestamp,
'thumbnail': thumbnail_url
})
entries = self._extract_webpage_player_entries(content, playlist_id, {
'title': title,
'timestamp': timestamp,
'thumbnail': thumbnail_url,
})
description = strip_or_none(self._og_search_description(webpage))
@ -383,3 +396,39 @@ class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor):
'Content-Type': 'application/json',
})
return self._parse_episode(data[0])
class PolskieRadioRadioKierowcowIE(PolskieRadioBaseExtractor):
_VALID_URL = r'https?://(?:www\.)?radiokierowcow\.pl/artykul/(?P<id>[0-9]+)'
IE_NAME = 'polskieradio:kierowcow'
_TESTS = [{
'url': 'https://radiokierowcow.pl/artykul/2694529',
'info_dict': {
'id': '2694529',
'title': 'Zielona fala reliktem przeszłości?',
'description': 'md5:343950a8717c9818fdfd4bd2b8ca9ff2',
},
'playlist_count': 3,
}]
def _real_extract(self, url):
media_id = self._match_id(url)
webpage = self._download_webpage(url, media_id)
nextjs_build = self._search_nextjs_data(webpage, media_id)['buildId']
article = self._download_json(
'https://radiokierowcow.pl/_next/data/%s/artykul/%s.json?articleId=%s' % (nextjs_build, media_id, media_id),
media_id)
data = article['pageProps']['data']
title = data['title']
entries = self._extract_webpage_player_entries(data['content'], media_id, {
'title': title,
})
return {
'_type': 'playlist',
'id': media_id,
'entries': entries,
'title': title,
'description': data['lead'],
}