rmf extractors

This commit is contained in:
Laura Liberda 2021-01-07 00:51:08 +01:00
parent 0aa7cb240c
commit c758741d55
2 changed files with 147 additions and 0 deletions

View file

@ -962,6 +962,11 @@ from .reuters import ReutersIE
from .reverbnation import ReverbNationIE from .reverbnation import ReverbNationIE
from .rice import RICEIE from .rice import RICEIE
from .rmcdecouverte import RMCDecouverteIE from .rmcdecouverte import RMCDecouverteIE
from .rmf import (
RMFonPodcastsIE,
RMFonStreamIE,
RMF24IE,
)
from .ro220 import Ro220IE from .ro220 import Ro220IE
from .rockstargames import RockstarGamesIE from .rockstargames import RockstarGamesIE
from .roosterteeth import RoosterTeethIE from .roosterteeth import RoosterTeethIE

142
haruhi_dl/extractor/rmf.py Normal file
View file

@ -0,0 +1,142 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
)
from ..utils import (
JSON_LD_RE,
parse_iso8601,
unescapeHTML,
)
class RMFonStreamIE(InfoExtractor):
IE_NAME = 'rmfon:stream'
_VALID_URL = r'https?://(?:www\.)?rmfon\.pl/play,(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.rmfon.pl/play,5#p',
'info_dict': {
'id': '5',
'ext': 'mp3',
'title': 'RMF FM',
},
}]
def _real_extract(self, url):
stream_id = self._match_id(url)
streams = self._download_xml('https://rmfon.pl/stacje/flash_aac_%s.xml.txt' % stream_id,
stream_id, 'Downloading station stream list')
formats = []
for stream in streams.iter('item'):
formats.append({
'url': stream.text,
'ext': 'aac',
})
for stream in streams.iter('item_mp3'):
formats.append({
'url': stream.text,
'ext': 'mp3',
})
# seems to have lower size than /json/app.txt and the website
stream_list = self._download_xml('https://www.rmfon.pl/xml/stations.txt',
stream_id, 'Downloading station list for metadata')
stream_meta = None
for meta in stream_list.iter('station'):
if meta.attrib.get('id') == stream_id:
stream_meta = meta.attrib
break
return {
'id': stream_id,
'formats': formats,
'title': stream_meta['name'],
'is_live': True,
}
# there doesn't seem to be a way to link to a specific podcast episode...
class RMFonPodcastsIE(InfoExtractor):
IE_NAME = 'rmfon:podcasts'
_VALID_URL = r'https?://(?:www\.)?rmfon\.pl/podcasty/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.rmfon.pl/podcasty/poranna-rozmowa',
'info_dict': {
'id': 'poranna-rozmowa',
'title': 'Poranna rozmowa w\xa0RMF\xa0FM',
'description': 'Na poranną publicystykę zaprasza Robert Mazurek. Codziennie, od poniedziałku do piątku o 8:02 polecamy Poranną rozmowę w RMF FM. Gośćmi są nie tylko politycy, ale i ludzie ze świata kultury czy sportu.',
},
'playlist_mincount': 30,
}]
def _real_extract(self, url):
podcast_slug = self._match_id(url)
meta = self._download_json('https://www.rmfon.pl/json/podcasts.php?c=%s' % (podcast_slug),
podcast_slug)
entries = []
for ep in meta['episodes']:
entries.append({
'id': 'id? on rmfon? haha, next joke please',
'url': ep['url'],
'title': ep['t'],
'description': ep['desc'],
'duration': ep.get('sec'),
'timestamp': parse_iso8601(ep.get('d')),
'thumbnail': ep.get('img'),
})
return {
'_type': 'playlist',
'id': podcast_slug,
'title': unescapeHTML(meta['title']),
'description': unescapeHTML(meta.get('description')),
'thumbnail': meta.get('img'),
'entries': entries,
}
class RMF24IE(InfoExtractor):
IE_NAME = 'rmf24'
_VALID_URL = r'https?://(?:www\.)?rmf24\.pl(?:/[^/?#,]+)+,nId,(?P<id>\d+)'
_TESTS = [{
'url': 'https://www.rmf24.pl/tylko-w-rmf24/poranna-rozmowa/news-marek-suski-chyba-sie-zaszczepie-chociaz-pewne-obawy-mam,nId,4942865',
'info_dict': {
'id': '4942865',
'title': 'Marek Suski: Chyba się zaszczepię, chociaż pewne obawy mam ',
'description': 'md5:1cee8cb54827b5aa9eb39ab1333d4b24',
},
'playlist_count': 3,
}]
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
entries = []
for jsonstr in re.finditer(JSON_LD_RE, webpage):
entry = self._json_ld(self._parse_json(jsonstr.group('json_ld'), page_id), page_id, expected_type='VideoObject')
if isinstance(entry, dict) and isinstance(entry.get('url'), compat_str):
self.to_screen(entry.get('url'))
entry.update({
'id': re.match(r'https?://[^/]+/-/([^/-]+)', entry.get('url')).group(1),
})
entries.append(entry)
return {
'_type': 'playlist',
'id': page_id,
'title': self._og_search_title(webpage),
'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
'entries': entries,
}