diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index b6d3a6c82..a48564139 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1107,7 +1107,10 @@ from .scte import ( SCTECourseIE, ) from .seeker import SeekerIE -from .sejmpl import SejmPlArchivalIE +from .sejmpl import ( + SejmPlArchivalIE, + SejmPlVideoIE, +) from .senateisvp import SenateISVPIE from .sendtonews import SendtoNewsIE from .servus import ServusIE diff --git a/haruhi_dl/extractor/sejmpl.py b/haruhi_dl/extractor/sejmpl.py index 31e3d766c..666d5f4dc 100644 --- a/haruhi_dl/extractor/sejmpl.py +++ b/haruhi_dl/extractor/sejmpl.py @@ -3,19 +3,36 @@ from .common import InfoExtractor from ..utils import ( clean_html, + js_to_json, ) import datetime import re +from urllib.parse import parse_qs class SejmPlArchivalIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?sejm\.gov\.pl/Sejm(?P\d+)\.nsf/transmisje_arch\.xsp(?:\?(?:[^&\s]+(?:&[^&\s]+)*)?)?(?:#|unid=)(?P[\dA-F]+)' IE_NAME = 'sejm.pl:archival' + _TESTS = [{ + # multiple cameras, PJM translator + 'url': 'https://www.sejm.gov.pl/Sejm9.nsf/transmisje_arch.xsp#9587D63364A355A1C1258562004DCF21', + 'info_dict': { + 'id': '9587D63364A355A1C1258562004DCF21', + 'title': '11. posiedzenie Sejmu IX kadencji', + }, + 'playlist_count': 10, + }] + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) term, video_id = mobj.group('term', 'id') + frame = self._download_webpage( + 'https://sejm-embed.redcdn.pl/Sejm%s.nsf/VideoFrame.xsp/%s' % (term, video_id), + video_id, headers={ + 'Referer': 'https://www.sejm.gov.pl/Sejm%s.nsf/transmisje_arch.xsp' % (term), + }) data = self._download_json( 'https://www.sejm.gov.pl/Sejm%s.nsf/transmisje_arch.xsp/json/%s' % (term, video_id), video_id, headers={ @@ -32,30 +49,77 @@ class SejmPlArchivalIE(InfoExtractor): start_time = iso_date_to_wtf_atende_wants(params['start']) stop_time = iso_date_to_wtf_atende_wants(params['stop']) - time_query = 'startTime=%d&stopTime=%d' % (start_time, stop_time) - file = params['file'].replace('http:', 'https:') + duration = stop_time - start_time + + entries = [] + + def add_entry(file): + if not file: + return + file = 'https:%s?startTime=%d&stopTime=%d' % (file, start_time, stop_time) + stream_id = self._search_regex(r'/o2/sejm/([^/]+)/[^./]+\.livx', file, 'stream id') + entries.append({ + '_type': 'url_transparent', + 'url': file, + 'ie_key': 'SejmPlVideo', + 'id': stream_id, + 'title': stream_id, + 'duration': duration, + }) + + cameras = self._parse_json( + self._search_regex(r'(?s)var cameras = (\[.+?\]);', frame, 'camera list'), + video_id, js_to_json) + for camera in cameras: + add_entry(camera['file']['flv']) + + if params.get('mig'): + add_entry(self._search_regex(r"var sliUrl = '(.+?)';", frame, 'migacz url', fatal=False)) + + return { + '_type': 'multi_video', + 'entries': entries, + 'id': video_id, + 'title': data['title'], + 'description': clean_html(data['desc']), + 'duration': duration, + } + + +class SejmPlVideoIE(InfoExtractor): + _VALID_URL = r'https?://[^.]+\.dcs\.redcdn\.pl/[^/]+/o2/sejm/(?P[^/]+)/(?P[^./]+)\.livx\?(?P.+)' + IE_NAME = 'sejm.pl:video' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + camera, filename, qs = mobj.group('id', 'filename', 'qs') + qs = parse_qs(qs) + start_time, stop_time = int(qs["startTime"][0]), int(qs["stopTime"][0]) + + file = f'https://r.dcs.redcdn.pl/%s/o2/sejm/{camera}/{filename}.livx?startTime={start_time}&stopTime={stop_time}' + file_index = file + '&indexMode=true' + + # sejm videos don't have an id, just a camera (pov) id and time range + video_id = '%s-%d-%d' % (camera, start_time, stop_time) + formats = [{ - 'url': file + '?' + time_query, + 'url': file % 'nvr', 'ext': 'flv', 'format_id': 'direct-0', 'preference': -1, # VERY slow to download (~200 KiB/s, compared to ~10-15 MiB/s by DASH/HLS) }] - formats.extend(self._extract_mpd_formats( - file.replace('/nvr/', '/livedash/') + '?indexMode=true&' + time_query, - video_id, mpd_id='dash')) + formats.extend(self._extract_mpd_formats(file_index % 'livedash', video_id, mpd_id='dash')) formats.extend(self._extract_m3u8_formats( - file.replace('/nvr/', '/livehls/') + '/playlist.m3u8?indexMode=true&' + time_query, - video_id, m3u8_id='hls')) + file_index.replace('?', '/playlist.m3u8?') % 'livehls', video_id, m3u8_id='hls')) self._sort_formats(formats) duration = stop_time - start_time return { - 'formats': formats, 'id': video_id, - 'title': data['title'], - 'description': clean_html(data['desc']), + 'title': camera, + 'formats': formats, 'duration': duration, }