[sejm.pl] multiple cameras and PJM translator

2021-04-12 12:07:19 +02:00 · 2021-04-12 12:07:19 +02:00 · 8324350f32
parent 059505a9ff
commit 8324350f32
2 changed files with 79 additions and 12 deletions
--- a/haruhi_dl/extractor/extractors.py
+++ b/haruhi_dl/extractor/extractors.py
@ -1107,7 +1107,10 @@ from .scte import (
    SCTECourseIE,
 )
 from .seeker import SeekerIE
-from .sejmpl import SejmPlArchivalIE
+from .sejmpl import (
+    SejmPlArchivalIE,
+    SejmPlVideoIE,
+)
 from .senateisvp import SenateISVPIE
 from .sendtonews import SendtoNewsIE
 from .servus import ServusIE
--- a/haruhi_dl/extractor/sejmpl.py
+++ b/haruhi_dl/extractor/sejmpl.py
@ -3,19 +3,36 @@
 from .common import InfoExtractor
 from ..utils import (
    clean_html,
+    js_to_json,
 )

 import datetime
 import re
+from urllib.parse import parse_qs


 class SejmPlArchivalIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?sejm\.gov\.pl/Sejm(?P<term>\d+)\.nsf/transmisje_arch\.xsp(?:\?(?:[^&\s]+(?:&[^&\s]+)*)?)?(?:#|unid=)(?P<id>[\dA-F]+)'
    IE_NAME = 'sejm.pl:archival'

+    _TESTS = [{
+        # multiple cameras, PJM translator
+        'url': 'https://www.sejm.gov.pl/Sejm9.nsf/transmisje_arch.xsp#9587D63364A355A1C1258562004DCF21',
+        'info_dict': {
+            'id': '9587D63364A355A1C1258562004DCF21',
+            'title': '11. posiedzenie Sejmu IX kadencji',
+        },
+        'playlist_count': 10,
+    }]
+
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        term, video_id = mobj.group('term', 'id')
+        frame = self._download_webpage(
+            'https://sejm-embed.redcdn.pl/Sejm%s.nsf/VideoFrame.xsp/%s' % (term, video_id),
+            video_id, headers={
+                'Referer': 'https://www.sejm.gov.pl/Sejm%s.nsf/transmisje_arch.xsp' % (term),
+            })
        data = self._download_json(
            'https://www.sejm.gov.pl/Sejm%s.nsf/transmisje_arch.xsp/json/%s' % (term, video_id),
            video_id, headers={
@ -32,30 +49,77 @@ class SejmPlArchivalIE(InfoExtractor):

        start_time = iso_date_to_wtf_atende_wants(params['start'])
        stop_time = iso_date_to_wtf_atende_wants(params['stop'])
-        time_query = 'startTime=%d&stopTime=%d' % (start_time, stop_time)

-        file = params['file'].replace('http:', 'https:')
+        duration = stop_time - start_time
+
+        entries = []
+
+        def add_entry(file):
+            if not file:
+                return
+            file = 'https:%s?startTime=%d&stopTime=%d' % (file, start_time, stop_time)
+            stream_id = self._search_regex(r'/o2/sejm/([^/]+)/[^./]+\.livx', file, 'stream id')
+            entries.append({
+                '_type': 'url_transparent',
+                'url': file,
+                'ie_key': 'SejmPlVideo',
+                'id': stream_id,
+                'title': stream_id,
+                'duration': duration,
+            })
+
+        cameras = self._parse_json(
+            self._search_regex(r'(?s)var cameras = (\[.+?\]);', frame, 'camera list'),
+            video_id, js_to_json)
+        for camera in cameras:
+            add_entry(camera['file']['flv'])
+
+        if params.get('mig'):
+            add_entry(self._search_regex(r"var sliUrl = '(.+?)';", frame, 'migacz url', fatal=False))
+
+        return {
+            '_type': 'multi_video',
+            'entries': entries,
+            'id': video_id,
+            'title': data['title'],
+            'description': clean_html(data['desc']),
+            'duration': duration,
+        }
+
+
+class SejmPlVideoIE(InfoExtractor):
+    _VALID_URL = r'https?://[^.]+\.dcs\.redcdn\.pl/[^/]+/o2/sejm/(?P<id>[^/]+)/(?P<filename>[^./]+)\.livx\?(?P<qs>.+)'
+    IE_NAME = 'sejm.pl:video'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        camera, filename, qs = mobj.group('id', 'filename', 'qs')
+        qs = parse_qs(qs)
+        start_time, stop_time = int(qs["startTime"][0]), int(qs["stopTime"][0])
+
+        file = f'https://r.dcs.redcdn.pl/%s/o2/sejm/{camera}/{filename}.livx?startTime={start_time}&stopTime={stop_time}'
+        file_index = file + '&indexMode=true'
+
+        # sejm videos don't have an id, just a camera (pov) id and time range
+        video_id = '%s-%d-%d' % (camera, start_time, stop_time)
+
        formats = [{
-            'url': file + '?' + time_query,
+            'url': file % 'nvr',
            'ext': 'flv',
            'format_id': 'direct-0',
            'preference': -1,   # VERY slow to download (~200 KiB/s, compared to ~10-15 MiB/s by DASH/HLS)
        }]
-        formats.extend(self._extract_mpd_formats(
-            file.replace('/nvr/', '/livedash/') + '?indexMode=true&' + time_query,
-            video_id, mpd_id='dash'))
+        formats.extend(self._extract_mpd_formats(file_index % 'livedash', video_id, mpd_id='dash'))
        formats.extend(self._extract_m3u8_formats(
-            file.replace('/nvr/', '/livehls/') + '/playlist.m3u8?indexMode=true&' + time_query,
-            video_id, m3u8_id='hls'))
+            file_index.replace('?', '/playlist.m3u8?') % 'livehls', video_id, m3u8_id='hls'))

        self._sort_formats(formats)

        duration = stop_time - start_time

        return {
-            'formats': formats,
            'id': video_id,
-            'title': data['title'],
-            'description': clean_html(data['desc']),
+            'title': camera,
+            'formats': formats,
            'duration': duration,
        }