# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor class EurozetArticleIE(InfoExtractor): IE_NAME = 'eurozet:article' _VALID_URL = r'https?://(?:[a-z]+\.)*(?[^/\s]+)' _DATA_RE = r'data-%s="(?P.+?)"' _TESTS = [{ 'url': 'https://wiadomosci.radiozet.pl/Gosc-Radia-ZET/Margot-Trzeba-uzywac-mocnych-srodkow-zeby-byc-irytujacym-dla-wladzy', 'info_dict': { 'id': '131014', 'ext': 'm3u8', 'upload_date': '20200902', 'title': 'Margot: Trzeba używać mocnych środków, żeby być irytującym dla władzy', 'timestamp': 1599021420, 'description': 'md5:d01ba0a7f10c84ed0c7921720411a886', }, }] def _real_extract(self, url): page_slug = self._match_id(url) webpage = self._download_webpage(url, page_slug) video_id = self._html_search_regex(self._DATA_RE % 'storage-id', webpage, 'video id', group='content') info_dict = self._search_json_ld(webpage, video_id) formats = [] for streaming_std in ('ss', 'dash', 'hls'): stream_url = self._html_search_regex(self._DATA_RE % ('source-%s' % streaming_std), webpage, '%s manifest url' % streaming_std, group='content', fatal=False) if stream_url: if streaming_std == 'ss': formats.extend(self._extract_ism_formats(stream_url, video_id)) elif streaming_std == 'dash': formats.extend(self._extract_mpd_formats(stream_url, video_id)) elif streaming_std == 'hls': formats.extend(self._extract_m3u8_formats(stream_url, video_id)) self._sort_formats(formats) info_dict.update({ 'id': video_id, 'formats': formats, }) return info_dict