From d2e522de09ed2f2a73b41b6e1fc73c537667e897 Mon Sep 17 00:00:00 2001 From: Laura Liberda Date: Mon, 25 Jan 2021 15:45:55 +0100 Subject: [PATCH] wp.pl extractor --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/wppl.py | 61 +++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 haruhi_dl/extractor/wppl.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 0b8eb1202..a68090c79 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1465,6 +1465,7 @@ from .weibo import ( from .weiqitv import WeiqiTVIE from .wistia import WistiaIE from .worldstarhiphop import WorldStarHipHopIE +from .wppl import WpPlIE from .wsj import ( WSJIE, WSJArticleIE, diff --git a/haruhi_dl/extractor/wppl.py b/haruhi_dl/extractor/wppl.py new file mode 100644 index 000000000..b299f2d07 --- /dev/null +++ b/haruhi_dl/extractor/wppl.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, +) + + +class WpPlIE(InfoExtractor): + _VALID_URL = r'https://(?:[^/]+\.)?wp\.pl/[^/]+-(?P\d+v)' + IE_NAME = 'wp.pl' + IE_DESC = 'Wirtualna Polska' + _TESTS = [{ + 'url': 'https://wiadomosci.wp.pl/piotr-wawrzyk-na-rpo-dzieki-psl-to-byloby-trzesienie-ziemi-w-polskiej-polityce-6600013103609985v', + 'info_dict': { + 'id': '2062884', + 'ext': 'mp4', + 'title': 'Piotr Wawrzyk na RPO dzięki PSL? "To byłoby trzęsienie ziemi w polskiej polityce"', + 'description': 'md5:c9b41dce48678c605cedf3f3fe5282c5', + }, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + video_id = self._search_regex(r']+\bid="video-player-(\d+)', webpage, 'video id') + + video_data = self._download_json('https://wideo.wp.pl/api/v2/embed/%s/secured' % video_id, video_id)['clip'] + + formats = [] + for fmt in video_data['url']: + ext = determine_ext(fmt['url']) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats(fmt['url'], video_id, m3u8_id=fmt['type'])) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats(fmt['url'], video_id, mpd_id=fmt['type'])) + else: + mobj = re.match(r'(\d+)x(\d+)', fmt['resolution']) + width, height = mobj.group(1, 2) + formats.append({ + 'url': fmt['url'], + 'ext': ext, + 'format_id': '%s-%s' % (fmt['type'], fmt['quality']), + 'width': int(width), + 'height': int(height), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': video_data['title'], + 'description': video_data.get('description'), + 'thumbnail': video_data.get('screenshot'), + 'duration': video_data.get('duration'), + 'age_limit': video_data.get('minimalAge'), + }