From b55552ad1aeb3f53b7788f782b4ec7973446cbc4 Mon Sep 17 00:00:00 2001 From: Laura Liberda Date: Thu, 11 Feb 2021 13:18:57 +0100 Subject: [PATCH] [vod.pl] fix extraction --- haruhi_dl/extractor/vodpl.py | 48 +++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/haruhi_dl/extractor/vodpl.py b/haruhi_dl/extractor/vodpl.py index 4bb75e841..8b07aa655 100644 --- a/haruhi_dl/extractor/vodpl.py +++ b/haruhi_dl/extractor/vodpl.py @@ -2,6 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + parse_duration, + parse_iso8601, +) from .pulsembed import PulseVideoIE @@ -9,16 +15,25 @@ class VODPlIE(InfoExtractor): _VALID_URL = r'https?://vod\.pl/(?:[^/]+/)+(?P[0-9a-zA-Z]+)' _TESTS = [{ - 'url': 'https://vod.pl/filmy/chlopaki-nie-placza/3ep3jns', - 'md5': 'a7dc3b2f7faa2421aefb0ecaabf7ec74', + 'url': 'https://vod.pl/filmy-dokumentalne/wielce-krolewski-slub/wcl5tx0', 'info_dict': { - 'id': '3ep3jns', + 'id': '2163051.179206518', 'ext': 'mp4', - 'title': 'Chłopaki nie płaczą', - 'description': 'md5:f5f03b84712e55f5ac9f0a3f94445224', - 'timestamp': 1463415154, - 'duration': 5765, - 'upload_date': '20160516', + 'title': 'Wielce królewski ślub', + 'description': 'md5:9de1b6df5dba5c44fcde37584ad13302', + 'timestamp': 1580313604, + 'upload_date': '20200129', + }, + }, { + 'url': 'https://vod.pl/filmy/autopsja/62gx8n1', + 'info_dict': { + 'id': '1973639.1440605974', + 'ext': 'mp4', + 'title': 'Autopsja', + 'description': 'md5:94cb987a8caeecd5755e3597d4c0bd66', + 'upload_date': '20190203', + 'timestamp': 1549227901, + 'age_limit': 18, }, }, { 'url': 'https://vod.pl/seriale/belfer-na-planie-praca-kamery-online/2c10heh', @@ -28,8 +43,23 @@ class VODPlIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + data = self._parse_json( + self._search_regex(r'try {\s*vodDataLayer = ({.+?});', webpage, 'vod data layer'), + video_id) + description = clean_html( + self._search_regex( + r'(?s)]+itemprop="description"[^>]*>(.+?)', + webpage, 'description', default=None)) + age_limit = int_or_none(self._search_regex( + r'