From 4cfa7883a3b125023bb20f9195e361bef10eb94f Mon Sep 17 00:00:00 2001 From: Laura Liberda Date: Thu, 11 Feb 2021 14:08:07 +0100 Subject: [PATCH] [clip.rs] fix extraction --- haruhi_dl/extractor/cliprs.py | 42 ++++++++++++++++++++++++++------ haruhi_dl/extractor/pulsembed.py | 15 +++++++++++- 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/haruhi_dl/extractor/cliprs.py b/haruhi_dl/extractor/cliprs.py index 8dda33c87..4e211298f 100644 --- a/haruhi_dl/extractor/cliprs.py +++ b/haruhi_dl/extractor/cliprs.py @@ -2,14 +2,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .pulsembed import PulseVideoIE +from ..utils import ( + ExtractorError, +) +from .pulsembed import PulseVideoIE, PulsEmbedIE class ClipRsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P[^/]+)/\d+' _TESTS = [{ - 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', - 'md5': 'c412d57815ba07b56f9edc7b5d6a14e5', + 'url': 'https://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', 'info_dict': { 'id': '1488842.1399140381', 'ext': 'mp4', @@ -19,16 +21,42 @@ class ClipRsIE(InfoExtractor): 'timestamp': 1459850243, 'upload_date': '20160405', } + }, { + 'url': 'https://www.clip.rs/u-novom-sadu-se-sinoc-desio-jedan-zimski-neum-svi-su-zaboravili-na-koronu-uhvatili-se-u-kolo-i-nastao-je-hit-video/15686', + 'info_dict': { + 'id': '2210721.1689293351', + 'ext': 'mp4', + 'title': 'U Novom Sadu se sinoć desio jedan zimski Neum: Svi su zaboravili na koronu, uhvatili se u kolo i nastao je HIT VIDEO', + 'description': 'md5:b1d7d6c0b029b922f06a2a08c9761852', + 'timestamp': 1609405068, + 'upload_date': '20201231', + }, }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) + info_dict = {} - return { + mvp_id = PulseVideoIE._search_mvp_id(webpage, default=None) + if mvp_id: + info_dict.update({ + 'url': 'pulsevideo:%s' % PulseVideoIE._search_mvp_id(webpage), + 'ie_key': PulseVideoIE.ie_key(), + }) + else: + entries = PulsEmbedIE._extract_entries(webpage) + if not entries: + raise ExtractorError('Video ID not found on webpage') + if len(entries) > 1: + raise ExtractorError('More than 1 PulsEmbed') + info_dict.update(entries[0]) + + info_dict.update({ '_type': 'url_transparent', - 'url': 'pulsevideo:%s' % PulseVideoIE._search_mvp_id(webpage), - 'ie_key': PulseVideoIE.ie_key(), + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), 'display_id': display_id, - } + }) + return info_dict diff --git a/haruhi_dl/extractor/pulsembed.py b/haruhi_dl/extractor/pulsembed.py index 0fac0a7ad..0151053eb 100644 --- a/haruhi_dl/extractor/pulsembed.py +++ b/haruhi_dl/extractor/pulsembed.py @@ -43,6 +43,10 @@ class PulseVideoIE(InfoExtractor): r'data-(?:params-)?mvp=["\'](\d+\.\d+)', webpage) if mvp: return mvp.group(1) + mvp = re.search( + r'\sid=(["\']?)mvp:(\d+\.\d+)\1', webpage) + if mvp: + return mvp.group(2) if default != NO_DEFAULT: return default raise ExtractorError('Could not extract mvp') @@ -222,7 +226,16 @@ class PulsEmbedIE(InfoExtractor): 'url': smuggle_url('pulsembed:%s' % embed.group('id'), {'referer': url}), 'ie_key': 'PulsEmbed', }) - return entries + + ids = [] + + def dedupe(entry): + if entry['url'] not in ids: + ids.append(entry['url']) + return True + return False + + return list(filter(dedupe, entries)) def _real_extract(self, url): video_id = self._match_id(url)