refactor from onet mvp to pulsevideo

This commit is contained in:
Laura Liberda 2021-02-11 01:09:05 +01:00
parent f603f36c3f
commit 630a86c5e3
5 changed files with 172 additions and 156 deletions

View file

@ -1,12 +1,13 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from .onet import OnetBaseIE from .common import InfoExtractor
from .pulsembed import PulseVideoIE
class ClipRsIE(OnetBaseIE): class ClipRsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+' _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+'
_TEST = { _TESTS = [{
'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732',
'md5': 'c412d57815ba07b56f9edc7b5d6a14e5', 'md5': 'c412d57815ba07b56f9edc7b5d6a14e5',
'info_dict': { 'info_dict': {
@ -18,16 +19,16 @@ class ClipRsIE(OnetBaseIE):
'timestamp': 1459850243, 'timestamp': 1459850243,
'upload_date': '20160405', 'upload_date': '20160405',
} }
} }]
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
mvp_id = self._search_mvp_id(webpage) return {
'_type': 'url_transparent',
info_dict = self._extract_from_id(mvp_id, webpage) 'url': 'pulsevideo:%s' % PulseVideoIE._search_mvp_id(webpage),
info_dict['display_id'] = display_id 'ie_key': PulseVideoIE.ie_key(),
'display_id': display_id,
return info_dict }

View file

@ -824,10 +824,7 @@ from .odnoklassniki import OdnoklassnikiIE
from .okopress import OKOPressIE from .okopress import OKOPressIE
from .oktoberfesttv import OktoberfestTVIE from .oktoberfesttv import OktoberfestTVIE
from .ondemandkorea import OnDemandKoreaIE from .ondemandkorea import OnDemandKoreaIE
from .onet import ( from .onet import OnetPlIE
OnetMVPIE,
OnetPlIE,
)
from .onionstudios import OnionStudiosIE from .onionstudios import OnionStudiosIE
from .onnetwork import ( from .onnetwork import (
OnNetworkLoaderIE, OnNetworkLoaderIE,
@ -922,7 +919,10 @@ from .puhutv import (
PuhuTVIE, PuhuTVIE,
PuhuTVSerieIE, PuhuTVSerieIE,
) )
from .pulsembed import PulsEmbedIE from .pulsembed import (
PulsEmbedIE,
PulseVideoIE,
)
from .presstv import PressTVIE from .presstv import PressTVIE
from .prosiebensat1 import ProSiebenSat1IE from .prosiebensat1 import ProSiebenSat1IE
from .puls4 import Puls4IE from .puls4 import Puls4IE

View file

@ -1,125 +1,14 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
determine_ext,
ExtractorError, ExtractorError,
float_or_none,
int_or_none,
NO_DEFAULT,
parse_iso8601,
) )
from .pulsembed import (
PulsEmbedIE,
class OnetBaseIE(InfoExtractor): PulseVideoIE,
@staticmethod )
def _search_mvp_id(webpage, default=NO_DEFAULT):
mvp = re.search(
r'data-(?:params-)?mvp=["\'](\d+\.\d+)', webpage)
if mvp:
return mvp.group(1)
if default != NO_DEFAULT:
return default
raise ExtractorError('Could not extract mvp')
def _extract_from_id(self, video_id, webpage=None):
response = self._download_json(
'http://qi.ckm.onetapi.pl/', video_id,
query={
'body[id]': video_id,
'body[jsonrpc]': '2.0',
'body[method]': 'get_asset_detail',
'body[params][ID_Publikacji]': video_id,
'body[params][Service]': 'www.onet.pl',
'content-type': 'application/jsonp',
'x-onet-app': 'player.front.onetapi.pl',
})
error = response.get('error')
if error:
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, error['message']), expected=True)
video = response['result'].get('0')
formats = []
for format_type, formats_dict in video['formats'].items():
if not isinstance(formats_dict, dict):
continue
for format_id, format_list in formats_dict.items():
if not isinstance(format_list, list):
continue
for f in format_list:
video_url = f.get('url')
if not video_url:
continue
ext = determine_ext(video_url)
if format_id.startswith('ism'):
formats.extend(self._extract_ism_formats(
video_url, video_id, 'mss', fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
video_url, video_id, mpd_id='dash', fatal=False))
elif format_id.startswith('hls'):
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
else:
http_f = {
'url': video_url,
'format_id': format_id,
'abr': float_or_none(f.get('audio_bitrate')),
}
if format_type == 'audio':
http_f['vcodec'] = 'none'
else:
http_f.update({
'height': int_or_none(f.get('vertical_resolution')),
'width': int_or_none(f.get('horizontal_resolution')),
'vbr': float_or_none(f.get('video_bitrate')),
})
formats.append(http_f)
self._sort_formats(formats)
meta = video.get('meta', {})
title = (self._og_search_title(
webpage, default=None) if webpage else None) or meta['title']
description = (self._og_search_description(
webpage, default=None) if webpage else None) or meta.get('description')
duration = meta.get('length') or meta.get('lenght')
timestamp = parse_iso8601(meta.get('addDate'), ' ')
return {
'id': video_id,
'title': title,
'description': description,
'duration': duration,
'timestamp': timestamp,
'formats': formats,
}
class OnetMVPIE(OnetBaseIE):
_VALID_URL = r'onetmvp:(?P<id>\d+\.\d+)'
_TEST = {
'url': 'onetmvp:381027.1509591944',
'only_matching': True,
}
def _real_extract(self, url):
return self._extract_from_id(self._match_id(url))
@staticmethod
def _extract_urls(webpage, **kw):
mvp = OnetBaseIE._search_mvp_id(webpage, default=None)
if mvp:
return ['onetmvp:%s' % mvp]
return []
class OnetPlIE(InfoExtractor): class OnetPlIE(InfoExtractor):
@ -133,11 +22,11 @@ class OnetPlIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'description': 'md5:0e70c7be673157c62ca183791d2b7b27', 'description': 'md5:0e70c7be673157c62ca183791d2b7b27',
'title': 'Podróż służbowa z wypadem na stok? "Załatwiamy wszystko na nartach"', 'title': 'Podróż służbowa z wypadem na stok? "Załatwiamy wszystko na nartach"',
'timestamp': 1607177736, 'timestamp': 1607174136,
'upload_date': '20201205', 'upload_date': '20201205',
} }
}, { }, {
# audio podcast form from libsyn.com via pulsembed.eu (2 iframes fucking nested in each other, who the fuck did this?) # audio podcast form from libsyn.com via pulsembed
'url': 'https://wiadomosci.onet.pl/tylko-w-onecie/milosc-w-czasach-zarazy/nbqxxwm', 'url': 'https://wiadomosci.onet.pl/tylko-w-onecie/milosc-w-czasach-zarazy/nbqxxwm',
'info_dict': { 'info_dict': {
'id': '12991166', 'id': '12991166',
@ -153,7 +42,7 @@ class OnetPlIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'Narodowy program szczepień na koronawirusa. Poznaliśmy szczegóły', 'title': 'Narodowy program szczepień na koronawirusa. Poznaliśmy szczegóły',
'description': 'md5:44f34f9718714e208797f62d851b58ec', 'description': 'md5:44f34f9718714e208797f62d851b58ec',
'timestamp': 1607111725, 'timestamp': 1607108125,
'upload_date': '20201204', 'upload_date': '20201204',
}, },
}, { }, {
@ -176,20 +65,24 @@ class OnetPlIE(InfoExtractor):
url = url.replace('.amp', '') url = url.replace('.amp', '')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
mvp_id = self._search_mvp_id(webpage, default=None) info_dict = self._search_json_ld(webpage, video_id, expected_type='NewsArticle')
info_dict['id'] = video_id
if not mvp_id: mvp_id = PulseVideoIE._search_mvp_id(webpage, default=None)
pulsembed_url = self._search_regex( if mvp_id:
r'data-src=(["\'])(?P<url>(?:https?:)?//pulsembed\.eu/.+?)\1', info_dict.update({
webpage, 'pulsembed url', group='url') 'url': 'pulsevideo:%s' % mvp_id,
webpage = self._download_webpage( 'ie_key': PulseVideoIE.ie_key(),
pulsembed_url, video_id, 'Downloading pulsembed webpage') })
mvp_id = self._search_mvp_id(webpage, default=None)
if not mvp_id:
libsyn_url = self._search_regex(r'src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/.+?)\1',
webpage, 'libsyn url', group='url')
if libsyn_url:
return self.url_result(libsyn_url, 'Libsyn')
return self.url_result( p2ems = PulsEmbedIE._extract_entries(webpage)
'onetmvp:%s' % mvp_id, OnetMVPIE.ie_key(), video_id=mvp_id) if len(p2ems) > 1:
info_dict.update({
'_type': 'playlist',
'entries': p2ems,
})
if p2ems:
info_dict.update(p2ems[0])
return info_dict
raise ExtractorError('PulsEmbed not found')

View file

@ -9,16 +9,130 @@ from ..compat import (
compat_str, compat_str,
) )
from ..utils import ( from ..utils import (
try_get, determine_ext,
float_or_none,
int_or_none,
parse_iso8601,
smuggle_url, smuggle_url,
try_get,
unescapeHTML, unescapeHTML,
unsmuggle_url, unsmuggle_url,
ExtractorError, ExtractorError,
NO_DEFAULT,
) )
from .libsyn import LibsynIE from .libsyn import LibsynIE
from .xnews import XLinkIE from .xnews import XLinkIE
from .tvp import TVPEmbedIE from .tvp import TVPEmbedIE
from .onet import OnetMVPIE
class PulseVideoIE(InfoExtractor):
"""
PulseVideo is a name used now by Ringier Axel Springer Tech.
Onet MVP is a name used previously by Onet's DreamLab,
before Onet became a part of Ringier Axel Springer Polska.
"""
_VALID_URL = r'(?:pulsevideo|onetmvp):(?P<id>\d+\.\d+)'
_TESTS = [{
'url': 'onetmvp:381027.1509591944',
'only_matching': True,
}]
@staticmethod
def _search_mvp_id(webpage, default=NO_DEFAULT):
mvp = re.search(
r'data-(?:params-)?mvp=["\'](\d+\.\d+)', webpage)
if mvp:
return mvp.group(1)
if default != NO_DEFAULT:
return default
raise ExtractorError('Could not extract mvp')
def _extract_from_id(self, video_id, webpage=None):
response = self._download_json(
'http://qi.ckm.onetapi.pl/', video_id,
query={
'body[id]': video_id,
'body[jsonrpc]': '2.0',
'body[method]': 'get_asset_detail',
'body[params][ID_Publikacji]': video_id,
'body[params][Service]': 'www.onet.pl',
'content-type': 'application/jsonp',
'x-onet-app': 'player.front.onetapi.pl',
})
error = response.get('error')
if error:
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, error['message']), expected=True)
video = response['result'].get('0')
formats = []
for format_type, formats_dict in video['formats'].items():
if not isinstance(formats_dict, dict):
continue
for format_id, format_list in formats_dict.items():
if not isinstance(format_list, list):
continue
for f in format_list:
video_url = f.get('url')
if not video_url:
continue
ext = determine_ext(video_url)
if format_id.startswith('ism'):
formats.extend(self._extract_ism_formats(
video_url, video_id, 'mss', fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
video_url, video_id, mpd_id='dash', fatal=False))
elif format_id.startswith('hls'):
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
else:
http_f = {
'url': video_url,
'format_id': format_id,
'abr': float_or_none(f.get('audio_bitrate')),
}
if format_type == 'audio':
http_f['vcodec'] = 'none'
else:
http_f.update({
'height': int_or_none(f.get('vertical_resolution')),
'width': int_or_none(f.get('horizontal_resolution')),
'vbr': float_or_none(f.get('video_bitrate')),
})
formats.append(http_f)
self._sort_formats(formats)
meta = video.get('meta', {})
title = (self._og_search_title(
webpage, default=None) if webpage else None) or meta['title']
description = (self._og_search_description(
webpage, default=None) if webpage else None) or meta.get('description')
duration = meta.get('length') or meta.get('lenght')
timestamp = parse_iso8601(meta.get('addDate'), ' ')
return {
'id': video_id,
'title': title,
'description': description,
'duration': duration,
'timestamp': timestamp,
'formats': formats,
}
def _real_extract(self, url):
return self._extract_from_id(self._match_id(url))
@staticmethod
def _extract_urls(webpage, **kw):
mvp = PulseVideoIE._search_mvp_id(webpage, default=None)
if mvp:
return ['onetmvp:%s' % mvp]
return []
class PulsEmbedIE(InfoExtractor): class PulsEmbedIE(InfoExtractor):
@ -131,7 +245,7 @@ class PulsEmbedIE(InfoExtractor):
LibsynIE, LibsynIE,
XLinkIE, XLinkIE,
TVPEmbedIE, TVPEmbedIE,
OnetMVPIE, PulseVideoIE,
): ):
embie_urls = embie._extract_urls(webpage, url=referer) embie_urls = embie._extract_urls(webpage, url=referer)
if embie_urls: if embie_urls:
@ -147,6 +261,11 @@ class PulsEmbedIE(InfoExtractor):
unknown_iframe = self._html_search_regex(r'<iframe[^>]*\ssrc=(["\'])(?P<url>[^\1]+)\1', unknown_iframe = self._html_search_regex(r'<iframe[^>]*\ssrc=(["\'])(?P<url>[^\1]+)\1',
webpage, 'unknown iframe', group='url', default=None) webpage, 'unknown iframe', group='url', default=None)
if unknown_iframe: if unknown_iframe:
if any((s in unknown_iframe for s in (
# feel free to extend the list
'//forms.freshmail.io/',
))):
return
webpage = self._download_webpage(unknown_iframe, video_id, 'Downloading unknown nested iframe') webpage = self._download_webpage(unknown_iframe, video_id, 'Downloading unknown nested iframe')
referer = unknown_iframe referer = unknown_iframe
new_page = True new_page = True

View file

@ -1,10 +1,11 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from .onet import OnetBaseIE from .common import InfoExtractor
from .pulsembed import PulseVideoIE
class VODPlIE(OnetBaseIE): class VODPlIE(InfoExtractor):
_VALID_URL = r'https?://vod\.pl/(?:[^/]+/)+(?P<id>[0-9a-zA-Z]+)' _VALID_URL = r'https?://vod\.pl/(?:[^/]+/)+(?P<id>[0-9a-zA-Z]+)'
_TESTS = [{ _TESTS = [{
@ -27,6 +28,8 @@ class VODPlIE(OnetBaseIE):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
info_dict = self._extract_from_id(self._search_mvp_id(webpage), webpage) return {
info_dict['id'] = video_id '_type': 'url_transparent',
return info_dict 'url': 'pulsevideo:%s' % PulseVideoIE._search_mvp_id(webpage),
'ie_key': PulseVideoIE.ie_key(),
}