a bit more embed searching normalization

This commit is contained in:
Laura Liberda 2021-01-05 20:06:22 +01:00
parent 8f86520b7a
commit 1d601522cc
4 changed files with 25 additions and 23 deletions

View file

@ -2561,12 +2561,28 @@ class GenericIE(InfoExtractor):
ExpressenIE, ExpressenIE,
ZypeIE, ZypeIE,
OnNetworkLoaderIE, OnNetworkLoaderIE,
VimeoIE,
SoundcloudEmbedIE,
KalturaIE,
): ):
try: try:
embie_urls = embie._extract_urls(webpage, embie_urls = embie._extract_urls(webpage,
url=url) url=url)
if embie_urls: if embie_urls:
return self.playlist_from_matches(embie_urls, video_id, video_title, ie=embie.ie_key()) entries = []
for embie_url in embie_urls:
entries.append({
'_type': 'url_transparent',
'url': smuggle_url(unescapeHTML(embie_url), {'source_url': embie_url}),
'ie_key': embie.ie_key(),
})
return {
'_type': 'playlist',
'entries': entries,
'id': video_id,
'title': video_title,
'uploader': video_uploader,
}
except Exception as exc: except Exception as exc:
self.report_warning('The exception above was caused by: %sIE' % embie.ie_key()) self.report_warning('The exception above was caused by: %sIE' % embie.ie_key())
raise exc raise exc
@ -2578,10 +2594,6 @@ class GenericIE(InfoExtractor):
if matches: if matches:
return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
vimeo_urls = VimeoIE._extract_urls(url, webpage)
if vimeo_urls:
return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())
vid_me_embed_url = self._search_regex( vid_me_embed_url = self._search_regex(
r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
webpage, 'vid.me embed', default=None) webpage, 'vid.me embed', default=None)
@ -2778,11 +2790,6 @@ class GenericIE(InfoExtractor):
if myvi_url: if myvi_url:
return self.url_result(myvi_url) return self.url_result(myvi_url)
# Look for embedded soundcloud player
soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage)
if soundcloud_urls:
return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML)
# Look for embedded mtvservices player # Look for embedded mtvservices player
mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage)
if mtvservices_url: if mtvservices_url:
@ -2842,14 +2849,6 @@ class GenericIE(InfoExtractor):
if mobj is not None: if mobj is not None:
return self.url_result(mobj.group('url'), 'Zapiks') return self.url_result(mobj.group('url'), 'Zapiks')
# Look for Kaltura embeds
kaltura_urls = KalturaIE._extract_urls(webpage)
if kaltura_urls:
return self.playlist_from_matches(
kaltura_urls, video_id, video_title,
getter=lambda x: smuggle_url(x, {'source_url': url}),
ie=KalturaIE.ie_key())
# Look for EaglePlatform embeds # Look for EaglePlatform embeds
eagleplatform_url = EaglePlatformIE._extract_url(webpage) eagleplatform_url = EaglePlatformIE._extract_url(webpage)
if eagleplatform_url: if eagleplatform_url:

View file

@ -117,7 +117,7 @@ class KalturaIE(InfoExtractor):
return urls[0] if urls else None return urls[0] if urls else None
@staticmethod @staticmethod
def _extract_urls(webpage): def _extract_urls(webpage, url=None):
# Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site
finditer = ( finditer = (
re.finditer( re.finditer(
@ -159,13 +159,15 @@ class KalturaIE(InfoExtractor):
for k, v in embed_info.items(): for k, v in embed_info.items():
if v: if v:
embed_info[k] = v.strip() embed_info[k] = v.strip()
url = 'kaltura:%(partner_id)s:%(id)s' % embed_info result_url = 'kaltura:%(partner_id)s:%(id)s' % embed_info
escaped_pid = re.escape(embed_info['partner_id']) escaped_pid = re.escape(embed_info['partner_id'])
service_mobj = re.search( service_mobj = re.search(
r'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), r'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid),
webpage) webpage)
smug = {'source_url': url}
if service_mobj: if service_mobj:
url = smuggle_url(url, {'service_url': service_mobj.group('id')}) smug['service_url'] = service_mobj.group('id')
url = smuggle_url(result_url, smug)
urls.append(url) urls.append(url)
return urls return urls

View file

@ -24,6 +24,7 @@ from ..utils import (
mimetype2ext, mimetype2ext,
str_or_none, str_or_none,
try_get, try_get,
unescapeHTML,
unified_timestamp, unified_timestamp,
update_url_query, update_url_query,
url_or_none, url_or_none,
@ -41,7 +42,7 @@ class SoundcloudEmbedIE(InfoExtractor):
@staticmethod @staticmethod
def _extract_urls(webpage, **kwargs): def _extract_urls(webpage, **kwargs):
return [m.group('url') for m in re.finditer( return [unescapeHTML(m.group('url')) for m in re.finditer(
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1',
webpage)] webpage)]

View file

@ -519,7 +519,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) return smuggle_url(url, {'http_headers': {'Referer': referrer_url}})
@staticmethod @staticmethod
def _extract_urls(url, webpage): def _extract_urls(webpage, url=None):
urls = [] urls = []
# Look for embedded (iframe) Vimeo player # Look for embedded (iframe) Vimeo player
for mobj in re.finditer( for mobj in re.finditer(