[generic] normalizing embeds part 2137

This commit is contained in:
Laura Liberda 2021-01-31 01:26:39 +01:00
parent b1c1d64de0
commit a3816f69be
7 changed files with 48 additions and 57 deletions

View file

@ -363,6 +363,16 @@ class DailymotionPlaylistIE(DailymotionPlaylistBaseIE):
}]
_OBJECT_TYPE = 'collection'
def _extract_urls(webpage, url=None):
m = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
if m:
playlists = re.findall(
r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
if playlists:
return ['//dailymotion.com/playlist/%s' % p for p in playlists]
return []
class DailymotionUserIE(DailymotionPlaylistBaseIE):
IE_NAME = 'dailymotion:user'

View file

@ -121,6 +121,7 @@ from .odnoklassniki import OdnoklassnikiIE
from .kinja import KinjaEmbedIE
from .onnetwork import OnNetworkLoaderIE
from .embetty import EmbettyIE
from .rtlnl import RtlNlIE
class GenericIE(InfoExtractor):
@ -2582,18 +2583,28 @@ class GenericIE(InfoExtractor):
VimeoIE,
SoundcloudEmbedIE,
KalturaIE,
RtlNlIE,
TeachableIE, # must be before Wistia
WistiaIE,
SVTIE,
):
try:
ie_key = embie.ie_key()
embie_urls = embie._extract_urls(webpage,
url=url)
if embie_urls:
entries = []
for embie_url in embie_urls:
entries.append({
entry = {
'_type': 'url_transparent',
'url': smuggle_url(unescapeHTML(embie_url), {'source_url': embie_url}),
'ie_key': embie.ie_key(),
})
'url': embie_url,
'ie_key': ie_key,
}
if ie_key in ("Wistia", ):
entries["uploader"] = video_uploader
if ie_key in ("Bandcamp", ):
entry["ie_key"] = None
entries.append(entry)
return {
'_type': 'playlist',
'entries': entries,
@ -2605,50 +2616,6 @@ class GenericIE(InfoExtractor):
self.report_warning('The exception above was caused by: %sIE' % embie.ie_key())
raise exc
# Look for embedded rtl.nl player
matches = re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
webpage)
if matches:
return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
vid_me_embed_url = self._search_regex(
r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
webpage, 'vid.me embed', default=None)
if vid_me_embed_url is not None:
return self.url_result(vid_me_embed_url, 'Vidme')
# Look for embedded Dailymotion playlist player (#3822)
m = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
if m:
playlists = re.findall(
r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
if playlists:
return self.playlist_from_matches(
playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
# Look for Teachable embeds, must be before Wistia
teachable_url = TeachableIE._extract_url(webpage, url)
if teachable_url:
return self.url_result(teachable_url)
# Look for embedded Wistia player
wistia_urls = WistiaIE._extract_urls(webpage)
if wistia_urls:
playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key())
for entry in playlist['entries']:
entry.update({
'_type': 'url_transparent',
'uploader': video_uploader,
})
return playlist
# Look for SVT player
svt_url = SVTIE._extract_url(webpage)
if svt_url:
return self.url_result(svt_url, 'SVT')
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None:

View file

@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
@ -98,6 +100,12 @@ class RtlNlIE(InfoExtractor):
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage, url=None):
return re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
webpage)
def _real_extract(self, url):
uuid = self._match_id(url)
info = self._download_json(

View file

@ -108,11 +108,10 @@ class SVTIE(SVTBaseIE):
}
@staticmethod
def _extract_url(webpage):
mobj = re.search(
def _extract_urls(webpage, url=None):
mobj = re.finditer(
r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage)
if mobj:
return mobj.group('url')
return [match.group('url') for match in mobj]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)

View file

@ -144,11 +144,11 @@ class TeachableIE(TeachableBaseIE):
webpage)
@staticmethod
def _extract_url(webpage, source_url):
def _extract_urls(webpage, url=None):
if not TeachableIE._is_teachable(webpage):
return
if re.match(r'https?://[^/]+/(?:courses|p)', source_url):
return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url)
return []
if re.match(r'https?://[^/]+/(?:courses|p)', url):
return ['%s%s' % (TeachableBaseIE._URL_PREFIX, url)]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)

View file

@ -1,6 +1,7 @@
from __future__ import unicode_literals
import itertools
import re
from .common import InfoExtractor
from ..compat import compat_HTTPError
@ -132,6 +133,12 @@ class VidmeIE(InfoExtractor):
},
}]
@staticmethod
def _extract_urls(webpage, url=None):
return re.findall(
r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
webpage)
def _real_extract(self, url):
video_id = self._match_id(url)

View file

@ -49,7 +49,7 @@ class WistiaIE(InfoExtractor):
return urls[0] if urls else None
@staticmethod
def _extract_urls(webpage):
def _extract_urls(webpage, url=None):
urls = []
for match in re.finditer(
r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage):