[generic] normalizing embeds part 2137

This commit is contained in:
Laura Liberda 2021-01-31 01:26:39 +01:00
parent b1c1d64de0
commit a3816f69be
7 changed files with 48 additions and 57 deletions

View file

@ -363,6 +363,16 @@ class DailymotionPlaylistIE(DailymotionPlaylistBaseIE):
}] }]
_OBJECT_TYPE = 'collection' _OBJECT_TYPE = 'collection'
def _extract_urls(webpage, url=None):
m = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
if m:
playlists = re.findall(
r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
if playlists:
return ['//dailymotion.com/playlist/%s' % p for p in playlists]
return []
class DailymotionUserIE(DailymotionPlaylistBaseIE): class DailymotionUserIE(DailymotionPlaylistBaseIE):
IE_NAME = 'dailymotion:user' IE_NAME = 'dailymotion:user'

View file

@ -121,6 +121,7 @@ from .odnoklassniki import OdnoklassnikiIE
from .kinja import KinjaEmbedIE from .kinja import KinjaEmbedIE
from .onnetwork import OnNetworkLoaderIE from .onnetwork import OnNetworkLoaderIE
from .embetty import EmbettyIE from .embetty import EmbettyIE
from .rtlnl import RtlNlIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -2582,18 +2583,28 @@ class GenericIE(InfoExtractor):
VimeoIE, VimeoIE,
SoundcloudEmbedIE, SoundcloudEmbedIE,
KalturaIE, KalturaIE,
RtlNlIE,
TeachableIE, # must be before Wistia
WistiaIE,
SVTIE,
): ):
try: try:
ie_key = embie.ie_key()
embie_urls = embie._extract_urls(webpage, embie_urls = embie._extract_urls(webpage,
url=url) url=url)
if embie_urls: if embie_urls:
entries = [] entries = []
for embie_url in embie_urls: for embie_url in embie_urls:
entries.append({ entry = {
'_type': 'url_transparent', '_type': 'url_transparent',
'url': smuggle_url(unescapeHTML(embie_url), {'source_url': embie_url}), 'url': embie_url,
'ie_key': embie.ie_key(), 'ie_key': ie_key,
}) }
if ie_key in ("Wistia", ):
entries["uploader"] = video_uploader
if ie_key in ("Bandcamp", ):
entry["ie_key"] = None
entries.append(entry)
return { return {
'_type': 'playlist', '_type': 'playlist',
'entries': entries, 'entries': entries,
@ -2605,50 +2616,6 @@ class GenericIE(InfoExtractor):
self.report_warning('The exception above was caused by: %sIE' % embie.ie_key()) self.report_warning('The exception above was caused by: %sIE' % embie.ie_key())
raise exc raise exc
# Look for embedded rtl.nl player
matches = re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
webpage)
if matches:
return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
vid_me_embed_url = self._search_regex(
r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
webpage, 'vid.me embed', default=None)
if vid_me_embed_url is not None:
return self.url_result(vid_me_embed_url, 'Vidme')
# Look for embedded Dailymotion playlist player (#3822)
m = re.search(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
if m:
playlists = re.findall(
r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
if playlists:
return self.playlist_from_matches(
playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
# Look for Teachable embeds, must be before Wistia
teachable_url = TeachableIE._extract_url(webpage, url)
if teachable_url:
return self.url_result(teachable_url)
# Look for embedded Wistia player
wistia_urls = WistiaIE._extract_urls(webpage)
if wistia_urls:
playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key())
for entry in playlist['entries']:
entry.update({
'_type': 'url_transparent',
'uploader': video_uploader,
})
return playlist
# Look for SVT player
svt_url = SVTIE._extract_url(webpage)
if svt_url:
return self.url_result(svt_url, 'SVT')
# Look for Bandcamp pages with custom domain # Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None: if mobj is not None:

View file

@ -1,6 +1,8 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
@ -98,6 +100,12 @@ class RtlNlIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
@staticmethod
def _extract_urls(webpage, url=None):
return re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
webpage)
def _real_extract(self, url): def _real_extract(self, url):
uuid = self._match_id(url) uuid = self._match_id(url)
info = self._download_json( info = self._download_json(

View file

@ -108,11 +108,10 @@ class SVTIE(SVTBaseIE):
} }
@staticmethod @staticmethod
def _extract_url(webpage): def _extract_urls(webpage, url=None):
mobj = re.search( mobj = re.finditer(
r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage) r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage)
if mobj: return [match.group('url') for match in mobj]
return mobj.group('url')
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)

View file

@ -144,11 +144,11 @@ class TeachableIE(TeachableBaseIE):
webpage) webpage)
@staticmethod @staticmethod
def _extract_url(webpage, source_url): def _extract_urls(webpage, url=None):
if not TeachableIE._is_teachable(webpage): if not TeachableIE._is_teachable(webpage):
return return []
if re.match(r'https?://[^/]+/(?:courses|p)', source_url): if re.match(r'https?://[^/]+/(?:courses|p)', url):
return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url) return ['%s%s' % (TeachableBaseIE._URL_PREFIX, url)]
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)

View file

@ -1,6 +1,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import itertools import itertools
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_HTTPError from ..compat import compat_HTTPError
@ -132,6 +133,12 @@ class VidmeIE(InfoExtractor):
}, },
}] }]
@staticmethod
def _extract_urls(webpage, url=None):
return re.findall(
r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
webpage)
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)

View file

@ -49,7 +49,7 @@ class WistiaIE(InfoExtractor):
return urls[0] if urls else None return urls[0] if urls else None
@staticmethod @staticmethod
def _extract_urls(webpage): def _extract_urls(webpage, url=None):
urls = [] urls = []
for match in re.finditer( for match in re.finditer(
r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage): r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage):