From 1d601522cc0603244df625f7248d2f52113d5057 Mon Sep 17 00:00:00 2001 From: Laura Liberda Date: Tue, 5 Jan 2021 20:06:22 +0100 Subject: [PATCH] a bit more embed searching normalization --- haruhi_dl/extractor/generic.py | 35 +++++++++++++++---------------- haruhi_dl/extractor/kaltura.py | 8 ++++--- haruhi_dl/extractor/soundcloud.py | 3 ++- haruhi_dl/extractor/vimeo.py | 2 +- 4 files changed, 25 insertions(+), 23 deletions(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 66f80a8ee..06187b3de 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -2561,12 +2561,28 @@ class GenericIE(InfoExtractor): ExpressenIE, ZypeIE, OnNetworkLoaderIE, + VimeoIE, + SoundcloudEmbedIE, + KalturaIE, ): try: embie_urls = embie._extract_urls(webpage, url=url) if embie_urls: - return self.playlist_from_matches(embie_urls, video_id, video_title, ie=embie.ie_key()) + entries = [] + for embie_url in embie_urls: + entries.append({ + '_type': 'url_transparent', + 'url': smuggle_url(unescapeHTML(embie_url), {'source_url': embie_url}), + 'ie_key': embie.ie_key(), + }) + return { + '_type': 'playlist', + 'entries': entries, + 'id': video_id, + 'title': video_title, + 'uploader': video_uploader, + } except Exception as exc: self.report_warning('The exception above was caused by: %sIE' % embie.ie_key()) raise exc @@ -2578,10 +2594,6 @@ class GenericIE(InfoExtractor): if matches: return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') - vimeo_urls = VimeoIE._extract_urls(url, webpage) - if vimeo_urls: - return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) - vid_me_embed_url = self._search_regex( r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', webpage, 'vid.me embed', default=None) @@ -2778,11 +2790,6 @@ class GenericIE(InfoExtractor): if myvi_url: return self.url_result(myvi_url) - # Look for embedded soundcloud player - soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage) - if soundcloud_urls: - return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML) - # Look for embedded mtvservices player mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) if mtvservices_url: @@ -2842,14 +2849,6 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'Zapiks') - # Look for Kaltura embeds - kaltura_urls = KalturaIE._extract_urls(webpage) - if kaltura_urls: - return self.playlist_from_matches( - kaltura_urls, video_id, video_title, - getter=lambda x: smuggle_url(x, {'source_url': url}), - ie=KalturaIE.ie_key()) - # Look for EaglePlatform embeds eagleplatform_url = EaglePlatformIE._extract_url(webpage) if eagleplatform_url: diff --git a/haruhi_dl/extractor/kaltura.py b/haruhi_dl/extractor/kaltura.py index 49d13460d..b094cc987 100644 --- a/haruhi_dl/extractor/kaltura.py +++ b/haruhi_dl/extractor/kaltura.py @@ -117,7 +117,7 @@ class KalturaIE(InfoExtractor): return urls[0] if urls else None @staticmethod - def _extract_urls(webpage): + def _extract_urls(webpage, url=None): # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site finditer = ( re.finditer( @@ -159,13 +159,15 @@ class KalturaIE(InfoExtractor): for k, v in embed_info.items(): if v: embed_info[k] = v.strip() - url = 'kaltura:%(partner_id)s:%(id)s' % embed_info + result_url = 'kaltura:%(partner_id)s:%(id)s' % embed_info escaped_pid = re.escape(embed_info['partner_id']) service_mobj = re.search( r']+src=(["\'])(?P(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), webpage) + smug = {'source_url': url} if service_mobj: - url = smuggle_url(url, {'service_url': service_mobj.group('id')}) + smug['service_url'] = service_mobj.group('id') + url = smuggle_url(result_url, smug) urls.append(url) return urls diff --git a/haruhi_dl/extractor/soundcloud.py b/haruhi_dl/extractor/soundcloud.py index 896fa77a4..723dcb8fd 100644 --- a/haruhi_dl/extractor/soundcloud.py +++ b/haruhi_dl/extractor/soundcloud.py @@ -24,6 +24,7 @@ from ..utils import ( mimetype2ext, str_or_none, try_get, + unescapeHTML, unified_timestamp, update_url_query, url_or_none, @@ -41,7 +42,7 @@ class SoundcloudEmbedIE(InfoExtractor): @staticmethod def _extract_urls(webpage, **kwargs): - return [m.group('url') for m in re.finditer( + return [unescapeHTML(m.group('url')) for m in re.finditer( r']+src=(["\'])(?P(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', webpage)] diff --git a/haruhi_dl/extractor/vimeo.py b/haruhi_dl/extractor/vimeo.py index da4f08704..9be1adcf9 100644 --- a/haruhi_dl/extractor/vimeo.py +++ b/haruhi_dl/extractor/vimeo.py @@ -519,7 +519,7 @@ class VimeoIE(VimeoBaseInfoExtractor): return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) @staticmethod - def _extract_urls(url, webpage): + def _extract_urls(webpage, url=None): urls = [] # Look for embedded (iframe) Vimeo player for mobj in re.finditer(