[generic] normalizing embeds part 2137

2021-01-31 01:26:39 +01:00 · 2021-01-31 01:26:39 +01:00 · a3816f69be
parent b1c1d64de0
commit a3816f69be
7 changed files with 48 additions and 57 deletions
--- a/haruhi_dl/extractor/dailymotion.py
+++ b/haruhi_dl/extractor/dailymotion.py
@ -363,6 +363,16 @@ class DailymotionPlaylistIE(DailymotionPlaylistBaseIE):
    }]
    _OBJECT_TYPE = 'collection'
    def _extract_urls(webpage, url=None):
        m = re.search(
            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
        if m:
            playlists = re.findall(
                r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
            if playlists:
                return ['//dailymotion.com/playlist/%s' % p for p in playlists]
        return []
 class DailymotionUserIE(DailymotionPlaylistBaseIE):
    IE_NAME = 'dailymotion:user'
--- a/haruhi_dl/extractor/generic.py
+++ b/haruhi_dl/extractor/generic.py
@ -121,6 +121,7 @@ from .odnoklassniki import OdnoklassnikiIE
 from .kinja import KinjaEmbedIE
 from .onnetwork import OnNetworkLoaderIE
 from .embetty import EmbettyIE
 from .rtlnl import RtlNlIE
 class GenericIE(InfoExtractor):
@ -2582,18 +2583,28 @@ class GenericIE(InfoExtractor):
            VimeoIE,
            SoundcloudEmbedIE,
            KalturaIE,
            RtlNlIE,
            TeachableIE,    # must be before Wistia
            WistiaIE,
            SVTIE,
        ):
            try:
                ie_key = embie.ie_key()
                embie_urls = embie._extract_urls(webpage,
                                                 url=url)
                if embie_urls:
                    entries = []
                    for embie_url in embie_urls:
-                        entries.append({
+                        entry = {
                            '_type': 'url_transparent',
-                            'url': smuggle_url(unescapeHTML(embie_url), {'source_url': embie_url}),
+                            'url': embie_url,
-                            'ie_key': embie.ie_key(),
+                            'ie_key': ie_key,
-                        })
+                        }
                        if ie_key in ("Wistia", ):
                            entries["uploader"] = video_uploader
                        if ie_key in ("Bandcamp", ):
                            entry["ie_key"] = None
                        entries.append(entry)
                    return {
                        '_type': 'playlist',
                        'entries': entries,
@ -2605,50 +2616,6 @@ class GenericIE(InfoExtractor):
                self.report_warning('The exception above was caused by: %sIE' % embie.ie_key())
                raise exc
        # Look for embedded rtl.nl player
        matches = re.findall(
            r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
            webpage)
        if matches:
            return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
        vid_me_embed_url = self._search_regex(
            r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
            webpage, 'vid.me embed', default=None)
        if vid_me_embed_url is not None:
            return self.url_result(vid_me_embed_url, 'Vidme')
        # Look for embedded Dailymotion playlist player (#3822)
        m = re.search(
            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
        if m:
            playlists = re.findall(
                r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
            if playlists:
                return self.playlist_from_matches(
                    playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
        # Look for Teachable embeds, must be before Wistia
        teachable_url = TeachableIE._extract_url(webpage, url)
        if teachable_url:
            return self.url_result(teachable_url)
        # Look for embedded Wistia player
        wistia_urls = WistiaIE._extract_urls(webpage)
        if wistia_urls:
            playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key())
            for entry in playlist['entries']:
                entry.update({
                    '_type': 'url_transparent',
                    'uploader': video_uploader,
                })
            return playlist
        # Look for SVT player
        svt_url = SVTIE._extract_url(webpage)
        if svt_url:
            return self.url_result(svt_url, 'SVT')
        # Look for Bandcamp pages with custom domain
        mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
        if mobj is not None:
--- a/haruhi_dl/extractor/rtlnl.py
+++ b/haruhi_dl/extractor/rtlnl.py
@ -1,6 +1,8 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import (
    int_or_none,
@ -98,6 +100,12 @@ class RtlNlIE(InfoExtractor):
        'only_matching': True,
    }]
    @staticmethod
    def _extract_urls(webpage, url=None):
        return re.findall(
            r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
            webpage)
    def _real_extract(self, url):
        uuid = self._match_id(url)
        info = self._download_json(
--- a/haruhi_dl/extractor/svt.py
+++ b/haruhi_dl/extractor/svt.py
@ -108,11 +108,10 @@ class SVTIE(SVTBaseIE):
    }
    @staticmethod
-    def _extract_url(webpage):
+    def _extract_urls(webpage, url=None):
-        mobj = re.search(
+        mobj = re.finditer(
            r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage)
-        if mobj:
+        return [match.group('url') for match in mobj]
            return mobj.group('url')
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
--- a/haruhi_dl/extractor/teachable.py
+++ b/haruhi_dl/extractor/teachable.py
@ -144,11 +144,11 @@ class TeachableIE(TeachableBaseIE):
            webpage)
    @staticmethod
-    def _extract_url(webpage, source_url):
+    def _extract_urls(webpage, url=None):
        if not TeachableIE._is_teachable(webpage):
-            return
+            return []
-        if re.match(r'https?://[^/]+/(?:courses|p)', source_url):
+        if re.match(r'https?://[^/]+/(?:courses|p)', url):
-            return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url)
+            return ['%s%s' % (TeachableBaseIE._URL_PREFIX, url)]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
--- a/haruhi_dl/extractor/vidme.py
+++ b/haruhi_dl/extractor/vidme.py
@ -1,6 +1,7 @@
 from __future__ import unicode_literals
 import itertools
 import re
 from .common import InfoExtractor
 from ..compat import compat_HTTPError
@ -132,6 +133,12 @@ class VidmeIE(InfoExtractor):
        },
    }]
    @staticmethod
    def _extract_urls(webpage, url=None):
        return re.findall(
            r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
            webpage)
    def _real_extract(self, url):
        video_id = self._match_id(url)
--- a/haruhi_dl/extractor/wistia.py
+++ b/haruhi_dl/extractor/wistia.py
@ -49,7 +49,7 @@ class WistiaIE(InfoExtractor):
        return urls[0] if urls else None
    @staticmethod
-    def _extract_urls(webpage):
+    def _extract_urls(webpage, url=None):
        urls = []
        for match in re.finditer(
                r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage):