[generic] normalizing embeds part 2137

2021-01-31 01:26:39 +01:00 · 2021-01-31 01:26:39 +01:00 · a3816f69be
parent b1c1d64de0
commit a3816f69be
7 changed files with 48 additions and 57 deletions
--- a/haruhi_dl/extractor/dailymotion.py
+++ b/haruhi_dl/extractor/dailymotion.py
@ -363,6 +363,16 @@ class DailymotionPlaylistIE(DailymotionPlaylistBaseIE):
    }]
    _OBJECT_TYPE = 'collection'

+    def _extract_urls(webpage, url=None):
+        m = re.search(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
+        if m:
+            playlists = re.findall(
+                r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
+            if playlists:
+                return ['//dailymotion.com/playlist/%s' % p for p in playlists]
+        return []
+

 class DailymotionUserIE(DailymotionPlaylistBaseIE):
    IE_NAME = 'dailymotion:user'
--- a/haruhi_dl/extractor/generic.py
+++ b/haruhi_dl/extractor/generic.py
@ -121,6 +121,7 @@ from .odnoklassniki import OdnoklassnikiIE
 from .kinja import KinjaEmbedIE
 from .onnetwork import OnNetworkLoaderIE
 from .embetty import EmbettyIE
+from .rtlnl import RtlNlIE


 class GenericIE(InfoExtractor):
@ -2582,18 +2583,28 @@ class GenericIE(InfoExtractor):
            VimeoIE,
            SoundcloudEmbedIE,
            KalturaIE,
+            RtlNlIE,
+            TeachableIE,    # must be before Wistia
+            WistiaIE,
+            SVTIE,
        ):
            try:
+                ie_key = embie.ie_key()
                embie_urls = embie._extract_urls(webpage,
                                                 url=url)
                if embie_urls:
                    entries = []
                    for embie_url in embie_urls:
-                        entries.append({
+                        entry = {
                            '_type': 'url_transparent',
-                            'url': smuggle_url(unescapeHTML(embie_url), {'source_url': embie_url}),
-                            'ie_key': embie.ie_key(),
-                        })
+                            'url': embie_url,
+                            'ie_key': ie_key,
+                        }
+                        if ie_key in ("Wistia", ):
+                            entries["uploader"] = video_uploader
+                        if ie_key in ("Bandcamp", ):
+                            entry["ie_key"] = None
+                        entries.append(entry)
                    return {
                        '_type': 'playlist',
                        'entries': entries,
@ -2605,50 +2616,6 @@ class GenericIE(InfoExtractor):
                self.report_warning('The exception above was caused by: %sIE' % embie.ie_key())
                raise exc

-        # Look for embedded rtl.nl player
-        matches = re.findall(
-            r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
-            webpage)
-        if matches:
-            return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
-
-        vid_me_embed_url = self._search_regex(
-            r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
-            webpage, 'vid.me embed', default=None)
-        if vid_me_embed_url is not None:
-            return self.url_result(vid_me_embed_url, 'Vidme')
-
-        # Look for embedded Dailymotion playlist player (#3822)
-        m = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
-        if m:
-            playlists = re.findall(
-                r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
-            if playlists:
-                return self.playlist_from_matches(
-                    playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
-
-        # Look for Teachable embeds, must be before Wistia
-        teachable_url = TeachableIE._extract_url(webpage, url)
-        if teachable_url:
-            return self.url_result(teachable_url)
-
-        # Look for embedded Wistia player
-        wistia_urls = WistiaIE._extract_urls(webpage)
-        if wistia_urls:
-            playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key())
-            for entry in playlist['entries']:
-                entry.update({
-                    '_type': 'url_transparent',
-                    'uploader': video_uploader,
-                })
-            return playlist
-
-        # Look for SVT player
-        svt_url = SVTIE._extract_url(webpage)
-        if svt_url:
-            return self.url_result(svt_url, 'SVT')
-
        # Look for Bandcamp pages with custom domain
        mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
        if mobj is not None:
--- a/haruhi_dl/extractor/rtlnl.py
+++ b/haruhi_dl/extractor/rtlnl.py
@ -1,6 +1,8 @@
 # coding: utf-8
 from __future__ import unicode_literals

+import re
+
 from .common import InfoExtractor
 from ..utils import (
    int_or_none,
@ -98,6 +100,12 @@ class RtlNlIE(InfoExtractor):
        'only_matching': True,
    }]

+    @staticmethod
+    def _extract_urls(webpage, url=None):
+        return re.findall(
+            r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
+            webpage)
+
    def _real_extract(self, url):
        uuid = self._match_id(url)
        info = self._download_json(
--- a/haruhi_dl/extractor/svt.py
+++ b/haruhi_dl/extractor/svt.py
@ -108,11 +108,10 @@ class SVTIE(SVTBaseIE):
    }

    @staticmethod
-    def _extract_url(webpage):
-        mobj = re.search(
+    def _extract_urls(webpage, url=None):
+        mobj = re.finditer(
            r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage)
-        if mobj:
-            return mobj.group('url')
+        return [match.group('url') for match in mobj]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
--- a/haruhi_dl/extractor/teachable.py
+++ b/haruhi_dl/extractor/teachable.py
@ -144,11 +144,11 @@ class TeachableIE(TeachableBaseIE):
            webpage)

    @staticmethod
-    def _extract_url(webpage, source_url):
+    def _extract_urls(webpage, url=None):
        if not TeachableIE._is_teachable(webpage):
-            return
-        if re.match(r'https?://[^/]+/(?:courses|p)', source_url):
-            return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url)
+            return []
+        if re.match(r'https?://[^/]+/(?:courses|p)', url):
+            return ['%s%s' % (TeachableBaseIE._URL_PREFIX, url)]

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
--- a/haruhi_dl/extractor/vidme.py
+++ b/haruhi_dl/extractor/vidme.py
@ -1,6 +1,7 @@
 from __future__ import unicode_literals

 import itertools
+import re

 from .common import InfoExtractor
 from ..compat import compat_HTTPError
@ -132,6 +133,12 @@ class VidmeIE(InfoExtractor):
        },
    }]

+    @staticmethod
+    def _extract_urls(webpage, url=None):
+        return re.findall(
+            r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
+            webpage)
+
    def _real_extract(self, url):
        video_id = self._match_id(url)

--- a/haruhi_dl/extractor/wistia.py
+++ b/haruhi_dl/extractor/wistia.py
@ -49,7 +49,7 @@ class WistiaIE(InfoExtractor):
        return urls[0] if urls else None

    @staticmethod
-    def _extract_urls(webpage):
+    def _extract_urls(webpage, url=None):
        urls = []
        for match in re.finditer(
                r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage):