+ castos extractors

2021-04-13 00:17:17 +02:00 · 2021-04-13 00:17:17 +02:00 · 44ed85b18b
parent 2bd0f6069a
commit 44ed85b18b
3 changed files with 119 additions and 0 deletions
--- a/haruhi_dl/extractor/castos.py
+++ b/haruhi_dl/extractor/castos.py
@ -0,0 +1,91 @@
+# coding: utf-8
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+)
+
+import re
+
+
+class CastosHostedIE(InfoExtractor):
+    _VALID_URL = r'https?://[^/.]+\.castos\.com/(?:player|episodes)/(?P<id>[\da-zA-Z-]+)'
+    IE_NAME = 'castos:hosted'
+
+    _TESTS = [{
+        'url': 'https://audience.castos.com/player/408278',
+        'info_dict': {
+            'id': '408278',
+            'ext': 'mp3',
+        },
+    }, {
+        'url': 'https://audience.castos.com/episodes/improve-your-podcast-production',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage, **kw):
+        return [mobj.group(1) for mobj
+                in re.finditer(
+                    r'<iframe\b[^>]+(?<!-)src="(https?://[^/.]+\.castos\.com/player/\d+)',
+                    webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        series = self._html_search_regex(
+            r'<div class="show">\s+<strong>([^<]+)</strong>', webpage, 'series name')
+        title = self._html_search_regex(
+            r'<div class="episode-title">([^<]+)</div>', webpage, 'episode title')
+
+        audio_url = self._html_search_regex(
+            r'<audio class="clip">\s+<source\b[^>]+src="(https?://[^"]+)"', webpage, 'audio url')
+        duration = parse_duration(self._search_regex(
+            r'<time id="duration">(\d\d(?::\d\d)+)</time>', webpage, 'duration'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': audio_url,
+            'duration': duration,
+            'series': series,
+            'episode': title,
+        }
+
+
+class CastosSSPIE(InfoExtractor):
+    @classmethod
+    def _extract_entries(self, webpage, **kw):
+        entries = []
+        for found in re.finditer(
+                r'(?s)<div class="castos-player[^"]*"[^>]*data-episode="(\d+)-[a-z\d]+">(.+?</nav>)\s*</div>',
+                webpage):
+            video_id, entry = found.group(1, 2)
+
+            def search_entry(regex):
+                res = re.search(regex, entry)
+                if res:
+                    return res.group(1)
+
+            series = search_entry(r'<div class="show">\s+<strong>([^<]+)</strong>')
+            title = search_entry(r'<div class="episode-title">([^<]+)</div>')
+
+            audio_url = search_entry(
+                r'<audio class="clip[^"]*">\s+<source\b[^>]+src="(https?://[^"]+)"')
+            duration = parse_duration(
+                search_entry(r'<time id="duration[^"]*">(\d\d(?::\d\d)+)</time>'))
+
+            if not title or not audio_url:
+                continue
+
+            entries.append({
+                'id': video_id,
+                'title': title,
+                'url': audio_url,
+                'duration': duration,
+                'series': series,
+                'episode': title,
+            })
+        return entries
--- a/haruhi_dl/extractor/extractors.py
+++ b/haruhi_dl/extractor/extractors.py
@ -183,6 +183,7 @@ from .carambatv import (
    CarambaTVPageIE,
 )
 from .cartoonnetwork import CartoonNetworkIE
+from .castos import CastosHostedIE
 from .cbc import (
    CBCIE,
    CBCPlayerIE,
--- a/haruhi_dl/extractor/generic.py
+++ b/haruhi_dl/extractor/generic.py
@ -137,6 +137,10 @@ from .arcpublishing import ArcPublishingIE
 from .medialaan import MedialaanIE
 from .simplecast import SimplecastIE
 from .spreaker import SpreakerIE
+from .castos import (
+    CastosHostedIE,
+    CastosSSPIE,
+)


 class GenericIE(InfoExtractor):
@ -2316,6 +2320,24 @@ class GenericIE(InfoExtractor):
                'timestamp': 1617024666,
            },
        },
+        {
+            # Castos (hosted) player
+            'url': 'https://castos.com/enhanced-podcast-player/',
+            'info_dict': {
+                'id': '210448',
+                'ext': 'mp3',
+                'title': '4 Ways To Create A Video Podcast (And Why You Should Try It)',
+            },
+        },
+        {
+            # Castos Super Simple Podcasting (WordPress plugin, selfhosted)
+            'url': 'https://pzbn.pl/4-heated-terf-moment/',
+            'info_dict': {
+                'id': '38',
+                'ext': 'mp3',
+                'title': '#4: Heated TERF moment',
+            },
+        },
    ]

    def report_following_redirect(self, new_url):
@ -2755,6 +2777,7 @@ class GenericIE(InfoExtractor):
            MedialaanIE,
            SimplecastIE,
            SpreakerIE,
+            CastosHostedIE,
        ):
            try:
                ie_key = embie.ie_key()
@ -3217,6 +3240,10 @@ class GenericIE(InfoExtractor):
        if pulsembed_entries:
            return self.playlist_result(pulsembed_entries, video_id, video_title)

+        castos_ssp_entries = CastosSSPIE._extract_entries(webpage)
+        if castos_ssp_entries:
+            return self.playlist_result(castos_ssp_entries, video_id, video_title)
+
        # Look for HTML5 media
        entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
        if entries: