From 44ed85b18b4435ec1822732a3f40cf2b0fa43c3d Mon Sep 17 00:00:00 2001
From: Lauren Liberda <laura@selfisekai.rocks>
Date: Tue, 13 Apr 2021 00:17:17 +0200
Subject: [PATCH] + castos extractors

---
 haruhi_dl/extractor/castos.py     | 91 +++++++++++++++++++++++++++++++
 haruhi_dl/extractor/extractors.py |  1 +
 haruhi_dl/extractor/generic.py    | 27 +++++++++
 3 files changed, 119 insertions(+)
 create mode 100644 haruhi_dl/extractor/castos.py
diff --git a/haruhi_dl/extractor/castos.py b/haruhi_dl/extractor/castos.py
new file mode 100644
index 000000000..774e4a148
--- /dev/null
+++ b/haruhi_dl/extractor/castos.py
@@ -0,0 +1,91 @@
+# coding: utf-8
+
+from .common import InfoExtractor
+from ..utils import (
+    parse_duration,
+)
+
+import re
+
+
+class CastosHostedIE(InfoExtractor):
+    _VALID_URL = r'https?://[^/.]+\.castos\.com/(?:player|episodes)/(?P<id>[\da-zA-Z-]+)'
+    IE_NAME = 'castos:hosted'
+
+    _TESTS = [{
+        'url': 'https://audience.castos.com/player/408278',
+        'info_dict': {
+            'id': '408278',
+            'ext': 'mp3',
+        },
+    }, {
+        'url': 'https://audience.castos.com/episodes/improve-your-podcast-production',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_urls(webpage, **kw):
+        return [mobj.group(1) for mobj
+                in re.finditer(
+                    r'<iframe\b[^>]+(?<!-)src="(https?://[^/.]+\.castos\.com/player/\d+)',
+                    webpage)]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        series = self._html_search_regex(
+            r'<div class="show">\s+<strong>([^<]+)</strong>', webpage, 'series name')
+        title = self._html_search_regex(
+            r'<div class="episode-title">([^<]+)</div>', webpage, 'episode title')
+
+        audio_url = self._html_search_regex(
+            r'<audio class="clip">\s+<source\b[^>]+src="(https?://[^"]+)"', webpage, 'audio url')
+        duration = parse_duration(self._search_regex(
+            r'<time id="duration">(\d\d(?::\d\d)+)</time>', webpage, 'duration'))
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': audio_url,
+            'duration': duration,
+            'series': series,
+            'episode': title,
+        }
+
+
+class CastosSSPIE(InfoExtractor):
+    @classmethod
+    def _extract_entries(self, webpage, **kw):
+        entries = []
+        for found in re.finditer(
+                r'(?s)<div class="castos-player[^"]*"[^>]*data-episode="(\d+)-[a-z\d]+">(.+?</nav>)\s*</div>',
+                webpage):
+            video_id, entry = found.group(1, 2)
+
+            def search_entry(regex):
+                res = re.search(regex, entry)
+                if res:
+                    return res.group(1)
+
+            series = search_entry(r'<div class="show">\s+<strong>([^<]+)</strong>')
+            title = search_entry(r'<div class="episode-title">([^<]+)</div>')
+
+            audio_url = search_entry(
+                r'<audio class="clip[^"]*">\s+<source\b[^>]+src="(https?://[^"]+)"')
+            duration = parse_duration(
+                search_entry(r'<time id="duration[^"]*">(\d\d(?::\d\d)+)</time>'))
+
+            if not title or not audio_url:
+                continue
+
+            entries.append({
+                'id': video_id,
+                'title': title,
+                'url': audio_url,
+                'duration': duration,
+                'series': series,
+                'episode': title,
+            })
+        return entries
diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py
index b9a835cfa..64ad080fe 100644
--- a/haruhi_dl/extractor/extractors.py
+++ b/haruhi_dl/extractor/extractors.py
@@ -183,6 +183,7 @@ from .carambatv import (
     CarambaTVPageIE,
 )
 from .cartoonnetwork import CartoonNetworkIE
+from .castos import CastosHostedIE
 from .cbc import (
     CBCIE,
     CBCPlayerIE,
diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py
index 1a9287d14..099dec091 100644
--- a/haruhi_dl/extractor/generic.py
+++ b/haruhi_dl/extractor/generic.py
@@ -137,6 +137,10 @@ from .arcpublishing import ArcPublishingIE
 from .medialaan import MedialaanIE
 from .simplecast import SimplecastIE
 from .spreaker import SpreakerIE
+from .castos import (
+    CastosHostedIE,
+    CastosSSPIE,
+)
 
 
 class GenericIE(InfoExtractor):
@@ -2316,6 +2320,24 @@ class GenericIE(InfoExtractor):
                 'timestamp': 1617024666,
             },
         },
+        {
+            # Castos (hosted) player
+            'url': 'https://castos.com/enhanced-podcast-player/',
+            'info_dict': {
+                'id': '210448',
+                'ext': 'mp3',
+                'title': '4 Ways To Create A Video Podcast (And Why You Should Try It)',
+            },
+        },
+        {
+            # Castos Super Simple Podcasting (WordPress plugin, selfhosted)
+            'url': 'https://pzbn.pl/4-heated-terf-moment/',
+            'info_dict': {
+                'id': '38',
+                'ext': 'mp3',
+                'title': '#4: Heated TERF moment',
+            },
+        },
     ]
 
     def report_following_redirect(self, new_url):
@@ -2755,6 +2777,7 @@ class GenericIE(InfoExtractor):
             MedialaanIE,
             SimplecastIE,
             SpreakerIE,
+            CastosHostedIE,
         ):
             try:
                 ie_key = embie.ie_key()
@@ -3217,6 +3240,10 @@ class GenericIE(InfoExtractor):
         if pulsembed_entries:
             return self.playlist_result(pulsembed_entries, video_id, video_title)
 
+        castos_ssp_entries = CastosSSPIE._extract_entries(webpage)
+        if castos_ssp_entries:
+            return self.playlist_result(castos_ssp_entries, video_id, video_title)
+
         # Look for HTML5 media
         entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
         if entries: