fix/speedup ci

vider support
[polskieradio] fix PR4 audition shit
2021-09-09 12:38:11 +02:00 · 2021-09-06 22:34:06 +02:00 · 2021-08-31 20:25:12 +02:00 · 2021-08-07 02:23:28 +02:00 · 2021-08-07 01:08:07 +02:00 · 2021-08-01 17:44:07 +02:00
9 changed files with 140 additions and 37 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,5 +1,6 @@
 default:
  before_script:
+    - sed -i "s@dl-cdn.alpinelinux.org@alpine.sakamoto.pl@g" /etc/apk/repositories
    - apk add bash
    - pip install nose

--- a/6
+++ b/6
@ -1,3 +1,9 @@
+version 2021.08.01
+Extractor
+* [youtube] fixed agegate
+* [niconico] dmc downloader from youtube-dlp
+* [peertube] new URL schemas
+
 version 2021.06.20
 Core
 * [playwright] fixed headlessness
--- a/haruhi_dl/extractor/extractors.py
+++ b/haruhi_dl/extractor/extractors.py
@ -1513,6 +1513,7 @@ from .videomore import (
 )
 from .videopress import VideoPressIE
 from .videotarget import VideoTargetIE
+from .vider import ViderIE
 from .vidio import VidioIE
 from .vidlii import VidLiiIE
 from .vidme import (
--- a/haruhi_dl/extractor/ipla.py
+++ b/haruhi_dl/extractor/ipla.py
@ -8,6 +8,7 @@ from .common import InfoExtractor
 from ..utils import (
    int_or_none,
    url_or_none,
+    ExtractorError,
 )


@ -79,7 +80,11 @@ class IplaIE(InfoExtractor):
            'Content-type': 'application/json'
        }

-        res = self._download_json('http://b2c-mobile.redefine.pl/rpc/navigation/', media_id, data=req, headers=headers)
+        res = self._download_json('https://b2c-mobile.redefine.pl/rpc/navigation/', media_id, data=req, headers=headers)
+        if not res.get('result'):
+            if res['error']['code'] == 13404:
+                raise ExtractorError('Video requires DRM protection', expected=True)
+            raise ExtractorError(f"Ipla said: {res['error']['message']} - {res['error']['data']['userMessage']}")
        return res['result']['mediaItem']

    def get_url(self, media_id, source_id):
@ -93,4 +98,6 @@ class IplaIE(InfoExtractor):
        }

        res = self._download_json('https://b2c-mobile.redefine.pl/rpc/drm/', media_id, data=req, headers=headers)
+        if not res.get('result'):
+            raise ExtractorError(f"Ipla said: {res['error']['message']} - {res['error']['data']['userMessage']}")
        return res['result']['url']
--- a/haruhi_dl/extractor/peertube.py
+++ b/haruhi_dl/extractor/peertube.py
@ -21,7 +21,7 @@ from ..utils import (


 class PeerTubeBaseExtractor(SelfhostedInfoExtractor):
-    _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
+    _UUID_RE = r'[\da-zA-Z]{22}|[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
    _API_BASE = 'https://%s/api/v1/%s/%s/%s'
    _SH_VALID_CONTENT_STRINGS = (
        '<title>PeerTube<',
@ -180,16 +180,16 @@ class PeerTubeBaseExtractor(SelfhostedInfoExtractor):

 class PeerTubeSHIE(PeerTubeBaseExtractor):
    _VALID_URL = r'peertube:(?P<host>[^:]+):(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE)
-    _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)|api/v\d/videos)/(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE)
+    _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)|api/v\d/videos|w)/(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE)

    _TESTS = [{
        'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d',
-        'md5': '9bed8c0137913e17b86334e5885aacff',
+        'md5': '8563064d245a4be5705bddb22bb00a28',
        'info_dict': {
            'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d',
            'ext': 'mp4',
            'title': 'What is PeerTube?',
-            'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10',
+            'description': 'md5:96adbaf219b4d41747bfc5937df0b017',
            'thumbnail': r're:https?://.*\.(?:jpg|png)',
            'timestamp': 1538391166,
            'upload_date': '20181001',
@ -220,6 +220,27 @@ class PeerTubeSHIE(PeerTubeBaseExtractor):
            'upload_date': '20200420',
            'uploader': 'Drew DeVault',
        }
+    }, {
+        # new url scheme since PeerTube 3.3
+        'url': 'https://peertube2.cpy.re/w/3fbif9S3WmtTP8gGsC5HBd',
+        'info_dict': {
+            'id': '122d093a-1ede-43bd-bd34-59d2931ffc5e',
+            'ext': 'mp4',
+            'title': 'E2E tests',
+            'uploader_id': '37855',
+            'timestamp': 1589276219,
+            'upload_date': '20200512',
+            'uploader': 'chocobozzz',
+        },
+    }, {
+        'url': 'https://peertube2.cpy.re/w/122d093a-1ede-43bd-bd34-59d2931ffc5e',
+        'only_matching': True,
+    }, {
+        'url': 'https://peertube2.cpy.re/api/v1/videos/3fbif9S3WmtTP8gGsC5HBd',
+        'only_matching': True,
+    }, {
+        'url': 'peertube:peertube2.cpy.re:3fbif9S3WmtTP8gGsC5HBd',
+        'only_matching': True,
    }, {
        'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44',
        'only_matching': True,
@ -289,7 +310,7 @@ class PeerTubeSHIE(PeerTubeBaseExtractor):

        description = None
        if webpage:
-            description = self._og_search_description(webpage)
+            description = self._og_search_description(webpage, default=None)
        if not description:
            full_description = self._call_api(
                host, 'videos', video_id, 'description', note='Downloading description JSON',
@ -305,7 +326,7 @@ class PeerTubeSHIE(PeerTubeBaseExtractor):

 class PeerTubePlaylistSHIE(PeerTubeBaseExtractor):
    _VALID_URL = r'peertube:playlist:(?P<host>[^:]+):(?P<id>.+)'
-    _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)/playlist|api/v\d/video-playlists)/(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE)
+    _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)/playlist|api/v\d/video-playlists|w/p)/(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE)

    _TESTS = [{
        'url': 'https://video.internet-czas-dzialac.pl/videos/watch/playlist/3c81b894-acde-4539-91a2-1748b208c14c?playlistPosition=1',
@ -316,6 +337,9 @@ class PeerTubePlaylistSHIE(PeerTubeBaseExtractor):
            'uploader': 'Internet. Czas działać!',
        },
        'playlist_mincount': 14,
+    }, {
+        'url': 'https://peertube2.cpy.re/w/p/hrAdcvjkMMkHJ28upnoN21',
+        'only_matching': True,
    }]

    def _selfhosted_extract(self, url, webpage=None):
@ -352,18 +376,21 @@ class PeerTubePlaylistSHIE(PeerTubeBaseExtractor):

 class PeerTubeChannelSHIE(PeerTubeBaseExtractor):
    _VALID_URL = r'peertube:channel:(?P<host>[^:]+):(?P<id>.+)'
-    _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:api/v\d/)?video-channels/(?P<id>[^/?#]+)(?:/videos)?'
+    _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:(?:api/v\d/)?video-channels|c)/(?P<id>[^/?#]+)(?:/videos)?'

    _TESTS = [{
        'url': 'https://video.internet-czas-dzialac.pl/video-channels/internet_czas_dzialac/videos',
        'info_dict': {
            'id': '2',
-            'title': 'internet_czas_dzialac',
-            'description': 'md5:4d2e215ea0d9ae4501a556ef6e9a5308',
+            'title': 'Internet. Czas działać!',
+            'description': 'md5:ac35d70f6625b04b189e0b4b76e62e17',
            'uploader_id': 3,
            'uploader': 'Internet. Czas działać!',
        },
        'playlist_mincount': 14,
+    }, {
+        'url': 'https://video.internet-czas-dzialac.pl/c/internet_czas_dzialac',
+        'only_matching': True,
    }]

    def _selfhosted_extract(self, url, webpage=None):
@ -401,18 +428,21 @@ class PeerTubeChannelSHIE(PeerTubeBaseExtractor):

 class PeerTubeAccountSHIE(PeerTubeBaseExtractor):
    _VALID_URL = r'peertube:account:(?P<host>[^:]+):(?P<id>.+)'
-    _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:api/v\d/)?accounts/(?P<id>[^/?#]+)(?:/video(?:s|-channels))?'
+    _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:(?:api/v\d/)?accounts|a)/(?P<id>[^/?#]+)(?:/video(?:s|-channels))?'

    _TESTS = [{
        'url': 'https://video.internet-czas-dzialac.pl/accounts/icd/video-channels',
        'info_dict': {
            'id': '3',
-            'description': 'md5:ab3c9b934dd39030eea1c9fe76079870',
+            'description': 'md5:ac35d70f6625b04b189e0b4b76e62e17',
            'uploader': 'Internet. Czas działać!',
            'title': 'Internet. Czas działać!',
            'uploader_id': 3,
        },
        'playlist_mincount': 14,
+    }, {
+        'url': 'https://video.internet-czas-dzialac.pl/a/icd',
+        'only_matching': True,
    }]

    def _selfhosted_extract(self, url, webpage=None):
--- a/haruhi_dl/extractor/polskieradio.py
+++ b/haruhi_dl/extractor/polskieradio.py
@ -91,6 +91,14 @@ class PolskieRadioIE(PolskieRadioBaseExtractor):
                'upload_date': '20201116',
            },
        }]
+    }, {
+        # PR4 audition - other frontend
+        'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301',
+        'info_dict': {
+            'id': '2610977',
+            'ext': 'mp3',
+            'title': 'Pogłos 29 października godz. 23:01',
+        },
    }, {
        'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis',
        'only_matching': True,
@ -113,24 +121,34 @@ class PolskieRadioIE(PolskieRadioBaseExtractor):

        content = self._search_regex(
            r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>',
-            webpage, 'content')
+            webpage, 'content', default=None)

        timestamp = unified_timestamp(self._html_search_regex(
            r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>',
-            webpage, 'timestamp', fatal=False))
+            webpage, 'timestamp', default=None))

-        thumbnail_url = self._og_search_thumbnail(webpage)
+        thumbnail_url = self._og_search_thumbnail(webpage, default=None)

        title = self._og_search_title(webpage).strip()

+        description = strip_or_none(self._og_search_description(webpage, default=None))
+        
+        if not content:
+            return {
+                'id': playlist_id,
+                'url': 'https:' + self._search_regex(r"source:\s*'(//static\.prsa\.pl/[^']+)'", webpage, 'audition record url'),
+                'title': title,
+                'description': description,
+                'timestamp': timestamp,
+                'thumbnail': thumbnail_url,
+            }
+
        entries = self._extract_webpage_player_entries(content, playlist_id, {
            'title': title,
            'timestamp': timestamp,
            'thumbnail': thumbnail_url,
        })

-        description = strip_or_none(self._og_search_description(webpage))
-
        return self.playlist_result(entries, playlist_id, title, description)


--- a/haruhi_dl/extractor/vider.py
+++ b/haruhi_dl/extractor/vider.py
@ -0,0 +1,37 @@
+from .common import InfoExtractor
+
+
+class ViderIE(InfoExtractor):
+    _VALID_URL = r'https?://vider\.(?:pl|info)/(?:vid/\+f|embed/video/)(?P<id>[a-z\d]+)'
+    _TESTS = [{
+        'url': 'https://vider.info/vid/+fsx51se',
+        'info_dict': {
+            'id': 'sx51se',
+            'ext': 'mp4',
+            'title': 'Big Buck Bunny',
+            'upload_date': '20210906',
+            'timestamp': 1630927351,
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(f'https://vider.info/vid/+f{video_id}', video_id)
+
+        json_ld = self._parse_json(
+            self._search_regex(
+                r'(?s)<script type="application/ld\+json">(.+?)</script>',
+                webpage, 'JSON-LD'), video_id)
+        info_dict = self._json_ld(json_ld, video_id)
+        # generated SEO junk
+        info_dict['description'] = None
+        info_dict['id'] = video_id
+        info_dict['formats'] = [{
+            'url': self._search_regex(r'\?file=(.+)', json_ld['embedUrl'], 'video url'),
+            'http_headers': {
+                'Referer': 'https://vider.info/',
+            },
+        }]
+
+        return info_dict
--- a/haruhi_dl/extractor/youtube.py
+++ b/haruhi_dl/extractor/youtube.py
@ -1441,29 +1441,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
        if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
                or re.search(r'player-age-gate-content">', video_webpage) is not None):
            age_gate = True
-            # We simulate the access to the video from www.youtube.com/v/{video_id}
-            # this can be viewed without login into Youtube
-            data = compat_urllib_parse_urlencode({
-                'video_id': video_id,
-                'eurl': 'https://youtube.googleapis.com/v/' + video_id,
-                'html5': 1,
-                'c': 'TVHTML5',
-                'cver': '6.20180913',
-            })
-            video_info_url = proto + '://www.youtube.com/get_video_info?' + data
            try:
-                video_info_webpage = self._download_webpage(
-                    video_info_url, video_id,
-                    note='Downloading age-gated video info',
+                yti1_player = self._download_webpage(
+                    proto + '://www.youtube.com/youtubei/v1/player', video_id,
+                    headers={
+                        'User-Agent': 'Mozilla/5.0 (SMART-TV; Linux; Tizen 4.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.0 Safari/537.36',
+                        'Content-Type': 'application/json',
+                        'X-Goog-Api-Key': self._YOUTUBE_API_KEY,
+                    },
+                    data=bytes(json.dumps({
+                        'context': {
+                            'client': {
+                                'clientName': 'WEB',
+                                'clientVersion': '2.20210721.00.00',
+                                'clientScreen': 'EMBED',
+                            },
+                        },
+                        'videoId': video_id,
+                    }).encode('utf-8')),
+                    note='Downloading age-gated player info',
                    errnote='unable to download video info')
            except ExtractorError:
-                video_info_webpage = None
-            if video_info_webpage:
-                video_info = compat_parse_qs(video_info_webpage)
-                pl_response = video_info.get('player_response', [None])[0]
-                player_response = extract_player_response(pl_response, video_id)
+                yti1_player = None
+            if yti1_player:
+                player_response = extract_player_response(yti1_player, video_id)
                add_dash_mpd(video_info)
-                view_count = extract_view_count(video_info)
+                view_count = extract_view_count(video_id)
        else:
            age_gate = False
            # Try looking directly into the video webpage
--- a/haruhi_dl/version.py
+++ b/haruhi_dl/version.py
@ -1,6 +1,6 @@
 from __future__ import unicode_literals

-__version__ = '2021.06.24.1'
+__version__ = '2021.08.01'

 if __name__ == '__main__':
    print(__version__)
Author	SHA1	Message	Date
Lauren Liberda	2f375d447c	fix/speedup ci	2021-09-09 12:38:11 +02:00
Lauren Liberda	d464b29113	vider support	2021-09-06 22:34:06 +02:00
Lauren Liberda	19602fb3f5	[polskieradio] fix PR4 audition shit	2021-08-31 20:25:12 +02:00
Lauren Liberda	a550e21b8c	[ipla] state the DRM requirement clearly	2021-08-07 02:23:28 +02:00
Lauren Liberda	1ae67712e8	[ipla] error handling	2021-08-07 01:08:07 +02:00
Dominika Liberda	a96bf110da	* version 2021.08.01	2021-08-01 17:44:07 +02:00
Lauren Liberda	973652cf4d	[youtube] fix age gate for some videos	2021-08-01 17:39:30 +02:00
Lauren Liberda	d81137a604	[peertube] pt 3.3+ url scheme support, fix tests, minor fixes	2021-07-30 20:40:19 +02:00