From b85fc0e982c09b85d2f4e90102ee2594931df3fb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 01:17:49 +0100 Subject: [PATCH 001/384] [cnbc] fix extraction --- haruhi_dl/extractor/cnbc.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/cnbc.py b/haruhi_dl/extractor/cnbc.py index 6889b0f40..7b9f4536a 100644 --- a/haruhi_dl/extractor/cnbc.py +++ b/haruhi_dl/extractor/cnbc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import smuggle_url @@ -38,7 +39,7 @@ class CNBCIE(InfoExtractor): class CNBCVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P[^./?#&]+)' + _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P/video/(?:[^/]+/)+(?P[^./?#&]+)\.html)' _TEST = { 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', 'info_dict': { @@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor): } def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, - 'video id') + path, display_id = re.match(self._VALID_URL, url).groups() + video_id = self._download_json( + 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ + 'query': '''{ + page(path: "%s") { + vcpsId + } +}''' % path, + })['data']['page']['vcpsId'] return self.url_result( - 'http://video.cnbc.com/gallery/?video=%s' % video_id, + 'http://video.cnbc.com/gallery/?video=%d' % video_id, CNBCIE.ie_key()) -- GitLab From 0f60a7c66cf704acb211dc7559ce043a15d423ca Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Fri, 26 Feb 2021 13:59:51 +0100 Subject: [PATCH 002/384] [devscripts/make_lazy_extractors] Correct a spelling mistake (#26991) --- devscripts/make_lazy_extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 0cfdf37ca..32f344201 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -77,7 +77,7 @@ def build_lazy_ie(ie, name): return s -# find the correct sorting and add the required base classes so that sublcasses +# find the correct sorting and add the required base classes so that subclasses # can be correctly created classes = _ALL_CLASSES[:-1] ordered_cls = [] -- GitLab From 4d26aa35af598a48a555bfc48bda3594e9416835 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 14:00:20 +0100 Subject: [PATCH 003/384] [nbc] fix NBCNews/Today/MSNBC extraction --- haruhi_dl/extractor/nbc.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/nbc.py b/haruhi_dl/extractor/nbc.py index 6f3cb3003..ea5f5a315 100644 --- a/haruhi_dl/extractor/nbc.py +++ b/haruhi_dl/extractor/nbc.py @@ -10,7 +10,6 @@ from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, - js_to_json, parse_duration, smuggle_url, try_get, @@ -394,8 +393,8 @@ class NBCNewsIE(ThePlatformIE): webpage = self._download_webpage(url, video_id) data = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({.+});', webpage, - 'bootstrap json'), video_id, js_to_json) + r']+id="__NEXT_DATA__"[^>]*>({.+?})', + webpage, 'bootstrap json'), video_id)['props']['initialState'] video_data = try_get(data, lambda x: x['video']['current'], dict) if not video_data: video_data = data['article']['content'][0]['primaryMedia']['video'] -- GitLab From 33c8322b1d56a252f848d8f208fab1ca213c60cc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 14:02:17 +0100 Subject: [PATCH 004/384] [usanetwork] fix extraction --- haruhi_dl/extractor/usanetwork.py | 82 ++++++------------------------- 1 file changed, 16 insertions(+), 66 deletions(-) diff --git a/haruhi_dl/extractor/usanetwork.py b/haruhi_dl/extractor/usanetwork.py index 54c7495cc..e3784e55f 100644 --- a/haruhi_dl/extractor/usanetwork.py +++ b/haruhi_dl/extractor/usanetwork.py @@ -1,74 +1,24 @@ # coding: utf-8 from __future__ import unicode_literals -from .adobepass import AdobePassIE -from ..utils import ( - NO_DEFAULT, - smuggle_url, - update_url_query, -) +from .nbc import NBCIE -class USANetworkIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P[^/?#]+)' - _TEST = { - 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity', - 'md5': '33c0d2ba381571b414024440d08d57fd', +class USANetworkIE(NBCIE): + _VALID_URL = r'https?(?P://(?:www\.)?usanetwork\.com/[^/]+/video/[^/]+/(?P\d+))' + _TESTS = [{ + 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302', 'info_dict': { - 'id': '3086229', + 'id': '4185302', 'ext': 'mp4', - 'title': 'HPE Cybersecurity', - 'description': 'The more we digitize our world, the more vulnerable we are.', - 'upload_date': '20160818', - 'timestamp': 1471535460, - 'uploader': 'NBCU-USA', + 'title': 'Intelligence (Trailer)', + 'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.', + 'upload_date': '20200715', + 'timestamp': 1594785600, + 'uploader': 'NBCU-MPAT', }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def _x(name, default=NO_DEFAULT): - return self._search_regex( - r'data-%s\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' % name, - webpage, name, default=default, group='value') - - video_id = _x('mpx-guid') - title = _x('episode-title') - mpx_account_id = _x('mpx-account-id', '2304992029') - - query = { - 'mbr': 'true', - } - if _x('is-full-episode', None) == '1': - query['manifest'] = 'm3u' - - if _x('is-entitlement', None) == '1': - adobe_pass = {} - drupal_settings = self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings', fatal=False) - if drupal_settings: - drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False) - if drupal_settings: - adobe_pass = drupal_settings.get('adobePass', {}) - resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'usa'), - title, video_id, _x('episode-rating', 'TV-14')) - query['auth'] = self._extract_mvpd_auth( - url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource) - - info = self._search_json_ld(webpage, video_id, default={}) - info.update({ - '_type': 'url_transparent', - 'url': smuggle_url(update_url_query( - 'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id), - query), {'force_smil_url': True}), - 'id': video_id, - 'title': title, - 'series': _x('show-title', None), - 'episode': title, - 'ie_key': 'ThePlatform', - }) - return info + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] -- GitLab From d52a2bf577a44f6bed6c6cefcadf0229266ca635 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 14:09:47 +0100 Subject: [PATCH 005/384] [rai] fix RaiPlay extraction --- haruhi_dl/extractor/rai.py | 61 +++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/haruhi_dl/extractor/rai.py b/haruhi_dl/extractor/rai.py index 207a6c247..bee2d53f5 100644 --- a/haruhi_dl/extractor/rai.py +++ b/haruhi_dl/extractor/rai.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -16,7 +17,6 @@ from ..utils import ( int_or_none, parse_duration, strip_or_none, - try_get, unescapeHTML, unified_strdate, unified_timestamp, @@ -141,6 +141,7 @@ class RaiPlayIE(RaiBaseIE): 'series': 'La Casa Bianca', 'season': '2016', }, + 'skip': 'This content is not available', }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', @@ -148,14 +149,12 @@ class RaiPlayIE(RaiBaseIE): 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', 'title': 'Report del 07/04/2014', - 'alt_title': 'S2013/14 - Puntata del 07/04/2014', - 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', + 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014', + 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 5', - 'creator': 'Rai 5', + 'uploader': 'Rai Gulp', 'duration': 6160, 'series': 'Report', - 'season_number': 5, 'season': '2013/14', }, 'params': { @@ -167,48 +166,51 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - url, video_id = mobj.group('url', 'id') + url, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( - '%s?json' % url, video_id, 'Downloading video JSON') + url.replace('.html', '.json'), video_id, 'Downloading video JSON') title = media['name'] video = media['video'] - relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) + relinker_info = self._extract_relinker_info(video['content_url'], video_id) self._sort_formats(relinker_info['formats']) thumbnails = [] - if 'images' in media: - for _, value in media.get('images').items(): - if value: - thumbnails.append({ - 'url': value.replace('[RESOLUTION]', '600x400') - }) + for _, value in media.get('images', {}).items(): + if value: + thumbnails.append({ + 'url': urljoin(url, value), + }) - timestamp = unified_timestamp(try_get( - media, lambda x: x['availabilities'][0]['start'], compat_str)) + date_published = media.get('date_published') + time_published = media.get('time_published') + if date_published and time_published: + date_published += ' ' + time_published subtitles = self._extract_subtitles(url, video.get('subtitles')) + program_info = media.get('program_info') or {} + season = media.get('season') + info = { 'id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, - 'alt_title': media.get('subtitle'), + 'alt_title': strip_or_none(media.get('subtitle')), 'description': media.get('description'), 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor')), + 'creator': strip_or_none(media.get('editor') or None), 'duration': parse_duration(video.get('duration')), - 'timestamp': timestamp, + 'timestamp': unified_timestamp(date_published), 'thumbnails': thumbnails, - 'series': try_get( - media, lambda x: x['isPartOf']['name'], compat_str), - 'season_number': int_or_none(try_get( - media, lambda x: x['isPartOf']['numeroStagioni'])), - 'season': media.get('stagione') or None, + 'series': program_info.get('name'), + 'season_number': int_or_none(season), + 'season': season if (season and not season.isdigit()) else None, + 'episode': media.get('episode_title'), + 'episode_number': int_or_none(media.get('episode')), 'subtitles': subtitles, } @@ -300,7 +302,8 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1758, 'upload_date': '20140612', - } + }, + 'skip': 'This content is available only in Italy', }, { # with ContentItem in many metas 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', @@ -316,7 +319,7 @@ class RaiIE(RaiBaseIE): }, { # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '11959b4e44fa74de47011b5799490adf', + 'md5': '6865dd00cf0bbf5772fdd89d59bd768a', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', @@ -338,6 +341,7 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20141221', }, + 'skip': 'This content is not available', }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', @@ -360,6 +364,7 @@ class RaiIE(RaiBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'This content is available only in Italy', }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', -- GitLab From 44676b32c351b02b26e50706df448bde8da4e207 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 14:09:54 +0100 Subject: [PATCH 006/384] [bandcamp] fix extraction --- haruhi_dl/extractor/bandcamp.py | 151 +++++++++++++------------------- 1 file changed, 59 insertions(+), 92 deletions(-) diff --git a/haruhi_dl/extractor/bandcamp.py b/haruhi_dl/extractor/bandcamp.py index 9ac93645e..82b605531 100644 --- a/haruhi_dl/extractor/bandcamp.py +++ b/haruhi_dl/extractor/bandcamp.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import random @@ -5,10 +6,7 @@ import re import time from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, @@ -17,30 +15,32 @@ from ..utils import ( parse_filesize, str_or_none, try_get, - unescapeHTML, update_url_query, unified_strdate, unified_timestamp, url_or_none, + urljoin, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P[^/?#&]+)' + _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://haruhi-dl.bandcamp.com/track/haruhi-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "haruhi-dl \"'/\\\u00e4\u21ad - haruhi-dl test song \"'/\\\u00e4\u21ad", + 'title': "haruhi-dl \"'/\\ä↭ - haruhi-dl \"'/\\ä↭ - haruhi-dl test song \"'/\\ä↭", 'duration': 9.8485, + 'uploader': 'haruhi-dl "\'/\\ä↭', + 'upload_date': '20121129', + 'timestamp': 1354224127, }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '853e35bf34aa1d6fe2615ae612564b36', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', @@ -79,11 +79,16 @@ class BandcampIE(InfoExtractor): }, }] + def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): + return self._parse_json(self._html_search_regex( + r'data-%s=(["\'])({.+?})\1' % attr, webpage, + attr + ' data', group=2), video_id, fatal=fatal) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') + title = self._match_id(url) webpage = self._download_webpage(url, title) - thumbnail = self._html_search_meta('og:image', webpage, default=None) + tralbum = self._extract_data_attr(webpage, title) + thumbnail = self._og_search_thumbnail(webpage) track_id = None track = None @@ -91,10 +96,7 @@ class BandcampIE(InfoExtractor): duration = None formats = [] - track_info = self._parse_json( - self._search_regex( - r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', - webpage, 'track info', default='{}'), title) + track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -111,37 +113,25 @@ class BandcampIE(InfoExtractor): 'abr': int_or_none(abr_str), }) track = track_info.get('title') - track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) + track_id = str_or_none( + track_info.get('track_id') or track_info.get('id')) track_number = int_or_none(track_info.get('track_num')) duration = float_or_none(track_info.get('duration')) - def extract(key): - return self._search_regex( - r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key, - webpage, key, default=None, group='value') - - artist = extract('artist') - album = extract('album_title') + embed = self._extract_data_attr(webpage, title, 'embed', False) + current = tralbum.get('current') or {} + artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') timestamp = unified_timestamp( - extract('publish_date') or extract('album_publish_date')) - release_date = unified_strdate(extract('album_release_date')) + current.get('publish_date') or tralbum.get('album_publish_date')) - download_link = self._search_regex( - r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'download link', default=None, group='url') + download_link = tralbum.get('freeDownloadPage') if download_link: - track_id = self._search_regex( - r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', - webpage, 'track id') + track_id = compat_str(tralbum['id']) download_webpage = self._download_webpage( download_link, track_id, 'Downloading free downloads page') - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, - 'blob', group='blob'), - track_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(download_webpage, track_id, 'blob') info = try_get( blob, (lambda x: x['digital_items'][0], @@ -207,20 +197,20 @@ class BandcampIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': artist, 'timestamp': timestamp, - 'release_date': release_date, + 'release_date': unified_strdate(tralbum.get('album_release_date')), 'duration': duration, 'track': track, 'track_number': track_number, 'track_id': track_id, 'artist': artist, - 'album': album, + 'album': embed.get('album_title'), 'formats': formats, } -class BandcampAlbumIE(InfoExtractor): +class BandcampAlbumIE(BandcampIE): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -230,7 +220,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '1353101989', 'ext': 'mp3', - 'title': 'Intro', + 'title': 'Blazo - Intro', + 'timestamp': 1311756226, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, { @@ -238,7 +231,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '38097443', 'ext': 'mp3', - 'title': 'Kero One - Keep It Alive (Blazo remix)', + 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)', + 'timestamp': 1311757238, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, ], @@ -294,41 +290,31 @@ class BandcampAlbumIE(InfoExtractor): else super(BandcampAlbumIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader_id = mobj.group('subdomain') - album_id = mobj.group('album_id') + uploader_id, album_id = re.match(self._VALID_URL, url).groups() playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) - track_elements = re.findall( - r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) - if not track_elements: + tralbum = self._extract_data_attr(webpage, playlist_id) + track_info = tralbum.get('trackinfo') + if not track_info: raise ExtractorError('The page doesn\'t contain any tracks') # Only tracks with duration info have songs entries = [ self.url_result( - compat_urlparse.urljoin(url, t_path), - ie=BandcampIE.ie_key(), - video_title=self._search_regex( - r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', - elem_content, 'track title', fatal=False)) - for elem_content, t_path in track_elements - if self._html_search_meta('duration', elem_content, default=None)] - - title = self._html_search_regex( - r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', - webpage, 'title', fatal=False) - if title: - title = title.replace(r'\"', '"') + urljoin(url, t['title_link']), BandcampIE.ie_key(), + str_or_none(t.get('track_id') or t.get('id')), t.get('title')) + for t in track_info + if t.get('duration')] + return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, - 'title': title, + 'title': try_get(tralbum, lambda x: x['current']['title'], compat_str), 'entries': entries, } -class BandcampWeeklyIE(InfoExtractor): +class BandcampWeeklyIE(BandcampIE): IE_NAME = 'Bandcamp:weekly' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' _TESTS = [{ @@ -343,29 +329,23 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': '20170404', 'series': 'Bandcamp Weekly', 'episode': 'Magic Moments', - 'episode_number': 208, 'episode_id': '224', - } + }, + 'params': { + 'format': 'opus-lo', + }, }, { 'url': 'https://bandcamp.com/?blah/blah@&show=228', 'only_matching': True }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, - 'blob', group='blob'), - video_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(webpage, show_id, 'blob') - show = blob['bcw_show'] - - # This is desired because any invalid show id redirects to `bandcamp.com` - # which happens to expose the latest Bandcamp Weekly episode. - show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) + show = blob['bcw_data'][show_id] formats = [] for format_id, format_url in show['audio_stream'].items(): @@ -390,20 +370,8 @@ class BandcampWeeklyIE(InfoExtractor): if subtitle: title += ' - %s' % subtitle - episode_number = None - seq = blob.get('bcw_seq') - - if seq and isinstance(seq, list): - try: - episode_number = next( - int_or_none(e.get('episode_number')) - for e in seq - if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) - except StopIteration: - pass - return { - 'id': video_id, + 'id': show_id, 'title': title, 'description': show.get('desc') or show.get('short_desc'), 'duration': float_or_none(show.get('audio_duration')), @@ -411,7 +379,6 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': unified_strdate(show.get('published_date')), 'series': 'Bandcamp Weekly', 'episode': show.get('subtitle'), - 'episode_number': episode_number, - 'episode_id': compat_str(video_id), + 'episode_id': show_id, 'formats': formats } -- GitLab From c62c95923ad915d07631afb2afe0ca2551f52681 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:10:03 +0100 Subject: [PATCH 007/384] [condenast] fix extraction and extract subtitles --- haruhi_dl/extractor/condenast.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/condenast.py b/haruhi_dl/extractor/condenast.py index ed278fefc..d5e77af32 100644 --- a/haruhi_dl/extractor/condenast.py +++ b/haruhi_dl/extractor/condenast.py @@ -16,6 +16,8 @@ from ..utils import ( mimetype2ext, orderedSet, parse_iso8601, + strip_or_none, + try_get, ) @@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor): 'uploader': 'gq', 'upload_date': '20170321', 'timestamp': 1490126427, + 'description': 'How much grimmer would things be if these people were competent?', }, }, { # JS embed @@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor): 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', 'uploader': 'arstechnica', 'upload_date': '20150916', - 'timestamp': 1442434955, + 'timestamp': 1442434920, } }, { 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', @@ -196,6 +199,13 @@ class CondeNastIE(InfoExtractor): }) self._sort_formats(formats) + subtitles = {} + for t, caption in video_info.get('captions', {}).items(): + caption_url = caption.get('src') + if not (t in ('vtt', 'srt', 'tml') and caption_url): + continue + subtitles.setdefault('en', []).append({'url': caption_url}) + return { 'id': video_id, 'formats': formats, @@ -208,6 +218,7 @@ class CondeNastIE(InfoExtractor): 'season': video_info.get('season_title'), 'timestamp': parse_iso8601(video_info.get('premiere_date')), 'categories': video_info.get('categories'), + 'subtitles': subtitles, } def _real_extract(self, url): @@ -225,8 +236,16 @@ class CondeNastIE(InfoExtractor): if url_type == 'series': return self._extract_series(url, webpage) else: - params = self._extract_video_params(webpage, display_id) - info = self._search_json_ld( - webpage, display_id, fatal=False) + video = try_get(self._parse_json(self._search_regex( + r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', '{}'), display_id), + lambda x: x['transformed']['video']) + if video: + params = {'videoId': video['id']} + info = {'description': strip_or_none(video.get('description'))} + else: + params = self._extract_video_params(webpage, display_id) + info = self._search_json_ld( + webpage, display_id, fatal=False) info.update(self._extract_video(params)) return info -- GitLab From 2901a6439ba3b9a36ae9c6b159d69eceaeb8e55c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:10:38 +0100 Subject: [PATCH 008/384] [lrt] fix extraction --- haruhi_dl/extractor/lrt.py | 91 +++++++++++++++----------------------- 1 file changed, 36 insertions(+), 55 deletions(-) diff --git a/haruhi_dl/extractor/lrt.py b/haruhi_dl/extractor/lrt.py index f5c997ef4..a89434adb 100644 --- a/haruhi_dl/extractor/lrt.py +++ b/haruhi_dl/extractor/lrt.py @@ -5,28 +5,26 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, - int_or_none, - parse_duration, - remove_end, + clean_html, + merge_dicts, ) class LRTIE(InfoExtractor): IE_NAME = 'lrt.lt' - _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))' _TESTS = [{ # m3u8 download - 'url': 'http://www.lrt.lt/mediateka/irasas/54391/', - 'md5': 'fe44cf7e4ab3198055f2c598fc175cb0', + 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene', + 'md5': '85cb2bb530f31d91a9c65b479516ade4', 'info_dict': { - 'id': '54391', + 'id': '2000127261', 'ext': 'mp4', - 'title': 'Septynios Kauno dienos', - 'description': 'md5:24d84534c7dc76581e59f5689462411a', - 'duration': 1783, - 'view_count': int, - 'like_count': int, + 'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė', + 'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa', + 'duration': 3035, + 'timestamp': 1604079000, + 'upload_date': '20201030', }, }, { # direct mp3 download @@ -43,52 +41,35 @@ class LRTIE(InfoExtractor): }, }] + def _extract_js_var(self, webpage, var_name, default): + return self._search_regex( + r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name, + webpage, var_name.replace('_', ' '), default, group=2) + def _real_extract(self, url): - video_id = self._match_id(url) + path, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' - LRT') - - formats = [] - for _, file_url in re.findall( - r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): - ext = determine_ext(file_url) - if ext not in ('m3u8', 'mp3'): - continue - # mp3 served as m3u8 produces stuttered media file - if ext == 'm3u8' and '.mp3' in file_url: - continue - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - file_url, video_id, 'mp4', entry_protocol='m3u8_native', - fatal=False)) - elif ext == 'mp3': - formats.append({ - 'url': file_url, - 'vcodec': 'none', - }) - self._sort_formats(formats) + media_url = self._extract_js_var(webpage, 'main_url', path) + media = self._download_json(self._extract_js_var( + webpage, 'media_info_url', + 'https://www.lrt.lt/servisai/stream_url/vod/media_info/'), + video_id, query={'url': media_url}) + jw_data = self._parse_jwplayer_data( + media['playlist_item'], video_id, base_url=url) - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage) - duration = parse_duration(self._search_regex( - r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1', - webpage, 'duration', default=None, group='duration')) + json_ld_data = self._search_json_ld(webpage, video_id) - view_count = int_or_none(self._html_search_regex( - r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>', - webpage, 'view count', fatal=False, group='count')) - like_count = int_or_none(self._search_regex( - r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<', - webpage, 'like count', fatal=False, group='count')) + tags = [] + for tag in media.get('tags', []): + tag_name = tag.get('name') + if not tag_name: + continue + tags.append(tag_name) - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, + clean_info = { + 'description': clean_html(media.get('content')), + 'tags': tags, } + + return merge_dicts(clean_info, jw_data, json_ld_data) -- GitLab From bc38ef944526c0217e1c60351d920e22a233b5b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:10:46 +0100 Subject: [PATCH 009/384] [utils] Skip ! prefixed code in js_to_json --- haruhi_dl/utils.py | 5 +++-- test/test_utils.py | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/utils.py b/haruhi_dl/utils.py index 6a02c6f0d..2bba1b04c 100644 --- a/haruhi_dl/utils.py +++ b/haruhi_dl/utils.py @@ -4080,7 +4080,7 @@ def js_to_json(code): v = m.group(0) if v in ('true', 'false', 'null'): return v - elif v.startswith('/*') or v.startswith('//') or v == ',': + elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': return "" if v[0] in ("'", '"'): @@ -4105,7 +4105,8 @@ def js_to_json(code): {comment}|,(?={skip}[\]}}])| (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| - [0-9]+(?={skip}:) + [0-9]+(?={skip}:)| + !+ '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) diff --git a/test/test_utils.py b/test/test_utils.py index a57863825..fcb86d92a 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -937,6 +937,28 @@ class TestUtil(unittest.TestCase): self.assertEqual(d['x'], 1) self.assertEqual(d['y'], 'a') + # Just drop ! prefix for now though this results in a wrong value + on = js_to_json('''{ + a: !0, + b: !1, + c: !!0, + d: !!42.42, + e: !!![], + f: !"abc", + g: !"", + !42: 42 + }''') + self.assertEqual(json.loads(on), { + 'a': 0, + 'b': 1, + 'c': 0, + 'd': 42.42, + 'e': [], + 'f': "abc", + 'g': "", + '42': 42 + }) + on = js_to_json('["abc", "def",]') self.assertEqual(json.loads(on), ['abc', 'def']) -- GitLab From 51dd5a4cc51e420bc93919062f3efeb50e581469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:10:56 +0100 Subject: [PATCH 010/384] [xtube] Fix extraction (closes #26996) --- haruhi_dl/extractor/xtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/xtube.py b/haruhi_dl/extractor/xtube.py index 01b253dcb..18969058f 100644 --- a/haruhi_dl/extractor/xtube.py +++ b/haruhi_dl/extractor/xtube.py @@ -90,7 +90,7 @@ class XTubeIE(InfoExtractor): title, thumbnail, duration = [None] * 3 config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config', + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', default='{}'), video_id, transform_source=js_to_json, fatal=False) if config: config = config.get('mainRoll') -- GitLab From 058b02f57f04e4b443bd2073eeb56d5f680fbd99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:11:04 +0100 Subject: [PATCH 011/384] =?UTF-8?q?[servus]=20Fix=20extraction=20(closes?= =?UTF-8?q?=20#26872,=20closes=20#26967,=20closes=20#26983,=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … closes #27000) --- haruhi_dl/extractor/servus.py | 106 +++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 15 deletions(-) diff --git a/haruhi_dl/extractor/servus.py b/haruhi_dl/extractor/servus.py index 9401bf2cf..206bc1801 100644 --- a/haruhi_dl/extractor/servus.py +++ b/haruhi_dl/extractor/servus.py @@ -1,9 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + unified_timestamp, + urlencode_postdata, + url_or_none, +) class ServusIE(InfoExtractor): @@ -19,13 +25,22 @@ class ServusIE(InfoExtractor): _TESTS = [{ # new URL schema 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', - 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', + 'md5': '60474d4c21f3eb148838f215c37f02b9', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', 'ext': 'mp4', 'title': 'Die Grünen aus Sicht des Volkes', + 'alt_title': 'Talk im Hangar-7 Voxpops Gruene', 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 62.442, + 'timestamp': 1605193976, + 'upload_date': '20201112', + 'series': 'Talk im Hangar-7', + 'season': 'Season 9', + 'season_number': 9, + 'episode': 'Episode 31 - September 14', + 'episode_number': 31, } }, { # old URL schema @@ -44,26 +59,87 @@ class ServusIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url).upper() - webpage = self._download_webpage(url, video_id) - title = self._search_regex( - (r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', - r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), - webpage, 'title', default=None, - group='title') or self._og_search_title(webpage) - title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + token = self._download_json( + 'https://auth.redbullmediahouse.com/token', video_id, + 'Downloading token', data=urlencode_postdata({ + 'grant_type': 'client_credentials', + }), headers={ + 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==', + }) + access_token = token['access_token'] + token_type = token.get('token_type', 'Bearer') + + video = self._download_json( + 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id, + video_id, 'Downloading video JSON', headers={ + 'Authorization': '%s %s' % (token_type, access_token), + }) - formats = self._extract_m3u8_formats( - 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, - video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + formats = [] + thumbnail = None + for resource in video['resources']: + if not isinstance(resource, dict): + continue + format_url = url_or_none(resource.get('url')) + if not format_url: + continue + extension = resource.get('extension') + type_ = resource.get('type') + if extension == 'jpg' or type_ == 'reference_keyframe': + thumbnail = format_url + continue + ext = determine_ext(format_url) + if type_ == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif type_ == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif extension == 'mp4' or ext == 'mp4': + formats.append({ + 'url': format_url, + 'format_id': type_, + 'width': int_or_none(resource.get('width')), + 'height': int_or_none(resource.get('height')), + }) self._sort_formats(formats) + attrs = {} + for attribute in video['attributes']: + if not isinstance(attribute, dict): + continue + key = attribute.get('fieldKey') + value = attribute.get('fieldValue') + if not key or not value: + continue + attrs[key] = value + + title = attrs.get('title_stv') or video_id + alt_title = attrs.get('title') + description = attrs.get('long_description') or attrs.get('short_description') + series = attrs.get('label') + season = attrs.get('season') + episode = attrs.get('chapter') + duration = float_or_none(attrs.get('duration'), scale=1000) + season_number = int_or_none(self._search_regex( + r'Season (\d+)', season or '', 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'Episode (\d+)', episode or '', 'episode number', default=None)) + return { 'id': video_id, 'title': title, + 'alt_title': alt_title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': unified_timestamp(video.get('lastPublished')), + 'series': series, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, 'formats': formats, } -- GitLab From ae004ab316f3d44521833f333089c410496c0a93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:11:12 +0100 Subject: [PATCH 012/384] [servus] Add support for pm-wissen.com (closes #25869) --- haruhi_dl/extractor/servus.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/servus.py b/haruhi_dl/extractor/servus.py index 206bc1801..1610ddc2c 100644 --- a/haruhi_dl/extractor/servus.py +++ b/haruhi_dl/extractor/servus.py @@ -18,7 +18,7 @@ class ServusIE(InfoExtractor): (?:www\.)? (?: servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| - servustv\.com/videos + (?:servustv|pm-wissen)\.com/videos ) /(?P<id>[aA]{2}-\w+|\d+-\d+) ''' @@ -55,6 +55,9 @@ class ServusIE(InfoExtractor): }, { 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', 'only_matching': True, + }, { + 'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/', + 'only_matching': True, }] def _real_extract(self, url): -- GitLab From 883cf213dc5155e01cde5c2060033589cfb81fba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:11:26 +0100 Subject: [PATCH 013/384] [ndr:embed:base] Extract subtitles (closes #25447, closes #26106) --- haruhi_dl/extractor/ndr.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/haruhi_dl/extractor/ndr.py b/haruhi_dl/extractor/ndr.py index 2447c812e..ddd828d92 100644 --- a/haruhi_dl/extractor/ndr.py +++ b/haruhi_dl/extractor/ndr.py @@ -81,6 +81,29 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + }, { + # with subtitles + 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', + 'info_dict': { + 'id': 'extra18674', + 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', + 'ext': 'mp4', + 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', + 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', + 'uploader': 'ndrtv', + 'upload_date': '20201113', + 'duration': 1749, + 'subtitles': { + 'de': [{ + 'ext': 'ttml', + 'url': r're:^https://www\.ndr\.de.+', + }], + }, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', 'only_matching': True, @@ -239,6 +262,20 @@ class NDREmbedBaseIE(InfoExtractor): 'preference': quality_key(thumbnail.get('quality')), }) + subtitles = {} + tracks = config.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + track_url = urljoin(url, track.get('src')) + if not track_url: + continue + subtitles.setdefault(track.get('srclang') or 'de', []).append({ + 'url': track_url, + 'ext': 'ttml', + }) + return { 'id': video_id, 'title': title, @@ -248,6 +285,7 @@ class NDREmbedBaseIE(InfoExtractor): 'duration': duration, 'thumbnails': thumbnails, 'formats': formats, + 'subtitles': subtitles, } -- GitLab From ebc218c4c4fa5db963a4407b80e3e39456eb4326 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:11:39 +0100 Subject: [PATCH 014/384] [lrt] fix extraction with empty tags(closes #20264) --- haruhi_dl/extractor/lrt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/lrt.py b/haruhi_dl/extractor/lrt.py index a89434adb..89d549858 100644 --- a/haruhi_dl/extractor/lrt.py +++ b/haruhi_dl/extractor/lrt.py @@ -61,7 +61,7 @@ class LRTIE(InfoExtractor): json_ld_data = self._search_json_ld(webpage, video_id) tags = [] - for tag in media.get('tags', []): + for tag in (media.get('tags') or []): tag_name = tag.get('name') if not tag_name: continue -- GitLab From 768e8bb238e9bd511f4faf197379d68d85923ff1 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:11:55 +0100 Subject: [PATCH 015/384] [urplay] fix extraction(closes #26828) --- haruhi_dl/extractor/urplay.py | 77 +++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 26 deletions(-) diff --git a/haruhi_dl/extractor/urplay.py b/haruhi_dl/extractor/urplay.py index 6030b7cb5..10b817760 100644 --- a/haruhi_dl/extractor/urplay.py +++ b/haruhi_dl/extractor/urplay.py @@ -2,7 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_timestamp +from ..utils import ( + dict_get, + int_or_none, + unified_timestamp, +) class URPlayIE(InfoExtractor): @@ -15,8 +19,8 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', - 'timestamp': 1513512768, - 'upload_date': '20171217', + 'timestamp': 1513292400, + 'upload_date': '20171214', }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -25,7 +29,7 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'Tripp, Trapp, Träd : Sovkudde', 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', - 'timestamp': 1440093600, + 'timestamp': 1440086400, 'upload_date': '20150820', }, }, { @@ -35,37 +39,58 @@ class URPlayIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - + url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - urplayer_data = self._parse_json(self._search_regex( - r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id) - host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] + urplayer_data = self._parse_json(self._html_search_regex( + r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['currentProduct'] + episode = urplayer_data['title'] + raw_streaming_info = urplayer_data['streamingInfo']['raw'] + host = self._download_json( + 'http://streaming-loadbalancer.ur.se/loadbalancer.json', + video_id)['redirect'] formats = [] - for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): - file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) + for k, v in raw_streaming_info.items(): + if not (k in ('sd', 'hd') and isinstance(v, dict)): + continue + file_http = v.get('location') if file_http: formats.extend(self._extract_wowza_formats( - 'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['rtmp', 'rtsp'])) + 'http://%s/%splaylist.m3u8' % (host, file_http), + video_id, skip_protocols=['f4m', 'rtmp', 'rtsp'])) self._sort_formats(formats) - subtitles = {} - for subtitle in urplayer_data.get('subtitles', []): - subtitle_url = subtitle.get('file') - kind = subtitle.get('kind') - if not subtitle_url or (kind and kind != 'captions'): - continue - subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ - 'url': subtitle_url, - }) + image = urplayer_data.get('image') or {} + thumbnails = [] + for k, v in image.items(): + t = { + 'id': k, + 'url': v, + } + wh = k.split('x') + if len(wh) == 2: + t.update({ + 'width': int_or_none(wh[0]), + 'height': int_or_none(wh[1]), + }) + thumbnails.append(t) + + series = urplayer_data.get('series') or {} + series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle')) return { 'id': video_id, - 'title': urplayer_data['title'], - 'description': self._og_search_description(webpage), - 'thumbnail': urplayer_data.get('image'), - 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), webpage, 'timestamp')), - 'series': urplayer_data.get('series_title'), - 'subtitles': subtitles, + 'title': '%s : %s' % (series_title, episode) if series_title else episode, + 'description': urplayer_data.get('description'), + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(urplayer_data.get('publishedAt')), + 'series': series_title, 'formats': formats, + 'duration': int_or_none(urplayer_data.get('duration')), + 'categories': urplayer_data.get('categories'), + 'tags': urplayer_data.get('keywords'), + 'season': series.get('label'), + 'episode': episode, + 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), } -- GitLab From 4826425743e6e22d7f5c7d01c0f10e29e86382ba Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:12:02 +0100 Subject: [PATCH 016/384] [bandcamp] extract playlist_description(closes #22684) --- haruhi_dl/extractor/bandcamp.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/bandcamp.py b/haruhi_dl/extractor/bandcamp.py index 82b605531..4c6b55035 100644 --- a/haruhi_dl/extractor/bandcamp.py +++ b/haruhi_dl/extractor/bandcamp.py @@ -270,6 +270,7 @@ class BandcampAlbumIE(BandcampIE): 'title': '"Entropy" EP', 'uploader_id': 'jstrecords', 'id': 'entropy-ep', + 'description': 'md5:0ff22959c943622972596062f2f366a5', }, 'playlist_mincount': 3, }, { @@ -279,6 +280,7 @@ class BandcampAlbumIE(BandcampIE): 'id': 'we-are-the-plague', 'title': 'WE ARE THE PLAGUE', 'uploader_id': 'insulters', + 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f', }, 'playlist_count': 2, }] @@ -305,11 +307,14 @@ class BandcampAlbumIE(BandcampIE): for t in track_info if t.get('duration')] + current = tralbum.get('current') or {} + return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, - 'title': try_get(tralbum, lambda x: x['current']['title'], compat_str), + 'title': current.get('title'), + 'description': current.get('about'), 'entries': entries, } -- GitLab From 14539655d5a3cf2f7fbebca02a427b7beac74fa6 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:12:10 +0100 Subject: [PATCH 017/384] [malltv] fix extraction(closes #27035) --- haruhi_dl/extractor/malltv.py | 60 +++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/haruhi_dl/extractor/malltv.py b/haruhi_dl/extractor/malltv.py index 6f4fd927f..fadfd9338 100644 --- a/haruhi_dl/extractor/malltv.py +++ b/haruhi_dl/extractor/malltv.py @@ -1,10 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import merge_dicts +from ..utils import ( + clean_html, + dict_get, + float_or_none, + int_or_none, + merge_dicts, + parse_duration, + try_get, +) class MallTVIE(InfoExtractor): @@ -17,7 +23,7 @@ class MallTVIE(InfoExtractor): 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', 'ext': 'mp4', 'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', - 'description': 'md5:25fc0ec42a72ba602b602c683fa29deb', + 'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35', 'duration': 216, 'timestamp': 1538870400, 'upload_date': '20181007', @@ -37,20 +43,46 @@ class MallTVIE(InfoExtractor): webpage = self._download_webpage( url, display_id, headers=self.geo_verification_headers()) - SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b' + video = self._parse_json(self._search_regex( + r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);', + webpage, 'video object'), display_id) + video_source = video['VideoSource'] video_id = self._search_regex( - SOURCE_RE, webpage, 'video id', group='id') + r'/([\da-z]+)/index\b', video_source, 'video id') + + formats = self._extract_m3u8_formats( + video_source + '.m3u8', video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + subtitles = {} + for s in (video.get('Subtitles') or {}): + s_url = s.get('Url') + if not s_url: + continue + subtitles.setdefault(s.get('Language') or 'cz', []).append({ + 'url': s_url, + }) + + entity_counts = video.get('EntityCounts') or {} - media = self._parse_html5_media_entries( - url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id, - m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] + def get_count(k): + v = entity_counts.get(k + 's') or {} + return int_or_none(dict_get(v, ('Count', 'StrCount'))) info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(media, info, { + return merge_dicts({ 'id': video_id, 'display_id': display_id, - 'title': self._og_search_title(webpage, default=None) or display_id, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - }) + 'title': video.get('Title'), + 'description': clean_html(video.get('Description')), + 'thumbnail': video.get('ThumbnailUrl'), + 'formats': formats, + 'subtitles': subtitles, + 'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')), + 'view_count': get_count('View'), + 'like_count': get_count('Like'), + 'dislike_count': get_count('Dislike'), + 'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])), + 'comment_count': get_count('Comment'), + }, info) -- GitLab From 9f47f2a04e5c04510a18c15f1d785eceeb428a03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:12:29 +0100 Subject: [PATCH 018/384] [spiegel] Fix extraction (closes #24206, closes #24767) Code picked from PR #24767 since original repo is not available due to takedown. --- haruhi_dl/extractor/extractors.py | 3 +- haruhi_dl/extractor/spiegel.py | 153 +++++------------------------- haruhi_dl/extractor/spiegeltv.py | 17 ---- 3 files changed, 25 insertions(+), 148 deletions(-) delete mode 100644 haruhi_dl/extractor/spiegeltv.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 412d02955..1341b84bd 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1101,8 +1101,7 @@ from .spankbang import ( SpankBangPlaylistIE, ) from .spankwire import SpankwireIE -from .spiegel import SpiegelIE, SpiegelArticleIE -from .spiegeltv import SpiegeltvIE +from .spiegel import SpiegelIE from .spike import ( BellatorIE, ParamountNetworkIE, diff --git a/haruhi_dl/extractor/spiegel.py b/haruhi_dl/extractor/spiegel.py index 4df7f4ddc..2da32b9b2 100644 --- a/haruhi_dl/extractor/spiegel.py +++ b/haruhi_dl/extractor/spiegel.py @@ -1,159 +1,54 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .spiegeltv import SpiegeltvIE -from ..compat import compat_urlparse -from ..utils import ( - parse_duration, - strip_or_none, - unified_timestamp, -) +from .jwplatform import JWPlatformIE class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': 'b57399839d055fccfeb9a0455c439868', + 'md5': '50c7948883ec85a3e431a0a44b7ad1d6', 'info_dict': { - 'id': '563747', + 'id': 'II0BUyxY', + 'display_id': '1259285', 'ext': 'mp4', - 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', + 'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', - 'duration': 49, + 'duration': 48.0, 'upload_date': '20130311', - 'timestamp': 1362994320, + 'timestamp': 1362997920, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', - 'info_dict': { - 'id': '580988', - 'ext': 'mp4', - 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', - 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', - 'duration': 983, - 'upload_date': '20131115', - 'timestamp': 1384546642, - }, + 'only_matching': True, }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': '97b91083a672d72976faa8433430afb9', - 'info_dict': { - 'id': '601883', - 'ext': 'mp4', - 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', - 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', - 'upload_date': '20140904', - 'timestamp': 1409834160, - } + 'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html', + 'only_matching': True, }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', + 'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7', 'only_matching': True, }, { - # nexx video 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', 'only_matching': True, + }, { + 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id - handle = self._request_webpage(metadata_url, video_id) - - # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html - if SpiegeltvIE.suitable(handle.geturl()): - return self.url_result(handle.geturl(), 'Spiegeltv') - - video_data = self._parse_json(self._webpage_read_content( - handle, metadata_url, video_id), video_id) - title = video_data['title'] - nexx_id = video_data['nexxOmniaId'] - domain_id = video_data.get('nexxOmniaDomain') or '748' - + webpage = self._download_webpage(url, video_id) + media_id = self._html_search_regex( + r'("|["\'])mediaId\1\s*:\s*("|["\'])(?P<id>(?:(?!\2).)+)\2', + webpage, 'media id', group='id') return { '_type': 'url_transparent', 'id': video_id, - 'url': 'nexx:%s:%s' % (domain_id, nexx_id), - 'title': title, - 'description': strip_or_none(video_data.get('teaser')), - 'duration': parse_duration(video_data.get('duration')), - 'timestamp': unified_timestamp(video_data.get('datum')), - 'ie_key': NexxIE.ie_key(), + 'display_id': video_id, + 'url': 'jwplatform:%s' % media_id, + 'title': self._og_search_title(webpage, default=None), + 'ie_key': JWPlatformIE.ie_key(), } - - -class SpiegelArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' - IE_NAME = 'Spiegel:Article' - IE_DESC = 'Articles on spiegel.de' - _TESTS = [{ - 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', - 'info_dict': { - 'id': '1516455', - 'ext': 'mp4', - 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', - 'description': 're:^Patrick Kämnitz gehört.{100,}', - 'upload_date': '20140825', - }, - }, { - 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', - 'info_dict': { - - }, - 'playlist_count': 6, - }, { - # Nexx iFrame embed - 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', - 'info_dict': { - 'id': '161464', - 'ext': 'mp4', - 'title': 'Nervenkitzel Achterbahn', - 'alt_title': 'Karussellbauer in Deutschland', - 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', - 'release_year': 2005, - 'creator': 'SPIEGEL TV', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2761, - 'timestamp': 1394021479, - 'upload_date': '20140305', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - # Single video on top of the page - video_link = self._search_regex( - r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage, - 'video page URL', default=None) - if video_link: - video_url = compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', video_link) - return self.url_result(video_url) - - # Multiple embedded videos - embeds = re.findall( - r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"', - webpage) - entries = [ - self.url_result(compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', embed_path)) - for embed_path in embeds] - if embeds: - return self.playlist_result(entries) - - return self.playlist_from_matches( - NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) diff --git a/haruhi_dl/extractor/spiegeltv.py b/haruhi_dl/extractor/spiegeltv.py deleted file mode 100644 index 6ccf4c342..000000000 --- a/haruhi_dl/extractor/spiegeltv.py +++ /dev/null @@ -1,17 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .nexx import NexxIE - - -class SpiegeltvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/', - 'only_matching': True, - } - - def _real_extract(self, url): - return self.url_result( - 'https://api.nexx.cloud/v3/748/videos/byid/%s' - % self._match_id(url), ie=NexxIE.ie_key()) -- GitLab From ff92752e7c448d8b7b7c8d3d6f98b7e02ae78726 Mon Sep 17 00:00:00 2001 From: gdzx <6490707+gdzx@users.noreply.github.com> Date: Fri, 26 Feb 2021 14:12:41 +0100 Subject: [PATCH 019/384] [francetv] Add fallback video url extraction (#27047) Fallback on another API endpoint when no video formats are found. Closes ytdl-org#22561 --- haruhi_dl/extractor/francetv.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/francetv.py b/haruhi_dl/extractor/francetv.py index 8598576e5..f29fd2666 100644 --- a/haruhi_dl/extractor/francetv.py +++ b/haruhi_dl/extractor/francetv.py @@ -128,17 +128,37 @@ class FranceTVIE(InfoExtractor): is_live = None - formats = [] + videos = [] + for video in info['videos']: if video['statut'] != 'ONLINE': continue + if not video['url']: + continue + videos.append(video) + + if not videos: + for device_type in ['desktop', 'mobile']: + fallback_info = self._download_json( + 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, + video_id, 'Downloading fallback %s video JSON' % device_type, query={ + 'device_type': device_type, + 'browser': 'chrome', + }, fatal=False) + + if fallback_info and fallback_info.get('video'): + videos.append(fallback_info['video']) + + formats = [] + for video in videos: video_url = video['url'] if not video_url: continue if is_live is None: - is_live = (try_get( - video, lambda x: x['plages_ouverture'][0]['direct'], - bool) is True) or '/live.francetv.fr/' in video_url + is_live = ((try_get( + video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True) + or video.get('is_live') is True + or '/live.francetv.fr/' in video_url) format_id = video['format'] ext = determine_ext(video_url) if ext == 'f4m': @@ -154,6 +174,9 @@ class FranceTVIE(InfoExtractor): sign(video_url, format_id), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False)) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, @@ -166,6 +189,7 @@ class FranceTVIE(InfoExtractor): 'url': video_url, 'format_id': format_id, }) + self._sort_formats(formats) title = info['titre'] -- GitLab From 9a4014d3941329610cb6c3b3569d7304601a6969 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:12:48 +0100 Subject: [PATCH 020/384] [francetv] improve info extraction --- haruhi_dl/extractor/francetv.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/haruhi_dl/extractor/francetv.py b/haruhi_dl/extractor/francetv.py index f29fd2666..ab2280630 100644 --- a/haruhi_dl/extractor/francetv.py +++ b/haruhi_dl/extractor/francetv.py @@ -17,6 +17,7 @@ from ..utils import ( parse_duration, try_get, url_or_none, + urljoin, ) from .dailymotion import DailymotionIE @@ -130,10 +131,10 @@ class FranceTVIE(InfoExtractor): videos = [] - for video in info['videos']: - if video['statut'] != 'ONLINE': + for video in (info.get('videos') or []): + if video.get('statut') != 'ONLINE': continue - if not video['url']: + if not video.get('url'): continue videos.append(video) @@ -151,15 +152,15 @@ class FranceTVIE(InfoExtractor): formats = [] for video in videos: - video_url = video['url'] + video_url = video.get('url') if not video_url: continue if is_live is None: - is_live = ((try_get( - video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True) + is_live = (try_get( + video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True or video.get('is_live') is True or '/live.francetv.fr/' in video_url) - format_id = video['format'] + format_id = video.get('format') ext = determine_ext(video_url) if ext == 'f4m': if georestricted: @@ -209,10 +210,10 @@ class FranceTVIE(InfoExtractor): return { 'id': video_id, 'title': self._live_title(title) if is_live else title, - 'description': clean_html(info['synopsis']), - 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), - 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), - 'timestamp': int_or_none(info['diffusion']['timestamp']), + 'description': clean_html(info.get('synopsis')), + 'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')), + 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), + 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, -- GitLab From e2b997d3bf703e0e4ab7cc7f6ace650b27202ace Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:13:00 +0100 Subject: [PATCH 021/384] =?UTF-8?q?[extractor/common]=20Output=20error=20f?= =?UTF-8?q?or=20invalid=20URLs=20in=20=5Fis=5Fvalid=5Furl=20(re=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …fs #21400, refs #24151, refs #25617, refs #25618, refs #25586, refs #26068, refs #27072) --- haruhi_dl/extractor/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index 699aced61..fb616b05a 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -1474,9 +1474,10 @@ class InfoExtractor(object): try: self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True - except ExtractorError: + except ExtractorError as e: self.to_screen( - '%s: %s URL is invalid, skipping' % (video_id, item)) + '%s: %s URL is invalid, skipping: %s' + % (video_id, item, error_to_compat_str(e.cause))) return False def http_scheme(self): -- GitLab From f8fb19832619230796df57c2fd258c3c40f6094d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:13:08 +0100 Subject: [PATCH 022/384] [mgtv] fix format extraction(closes #26415) --- haruhi_dl/extractor/mgtv.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/mgtv.py b/haruhi_dl/extractor/mgtv.py index 71fc3ec56..cab3aa045 100644 --- a/haruhi_dl/extractor/mgtv.py +++ b/haruhi_dl/extractor/mgtv.py @@ -17,9 +17,8 @@ from ..utils import ( class MGTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' + _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' IE_DESC = '芒果TV' - _GEO_COUNTRIES = ['CN'] _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', @@ -34,14 +33,18 @@ class MGTVIE(InfoExtractor): }, { 'url': 'http://www.mgtv.com/b/301817/3826653.html', 'only_matching': True, + }, { + 'url': 'https://w.mgtv.com/b/301817/3826653.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1] try: api_data = self._download_json( 'https://pcweb.api.mgtv.com/player/video', video_id, query={ - 'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: @@ -56,6 +59,7 @@ class MGTVIE(InfoExtractor): stream_data = self._download_json( 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ 'pm2': api_data['atc']['pm2'], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] stream_domain = stream_data['stream_domain'][0] -- GitLab From 93064492e9ef1574e9a157afcfd21370996c0818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:13:13 +0100 Subject: [PATCH 023/384] [arte] Extract m3u8 formats (closes #27061) --- haruhi_dl/extractor/arte.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/arte.py b/haruhi_dl/extractor/arte.py index 2bd3bfe8a..b80467548 100644 --- a/haruhi_dl/extractor/arte.py +++ b/haruhi_dl/extractor/arte.py @@ -11,6 +11,7 @@ from ..utils import ( qualities, try_get, unified_strdate, + url_or_none, ) # There are different sources of video in arte.tv, the extraction process @@ -63,8 +64,13 @@ class ArteTVBaseIE(InfoExtractor): langcode = LANGS.get(lang, lang) formats = [] + m3u8_formats = [] for format_id, format_dict in vsr.items(): f = dict(format_dict) + format_url = url_or_none(f.get('url')) + streamer = f.get('streamer') + if not format_url and not streamer: + continue versionCode = f.get('versionCode') l = re.escape(langcode) @@ -107,6 +113,15 @@ class ArteTVBaseIE(InfoExtractor): else: lang_pref = -1 + media_type = f.get('mediaType') + if media_type == 'hls': + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + for m3u8_format in m3u8_formats: + m3u8_format['language_preference'] = lang_pref + continue + format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -118,7 +133,7 @@ class ArteTVBaseIE(InfoExtractor): 'quality': qfunc(f.get('quality')), } - if f.get('mediaType') == 'rtmp': + if media_type == 'rtmp': format['url'] = f['streamer'] format['play_path'] = 'mp4:' + f['url'] format['ext'] = 'flv' @@ -128,6 +143,8 @@ class ArteTVBaseIE(InfoExtractor): formats.append(format) self._check_formats(formats, video_id) + + formats.extend(m3u8_formats) self._sort_formats(formats) info_dict['formats'] = formats -- GitLab From 1451f4f49864c0c07c9db01b9af8fc3730091b02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:13:19 +0100 Subject: [PATCH 024/384] [arte] Rework extractors * Reimplement embed and playlist extractors to delegate to the single entrypoint artetv extractor Beware reluctant download archive extractor keys breakage. * Improve embeds detection (closes #27057) - Remove obsolete code --- haruhi_dl/extractor/arte.py | 154 ++++++++++++++++++------------ haruhi_dl/extractor/extractors.py | 2 +- haruhi_dl/extractor/generic.py | 9 +- 3 files changed, 100 insertions(+), 65 deletions(-) diff --git a/haruhi_dl/extractor/arte.py b/haruhi_dl/extractor/arte.py index b80467548..03abdbfaf 100644 --- a/haruhi_dl/extractor/arte.py +++ b/haruhi_dl/extractor/arte.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -14,14 +17,44 @@ from ..utils import ( url_or_none, ) -# There are different sources of video in arte.tv, the extraction process -# is different for each one. The videos usually expire in 7 days, so we can't -# add tests. - class ArteTVBaseIE(InfoExtractor): - def _extract_from_json_url(self, json_url, video_id, lang, title=None): - info = self._download_json(json_url, video_id) + _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' + _API_BASE = 'https://api.arte.tv/api/player/v1' + + +class ArteTVIE(ArteTVBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos| + api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s) + ) + /(?P<id>\d{6}-\d{3}-[AF]) + ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'info_dict': { + 'id': '088501-000-A', + 'ext': 'mp4', + 'title': 'Mexico: Stealing Petrol to Survive', + 'upload_date': '20190628', + }, + }, { + 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', + 'only_matching': True, + }, { + 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + lang = mobj.group('lang') or mobj.group('lang_2') + + info = self._download_json( + '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id) player_info = info['videoJsonPlayer'] vsr = try_get(player_info, lambda x: x['VSR'], dict) @@ -38,18 +71,11 @@ class ArteTVBaseIE(InfoExtractor): if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] - title = (player_info.get('VTI') or title or player_info['VID']).strip() + title = (player_info.get('VTI') or player_info['VID']).strip() subtitle = player_info.get('VSU', '').strip() if subtitle: title += ' - %s' % subtitle - info_dict = { - 'id': player_info['VID'], - 'title': title, - 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(upload_date_str), - 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), - } qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) LANGS = { @@ -64,7 +90,6 @@ class ArteTVBaseIE(InfoExtractor): langcode = LANGS.get(lang, lang) formats = [] - m3u8_formats = [] for format_id, format_dict in vsr.items(): f = dict(format_dict) format_url = url_or_none(f.get('url')) @@ -120,6 +145,7 @@ class ArteTVBaseIE(InfoExtractor): m3u8_id=format_id, fatal=False) for m3u8_format in m3u8_formats: m3u8_format['language_preference'] = lang_pref + formats.extend(m3u8_formats) continue format = { @@ -142,58 +168,50 @@ class ArteTVBaseIE(InfoExtractor): formats.append(format) - self._check_formats(formats, video_id) - - formats.extend(m3u8_formats) self._sort_formats(formats) - info_dict['formats'] = formats - return info_dict - + return { + 'id': player_info.get('VID') or video_id, + 'title': title, + 'description': player_info.get('VDE'), + 'upload_date': unified_strdate(upload_date_str), + 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + 'formats': formats, + } -class ArteTVPlus7IE(ArteTVBaseIE): - IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])' +class ArteTVEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' _TESTS = [{ - 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', 'info_dict': { - 'id': '088501-000-A', + 'id': '100605-013-A', 'ext': 'mp4', - 'title': 'Mexico: Stealing Petrol to Survive', - 'upload_date': '20190628', + 'title': 'United we Stream November Lockdown Edition #13', + 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', + 'upload_date': '20201116', }, + }, { + 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, }] - def _real_extract(self, url): - lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url( - 'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), - video_id, lang) - - -class ArteTVEmbedIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:embed' - _VALID_URL = r'''(?x) - https://www\.arte\.tv - /player/v3/index\.php\?json_url= - (?P<json_url> - https?://api\.arte\.tv/api/player/v1/config/ - (?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF]) - ) - ''' - - _TESTS = [] + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', + webpage)] def _real_extract(self, url): - json_url, lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url(json_url, video_id, lang) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + json_url = qs['json_url'][0] + video_id = ArteTVIE._match_id(json_url) + return self.url_result( + json_url, ie=ArteTVIE.ie_key(), video_id=video_id) class ArteTVPlaylistIE(ArteTVBaseIE): - IE_NAME = 'arte.tv:playlist' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})' - + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', 'info_dict': { @@ -202,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE): 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', }, 'playlist_mincount': 6, + }, { + 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', + 'only_matching': True, }] def _real_extract(self, url): lang, playlist_id = re.match(self._VALID_URL, url).groups() collection = self._download_json( - 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' - % (lang, playlist_id), playlist_id) + '%s/collectionData/%s/%s?source=videos' + % (self._API_BASE, lang, playlist_id), playlist_id) + entries = [] + for video in collection['videos']: + if not isinstance(video, dict): + continue + video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) + if not video_url: + continue + video_id = video.get('programId') + entries.append({ + '_type': 'url_transparent', + 'url': video_url, + 'id': video_id, + 'title': video.get('title'), + 'alt_title': video.get('subtitle'), + 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), + 'duration': int_or_none(video.get('durationSeconds')), + 'view_count': int_or_none(video.get('views')), + 'ie_key': ArteTVIE.ie_key(), + }) title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') - entries = [ - self._extract_from_json_url( - video['jsonUrl'], video.get('programId') or playlist_id, lang) - for video in collection['videos'] if video.get('jsonUrl')] return self.playlist_result(entries, playlist_id, title, description) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 1341b84bd..6a7fc43f4 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -68,7 +68,7 @@ from .ard import ( ARDMediathekIE, ) from .arte import ( - ArteTVPlus7IE, + ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, ) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index c81247dd0..babc59dcc 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -91,6 +91,7 @@ from .piksel import PikselIE from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE +from .arte import ArteTVEmbedIE from .videopress import VideoPressIE from .rutube import RutubeIE from .limelight import LimelightBaseIE @@ -2751,11 +2752,9 @@ class GenericIE(InfoExtractor): return self.url_result(ustream_url, UstreamIE.ie_key()) # Look for embedded arte.tv player - mobj = re.search( - r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'ArteTVEmbed') + arte_urls = ArteTVEmbedIE._extract_urls(webpage) + if arte_urls: + return self.playlist_from_matches(arte_urls, video_id, video_title) # Look for embedded francetv player mobj = re.search( -- GitLab From 3a32ea072b9837e75e989893229ca64ebc789438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:13:24 +0100 Subject: [PATCH 025/384] =?UTF-8?q?[youporn]=20Fix=20upload=20date=20extra?= =?UTF-8?q?ction=20and=20make=20comment=20count=20optional=20=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …(closes #26986) --- haruhi_dl/extractor/youporn.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/youporn.py b/haruhi_dl/extractor/youporn.py index c178e2f39..901651b8b 100644 --- a/haruhi_dl/extractor/youporn.py +++ b/haruhi_dl/extractor/youporn.py @@ -29,7 +29,6 @@ class YouPornIE(InfoExtractor): 'upload_date': '20101217', 'average_rating': int, 'view_count': int, - 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -48,7 +47,6 @@ class YouPornIE(InfoExtractor): 'upload_date': '20110418', 'average_rating': int, 'view_count': int, - 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -156,7 +154,8 @@ class YouPornIE(InfoExtractor): r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - [r'Date\s+[Aa]dded:\s*<span>([^<]+)', + [r'UPLOADED:\s*<span>([^<]+)', + r'Date\s+[Aa]dded:\s*<span>([^<]+)', r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], webpage, 'upload date', fatal=False)) @@ -171,7 +170,7 @@ class YouPornIE(InfoExtractor): webpage, 'view count', fatal=False, group='count')) comment_count = str_to_int(self._search_regex( r'>All [Cc]omments? \(([\d,.]+)\)', - webpage, 'comment count', fatal=False)) + webpage, 'comment count', default=None)) def extract_tag_box(regex, title): tag_box = self._search_regex(regex, webpage, title, default=None) -- GitLab From 46fce7272c49df0e74683a229a68eb2faeba93e8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:13:32 +0100 Subject: [PATCH 026/384] [mtv] fix mgid extraction(closes #26841) --- haruhi_dl/extractor/mtv.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/haruhi_dl/extractor/mtv.py b/haruhi_dl/extractor/mtv.py index fedd5f46b..df1034fc5 100644 --- a/haruhi_dl/extractor/mtv.py +++ b/haruhi_dl/extractor/mtv.py @@ -349,6 +349,18 @@ class MTVIE(MTVServicesInfoExtractor): 'only_matching': True, }] + @staticmethod + def extract_child_with_type(parent, t): + children = parent['children'] + return next(c for c in children if c.get('type') == t) + + def _extract_mgid(self, webpage): + data = self._parse_json(self._search_regex( + r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) + main_container = self.extract_child_with_type(data, 'MainContainer') + video_player = self.extract_child_with_type(main_container, 'VideoPlayer') + return video_player['props']['media']['video']['config']['uri'] + class MTVJapanIE(MTVServicesInfoExtractor): IE_NAME = 'mtvjapan' -- GitLab From 514683921adf72de4d5b40726fbea5a736f6e16b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:13:40 +0100 Subject: [PATCH 027/384] [vimeo:album] fix extraction(closes #27079) --- haruhi_dl/extractor/vimeo.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/vimeo.py b/haruhi_dl/extractor/vimeo.py index e14551459..e8a4547cd 100644 --- a/haruhi_dl/extractor/vimeo.py +++ b/haruhi_dl/extractor/vimeo.py @@ -946,10 +946,13 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): def _real_extract(self, url): album_id = self._match_id(url) - webpage = self._download_webpage(url, album_id) - viewer = self._parse_json(self._search_regex( - r'bootstrap_data\s*=\s*({.+?})</script>', - webpage, 'bootstrap data'), album_id)['viewer'] + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', album_id, fatal=False) + if not viewer: + webpage = self._download_webpage(url, album_id) + viewer = self._parse_json(self._search_regex( + r'bootstrap_data\s*=\s*({.+?})</script>', + webpage, 'bootstrap data'), album_id)['viewer'] jwt = viewer['jwt'] album = self._download_json( 'https://api.vimeo.com/albums/' + album_id, -- GitLab From 9a527679ed18644ebc16577442b6fd2373920d5c Mon Sep 17 00:00:00 2001 From: Joost Verdoorn <jpverdoorn@gmail.com> Date: Fri, 26 Feb 2021 14:14:59 +0100 Subject: [PATCH 028/384] [Amara] Add new extractor (#20618) * [Amara] Add new extractor --- haruhi_dl/extractor/amara.py | 76 +++++++++++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 1 + 2 files changed, 77 insertions(+) create mode 100644 haruhi_dl/extractor/amara.py diff --git a/haruhi_dl/extractor/amara.py b/haruhi_dl/extractor/amara.py new file mode 100644 index 000000000..b222154bd --- /dev/null +++ b/haruhi_dl/extractor/amara.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor + + +class AmaraIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)' + _TESTS = [ + { + 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', + 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', + 'info_dict': { + 'id': 'h6ZuVdvYnfE', + 'ext': 'mp4', + 'title': 'Why jury trials are becoming less common', + 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20160813', + 'uploader': 'PBS NewsHour', + 'uploader_id': 'PBSNewsHour' + } + }, + { + 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', + 'md5': '99392c75fa05d432a8f11df03612195e', + 'info_dict': { + 'id': '18622084', + 'ext': 'mov', + 'title': 'Vimeo at CES 2011!', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'timestamp': 1294649110, + 'upload_date': '20110110', + 'uploader': 'Sam Morrill', + 'uploader_id': 'sammorrill' + } + }, + { + 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', + 'md5': 'd3970f08512738ee60c5807311ff5d3f', + 'info_dict': { + 'id': 'ChimamandaAdichie_2009G-transcript', + 'ext': 'mp4', + 'title': 'The danger of a single story', + 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20131206' + } + } + ] + + def get_subtitles_for_language(self, language): + return [{ + 'ext': type, + 'url': language['subtitles_uri'].replace('format=json', 'format=' + type) + } for type in ['vtt', 'srt', 'json']] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._download_json('https://amara.org/api/videos/%s/' % video_id, video_id, query={'format': 'json'}) + + video_url = meta.get('all_urls')[0] + subtitles = dict([(language['code'], self.get_subtitles_for_language(language)) for language in meta.get('languages', []) if language['published']]) + + return { + '_type': 'url_transparent', + 'url': video_url, + 'id': video_id, + 'subtitles': subtitles, + 'title': meta['title'], + 'description': meta.get('description'), + 'thumbnail': meta.get('thumbnail') + } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 6a7fc43f4..bd6003c93 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -43,6 +43,7 @@ from .airmozilla import AirMozillaIE from .albicla import AlbiclaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .amara import AmaraIE from .amcnetworks import AMCNetworksIE from .americastestkitchen import AmericasTestKitchenIE from .animeondemand import AnimeOnDemandIE -- GitLab From 339f127540188c06c99ec6f09ff03a1a03e21893 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:16:30 +0100 Subject: [PATCH 029/384] [amara] improve extraction --- haruhi_dl/extractor/amara.py | 143 +++++++++++++++++++++-------------- 1 file changed, 85 insertions(+), 58 deletions(-) diff --git a/haruhi_dl/extractor/amara.py b/haruhi_dl/extractor/amara.py index b222154bd..61d469574 100644 --- a/haruhi_dl/extractor/amara.py +++ b/haruhi_dl/extractor/amara.py @@ -1,76 +1,103 @@ # coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor +from .youtube import YoutubeIE +from .vimeo import VimeoIE +from ..utils import ( + int_or_none, + parse_iso8601, + update_url_query, +) class AmaraIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)' - _TESTS = [ - { - 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', - 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', - 'info_dict': { - 'id': 'h6ZuVdvYnfE', - 'ext': 'mp4', - 'title': 'Why jury trials are becoming less common', - 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': dict, - 'upload_date': '20160813', - 'uploader': 'PBS NewsHour', - 'uploader_id': 'PBSNewsHour' - } - }, - { - 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', - 'md5': '99392c75fa05d432a8f11df03612195e', - 'info_dict': { - 'id': '18622084', - 'ext': 'mov', - 'title': 'Vimeo at CES 2011!', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': dict, - 'timestamp': 1294649110, - 'upload_date': '20110110', - 'uploader': 'Sam Morrill', - 'uploader_id': 'sammorrill' - } - }, - { - 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', - 'md5': 'd3970f08512738ee60c5807311ff5d3f', - 'info_dict': { - 'id': 'ChimamandaAdichie_2009G-transcript', - 'ext': 'mp4', - 'title': 'The danger of a single story', - 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': dict, - 'upload_date': '20131206' - } + _TESTS = [{ + # Youtube + 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', + 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', + 'info_dict': { + 'id': 'h6ZuVdvYnfE', + 'ext': 'mp4', + 'title': 'Why jury trials are becoming less common', + 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20160813', + 'uploader': 'PBS NewsHour', + 'uploader_id': 'PBSNewsHour', + 'timestamp': 1549639570, } - ] - - def get_subtitles_for_language(self, language): - return [{ - 'ext': type, - 'url': language['subtitles_uri'].replace('format=json', 'format=' + type) - } for type in ['vtt', 'srt', 'json']] + }, { + # Vimeo + 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', + 'md5': '99392c75fa05d432a8f11df03612195e', + 'info_dict': { + 'id': '18622084', + 'ext': 'mov', + 'title': 'Vimeo at CES 2011!', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'timestamp': 1294763658, + 'upload_date': '20110111', + 'uploader': 'Sam Morrill', + 'uploader_id': 'sammorrill' + } + }, { + # Direct Link + 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', + 'md5': 'd3970f08512738ee60c5807311ff5d3f', + 'info_dict': { + 'id': 's8KL7I3jLmh6', + 'ext': 'mp4', + 'title': 'The danger of a single story', + 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20091007', + 'timestamp': 1254942511, + } + }] def _real_extract(self, url): video_id = self._match_id(url) - meta = self._download_json('https://amara.org/api/videos/%s/' % video_id, video_id, query={'format': 'json'}) + meta = self._download_json( + 'https://amara.org/api/videos/%s/' % video_id, + video_id, query={'format': 'json'}) + title = meta['title'] + video_url = meta['all_urls'][0] - video_url = meta.get('all_urls')[0] - subtitles = dict([(language['code'], self.get_subtitles_for_language(language)) for language in meta.get('languages', []) if language['published']]) + subtitles = {} + for language in (meta.get('languages') or []): + subtitles_uri = language.get('subtitles_uri') + if not (subtitles_uri and language.get('published')): + continue + subtitle = subtitles.setdefault(language.get('code') or 'en', []) + for f in ('json', 'srt', 'vtt'): + subtitle.append({ + 'ext': f, + 'url': update_url_query(subtitles_uri, {'format': f}), + }) - return { - '_type': 'url_transparent', + info = { 'url': video_url, 'id': video_id, 'subtitles': subtitles, - 'title': meta['title'], + 'title': title, 'description': meta.get('description'), - 'thumbnail': meta.get('thumbnail') + 'thumbnail': meta.get('thumbnail'), + 'duration': int_or_none(meta.get('duration')), + 'timestamp': parse_iso8601(meta.get('created')), } + + for ie in (YoutubeIE, VimeoIE): + if ie.suitable(video_url): + info.update({ + '_type': 'url_transparent', + 'ie_key': ie.ie_key(), + }) + break + + return info -- GitLab From 9adedd82f32a81f1439fbf4e85e9985642dffdb9 Mon Sep 17 00:00:00 2001 From: beefchop <32330393+beefchop@users.noreply.github.com> Date: Fri, 26 Feb 2021 14:18:39 +0100 Subject: [PATCH 030/384] [viki] fix stream extraction from mpd (#27092) Co-authored-by: beefchop <beefchop@users.noreply.github.com> --- haruhi_dl/extractor/viki.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/haruhi_dl/extractor/viki.py b/haruhi_dl/extractor/viki.py index b0dcdc0e6..48ab7b944 100644 --- a/haruhi_dl/extractor/viki.py +++ b/haruhi_dl/extractor/viki.py @@ -296,6 +296,9 @@ class VikiIE(VikiBaseIE): if f.get('acodec') == 'none' and f.get('vcodec') != 'none': f['acodec'] = None formats.extend(m3u8_formats) + elif format_id == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, 'mpd-%s' % protocol, fatal=False)) elif format_url.startswith('rtmp'): mobj = re.search( r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', -- GitLab From ddc62043ed31799433868043188bcd9e90c2ccca Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:18:46 +0100 Subject: [PATCH 031/384] [viki] improve format extraction --- haruhi_dl/extractor/viki.py | 142 +++++++++++++++++++++--------------- 1 file changed, 83 insertions(+), 59 deletions(-) diff --git a/haruhi_dl/extractor/viki.py b/haruhi_dl/extractor/viki.py index 48ab7b944..a003b7af8 100644 --- a/haruhi_dl/extractor/viki.py +++ b/haruhi_dl/extractor/viki.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import hashlib import hmac import itertools @@ -9,6 +10,10 @@ import re import time from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -165,19 +170,20 @@ class VikiIE(VikiBaseIE): }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '5fa476a902e902783ac7a4d615cdbc7a', + 'md5': '94e0e34fd58f169f40c184f232356cfe', 'info_dict': { 'id': '44699v', 'ext': 'mp4', 'title': 'Boys Over Flowers - Episode 1', 'description': 'md5:b89cf50038b480b88b5b3c93589a9076', - 'duration': 4204, + 'duration': 4172, 'timestamp': 1270496524, 'upload_date': '20100405', 'uploader': 'group8', 'like_count': int, 'age_limit': 13, - } + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # youtube external 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', @@ -194,14 +200,15 @@ class VikiIE(VikiBaseIE): 'uploader_id': 'ad14065n', 'like_count': int, 'age_limit': 13, - } + }, + 'skip': 'Page not found!', }, { 'url': 'http://www.viki.com/player/44699v', 'only_matching': True, }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '1713ae35df5a521b31f6dc40730e7c9c', + 'md5': 'adf9e321a0ae5d0aace349efaaff7691', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -217,8 +224,11 @@ class VikiIE(VikiBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - video = self._call_api( - 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') + resp = self._download_json( + 'https://www.viki.com/api/videos/' + video_id, + video_id, 'Downloading video JSON', + headers={'x-viki-app-ver': '4.0.57'}) + video = resp['video'] self._check_errors(video) @@ -265,60 +275,74 @@ class VikiIE(VikiBaseIE): 'subtitles': subtitles, } - streams = self._call_api( - 'videos/%s/streams.json' % video_id, video_id, - 'Downloading video streams JSON') - - if 'external' in streams: - result.update({ - '_type': 'url_transparent', - 'url': streams['external']['url'], - }) - return result - formats = [] - for format_id, stream_dict in streams.items(): - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - for protocol, format_dict in stream_dict.items(): - # rtmps URLs does not seem to work - if protocol == 'rtmps': - continue - format_url = format_dict['url'] - if format_id == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False) - # Despite CODECS metadata in m3u8 all video-only formats - # are actually video+audio - for f in m3u8_formats: - if f.get('acodec') == 'none' and f.get('vcodec') != 'none': - f['acodec'] = None - formats.extend(m3u8_formats) - elif format_id == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, video_id, 'mpd-%s' % protocol, fatal=False)) - elif format_url.startswith('rtmp'): - mobj = re.search( - r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', - format_url) - if not mobj: + + def add_format(format_id, format_dict, protocol='http'): + # rtmps URLs does not seem to work + if protocol == 'rtmps': + return + format_url = format_dict.get('url') + if not format_url: + return + qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query) + stream = qs.get('stream', [None])[0] + if stream: + format_url = base64.b64decode(stream).decode() + if format_id in ('m3u8', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', + m3u8_id='m3u8-%s' % protocol, fatal=False) + # Despite CODECS metadata in m3u8 all video-only formats + # are actually video+audio + for f in m3u8_formats: + if '_drm/index_' in f['url']: continue - formats.append({ - 'format_id': 'rtmp-%s' % format_id, - 'ext': 'flv', - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': url, - }) - else: - formats.append({ - 'url': format_url, - 'format_id': '%s-%s' % (format_id, protocol), - 'height': height, - }) + if f.get('acodec') == 'none' and f.get('vcodec') != 'none': + f['acodec'] = None + formats.append(f) + elif format_id in ('mpd', 'dash'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, 'mpd-%s' % protocol, fatal=False)) + elif format_url.startswith('rtmp'): + mobj = re.search( + r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', + format_url) + if not mobj: + return + formats.append({ + 'format_id': 'rtmp-%s' % format_id, + 'ext': 'flv', + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': url, + }) + else: + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (format_id, protocol), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)), + }) + + for format_id, format_dict in (resp.get('streams') or {}).items(): + add_format(format_id, format_dict) + if not formats: + streams = self._call_api( + 'videos/%s/streams.json' % video_id, video_id, + 'Downloading video streams JSON') + + if 'external' in streams: + result.update({ + '_type': 'url_transparent', + 'url': streams['external']['url'], + }) + return result + + for format_id, stream_dict in streams.items(): + for protocol, format_dict in stream_dict.items(): + add_format(format_id, format_dict, protocol) self._sort_formats(formats) result['formats'] = formats -- GitLab From 9fd254036be0a95a4d29e0ff1a0d6eb51a2e3785 Mon Sep 17 00:00:00 2001 From: Leonardo Taccari <iamleot@gmail.com> Date: Fri, 26 Feb 2021 14:18:51 +0100 Subject: [PATCH 032/384] [rai] Fix extraction for recent raiplay.it updates (#27077) - Remove first test of RaiPlayIE: it is no longer available - Make RaiPlayIE extension-agnostic (passing possible `.json' URLs is now supported too) - Adjust RaiPlayLiveIE to recent raiplay.it updates. Passing it as `url_transparent' is no longer supported (there is no longer an accessible ContentItem) - Adjust RaiPlayPlaylistIE to recent raiplay.it updates and instruct it about ContentSet-s. - Update a RaiIE test and remove two tests that are no longer availables Thanks to @remitamine for the review! --- haruhi_dl/extractor/rai.py | 126 +++++++++++++++---------------------- 1 file changed, 52 insertions(+), 74 deletions(-) diff --git a/haruhi_dl/extractor/rai.py b/haruhi_dl/extractor/rai.py index bee2d53f5..dae7800d2 100644 --- a/haruhi_dl/extractor/rai.py +++ b/haruhi_dl/extractor/rai.py @@ -17,7 +17,6 @@ from ..utils import ( int_or_none, parse_duration, strip_or_none, - unescapeHTML, unified_strdate, unified_timestamp, update_url_query, @@ -122,27 +121,8 @@ class RaiBaseIE(InfoExtractor): class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)\.(?:html|json)' % RaiBaseIE._UUID_RE _TESTS = [{ - 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', - 'md5': '340aa3b7afb54bfd14a8c11786450d76', - 'info_dict': { - 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', - 'ext': 'mp4', - 'title': 'La Casa Bianca', - 'alt_title': 'S2016 - Puntata del 23/10/2016', - 'description': 'md5:a09d45890850458077d1f68bb036e0a5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 3', - 'creator': 'Rai 3', - 'duration': 3278, - 'timestamp': 1477764300, - 'upload_date': '20161029', - 'series': 'La Casa Bianca', - 'season': '2016', - }, - 'skip': 'This content is not available', - }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { @@ -166,10 +146,11 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - url, video_id = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + base, video_id, = mobj.group('base', 'id') media = self._download_json( - url.replace('.html', '.json'), video_id, 'Downloading video JSON') + '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON') title = media['name'] @@ -219,7 +200,7 @@ class RaiPlayIE(RaiBaseIE): class RaiPlayLiveIE(RaiBaseIE): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)' + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' _TEST = { 'url': 'http://www.raiplay.it/dirette/rainews24', 'info_dict': { @@ -227,7 +208,7 @@ class RaiPlayLiveIE(RaiBaseIE): 'display_id': 'rainews24', 'ext': 'mp4', 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:6eca31500550f9376819f174e5644754', + 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', 'uploader': 'Rai News 24', 'creator': 'Rai News 24', 'is_live': True, @@ -238,53 +219,75 @@ class RaiPlayLiveIE(RaiBaseIE): } def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + base, display_id, = mobj.group('base', 'id') + + media = self._download_json( + '%s.json' % base, + display_id, 'Downloading channel JSON') - webpage = self._download_webpage(url, display_id) + title = media['name'] + video = media['video'] + video_id = media['id'].replace('ContentItem-', '') - video_id = self._search_regex( - r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, - webpage, 'content id') + relinker_info = self._extract_relinker_info(video['content_url'], video_id) + self._sort_formats(relinker_info['formats']) - return { - '_type': 'url_transparent', - 'ie_key': RaiPlayIE.ie_key(), - 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, + info = { 'id': video_id, 'display_id': display_id, + 'title': self._live_title(title) if relinker_info.get( + 'is_live') else title, + 'description': media.get('description'), + 'uploader': strip_or_none(media.get('channel')), + 'creator': strip_or_none(media.get('editor')), } + info.update(relinker_info) + return info + class RaiPlayPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', - 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', + 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, 'playlist_mincount': 12, }] def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + base, playlist_id, = mobj.group('base', 'id') - webpage = self._download_webpage(url, playlist_id) + media = self._download_json( + '%s.json' % base, + playlist_id, 'Downloading program JSON') - title = self._html_search_meta( - ('programma', 'nomeProgramma'), webpage, 'title') - description = unescapeHTML(self._html_search_meta( - ('description', 'og:description'), webpage, 'description')) + title = media.get('name') + description = None + if media.get('program_info') and media['program_info'].get('description'): + description = media['program_info']['description'] entries = [] - for mobj in re.finditer( - r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1', - webpage): - video_url = urljoin(url, mobj.group('path')) - entries.append(self.url_result( - video_url, ie=RaiPlayIE.ie_key(), - video_id=RaiPlayIE._match_id(video_url))) + for b in media.get('blocks', []): + for s in b.get('sets', []): + cs = s.get('id') + if not cs: + continue + medias = self._download_json( + '%s/%s.json' % (base, cs), + cs, 'Downloading content set JSON', fatal=False) + if not medias: + continue + for m in medias['items']: + video_url = urljoin(url, m['path_id']) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) return self.playlist_result(entries, playlist_id, title, description) @@ -329,19 +332,6 @@ class RaiIE(RaiBaseIE): 'duration': 2214, 'upload_date': '20161103', } - }, { - # drawMediaRaiTV(...) - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '2dd727e61114e1ee9c47f0da6914e178', - 'info_dict': { - 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'mp4', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20141221', - }, - 'skip': 'This content is not available', }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', @@ -353,18 +343,6 @@ class RaiIE(RaiBaseIE): 'upload_date': '20170401', }, 'skip': 'Changes daily', - }, { - # HDS live stream with only relinker URL - 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - 'info_dict': { - 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', - 'ext': 'flv', - 'title': 'EuroNews', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This content is available only in Italy', }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', -- GitLab From a7bd83e154d2adf01096662e14173d5848305fa8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:18:55 +0100 Subject: [PATCH 033/384] [rai] improve extraction --- haruhi_dl/extractor/rai.py | 82 +++++++++++++------------------------- 1 file changed, 27 insertions(+), 55 deletions(-) diff --git a/haruhi_dl/extractor/rai.py b/haruhi_dl/extractor/rai.py index dae7800d2..b072a0f38 100644 --- a/haruhi_dl/extractor/rai.py +++ b/haruhi_dl/extractor/rai.py @@ -16,7 +16,9 @@ from ..utils import ( GeoRestrictedError, int_or_none, parse_duration, + remove_start, strip_or_none, + try_get, unified_strdate, unified_timestamp, update_url_query, @@ -121,7 +123,7 @@ class RaiBaseIE(InfoExtractor): class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)\.(?:html|json)' % RaiBaseIE._UUID_RE + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE _TESTS = [{ 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', @@ -146,11 +148,10 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - base, video_id, = mobj.group('base', 'id') + base, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( - '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON') + base + '.json', video_id, 'Downloading video JSON') title = media['name'] @@ -177,7 +178,8 @@ class RaiPlayIE(RaiBaseIE): season = media.get('season') info = { - 'id': video_id, + 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, + 'display_id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, 'alt_title': strip_or_none(media.get('subtitle')), @@ -199,9 +201,9 @@ class RaiPlayIE(RaiBaseIE): return info -class RaiPlayLiveIE(RaiBaseIE): +class RaiPlayLiveIE(RaiPlayIE): _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' - _TEST = { + _TESTS = [{ 'url': 'http://www.raiplay.it/dirette/rainews24', 'info_dict': { 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', @@ -216,35 +218,7 @@ class RaiPlayLiveIE(RaiBaseIE): 'params': { 'skip_download': True, }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - base, display_id, = mobj.group('base', 'id') - - media = self._download_json( - '%s.json' % base, - display_id, 'Downloading channel JSON') - - title = media['name'] - video = media['video'] - video_id = media['id'].replace('ContentItem-', '') - - relinker_info = self._extract_relinker_info(video['content_url'], video_id) - self._sort_formats(relinker_info['formats']) - - info = { - 'id': video_id, - 'display_id': display_id, - 'title': self._live_title(title) if relinker_info.get( - 'is_live') else title, - 'description': media.get('description'), - 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor')), - } - - info.update(relinker_info) - return info + }] class RaiPlayPlaylistIE(InfoExtractor): @@ -260,36 +234,34 @@ class RaiPlayPlaylistIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - base, playlist_id, = mobj.group('base', 'id') - - media = self._download_json( - '%s.json' % base, - playlist_id, 'Downloading program JSON') + base, playlist_id = re.match(self._VALID_URL, url).groups() - title = media.get('name') - description = None - if media.get('program_info') and media['program_info'].get('description'): - description = media['program_info']['description'] + program = self._download_json( + base + '.json', playlist_id, 'Downloading program JSON') entries = [] - for b in media.get('blocks', []): - for s in b.get('sets', []): - cs = s.get('id') - if not cs: + for b in (program.get('blocks') or []): + for s in (b.get('sets') or []): + s_id = s.get('id') + if not s_id: continue medias = self._download_json( - '%s/%s.json' % (base, cs), - cs, 'Downloading content set JSON', fatal=False) + '%s/%s.json' % (base, s_id), s_id, + 'Downloading content set JSON', fatal=False) if not medias: continue - for m in medias['items']: - video_url = urljoin(url, m['path_id']) + for m in (medias.get('items') or []): + path_id = m.get('path_id') + if not path_id: + continue + video_url = urljoin(url, path_id) entries.append(self.url_result( video_url, ie=RaiPlayIE.ie_key(), video_id=RaiPlayIE._match_id(video_url))) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + entries, playlist_id, program.get('name'), + try_get(program, lambda x: x['program_info']['description'])) class RaiIE(RaiBaseIE): -- GitLab From 493d2796046ac269c105e69704bb86b6e74d3f93 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:19:02 +0100 Subject: [PATCH 034/384] [rai] fix unavailable video format detection --- haruhi_dl/extractor/rai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/rai.py b/haruhi_dl/extractor/rai.py index b072a0f38..06958966f 100644 --- a/haruhi_dl/extractor/rai.py +++ b/haruhi_dl/extractor/rai.py @@ -68,7 +68,7 @@ class RaiBaseIE(InfoExtractor): # This does not imply geo restriction (e.g. # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) - if media_url == 'http://download.rai.it/video_no_available.mp4': + if '/video_no_available.mp4' in media_url: continue ext = determine_ext(media_url) -- GitLab From a7324932923e9fee73a92441e1c033fb1a6c071d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:19:07 +0100 Subject: [PATCH 035/384] [rai] fix protocol relative relinker URLs(closes #22766) --- haruhi_dl/extractor/rai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/rai.py b/haruhi_dl/extractor/rai.py index 06958966f..ecb628f14 100644 --- a/haruhi_dl/extractor/rai.py +++ b/haruhi_dl/extractor/rai.py @@ -424,7 +424,7 @@ class RaiIE(RaiBaseIE): except ExtractorError: pass - relinker_url = self._search_regex( + relinker_url = self._proto_relative_url(self._search_regex( r'''(?x) (?: var\s+videoURL| @@ -436,7 +436,7 @@ class RaiIE(RaiBaseIE): //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 ''', - webpage, 'relinker URL', group='url') + webpage, 'relinker URL', group='url')) relinker_info = self._extract_relinker_info( urljoin(url, relinker_url), video_id) -- GitLab From 3ffb6438448804348c3a0c9f45b16fdbc5118c41 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:19:12 +0100 Subject: [PATCH 036/384] [discoverynetworks] add support new TLC/DMAX URLs(closes #27100) --- haruhi_dl/extractor/discoverynetworks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/discoverynetworks.py b/haruhi_dl/extractor/discoverynetworks.py index 607a54948..c512b95d0 100644 --- a/haruhi_dl/extractor/discoverynetworks.py +++ b/haruhi_dl/extractor/discoverynetworks.py @@ -7,7 +7,7 @@ from .dplay import DPlayIE class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)' _TESTS = [{ 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', @@ -29,6 +29,9 @@ class DiscoveryNetworksDeIE(DPlayIE): }, { 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', 'only_matching': True, + }, { + 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', + 'only_matching': True, }] def _real_extract(self, url): -- GitLab From 8175a5e8b18d1ad04ce412eecafa01f65e6f0f53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:19:38 +0100 Subject: [PATCH 037/384] =?UTF-8?q?[YoutubeDL]=20Fix=20--ignore-errors=20f?= =?UTF-8?q?or=20playlists=20with=20generator-based=20en=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …tries of url_transparent (closes #27064) --- haruhi_dl/HaruhiDL.py | 52 +++++++++++++++++++------------- test/test_HaruhiDL.py | 70 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 21 deletions(-) diff --git a/haruhi_dl/HaruhiDL.py b/haruhi_dl/HaruhiDL.py index 0a1a5a5a9..e67c01a9d 100755 --- a/haruhi_dl/HaruhiDL.py +++ b/haruhi_dl/HaruhiDL.py @@ -797,21 +797,14 @@ class HaruhiDL(object): self.report_warning('The program functionality for this site has been marked as broken, ' 'and will probably not work.') + return self.__extract_info(url, ie, download, extra_info, process) + else: + self.report_error('no suitable InfoExtractor for URL %s' % url) + + def __handle_extraction_exceptions(func): + def wrapper(self, *args, **kwargs): try: - ie_result = ie.extract(url) - if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) - break - if isinstance(ie_result, list): - # Backwards compatibility: old IE result format - ie_result = { - '_type': 'compat_list', - 'entries': ie_result, - } - self.add_default_extra_info(ie_result, ie, url) - if process: - return self.process_ie_result(ie_result, download, extra_info) - else: - return ie_result + return func(self, *args, **kwargs) except GeoRestrictedError as e: msg = e.msg if e.countries: @@ -819,20 +812,33 @@ class HaruhiDL(object): map(ISO3166Utils.short2full, e.countries)) msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' self.report_error(msg) - break except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) - break except MaxDownloadsReached: raise except Exception as e: if self.params.get('ignoreerrors', False): self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) - break else: raise + return wrapper + + @__handle_extraction_exceptions + def __extract_info(self, url, ie, download, extra_info, process): + ie_result = ie.extract(url) + if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + return + if isinstance(ie_result, list): + # Backwards compatibility: old IE result format + ie_result = { + '_type': 'compat_list', + 'entries': ie_result, + } + self.add_default_extra_info(ie_result, ie, url) + if process: + return self.process_ie_result(ie_result, download, extra_info) else: - self.report_error('no suitable InfoExtractor for URL %s' % url) + return ie_result def add_default_extra_info(self, ie_result, ie, url): self.add_extra_info(ie_result, { @@ -1007,9 +1013,8 @@ class HaruhiDL(object): self.to_screen('[download] ' + reason) continue - entry_result = self.process_ie_result(entry, - download=download, - extra_info=extra) + entry_result = self.__process_iterable_entry(entry, download, extra) + # TODO: skip failed (empty) entries? playlist_results.append(entry_result) ie_result['entries'] = playlist_results self.to_screen('[download] Finished downloading playlist: %s' % playlist) @@ -1038,6 +1043,11 @@ class HaruhiDL(object): else: raise Exception('Invalid result type: %s' % result_type) + @__handle_extraction_exceptions + def __process_iterable_entry(self, entry, download, extra_info): + return self.process_ie_result( + entry, download=download, extra_info=extra_info) + def _build_format_filter(self, filter_spec): " Returns a function to filter the formats according to the filter_spec " diff --git a/test/test_HaruhiDL.py b/test/test_HaruhiDL.py index c6346118a..7b93d0cdb 100644 --- a/test/test_HaruhiDL.py +++ b/test/test_HaruhiDL.py @@ -922,6 +922,76 @@ class TestHaruhiDL(unittest.TestCase): self.assertEqual(downloaded['extractor'], 'testex') self.assertEqual(downloaded['extractor_key'], 'TestEx') + # Test case for https://github.com/hdl-org/haruhi-dl/issues/27064 + def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self): + + class _YDL(YDL): + def __init__(self, *args, **kwargs): + super(_YDL, self).__init__(*args, **kwargs) + + def trouble(self, s, tb=None): + pass + + ydl = _YDL({ + 'format': 'extra', + 'ignoreerrors': True, + }) + + class VideoIE(InfoExtractor): + _VALID_URL = r'video:(?P<id>\d+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = [{ + 'format_id': 'default', + 'url': 'url:', + }] + if video_id == '0': + raise ExtractorError('foo') + if video_id == '2': + formats.append({ + 'format_id': 'extra', + 'url': TEST_URL, + }) + return { + 'id': video_id, + 'title': 'Video %s' % video_id, + 'formats': formats, + } + + class PlaylistIE(InfoExtractor): + _VALID_URL = r'playlist:' + + def _entries(self): + for n in range(3): + video_id = compat_str(n) + yield { + '_type': 'url_transparent', + 'ie_key': VideoIE.ie_key(), + 'id': video_id, + 'url': 'video:%s' % video_id, + 'title': 'Video Transparent %s' % video_id, + } + + def _real_extract(self, url): + return self.playlist_result(self._entries()) + + ydl.add_info_extractor(VideoIE(ydl)) + ydl.add_info_extractor(PlaylistIE(ydl)) + info = ydl.extract_info('playlist:') + entries = info['entries'] + self.assertEqual(len(entries), 3) + self.assertTrue(entries[0] is None) + self.assertTrue(entries[1] is None) + self.assertEqual(len(ydl.downloaded_info_dicts), 1) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(entries[2], downloaded) + self.assertEqual(downloaded['url'], TEST_URL) + self.assertEqual(downloaded['title'], 'Video Transparent 2') + self.assertEqual(downloaded['id'], '2') + self.assertEqual(downloaded['extractor'], 'Video') + self.assertEqual(downloaded['extractor_key'], 'Video') + if __name__ == '__main__': unittest.main() -- GitLab From acfb99b684a5b7cdb9b02691ea6f476b0eff88c6 Mon Sep 17 00:00:00 2001 From: Laura Liberda <laura@selfisekai.rocks> Date: Fri, 26 Feb 2021 14:27:42 +0100 Subject: [PATCH 038/384] improve copykitku patch hook --- devscripts/copykitku-patch-hook.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/devscripts/copykitku-patch-hook.js b/devscripts/copykitku-patch-hook.js index cd6e93a84..1db74257a 100644 --- a/devscripts/copykitku-patch-hook.js +++ b/devscripts/copykitku-patch-hook.js @@ -4,16 +4,18 @@ module.exports = function patchHook(patchContent) { [ + [/(?:youtube-|yt-?)dl\.org/g, 'haruhi.download'], [/youtube_dl/g, 'haruhi_dl'], [/youtube-dl/g, 'haruhi-dl'], [/youtubedl/g, 'haruhidl'], [/YoutubeDL/g, 'HaruhiDL'], [/ytdl/g, 'hdl'], - [/(?:youtube-|yt-?)dl\.org/g, 'haruhi.download'], [/yt-dl/g, 'h-dl'], + [/ydl/g, 'hdl'], // prevent from linking to non-existent repository [/github\.com\/ytdl-org\/haruhi-dl/g, 'github.com/ytdl-org/youtube-dl'], + [/github\.com\/rg3\/haruhi-dl/g, 'github.com/ytdl-org/youtube-dl'], // prevent changing the smuggle URLs (for compatibility with ytdl) [/__haruhidl_smuggle/g, '__youtubedl_smuggle'], ].forEach(([regex, replacement]) => patchContent = patchContent.replace(regex, replacement)); -- GitLab From bb0f8c2607eb0cfb1b21e1733a2d70a17f568458 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:27:49 +0100 Subject: [PATCH 039/384] =?UTF-8?q?[downloader/http]=20Fix=20crash=20durin?= =?UTF-8?q?g=20urlopen=20caused=20by=20missing=20reason=20o=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …f URLError --- haruhi_dl/downloader/http.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/downloader/http.py b/haruhi_dl/downloader/http.py index c9c908b34..6d3d8f1e5 100644 --- a/haruhi_dl/downloader/http.py +++ b/haruhi_dl/downloader/http.py @@ -109,7 +109,9 @@ class HttpFD(FileDownloader): try: ctx.data = self.hdl.urlopen(request) except (compat_urllib_error.URLError, ) as err: - if isinstance(err.reason, socket.timeout): + # reason may not be available, e.g. for urllib2.HTTPError on python 2.6 + reason = getattr(err, 'reason', None) + if isinstance(reason, socket.timeout): raise RetryDownload(err) raise err # When trying to resume, Content-Range HTTP header of response has to be checked -- GitLab From f3c426a2ee0db526c1e3a732d67ffc95f6b41d52 Mon Sep 17 00:00:00 2001 From: renalid <renalid@gmail.com> Date: Fri, 26 Feb 2021 14:28:03 +0100 Subject: [PATCH 040/384] [francetv] Update to fix thumbnail URL issue (#27120) Fix the thumbnail URL. The issue was here for many years, never fixed. It's done ! :-) Example : https://www.france.tv/france-2/de-gaulle-l-eclat-et-le-secret/de-gaulle-l-eclat-et-le-secret-saison-1/2035247-solitude.html failed thumbnail url generated : http://pluzz.francetv.fr/staticftv/ref_emissions/2020-11-02/EMI_1104da66f533cc7dc5d0d07a181a18c2e2fe1d81_20201014122553940.jpg right thumbnail url fixed : https://sivideo.webservices.francetelevisions.fr/staticftv/ref_emissions/2020-11-02/EMI_1104da66f533cc7dc5d0d07a181a18c2e2fe1d81_20201014122553940.jpg --- haruhi_dl/extractor/francetv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/francetv.py b/haruhi_dl/extractor/francetv.py index ab2280630..3cb17751e 100644 --- a/haruhi_dl/extractor/francetv.py +++ b/haruhi_dl/extractor/francetv.py @@ -211,7 +211,7 @@ class FranceTVIE(InfoExtractor): 'id': video_id, 'title': self._live_title(title) if is_live else title, 'description': clean_html(info.get('synopsis')), - 'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')), + 'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')), 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), 'is_live': is_live, -- GitLab From 968583c56f3731d30f89f87675014a1e1ab8b0ee Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:28:13 +0100 Subject: [PATCH 041/384] [infoq] fix format extraction(closes #25984) --- haruhi_dl/extractor/infoq.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/infoq.py b/haruhi_dl/extractor/infoq.py index 18249cf9b..0a70a1fb4 100644 --- a/haruhi_dl/extractor/infoq.py +++ b/haruhi_dl/extractor/infoq.py @@ -54,7 +54,7 @@ class InfoQIE(BokeCCBaseIE): def _extract_rtmp_video(self, webpage): # The server URL is hardcoded - video_url = 'rtmpe://video.infoq.com/cfx/st/' + video_url = 'rtmpe://videof.infoq.com/cfx/st/' # Extract video URL encoded_id = self._search_regex( @@ -86,17 +86,18 @@ class InfoQIE(BokeCCBaseIE): return [{ 'format_id': 'http_video', 'url': http_video_url, + 'http_headers': {'Referer': 'https://www.infoq.com/'}, }] def _extract_http_audio(self, webpage, video_id): - fields = self._hidden_inputs(webpage) + fields = self._form_hidden_inputs('mp3Form', webpage) http_audio_url = fields.get('filename') if not http_audio_url: return [] # base URL is found in the Location header in the response returned by # GET https://www.infoq.com/mp3download.action?filename=... when logged in. - http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + http_audio_url = compat_urlparse.urljoin('http://ress.infoq.com/downloads/mp3downloads/', http_audio_url) http_audio_url = update_url_query(http_audio_url, self._extract_cf_auth(webpage)) # audio file seem to be missing some times even if there is a download link -- GitLab From 9e816eca8f20a327e40e3612937efd4a3172784e Mon Sep 17 00:00:00 2001 From: Mattias Wadman <mattias.wadman@gmail.com> Date: Fri, 26 Feb 2021 14:32:52 +0100 Subject: [PATCH 042/384] [svt] Extract timestamp and thumbnail in more cases (#27130) Add timestamp, set to "valid from" which i think could been seen as publish time. Add thumbnail in more cases, seems to was only done in the embedded data case for some reason. Switch svtplay test url to an existing video and also one with no expire date. Also add an additional thumbnail url test regex. --- haruhi_dl/extractor/svt.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/haruhi_dl/extractor/svt.py b/haruhi_dl/extractor/svt.py index a5e480f0b..0f8b2d61f 100644 --- a/haruhi_dl/extractor/svt.py +++ b/haruhi_dl/extractor/svt.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, dict_get, int_or_none, + unified_timestamp, str_or_none, strip_or_none, try_get, @@ -44,7 +45,8 @@ class SVTBaseIE(InfoExtractor): 'format_id': player_type, 'url': vurl, }) - if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): + rights = try_get(video_info, lambda x: x['rights'], dict) or {} + if not formats and rights.get('geoBlockedSweden'): self.raise_geo_restricted( 'This video is only available in Sweden', countries=self._GEO_COUNTRIES) @@ -70,6 +72,7 @@ class SVTBaseIE(InfoExtractor): episode = video_info.get('episodeTitle') episode_number = int_or_none(video_info.get('episodeNumber')) + timestamp = unified_timestamp(rights.get('validFrom')) duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) age_limit = None adult = dict_get( @@ -84,6 +87,7 @@ class SVTBaseIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'duration': duration, + 'timestamp': timestamp, 'age_limit': age_limit, 'series': series, 'season_number': season_number, @@ -140,21 +144,30 @@ class SVTPlayIE(SVTPlayBaseIE): ) ''' _TESTS = [{ - 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', - 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', + 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', + 'md5': '2382036fd6f8c994856c323fe51c426e', 'info_dict': { - 'id': '5996901', + 'id': 'jNwpV9P', 'ext': 'mp4', - 'title': 'Flygplan till Haile Selassie', - 'duration': 3527, - 'thumbnail': r're:^https?://.*[\.-]jpg$', + 'title': 'Det h\xe4r \xe4r himlen', + 'timestamp': 1586044800, + 'upload_date': '20200405', + 'duration': 3515, + 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', 'age_limit': 0, 'subtitles': { 'sv': [{ - 'ext': 'wsrt', + 'ext': 'vtt', }] }, }, + 'params': { + 'format': 'bestvideo', + # skip for now due to download test asserts that segment is > 10000 bytes and svt uses + # init segments that are smaller + # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B + 'skip_download': True, + }, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -235,7 +248,10 @@ class SVTPlayIE(SVTPlayBaseIE): r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), webpage, 'video id') - return self._extract_by_video_id(svt_id, webpage) + info_dict = self._extract_by_video_id(svt_id, webpage) + info_dict['thumbnail'] = thumbnail + + return info_dict class SVTSeriesIE(SVTPlayBaseIE): -- GitLab From 5a94d1b61d23f78644e7aec3b3b45a9ff3358742 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:32:59 +0100 Subject: [PATCH 043/384] [svtplay] Add support for svt.se/barnkanalen (closes #24817) --- haruhi_dl/extractor/svt.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/svt.py b/haruhi_dl/extractor/svt.py index 0f8b2d61f..3b1908a82 100644 --- a/haruhi_dl/extractor/svt.py +++ b/haruhi_dl/extractor/svt.py @@ -139,7 +139,11 @@ class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'''(?x) (?: - svt:(?P<svt_id>[^/?#&]+)| + (?: + svt:| + https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/ + ) + (?P<svt_id>[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) ) ''' @@ -184,6 +188,12 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'svt:14278044', 'only_matching': True, + }, { + 'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/', + 'only_matching': True, + }, { + 'url': 'svt:eWv5MLX', + 'only_matching': True, }] def _adjust_title(self, info): @@ -375,7 +385,7 @@ class SVTPageIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url) def _real_extract(self, url): path, display_id = re.match(self._VALID_URL, url).groups() -- GitLab From d64e153832c59fb3120cad5643b895286cb6e3a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:33:05 +0100 Subject: [PATCH 044/384] [svtplay] Fix test title --- haruhi_dl/extractor/svt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/svt.py b/haruhi_dl/extractor/svt.py index 3b1908a82..1c2e747c8 100644 --- a/haruhi_dl/extractor/svt.py +++ b/haruhi_dl/extractor/svt.py @@ -153,7 +153,7 @@ class SVTPlayIE(SVTPlayBaseIE): 'info_dict': { 'id': 'jNwpV9P', 'ext': 'mp4', - 'title': 'Det h\xe4r \xe4r himlen', + 'title': 'Det här är himlen', 'timestamp': 1586044800, 'upload_date': '20200405', 'duration': 3515, -- GitLab From 9a5816f425afe7d37c86aeb60f397bf3dc67f927 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:33:38 +0100 Subject: [PATCH 045/384] [pinterest] Add extractor (closes #25747) --- haruhi_dl/extractor/extractors.py | 4 + haruhi_dl/extractor/pinterest.py | 176 ++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 haruhi_dl/extractor/pinterest.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index bd6003c93..27fb2062d 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -880,6 +880,10 @@ from .picarto import ( ) from .piksel import PikselIE from .pinkbike import PinkbikeIE +from .pinterest import ( + PinterestIE, + PinterestCollectionIE, +) from .pladform import PladformIE from .platzi import ( PlatziIE, diff --git a/haruhi_dl/extractor/pinterest.py b/haruhi_dl/extractor/pinterest.py new file mode 100644 index 000000000..2bb4ca660 --- /dev/null +++ b/haruhi_dl/extractor/pinterest.py @@ -0,0 +1,176 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +class PinterestBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' + + def _extract_resource(self, webpage, video_id): + return self._parse_json( + self._search_regex( + r'<script[^>]+\bid=["\']initial-state["\'][^>]*>({.+?})</script>', + webpage, 'application json'), + video_id)['resourceResponses'] + + def _extract_video(self, data, extract_formats=True): + video_id = data['id'] + + title = (data.get('title') or data.get('grid_title') or video_id).strip() + + formats = [] + duration = None + if extract_formats: + for format_id, format_dict in data['videos']['video_list'].items(): + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('url')) + if not format_url: + continue + duration = float_or_none(format_dict.get('duration'), scale=1000) + ext = determine_ext(format_url) + if 'hls' in format_id.lower() or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'width': int_or_none(format_dict.get('width')), + 'height': int_or_none(format_dict.get('height')), + 'duration': duration, + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) + + description = data.get('description') or data.get('description_html') or data.get('seo_description') + timestamp = unified_timestamp(data.get('created_at')) + + def _u(field): + return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) + + uploader = _u('full_name') + uploader_id = _u('id') + + repost_count = int_or_none(data.get('repin_count')) + comment_count = int_or_none(data.get('comment_count')) + categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) + tags = data.get('hashtags') + + thumbnails = [] + images = data.get('images') + if isinstance(images, dict): + for thumbnail_id, thumbnail in images.items(): + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'repost_count': repost_count, + 'comment_count': comment_count, + 'categories': categories, + 'tags': tags, + 'formats': formats, + 'extractor_key': PinterestIE.ie_key(), + } + + +class PinterestIE(PinterestBaseIE): + _VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.com/pin/664281013778109217/', + 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', + 'info_dict': { + 'id': '664281013778109217', + 'ext': 'mp4', + 'title': 'Origami', + 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', + 'duration': 57.7, + 'timestamp': 1593073622, + 'upload_date': '20200625', + 'uploader': 'Love origami -I am Dafei', + 'uploader_id': '586523688879454212', + 'repost_count': 50, + 'comment_count': 0, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://co.pinterest.com/pin/824721750502199491/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._extract_resource(webpage, video_id)[0]['response']['data'] + return self._extract_video(data) + + +class PinterestCollectionIE(PinterestBaseIE): + _VALID_URL = r'%s/[^/]+/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', + 'info_dict': { + 'id': '585890301462791043', + 'title': 'cool diys', + }, + 'playlist_count': 8, + }] + + @classmethod + def suitable(cls, url): + return False if PinterestIE.suitable(url) else super( + PinterestCollectionIE, cls).suitable(url) + + def _real_extract(self, url): + collection_name = self._match_id(url) + webpage = self._download_webpage(url, collection_name) + resource = self._extract_resource(webpage, collection_name)[1] + entries = [] + for item in resource['response']['data']: + if not isinstance(item, dict) or item.get('type') != 'pin': + continue + video_id = item.get('id') + if video_id: + # Some pins may not be available anonymously via pin URL + # video = self._extract_video(item, extract_formats=False) + # video.update({ + # '_type': 'url_transparent', + # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, + # }) + # entries.append(video) + entries.append(self._extract_video(item)) + title = try_get( + resource, lambda x: x['options']['board_title'], compat_str) + collection_id = try_get( + resource, lambda x: x['options']['board_id'], + compat_str) or collection_name + return self.playlist_result( + entries, playlist_id=collection_id, playlist_title=title) -- GitLab From ac852e57a0f46a28cfbfe1aafbfe4ab873bec7a9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:33:51 +0100 Subject: [PATCH 046/384] [extractor/common] add generic support for akamai http format extraction --- haruhi_dl/extractor/common.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index fb616b05a..32a391a85 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -2614,6 +2614,7 @@ class InfoExtractor(object): def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats = [] + hdcore_sign = 'hdcore=3.7.0' f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') hds_host = hosts.get('hds') @@ -2626,6 +2627,7 @@ class InfoExtractor(object): for entry in f4m_formats: entry.update({'extra_param_to_segment_url': hdcore_sign}) formats.extend(f4m_formats) + m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') hls_host = hosts.get('hls') if hls_host: @@ -2633,6 +2635,31 @@ class InfoExtractor(object): formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + http_host = hosts.get('http') + if http_host and 'hdnea=' not in manifest_url: + REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' + qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') + qualities_length = len(qualities) + if len(formats) in (qualities_length + 1, qualities_length * 2 + 1): + i = 0 + http_formats = [] + for f in formats: + if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none': + for protocol in ('http', 'https'): + http_f = f.copy() + del http_f['manifest_url'] + http_url = re.sub( + REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url']) + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), + 'url': http_url, + 'protocol': protocol, + }) + http_formats.append(http_f) + i += 1 + formats.extend(http_formats) + return formats def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): -- GitLab From 186e07f960702ff25df8065ee7e9fb61c84495a8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:34:00 +0100 Subject: [PATCH 047/384] [skyit] add support for multiple Sky Italia websites(closes #26629) --- haruhi_dl/extractor/extractors.py | 10 ++ haruhi_dl/extractor/skyit.py | 239 ++++++++++++++++++++++++++++++ 2 files changed, 249 insertions(+) create mode 100644 haruhi_dl/extractor/skyit.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 27fb2062d..87d509df5 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1060,6 +1060,16 @@ from .shared import ( from .showroomlive import ShowRoomLiveIE from .sina import SinaIE from .sixplay import SixPlayIE +from .skyit import ( + SkyItPlayerIE, + SkyItVideoIE, + SkyItVideoLiveIE, + SkyItIE, + SkyItAcademyIE, + SkyItArteIE, + CieloTVItIE, + TV8ItIE, +) from .skylinewebcams import SkylineWebcamsIE from .skynewsarabia import ( SkyNewsArabiaIE, diff --git a/haruhi_dl/extractor/skyit.py b/haruhi_dl/extractor/skyit.py new file mode 100644 index 000000000..14a4d8d4c --- /dev/null +++ b/haruhi_dl/extractor/skyit.py @@ -0,0 +1,239 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + dict_get, + int_or_none, + parse_duration, + unified_timestamp, +) + + +class SkyItPlayerIE(InfoExtractor): + IE_NAME = 'player.sky.it' + _VALID_URL = r'https?://player\.sky\.it/player/(?:external|social)\.html\?.*?\bid=(?P<id>\d+)' + _GEO_BYPASS = False + _DOMAIN = 'sky' + _PLAYER_TMPL = 'https://player.sky.it/player/external.html?id=%s&domain=%s' + # http://static.sky.it/static/skyplayer/conf.json + _TOKEN_MAP = { + 'cielo': 'Hh9O7M8ks5yi6nSROL7bKYz933rdf3GhwZlTLMgvy4Q', + 'hotclub': 'kW020K2jq2lk2eKRJD2vWEg832ncx2EivZlTLQput2C', + 'mtv8': 'A5Nn9GGb326CI7vP5e27d7E4PIaQjota', + 'salesforce': 'C6D585FD1615272C98DE38235F38BD86', + 'sitocommerciale': 'VJwfFuSGnLKnd9Phe9y96WkXgYDCguPMJ2dLhGMb2RE', + 'sky': 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk', + 'skyacademy': 'A6LAn7EkO2Q26FRy0IAMBekX6jzDXYL3', + 'skyarte': 'LWk29hfiU39NNdq87ePeRach3nzTSV20o0lTv2001Cd', + 'theupfront': 'PRSGmDMsg6QMGc04Obpoy7Vsbn7i2Whp', + } + + def _player_url_result(self, video_id): + return self.url_result( + self._PLAYER_TMPL % (video_id, self._DOMAIN), + SkyItPlayerIE.ie_key(), video_id) + + def _parse_video(self, video, video_id): + title = video['title'] + is_live = video.get('type') == 'live' + hls_url = video.get(('streaming' if is_live else 'hls') + '_url') + if not hls_url and video.get('geoblock' if is_live else 'geob'): + self.raise_geo_restricted(countries=['IT']) + + if is_live: + formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4') + else: + formats = self._extract_akamai_formats( + hls_url, video_id, {'http': 'videoplatform.sky.it'}) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'formats': formats, + 'thumbnail': dict_get(video, ('video_still', 'video_still_medium', 'thumb')), + 'description': video.get('short_desc') or None, + 'timestamp': unified_timestamp(video.get('create_date')), + 'duration': int_or_none(video.get('duration_sec')) or parse_duration(video.get('duration')), + 'is_live': is_live, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + domain = compat_parse_qs(compat_urllib_parse_urlparse( + url).query).get('domain', [None])[0] + token = dict_get(self._TOKEN_MAP, (domain, 'sky')) + video = self._download_json( + 'https://apid.sky.it/vdp/v1/getVideoData', + video_id, query={ + 'caller': 'sky', + 'id': video_id, + 'token': token + }, headers=self.geo_verification_headers()) + return self._parse_video(video, video_id) + + +class SkyItVideoIE(SkyItPlayerIE): + IE_NAME = 'video.sky.it' + _VALID_URL = r'https?://(?:masterchef|video|xfactor)\.sky\.it(?:/[^/]+)*/video/[0-9a-z-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://video.sky.it/news/mondo/video/uomo-ucciso-da-uno-squalo-in-australia-631227', + 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', + 'info_dict': { + 'id': '631227', + 'ext': 'mp4', + 'title': 'Uomo ucciso da uno squalo in Australia', + 'timestamp': 1606036192, + 'upload_date': '20201122', + } + }, { + 'url': 'https://xfactor.sky.it/video/x-factor-2020-replay-audizioni-1-615820', + 'only_matching': True, + }, { + 'url': 'https://masterchef.sky.it/video/masterchef-9-cosa-e-successo-nella-prima-puntata-562831', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._player_url_result(video_id) + + +class SkyItVideoLiveIE(SkyItPlayerIE): + IE_NAME = 'video.sky.it:live' + _VALID_URL = r'https?://video\.sky\.it/diretta/(?P<id>[^/?&#]+)' + _TEST = { + 'url': 'https://video.sky.it/diretta/tg24', + 'info_dict': { + 'id': '1', + 'ext': 'mp4', + 'title': r're:Diretta TG24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'description': 'Guarda la diretta streaming di SkyTg24, segui con Sky tutti gli appuntamenti e gli speciali di Tg24.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + asset_id = compat_str(self._parse_json(self._search_regex( + r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', + webpage, 'next data'), display_id)['props']['initialState']['livePage']['content']['asset_id']) + livestream = self._download_json( + 'https://apid.sky.it/vdp/v1/getLivestream', + asset_id, query={'id': asset_id}) + return self._parse_video(livestream, asset_id) + + +class SkyItIE(SkyItPlayerIE): + IE_NAME = 'sky.it' + _VALID_URL = r'https?://(?:sport|tg24)\.sky\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://sport.sky.it/calcio/serie-a/2020/11/21/juventus-cagliari-risultato-gol', + 'info_dict': { + 'id': '631201', + 'ext': 'mp4', + 'title': 'Un rosso alla violenza: in campo per i diritti delle donne', + 'upload_date': '20201121', + 'timestamp': 1605995753, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'https://tg24.sky.it/mondo/2020/11/22/australia-squalo-uccide-uomo', + 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', + 'info_dict': { + 'id': '631227', + 'ext': 'mp4', + 'title': 'Uomo ucciso da uno squalo in Australia', + 'timestamp': 1606036192, + 'upload_date': '20201122', + }, + }] + _VIDEO_ID_REGEX = r'data-videoid="(\d+)"' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + self._VIDEO_ID_REGEX, webpage, 'video id') + return self._player_url_result(video_id) + + +class SkyItAcademyIE(SkyItIE): + IE_NAME = 'skyacademy.it' + _VALID_URL = r'https?://(?:www\.)?skyacademy\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.skyacademy.it/eventi-speciali/2019/07/05/a-lezione-di-cinema-con-sky-academy-/', + 'md5': 'ced5c26638b7863190cbc44dd6f6ba08', + 'info_dict': { + 'id': '523458', + 'ext': 'mp4', + 'title': 'Sky Academy "The Best CineCamp 2019"', + 'timestamp': 1562843784, + 'upload_date': '20190711', + } + }] + _DOMAIN = 'skyacademy' + _VIDEO_ID_REGEX = r'id="news-videoId_(\d+)"' + + +class SkyItArteIE(SkyItIE): + IE_NAME = 'arte.sky.it' + _VALID_URL = r'https?://arte\.sky\.it/video/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://arte.sky.it/video/serie-musei-venezia-collezionismo-12-novembre/', + 'md5': '515aee97b87d7a018b6c80727d3e7e17', + 'info_dict': { + 'id': '627926', + 'ext': 'mp4', + 'title': "Musei Galleria Franchetti alla Ca' d'Oro Palazzo Grimani", + 'upload_date': '20201106', + 'timestamp': 1604664493, + } + }] + _DOMAIN = 'skyarte' + _VIDEO_ID_REGEX = r'(?s)<iframe[^>]+src="(?:https:)?//player\.sky\.it/player/external\.html\?[^"]*\bid=(\d+)' + + +class CieloTVItIE(SkyItIE): + IE_NAME = 'cielotv.it' + _VALID_URL = r'https?://(?:www\.)?cielotv\.it/video/(?P<id>[^.]+)\.html' + _TESTS = [{ + 'url': 'https://www.cielotv.it/video/Il-lunedi-e-sempre-un-dramma.html', + 'md5': 'c4deed77552ba901c2a0d9258320304b', + 'info_dict': { + 'id': '499240', + 'ext': 'mp4', + 'title': 'Il lunedì è sempre un dramma', + 'upload_date': '20190329', + 'timestamp': 1553862178, + } + }] + _DOMAIN = 'cielo' + _VIDEO_ID_REGEX = r'videoId\s*=\s*"(\d+)"' + + +class TV8ItIE(SkyItVideoIE): + IE_NAME = 'tv8.it' + _VALID_URL = r'https?://tv8\.it/showvideo/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://tv8.it/showvideo/630529/ogni-mattina-ucciso-asino-di-andrea-lo-cicero/18-11-2020/', + 'md5': '9ab906a3f75ea342ed928442f9dabd21', + 'info_dict': { + 'id': '630529', + 'ext': 'mp4', + 'title': 'Ogni mattina - Ucciso asino di Andrea Lo Cicero', + 'timestamp': 1605721374, + 'upload_date': '20201118', + } + }] + _DOMAIN = 'mtv8' -- GitLab From abe5d97246c8c782a4d54cc2bf530ee6b1fa36bc Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:34:06 +0100 Subject: [PATCH 048/384] [rumble] add support for embed pages(#10785) --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/rumble.py | 67 +++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 haruhi_dl/extractor/rumble.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 87d509df5..0fc7d1d7f 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1010,6 +1010,7 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe from .rtvnh import RTVNHIE from .rtvs import RTVSIE from .ruhd import RUHDIE +from .rumble import RumbleEmbedIE from .rutube import ( RutubeIE, RutubeChannelIE, diff --git a/haruhi_dl/extractor/rumble.py b/haruhi_dl/extractor/rumble.py new file mode 100644 index 000000000..4a0225109 --- /dev/null +++ b/haruhi_dl/extractor/rumble.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + parse_iso8601, + try_get, +) + + +class RumbleEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)' + _TESTS = [{ + 'url': 'https://rumble.com/embed/v5pv5f', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, { + 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._download_json( + 'https://rumble.com/embedJS/', video_id, + query={'request': 'video', 'v': video_id}) + title = video['title'] + + formats = [] + for height, ua in (video.get('ua') or {}).items(): + for i in range(2): + f_url = try_get(ua, lambda x: x[i], compat_str) + if f_url: + ext = determine_ext(f_url) + f = { + 'ext': ext, + 'format_id': '%s-%sp' % (ext, height), + 'height': int_or_none(height), + 'url': f_url, + } + bitrate = try_get(ua, lambda x: x[i + 2]['bitrate']) + if bitrate: + f['tbr'] = int_or_none(bitrate) + formats.append(f) + self._sort_formats(formats) + + author = video.get('author') or {} + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video.get('i'), + 'timestamp': parse_iso8601(video.get('pubDate')), + 'channel': author.get('name'), + 'channel_url': author.get('url'), + 'duration': int_or_none(video.get('duration')), + } -- GitLab From 7a0255f6e2b5b68efcf5dcc017a124a1fd1eb0c0 Mon Sep 17 00:00:00 2001 From: Jia Rong Yee <28086837+fourjr@users.noreply.github.com> Date: Fri, 26 Feb 2021 14:34:21 +0100 Subject: [PATCH 049/384] [nytimes] Add new cooking.nytimes.com extractor (#27143) * [nytimes] support cooking.nytimes.com, resolves #27112 Co-authored-by: remitamine <remitamine@gmail.com> --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/nytimes.py | 38 +++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 0fc7d1d7f..0a002df66 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -817,6 +817,7 @@ from .ntvru import NTVRuIE from .nytimes import ( NYTimesIE, NYTimesArticleIE, + NYTimesCookingIE, ) from .nuvid import NuvidIE from .nzz import NZZIE diff --git a/haruhi_dl/extractor/nytimes.py b/haruhi_dl/extractor/nytimes.py index fc78ca56c..976b1c694 100644 --- a/haruhi_dl/extractor/nytimes.py +++ b/haruhi_dl/extractor/nytimes.py @@ -221,3 +221,41 @@ class NYTimesArticleIE(NYTimesBaseIE): r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'), webpage, 'podcast data') return self._extract_podcast_from_json(podcast_data, page_id, webpage) + + +class NYTimesCookingIE(NYTimesBaseIE): + _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', + 'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3', + 'info_dict': { + 'id': '100000004756089', + 'ext': 'mov', + 'timestamp': 1479383008, + 'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON', + 'title': 'Cranberry Tart', + 'upload_date': '20161117', + 'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.', + }, + }, { + 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', + 'md5': '4b2e8c70530a89b8d905a2b572316eb8', + 'info_dict': { + 'id': '100000003951728', + 'ext': 'mov', + 'timestamp': 1445509539, + 'description': 'Turkey guide', + 'upload_date': '20151022', + 'title': 'Turkey', + } + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + video_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'video id') + + return self._extract_video_from_id(video_id) -- GitLab From e1c07eb79f62e6305dc2b63766509433852df6a8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:34:37 +0100 Subject: [PATCH 050/384] [box] Add new extractor(#5949) --- haruhi_dl/extractor/box.py | 98 +++++++++++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 1 + 2 files changed, 99 insertions(+) create mode 100644 haruhi_dl/extractor/box.py diff --git a/haruhi_dl/extractor/box.py b/haruhi_dl/extractor/box.py new file mode 100644 index 000000000..aae82d1af --- /dev/null +++ b/haruhi_dl/extractor/box.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_iso8601, + # try_get, + update_url_query, +) + + +class BoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)' + _TEST = { + 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', + 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', + 'info_dict': { + 'id': '510727257538', + 'ext': 'mp4', + 'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4', + 'uploader': 'MLS Video', + 'timestamp': 1566320259, + 'upload_date': '20190820', + 'uploader_id': '235196876', + } + } + + def _real_extract(self, url): + shared_name, file_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, file_id) + request_token = self._parse_json(self._search_regex( + r'Box\.config\s*=\s*({.+?});', webpage, + 'Box config'), file_id)['requestToken'] + access_token = self._download_json( + 'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, + 'Downloading token JSON metadata', + data=json.dumps({'fileIDs': [file_id]}).encode(), headers={ + 'Content-Type': 'application/json', + 'X-Request-Token': request_token, + 'X-Box-EndUser-API': 'sharedName=' + shared_name, + })[file_id]['read'] + shared_link = 'https://app.box.com/s/' + shared_name + f = self._download_json( + 'https://api.box.com/2.0/files/' + file_id, file_id, + 'Downloading file JSON metadata', headers={ + 'Authorization': 'Bearer ' + access_token, + 'BoxApi': 'shared_link=' + shared_link, + 'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats + }, query={ + 'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size' + }) + title = f['name'] + + query = { + 'access_token': access_token, + 'shared_link': shared_link + } + + formats = [] + + # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): + # entry_url_template = try_get( + # entry, lambda x: x['content']['url_template']) + # if not entry_url_template: + # continue + # representation = entry.get('representation') + # if representation == 'dash': + # TODO: append query to every fragment URL + # formats.extend(self._extract_mpd_formats( + # entry_url_template.replace('{+asset_path}', 'manifest.mpd'), + # file_id, query=query)) + + authenticated_download_url = f.get('authenticated_download_url') + if authenticated_download_url and f.get('is_download_available'): + formats.append({ + 'ext': f.get('extension') or determine_ext(title), + 'filesize': f.get('size'), + 'format_id': 'download', + 'url': update_url_query(authenticated_download_url, query), + }) + + self._sort_formats(formats) + + creator = f.get('created_by') or {} + + return { + 'id': file_id, + 'title': title, + 'formats': formats, + 'description': f.get('description') or None, + 'uploader': creator.get('name'), + 'timestamp': parse_iso8601(f.get('created_at')), + 'uploader_id': creator.get('id'), + } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 0a002df66..234763076 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -132,6 +132,7 @@ from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bostonglobe import BostonGlobeIE +from .box import BoxIE from .bpb import BpbIE from .br import ( BRIE, -- GitLab From 950c574c2218a74610e9306a00cd8065078eb376 Mon Sep 17 00:00:00 2001 From: renalid <renalid@gmail.com> Date: Fri, 26 Feb 2021 14:35:11 +0100 Subject: [PATCH 051/384] [franceinter] add thumbnail url (#27153) Co-authored-by: remitamine <remitamine@gmail.com> --- haruhi_dl/extractor/franceinter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/haruhi_dl/extractor/franceinter.py b/haruhi_dl/extractor/franceinter.py index 05806895c..a009f4d38 100644 --- a/haruhi_dl/extractor/franceinter.py +++ b/haruhi_dl/extractor/franceinter.py @@ -16,6 +16,7 @@ class FranceInterIE(InfoExtractor): 'ext': 'mp3', 'title': 'Affaire Cahuzac : le contentieux du compte en Suisse', 'description': 'md5:401969c5d318c061f86bda1fa359292b', + 'thumbnail': r're:^https?://.*\.jpg', 'upload_date': '20160907', }, } @@ -31,6 +32,7 @@ class FranceInterIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage) + thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) upload_date_str = self._search_regex( r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', @@ -48,6 +50,7 @@ class FranceInterIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, + 'thumbnail' : thumbnail, 'upload_date': upload_date, 'formats': [{ 'url': video_url, -- GitLab From 3d030642c704eeefc3665da8fca56710427e2153 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:35:16 +0100 Subject: [PATCH 052/384] [franceinter] flake8 --- haruhi_dl/extractor/franceinter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/franceinter.py b/haruhi_dl/extractor/franceinter.py index a009f4d38..ae822a50e 100644 --- a/haruhi_dl/extractor/franceinter.py +++ b/haruhi_dl/extractor/franceinter.py @@ -50,7 +50,7 @@ class FranceInterIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, - 'thumbnail' : thumbnail, + 'thumbnail': thumbnail, 'upload_date': upload_date, 'formats': [{ 'url': video_url, -- GitLab From 2a368bc78e70c318847dc69a6d3b0ed55f693eaf Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:35:22 +0100 Subject: [PATCH 053/384] [pinterest] Add support for large collections(more than 25 pins) --- haruhi_dl/extractor/pinterest.py | 87 ++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 31 deletions(-) diff --git a/haruhi_dl/extractor/pinterest.py b/haruhi_dl/extractor/pinterest.py index 2bb4ca660..b249c9eda 100644 --- a/haruhi_dl/extractor/pinterest.py +++ b/haruhi_dl/extractor/pinterest.py @@ -1,6 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -16,12 +19,12 @@ from ..utils import ( class PinterestBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' - def _extract_resource(self, webpage, video_id): - return self._parse_json( - self._search_regex( - r'<script[^>]+\bid=["\']initial-state["\'][^>]*>({.+?})</script>', - webpage, 'application json'), - video_id)['resourceResponses'] + def _call_api(self, resource, video_id, options): + return self._download_json( + 'https://www.pinterest.com/resource/%sResource/get/' % resource, + video_id, 'Download %s JSON metadata' % resource, query={ + 'data': json.dumps({'options': options}) + })['resource_response'] def _extract_video(self, data, extract_formats=True): video_id = data['id'] @@ -128,13 +131,16 @@ class PinterestIE(PinterestBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - data = self._extract_resource(webpage, video_id)[0]['response']['data'] + data = self._call_api( + 'Pin', video_id, { + 'field_set_key': 'unauth_react_main_pin', + 'id': video_id, + })['data'] return self._extract_video(data) class PinterestCollectionIE(PinterestBaseIE): - _VALID_URL = r'%s/[^/]+/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE + _VALID_URL = r'%s/(?P<username>[^/]+)/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE _TESTS = [{ 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', 'info_dict': { @@ -142,6 +148,14 @@ class PinterestCollectionIE(PinterestBaseIE): 'title': 'cool diys', }, 'playlist_count': 8, + }, { + 'url': 'https://www.pinterest.ca/fudohub/videos/', + 'info_dict': { + 'id': '682858430939307450', + 'title': 'VIDEOS', + }, + 'playlist_mincount': 365, + 'skip': 'Test with extract_formats=False', }] @classmethod @@ -150,27 +164,38 @@ class PinterestCollectionIE(PinterestBaseIE): PinterestCollectionIE, cls).suitable(url) def _real_extract(self, url): - collection_name = self._match_id(url) - webpage = self._download_webpage(url, collection_name) - resource = self._extract_resource(webpage, collection_name)[1] + username, slug = re.match(self._VALID_URL, url).groups() + board = self._call_api( + 'Board', slug, { + 'slug': slug, + 'username': username + })['data'] + board_id = board['id'] + options = { + 'board_id': board_id, + 'page_size': 250, + } + bookmark = None entries = [] - for item in resource['response']['data']: - if not isinstance(item, dict) or item.get('type') != 'pin': - continue - video_id = item.get('id') - if video_id: - # Some pins may not be available anonymously via pin URL - # video = self._extract_video(item, extract_formats=False) - # video.update({ - # '_type': 'url_transparent', - # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, - # }) - # entries.append(video) - entries.append(self._extract_video(item)) - title = try_get( - resource, lambda x: x['options']['board_title'], compat_str) - collection_id = try_get( - resource, lambda x: x['options']['board_id'], - compat_str) or collection_name + while True: + if bookmark: + options['bookmarks'] = [bookmark] + board_feed = self._call_api('BoardFeed', board_id, options) + for item in (board_feed.get('data') or []): + if not isinstance(item, dict) or item.get('type') != 'pin': + continue + video_id = item.get('id') + if video_id: + # Some pins may not be available anonymously via pin URL + # video = self._extract_video(item, extract_formats=False) + # video.update({ + # '_type': 'url_transparent', + # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, + # }) + # entries.append(video) + entries.append(self._extract_video(item)) + bookmark = board_feed.get('bookmark') + if not bookmark: + break return self.playlist_result( - entries, playlist_id=collection_id, playlist_title=title) + entries, playlist_id=board_id, playlist_title=board.get('name')) -- GitLab From c7196194719d03e47f5e506e26cde42d1e61ae8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:35:49 +0100 Subject: [PATCH 054/384] [nrk] Fix extraction --- haruhi_dl/extractor/nrk.py | 424 ++++++++++++++++++++++--------------- 1 file changed, 248 insertions(+), 176 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 84aacbcda..4a395546f 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -9,6 +9,7 @@ from ..compat import ( compat_urllib_parse_unquote, ) from ..utils import ( + determine_ext, ExtractorError, int_or_none, js_to_json, @@ -16,185 +17,13 @@ from ..utils import ( parse_age_limit, parse_duration, try_get, + url_or_none, ) class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] - _api_host = None - - def _real_extract(self, url): - video_id = self._match_id(url) - - api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS - - for api_host in api_hosts: - data = self._download_json( - 'http://%s/mediaelement/%s' % (api_host, video_id), - video_id, 'Downloading mediaelement JSON', - fatal=api_host == api_hosts[-1]) - if not data: - continue - self._api_host = api_host - break - - title = data.get('fullTitle') or data.get('mainTitle') or data['title'] - video_id = data.get('id') or video_id - - entries = [] - - conviva = data.get('convivaStatistics') or {} - live = (data.get('mediaElementType') == 'Live' - or data.get('isLive') is True or conviva.get('isLive')) - - def make_title(t): - return self._live_title(t) if live else t - - media_assets = data.get('mediaAssets') - if media_assets and isinstance(media_assets, list): - def video_id_and_title(idx): - return ((video_id, title) if len(media_assets) == 1 - else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) - for num, asset in enumerate(media_assets, 1): - asset_url = asset.get('url') - if not asset_url: - continue - formats = self._extract_akamai_formats(asset_url, video_id) - if not formats: - continue - self._sort_formats(formats) - - # Some f4m streams may not work with hdcore in fragments' URLs - for f in formats: - extra_param = f.get('extra_param_to_segment_url') - if extra_param and 'hdcore' in extra_param: - del f['extra_param_to_segment_url'] - - entry_id, entry_title = video_id_and_title(num) - duration = parse_duration(asset.get('duration')) - subtitles = {} - for subtitle in ('webVtt', 'timedText'): - subtitle_url = asset.get('%sSubtitlesUrl' % subtitle) - if subtitle_url: - subtitles.setdefault('no', []).append({ - 'url': compat_urllib_parse_unquote(subtitle_url) - }) - entries.append({ - 'id': asset.get('carrierId') or entry_id, - 'title': make_title(entry_title), - 'duration': duration, - 'subtitles': subtitles, - 'formats': formats, - }) - - if not entries: - media_url = data.get('mediaUrl') - if media_url: - formats = self._extract_akamai_formats(media_url, video_id) - self._sort_formats(formats) - duration = parse_duration(data.get('duration')) - entries = [{ - 'id': video_id, - 'title': make_title(title), - 'duration': duration, - 'formats': formats, - }] - - if not entries: - MESSAGES = { - 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', - 'ProgramRightsHasExpired': 'Programmet har gått ut', - 'NoProgramRights': 'Ikke tilgjengelig', - 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', - } - message_type = data.get('messageType', '') - # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type: - self.raise_geo_restricted( - msg=MESSAGES.get('ProgramIsGeoBlocked'), - countries=self._GEO_COUNTRIES) - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, MESSAGES.get( - message_type, message_type)), - expected=True) - - series = conviva.get('seriesName') or data.get('seriesTitle') - episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') - - season_number = None - episode_number = None - if data.get('mediaElementType') == 'Episode': - _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ - data.get('relativeOriginUrl', '') - EPISODENUM_RE = [ - r'/s(?P<season>\d{,2})e(?P<episode>\d{,2})\.', - r'/sesong-(?P<season>\d{,2})/episode-(?P<episode>\d{,2})', - ] - season_number = int_or_none(self._search_regex( - EPISODENUM_RE, _season_episode, 'season number', - default=None, group='season')) - episode_number = int_or_none(self._search_regex( - EPISODENUM_RE, _season_episode, 'episode number', - default=None, group='episode')) - - thumbnails = None - images = data.get('images') - if images and isinstance(images, dict): - web_images = images.get('webImages') - if isinstance(web_images, list): - thumbnails = [{ - 'url': image['imageUrl'], - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - } for image in web_images if image.get('imageUrl')] - - description = data.get('description') - category = data.get('mediaAnalytics', {}).get('category') - - common_info = { - 'description': description, - 'series': series, - 'episode': episode, - 'season_number': season_number, - 'episode_number': episode_number, - 'categories': [category] if category else None, - 'age_limit': parse_age_limit(data.get('legalAge')), - 'thumbnails': thumbnails, - } - - vcodec = 'none' if data.get('mediaType') == 'Audio' else None - - for entry in entries: - entry.update(common_info) - for f in entry['formats']: - f['vcodec'] = vcodec - - points = data.get('shortIndexPoints') - if isinstance(points, list): - chapters = [] - for next_num, point in enumerate(points, start=1): - if not isinstance(point, dict): - continue - start_time = parse_duration(point.get('startPoint')) - if start_time is None: - continue - end_time = parse_duration( - data.get('duration') - if next_num == len(points) - else points[next_num].get('startPoint')) - if end_time is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': point.get('title'), - }) - if chapters and len(entries) == 1: - entries[0]['chapters'] = chapters - - return self.playlist_result(entries, video_id, title, description) - class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) @@ -202,13 +31,13 @@ class NRKIE(NRKBaseIE): nrk:| https?:// (?: - (?:www\.)?nrk\.no/video/PS\*| + (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| v8[-.]psapi\.nrk\.no/mediaelement/ ) ) - (?P<id>[^?#&]+) + (?P<id>[^?\#&]+) ''' - _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no') + _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', @@ -240,8 +69,76 @@ class NRKIE(NRKBaseIE): }, { 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', + 'only_matching': True, }] + def _extract_from_playback(self, video_id): + manifest = self._download_json( + 'http://psapi.nrk.no/playback/manifest/%s' % video_id, + video_id, 'Downloading manifest JSON') + + playable = manifest['playable'] + + formats = [] + for asset in playable['assets']: + if not isinstance(asset, dict): + continue + if asset.get('encrypted'): + continue + format_url = url_or_none(asset.get('url')) + if not format_url: + continue + if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + data = self._download_json( + 'http://psapi.nrk.no/playback/metadata/%s' % video_id, + video_id, 'Downloading metadata JSON') + + preplay = data['preplay'] + titles = preplay['titles'] + title = titles['title'] + alt_title = titles.get('subtitle') + + description = preplay.get('description') + duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) + + thumbnails = [] + for image in try_get( + preplay, lambda x: x['poster']['images'], list) or []: + if not isinstance(image, dict): + continue + image_url = url_or_none(image.get('url')) + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('pixelWidth')), + 'height': int_or_none(image.get('pixelHeight')), + }) + + return { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_from_playback(video_id) + class NRKTVIE(NRKBaseIE): IE_DESC = 'NRK TV and NRK Radio' @@ -380,6 +277,181 @@ class NRKTVIE(NRKBaseIE): 'only_matching': True, }] + _api_host = None + + def _extract_from_mediaelement(self, video_id): + api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS + + for api_host in api_hosts: + data = self._download_json( + 'http://%s/mediaelement/%s' % (api_host, video_id), + video_id, 'Downloading mediaelement JSON', + fatal=api_host == api_hosts[-1]) + if not data: + continue + self._api_host = api_host + break + + title = data.get('fullTitle') or data.get('mainTitle') or data['title'] + video_id = data.get('id') or video_id + + entries = [] + + conviva = data.get('convivaStatistics') or {} + live = (data.get('mediaElementType') == 'Live' + or data.get('isLive') is True or conviva.get('isLive')) + + def make_title(t): + return self._live_title(t) if live else t + + media_assets = data.get('mediaAssets') + if media_assets and isinstance(media_assets, list): + def video_id_and_title(idx): + return ((video_id, title) if len(media_assets) == 1 + else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) + for num, asset in enumerate(media_assets, 1): + asset_url = asset.get('url') + if not asset_url: + continue + formats = self._extract_akamai_formats(asset_url, video_id) + if not formats: + continue + self._sort_formats(formats) + + # Some f4m streams may not work with hdcore in fragments' URLs + for f in formats: + extra_param = f.get('extra_param_to_segment_url') + if extra_param and 'hdcore' in extra_param: + del f['extra_param_to_segment_url'] + + entry_id, entry_title = video_id_and_title(num) + duration = parse_duration(asset.get('duration')) + subtitles = {} + for subtitle in ('webVtt', 'timedText'): + subtitle_url = asset.get('%sSubtitlesUrl' % subtitle) + if subtitle_url: + subtitles.setdefault('no', []).append({ + 'url': compat_urllib_parse_unquote(subtitle_url) + }) + entries.append({ + 'id': asset.get('carrierId') or entry_id, + 'title': make_title(entry_title), + 'duration': duration, + 'subtitles': subtitles, + 'formats': formats, + }) + + if not entries: + media_url = data.get('mediaUrl') + if media_url: + formats = self._extract_akamai_formats(media_url, video_id) + self._sort_formats(formats) + duration = parse_duration(data.get('duration')) + entries = [{ + 'id': video_id, + 'title': make_title(title), + 'duration': duration, + 'formats': formats, + }] + + if not entries: + MESSAGES = { + 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', + 'ProgramRightsHasExpired': 'Programmet har gått ut', + 'NoProgramRights': 'Ikke tilgjengelig', + 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', + } + message_type = data.get('messageType', '') + # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* + if 'IsGeoBlocked' in message_type: + self.raise_geo_restricted( + msg=MESSAGES.get('ProgramIsGeoBlocked'), + countries=self._GEO_COUNTRIES) + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, MESSAGES.get( + message_type, message_type)), + expected=True) + + series = conviva.get('seriesName') or data.get('seriesTitle') + episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') + + season_number = None + episode_number = None + if data.get('mediaElementType') == 'Episode': + _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ + data.get('relativeOriginUrl', '') + EPISODENUM_RE = [ + r'/s(?P<season>\d{,2})e(?P<episode>\d{,2})\.', + r'/sesong-(?P<season>\d{,2})/episode-(?P<episode>\d{,2})', + ] + season_number = int_or_none(self._search_regex( + EPISODENUM_RE, _season_episode, 'season number', + default=None, group='season')) + episode_number = int_or_none(self._search_regex( + EPISODENUM_RE, _season_episode, 'episode number', + default=None, group='episode')) + + thumbnails = None + images = data.get('images') + if images and isinstance(images, dict): + web_images = images.get('webImages') + if isinstance(web_images, list): + thumbnails = [{ + 'url': image['imageUrl'], + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in web_images if image.get('imageUrl')] + + description = data.get('description') + category = data.get('mediaAnalytics', {}).get('category') + + common_info = { + 'description': description, + 'series': series, + 'episode': episode, + 'season_number': season_number, + 'episode_number': episode_number, + 'categories': [category] if category else None, + 'age_limit': parse_age_limit(data.get('legalAge')), + 'thumbnails': thumbnails, + } + + vcodec = 'none' if data.get('mediaType') == 'Audio' else None + + for entry in entries: + entry.update(common_info) + for f in entry['formats']: + f['vcodec'] = vcodec + + points = data.get('shortIndexPoints') + if isinstance(points, list): + chapters = [] + for next_num, point in enumerate(points, start=1): + if not isinstance(point, dict): + continue + start_time = parse_duration(point.get('startPoint')) + if start_time is None: + continue + end_time = parse_duration( + data.get('duration') + if next_num == len(points) + else points[next_num].get('startPoint')) + if end_time is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': point.get('title'), + }) + if chapters and len(entries) == 1: + entries[0]['chapters'] = chapters + + return self.playlist_result(entries, video_id, title, description) + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_from_mediaelement(video_id) + class NRKTVEpisodeIE(InfoExtractor): _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/\d+/episode/\d+)' -- GitLab From 00088ef4b17ea4b261626dc7903094a5c6c25a27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:35:54 +0100 Subject: [PATCH 055/384] =?UTF-8?q?[downloader/fragment]=20Set=20final=20f?= =?UTF-8?q?ile's=20mtime=20according=20to=20last=20fragme=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …nt's Last-Modified header (closes #11718, closes #18384, closes #27138) --- haruhi_dl/downloader/fragment.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/downloader/fragment.py b/haruhi_dl/downloader/fragment.py index 65d8c881d..090941024 100644 --- a/haruhi_dl/downloader/fragment.py +++ b/haruhi_dl/downloader/fragment.py @@ -97,12 +97,15 @@ class FragmentFD(FileDownloader): def _download_fragment(self, ctx, frag_url, info_dict, headers=None): fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) - success = ctx['dl'].download(fragment_filename, { + fragment_info_dict = { 'url': frag_url, 'http_headers': headers or info_dict.get('http_headers'), - }) + } + success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False, None + if fragment_info_dict.get('filetime'): + ctx['fragment_filetime'] = fragment_info_dict.get('filetime') down, frag_sanitized = sanitize_open(fragment_filename, 'rb') ctx['fragment_filename_sanitized'] = frag_sanitized frag_content = down.read() @@ -258,6 +261,13 @@ class FragmentFD(FileDownloader): downloaded_bytes = ctx['complete_frags_downloaded_bytes'] else: self.try_rename(ctx['tmpfilename'], ctx['filename']) + if self.params.get('updatetime', True): + filetime = ctx.get('fragment_filetime') + if filetime: + try: + os.utime(ctx['filename'], (time.time(), filetime)) + except Exception: + pass downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) self._hook_progress({ -- GitLab From 997dc3ca4446b0f9f02189dfdcdbf72befdfe70c Mon Sep 17 00:00:00 2001 From: Joshua Lochner <admin@xenova.com> Date: Fri, 26 Feb 2021 14:36:01 +0100 Subject: [PATCH 056/384] [medaltv] Add new extractor (#27149) --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/medaltv.py | 138 ++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 haruhi_dl/extractor/medaltv.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 234763076..5de842c31 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -640,6 +640,7 @@ from .mastodon import MastodonSHIE from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE +from .medaltv import MedalTVIE from .mediaset import MediasetIE from .mediasite import ( MediasiteIE, diff --git a/haruhi_dl/extractor/medaltv.py b/haruhi_dl/extractor/medaltv.py new file mode 100644 index 000000000..06f7b6e92 --- /dev/null +++ b/haruhi_dl/extractor/medaltv.py @@ -0,0 +1,138 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + try_get, + float_or_none, + int_or_none +) + + +class MedalTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr', + 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', + 'info_dict': { + 'id': '34934644', + 'ext': 'mp4', + 'title': 'Quad Cold', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'MowgliSB', + 'timestamp': 1603165266, + 'upload_date': '20201020', + 'uploader_id': 10619174, + } + }, { + 'url': 'https://medal.tv/clips/36787208', + 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', + 'info_dict': { + 'id': '36787208', + 'ext': 'mp4', + 'title': 'u tk me i tk u bigger', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'Mimicc', + 'timestamp': 1605580939, + 'upload_date': '20201117', + 'uploader_id': 5156321, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + hydration_data = self._search_regex( + r'<script[^>]*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*</script>', + webpage, 'hydration data', default='{}') + parsed = self._parse_json(hydration_data, video_id) + + clip_info = try_get(parsed, lambda x: x['clips'][video_id], dict) or {} + if not clip_info: + raise ExtractorError('Could not find video information.', + video_id=video_id) + + width = int_or_none(clip_info.get('sourceWidth')) + height = int_or_none(clip_info.get('sourceHeight')) + + aspect_ratio = (width / height) if(width and height) else (16 / 9) + + # ordered from lowest to highest resolution + heights = (144, 240, 360, 480, 720, 1080) + + formats = [] + thumbnails = [] + + for height in heights: + format_key = '{0}p'.format(height) + video_key = 'contentUrl{0}'.format(format_key) + thumbnail_key = 'thumbnail{0}'.format(format_key) + width = int(round(aspect_ratio * height)) + + # Second condition needed as sometimes medal says + # they have a format when in fact it is another format. + format_url = clip_info.get(video_key) + if(format_url and format_key in format_url): + formats.append({ + 'url': format_url, + 'format_id': format_key, + 'width': width, + 'height': height + }) + + thumbnail_url = clip_info.get(thumbnail_key) + if(thumbnail_url and format_key in thumbnail_url): + thumbnails.append({ + 'id': format_key, + 'url': thumbnail_url, + 'width': width, + 'height': height + }) + + # add source to formats + source_url = clip_info.get('contentUrl') + if(source_url): + formats.append({ + 'url': source_url, + 'format_id': 'source', + 'width': width, + 'height': height + }) + + error = clip_info.get('error') + if not formats and error: + if(error == 404): + raise ExtractorError('That clip does not exist.', + expected=True, video_id=video_id) + else: + raise ExtractorError('An unknown error occurred ({0}).'.format(error), + video_id=video_id) + + # Necessary because the id of the author is not known in advance. + # Won't raise an issue if no profile can be found as this is optional. + author_info = try_get(parsed, + lambda x: list(x['profiles'].values())[0], dict + ) or {} + author_id = author_info.get('id') + author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None + + return { + 'id': video_id, + 'title': clip_info.get('contentTitle'), + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clip_info.get('contentDescription'), + + 'uploader': author_info.get('displayName'), + 'timestamp': float_or_none(clip_info.get('created'), 1000), + 'uploader_id': author_id, + 'uploader_url': author_url, + + 'duration': float_or_none(clip_info.get('videoLengthSeconds')), + 'view_count': int_or_none(clip_info.get('views')), + 'like_count': int_or_none(clip_info.get('likes')), + 'comment_count': int_or_none(clip_info.get('comments')) + } -- GitLab From d1114a12e179565e128fa6a384e04a6d45e78391 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:36:07 +0100 Subject: [PATCH 057/384] [medaltv] improve extraction --- haruhi_dl/extractor/medaltv.py | 131 ++++++++++++++++----------------- 1 file changed, 62 insertions(+), 69 deletions(-) diff --git a/haruhi_dl/extractor/medaltv.py b/haruhi_dl/extractor/medaltv.py index 06f7b6e92..1603b55f6 100644 --- a/haruhi_dl/extractor/medaltv.py +++ b/haruhi_dl/extractor/medaltv.py @@ -1,13 +1,16 @@ # coding: utf-8 - from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, - try_get, float_or_none, - int_or_none + int_or_none, + str_or_none, + try_get, ) @@ -45,94 +48,84 @@ class MedalTVIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - hydration_data = self._search_regex( + hydration_data = self._parse_json(self._search_regex( r'<script[^>]*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*</script>', - webpage, 'hydration data', default='{}') - parsed = self._parse_json(hydration_data, video_id) + webpage, 'hydration data', default='{}'), video_id) - clip_info = try_get(parsed, lambda x: x['clips'][video_id], dict) or {} - if not clip_info: - raise ExtractorError('Could not find video information.', - video_id=video_id) + clip = try_get( + hydration_data, lambda x: x['clips'][video_id], dict) or {} + if not clip: + raise ExtractorError( + 'Could not find video information.', video_id=video_id) - width = int_or_none(clip_info.get('sourceWidth')) - height = int_or_none(clip_info.get('sourceHeight')) + title = clip['contentTitle'] - aspect_ratio = (width / height) if(width and height) else (16 / 9) + source_width = int_or_none(clip.get('sourceWidth')) + source_height = int_or_none(clip.get('sourceHeight')) - # ordered from lowest to highest resolution - heights = (144, 240, 360, 480, 720, 1080) - - formats = [] - thumbnails = [] + aspect_ratio = source_width / source_height if source_width and source_height else 16 / 9 - for height in heights: - format_key = '{0}p'.format(height) - video_key = 'contentUrl{0}'.format(format_key) - thumbnail_key = 'thumbnail{0}'.format(format_key) + def add_item(container, item_url, height, id_key='format_id', item_id=None): + item_id = item_id or '%dp' % height + if item_id not in item_url: + return width = int(round(aspect_ratio * height)) - - # Second condition needed as sometimes medal says - # they have a format when in fact it is another format. - format_url = clip_info.get(video_key) - if(format_url and format_key in format_url): - formats.append({ - 'url': format_url, - 'format_id': format_key, - 'width': width, - 'height': height - }) - - thumbnail_url = clip_info.get(thumbnail_key) - if(thumbnail_url and format_key in thumbnail_url): - thumbnails.append({ - 'id': format_key, - 'url': thumbnail_url, - 'width': width, - 'height': height - }) - - # add source to formats - source_url = clip_info.get('contentUrl') - if(source_url): - formats.append({ - 'url': source_url, - 'format_id': 'source', + container.append({ + 'url': item_url, + id_key: item_id, 'width': width, 'height': height }) - error = clip_info.get('error') + formats = [] + thumbnails = [] + for k, v in clip.items(): + if not (v and isinstance(v, compat_str)): + continue + mobj = re.match(r'(contentUrl|thumbnail)(?:(\d+)p)?$', k) + if not mobj: + continue + prefix = mobj.group(1) + height = int_or_none(mobj.group(2)) + if prefix == 'contentUrl': + add_item( + formats, v, height or source_height, + item_id=None if height else 'source') + elif prefix == 'thumbnail': + add_item(thumbnails, v, height, 'id') + + error = clip.get('error') if not formats and error: - if(error == 404): - raise ExtractorError('That clip does not exist.', - expected=True, video_id=video_id) + if error == 404: + raise ExtractorError( + 'That clip does not exist.', + expected=True, video_id=video_id) else: - raise ExtractorError('An unknown error occurred ({0}).'.format(error), - video_id=video_id) + raise ExtractorError( + 'An unknown error occurred ({0}).'.format(error), + video_id=video_id) + + self._sort_formats(formats) # Necessary because the id of the author is not known in advance. # Won't raise an issue if no profile can be found as this is optional. - author_info = try_get(parsed, - lambda x: list(x['profiles'].values())[0], dict - ) or {} - author_id = author_info.get('id') + author = try_get( + hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} + author_id = str_or_none(author.get('id')) author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None return { 'id': video_id, - 'title': clip_info.get('contentTitle'), + 'title': title, 'formats': formats, 'thumbnails': thumbnails, - 'description': clip_info.get('contentDescription'), - - 'uploader': author_info.get('displayName'), - 'timestamp': float_or_none(clip_info.get('created'), 1000), + 'description': clip.get('contentDescription'), + 'uploader': author.get('displayName'), + 'timestamp': float_or_none(clip.get('created'), 1000), 'uploader_id': author_id, 'uploader_url': author_url, - - 'duration': float_or_none(clip_info.get('videoLengthSeconds')), - 'view_count': int_or_none(clip_info.get('views')), - 'like_count': int_or_none(clip_info.get('likes')), - 'comment_count': int_or_none(clip_info.get('comments')) + 'duration': int_or_none(clip.get('videoLengthSeconds')), + 'view_count': int_or_none(clip.get('views')), + 'like_count': int_or_none(clip.get('likes')), + 'comment_count': int_or_none(clip.get('comments')), } -- GitLab From 45eded9bd23596d982a8165e40cc8a44354ff6ed Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:36:12 +0100 Subject: [PATCH 058/384] [bbc] fix BBC News videos extraction --- haruhi_dl/extractor/bbc.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/bbc.py b/haruhi_dl/extractor/bbc.py index 1ff7834bf..fbd79ce4d 100644 --- a/haruhi_dl/extractor/bbc.py +++ b/haruhi_dl/extractor/bbc.py @@ -981,7 +981,7 @@ class BBCIE(BBCCoUkIE): group_id = self._search_regex( r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, webpage, 'group id', default=None) - if playlist_id: + if group_id: return self.url_result( 'https://www.bbc.co.uk/programmes/%s' % group_id, ie=BBCCoUkIE.ie_key()) @@ -1118,6 +1118,39 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if initial_data: + def parse_media(media): + if not media: + return + for item in (try_get(media, lambda x: x['media']['items'], list) or []): + item_id = item.get('id') + item_title = item.get('title') + if not (item_id and item_title): + continue + formats, subtitles = self._download_media_selector(item_id) + self._sort_formats(formats) + entries.append({ + 'id': item_id, + 'title': item_title, + 'thumbnail': item.get('holdingImageUrl'), + 'formats': formats, + 'subtitles': subtitles, + }) + for resp in (initial_data.get('data') or {}).values(): + name = resp.get('name') + if name == 'media-experience': + parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) + elif name == 'article': + for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + if block.get('type') != 'media': + continue + parse_media(block.get('model')) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), -- GitLab From 3f6dc5d4ef783523e3a2e3dc473a668470f2293c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:36:19 +0100 Subject: [PATCH 059/384] [bbc] fix BBC Three clip extraction --- haruhi_dl/extractor/bbc.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/bbc.py b/haruhi_dl/extractor/bbc.py index fbd79ce4d..7aa3a11b5 100644 --- a/haruhi_dl/extractor/bbc.py +++ b/haruhi_dl/extractor/bbc.py @@ -1092,10 +1092,26 @@ class BBCIE(BBCCoUkIE): self._search_regex( r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, 'bbcthree config', default='{}'), - playlist_id, transform_source=js_to_json, fatal=False) - if bbc3_config: + playlist_id, transform_source=js_to_json, fatal=False) or {} + payload = bbc3_config.get('payload') or {} + if payload: + clip = payload.get('currentClip') or {} + clip_vpid = clip.get('vpid') + clip_title = clip.get('title') + if clip_vpid and clip_title: + formats, subtitles = self._download_media_selector(clip_vpid) + self._sort_formats(formats) + return { + 'id': clip_vpid, + 'title': clip_title, + 'thumbnail': dict_get(clip, ('poster', 'imageUrl')), + 'description': clip.get('description'), + 'duration': parse_duration(clip.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + } bbc3_playlist = try_get( - bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + payload, lambda x: x['content']['bbcMedia']['playlist'], dict) if bbc3_playlist: playlist_title = bbc3_playlist.get('title') or playlist_title -- GitLab From 7a49184ca69784612ecee1d8e86bc6a3414816d2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:36:24 +0100 Subject: [PATCH 060/384] [viki] fix video API request(closes #27184) --- haruhi_dl/extractor/viki.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/viki.py b/haruhi_dl/extractor/viki.py index a003b7af8..a311f21ef 100644 --- a/haruhi_dl/extractor/viki.py +++ b/haruhi_dl/extractor/viki.py @@ -20,6 +20,7 @@ from ..utils import ( parse_age_limit, parse_iso8601, sanitized_Request, + std_headers, ) @@ -226,8 +227,10 @@ class VikiIE(VikiBaseIE): resp = self._download_json( 'https://www.viki.com/api/videos/' + video_id, - video_id, 'Downloading video JSON', - headers={'x-viki-app-ver': '4.0.57'}) + video_id, 'Downloading video JSON', headers={ + 'x-client-user-agent': std_headers['User-Agent'], + 'x-viki-app-ver': '4.0.57', + }) video = resp['video'] self._check_errors(video) -- GitLab From 79cd28f514a519a1221f7fc20fed914e4adad1f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:37:00 +0100 Subject: [PATCH 061/384] [spreaker] Add extractor (closes #13480, closes #13877) --- haruhi_dl/extractor/extractors.py | 6 + haruhi_dl/extractor/spreaker.py | 176 ++++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+) create mode 100644 haruhi_dl/extractor/spreaker.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 5de842c31..3e26dfe40 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1129,6 +1129,12 @@ from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE +from .spreaker import ( + SpreakerIE, + SpreakerPageIE, + SpreakerShowIE, + SpreakerShowPageIE, +) from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE from .srgssr import ( diff --git a/haruhi_dl/extractor/spreaker.py b/haruhi_dl/extractor/spreaker.py new file mode 100644 index 000000000..beee6670c --- /dev/null +++ b/haruhi_dl/extractor/spreaker.py @@ -0,0 +1,176 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + str_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +def _extract_episode(data, episode_id=None): + title = data['title'] + download_url = data['download_url'] + + series = try_get(data, lambda x: x['show']['title'], compat_str) + uploader = try_get(data, lambda x: x['author']['fullname'], compat_str) + + thumbnails = [] + for image in ('image_original', 'image_medium', 'image'): + image_url = url_or_none(data.get('%s_url' % image)) + if image_url: + thumbnails.append({'url': image_url}) + + def stats(key): + return int_or_none(try_get( + data, + (lambda x: x['%ss_count' % key], + lambda x: x['stats']['%ss' % key]))) + + def duration(key): + return float_or_none(data.get(key), scale=1000) + + return { + 'id': compat_str(episode_id or data['episode_id']), + 'url': download_url, + 'display_id': data.get('permalink'), + 'title': title, + 'description': data.get('description'), + 'timestamp': unified_timestamp(data.get('published_at')), + 'uploader': uploader, + 'uploader_id': str_or_none(data.get('author_id')), + 'creator': uploader, + 'duration': duration('duration') or duration('length'), + 'view_count': stats('play'), + 'like_count': stats('like'), + 'comment_count': stats('message'), + 'format': 'MPEG Layer 3', + 'format_id': 'mp3', + 'container': 'mp3', + 'ext': 'mp3', + 'thumbnails': thumbnails, + 'series': series, + 'extractor_key': SpreakerIE.ie_key(), + } + + +class SpreakerIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + api\.spreaker\.com/ + (?: + (?:download/)?episode| + v2/episodes + )/ + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'https://api.spreaker.com/episode/12534508', + 'info_dict': { + 'id': '12534508', + 'display_id': 'swm-ep15-how-to-market-your-music-part-2', + 'ext': 'mp3', + 'title': 'EP:15 | Music Marketing (Likes) - Part 2', + 'description': 'md5:0588c43e27be46423e183076fa071177', + 'timestamp': 1502250336, + 'upload_date': '20170809', + 'uploader': 'SWM', + 'uploader_id': '9780658', + 'duration': 1063.42, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'series': 'Success With Music (SWM)', + }, + }, { + 'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3', + 'only_matching': True, + }, { + 'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + data = self._download_json( + 'https://api.spreaker.com/v2/episodes/%s' % episode_id, + episode_id)['response']['episode'] + return _extract_episode(data, episode_id) + + +class SpreakerPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + episode_id = self._search_regex( + (r'data-episode_id=["\'](?P<id>\d+)', + r'episode_id\s*:\s*(?P<id>\d+)'), webpage, 'episode id') + return self.url_result( + 'https://api.spreaker.com/episode/%s' % episode_id, + ie=SpreakerIE.ie_key(), video_id=episode_id) + + +class SpreakerShowIE(InfoExtractor): + _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/show/3-ninjas-podcast', + 'info_dict': { + 'id': '4652058', + }, + 'playlist_mincount': 118, + }] + + def _entries(self, show_id): + for page_num in itertools.count(1): + episodes = self._download_json( + 'https://api.spreaker.com/show/%s/episodes' % show_id, + show_id, note='Downloading JSON page %d' % page_num, query={ + 'page': page_num, + 'max_per_page': 100, + }) + pager = try_get(episodes, lambda x: x['response']['pager'], dict) + if not pager: + break + results = pager.get('results') + if not results or not isinstance(results, list): + break + for result in results: + if not isinstance(result, dict): + continue + yield _extract_episode(result) + if page_num == pager.get('last_page'): + break + + def _real_extract(self, url): + show_id = self._match_id(url) + return self.playlist_result(self._entries(show_id), playlist_id=show_id) + + +class SpreakerShowPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/show/success-with-music', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show_id = self._search_regex( + r'show_id\s*:\s*(?P<id>\d+)', webpage, 'show id') + return self.url_result( + 'https://api.spreaker.com/show/%s' % show_id, + ie=SpreakerShowIE.ie_key(), video_id=show_id) -- GitLab From 37108e29a6af96e058c31452aefec4a236cc520e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:37:08 +0100 Subject: [PATCH 062/384] [spreaker] fix SpreakerShowIE test URL --- haruhi_dl/extractor/spreaker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/spreaker.py b/haruhi_dl/extractor/spreaker.py index beee6670c..6c7e40ae4 100644 --- a/haruhi_dl/extractor/spreaker.py +++ b/haruhi_dl/extractor/spreaker.py @@ -126,7 +126,7 @@ class SpreakerPageIE(InfoExtractor): class SpreakerShowIE(InfoExtractor): _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)' _TESTS = [{ - 'url': 'https://www.spreaker.com/show/3-ninjas-podcast', + 'url': 'https://api.spreaker.com/show/4652058', 'info_dict': { 'id': '4652058', }, -- GitLab From 34a34d7f710f96a67e2de02e89e895fa749a9a89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FAdrian=3D20Heine=3D20n=3DC3=3DA9=3D20Lan?= =?UTF-8?q?g=3F=3D?= <mail@adrianheine.de> Date: Fri, 26 Feb 2021 14:37:14 +0100 Subject: [PATCH 063/384] [videa] Adapt to updates (#26301) closes #25973, closes #25650. --- haruhi_dl/extractor/videa.py | 62 ++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/videa.py b/haruhi_dl/extractor/videa.py index 5830e7fd7..0f0702852 100644 --- a/haruhi_dl/extractor/videa.py +++ b/haruhi_dl/extractor/videa.py @@ -2,15 +2,24 @@ from __future__ import unicode_literals import re +import random +import string +import struct from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, mimetype2ext, parse_codecs, xpath_element, xpath_text, ) +from ..compat import ( + compat_b64decode, + compat_ord, + compat_parse_qs, +) class VideaIE(InfoExtractor): @@ -60,15 +69,63 @@ class VideaIE(InfoExtractor): r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', webpage)] + def rc4(self, ciphertext, key): + res = b'' + + keyLen = len(key) + S = list(range(256)) + + j = 0 + for i in range(256): + j = (j + S[i] + ord(key[i % keyLen])) % 256 + S[i], S[j] = S[j], S[i] + + i = 0 + j = 0 + for m in range(len(ciphertext)): + i = (i + 1) % 256 + j = (j + S[i]) % 256 + S[i], S[j] = S[j], S[i] + k = S[(S[i] + S[j]) % 256] + res += struct.pack("B", k ^ compat_ord(ciphertext[m])) + + return res + def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, fatal=True) + error = self._search_regex(r'<p class="error-text">([^<]+)</p>', webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + video_src_params_raw = self._search_regex(r'<iframe[^>]+id="videa_player_iframe"[^>]+src="/player\?([^"]+)"', webpage, 'video_src_params') + video_src_params = compat_parse_qs(video_src_params_raw) + player_page = self._download_webpage("https://videa.hu/videojs_player?%s" % video_src_params_raw, video_id, fatal=True) + nonce = self._search_regex(r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') + random_seed = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(8)) + static_secret = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' + l = nonce[:32] + s = nonce[32:] + result = '' + for i in range(0, 32): + result += s[i - (static_secret.index(l[i]) - 31)] - info = self._download_xml( + video_src_params['_s'] = random_seed + video_src_params['_t'] = result[:16] + encryption_key_stem = result[16:] + random_seed + + [b64_info, handle] = self._download_webpage_handle( 'http://videa.hu/videaplayer_get_xml.php', video_id, - query={'v': video_id}) + query=video_src_params, fatal=True) + + encrypted_info = compat_b64decode(b64_info) + key = encryption_key_stem + handle.info()['x-videa-xs'] + info_str = self.rc4(encrypted_info, key).decode('utf8') + info = self._parse_xml(info_str, video_id) video = xpath_element(info, './/video', 'video', fatal=True) sources = xpath_element(info, './/video_sources', 'sources', fatal=True) + hash_values = xpath_element(info, './/hash_values', 'hash_values', fatal=True) title = xpath_text(video, './title', fatal=True) @@ -77,6 +134,7 @@ class VideaIE(InfoExtractor): source_url = source.text if not source_url: continue + source_url += '?md5=%s&expires=%s' % (hash_values.find('hash_value_%s' % source.get('name')).text, source.get('exp')) f = parse_codecs(source.get('codecs')) f.update({ 'url': source_url, -- GitLab From de296b234ab6ffc9559b60f9cb401cbc36c11687 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:37:50 +0100 Subject: [PATCH 064/384] =?UTF-8?q?[YoutubeDL]=20Write=20static=20debug=20?= =?UTF-8?q?to=20stderr=20and=20respect=20quiet=20for=20dynami=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …c debug (closes #14579, closes #22593) TODO: logging and verbosity needs major refactoring (refs #10894) --- haruhi_dl/HaruhiDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/HaruhiDL.py b/haruhi_dl/HaruhiDL.py index e67c01a9d..ffc583e82 100755 --- a/haruhi_dl/HaruhiDL.py +++ b/haruhi_dl/HaruhiDL.py @@ -1614,7 +1614,7 @@ class HaruhiDL(object): if req_format is None: req_format = self._default_format_spec(info_dict, download=download) if self.params.get('verbose'): - self.to_stdout('[debug] Default format spec: %s' % req_format) + self._write_string('[debug] Default format spec: %s\n' % req_format) format_selector = self.build_format_selector(req_format) @@ -1875,7 +1875,7 @@ class HaruhiDL(object): for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): - self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) + self.to_screen('[debug] Invoking downloader on %r' % info.get('url')) return fd.download(name, info) if info_dict.get('requested_formats') is not None: -- GitLab From bbb93695b039b9037399ced819adaaeb05a5e10b Mon Sep 17 00:00:00 2001 From: bopol <bopol@e.email> Date: Fri, 26 Feb 2021 14:37:59 +0100 Subject: [PATCH 065/384] [ina] Add support for mobile URLs (#27229) --- haruhi_dl/extractor/ina.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/ina.py b/haruhi_dl/extractor/ina.py index 12695af27..b3b2683cb 100644 --- a/haruhi_dl/extractor/ina.py +++ b/haruhi_dl/extractor/ina.py @@ -12,7 +12,7 @@ from ..utils import ( class InaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)' _TESTS = [{ 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', 'md5': 'a667021bf2b41f8dc6049479d9bb38a3', @@ -31,6 +31,9 @@ class InaIE(InfoExtractor): }, { 'url': 'https://www.ina.fr/video/P16173408-video.html', 'only_matching': True, + }, { + 'url': 'http://m.ina.fr/video/I12055569', + 'only_matching': True, }] def _real_extract(self, url): -- GitLab From f44820d71839ecd969158af4fb0c7884b6d6a622 Mon Sep 17 00:00:00 2001 From: Michael Munch <mm.munk@gmail.com> Date: Fri, 26 Feb 2021 14:38:09 +0100 Subject: [PATCH 066/384] [drtv] Extend _VALID_URL (#27243) --- haruhi_dl/extractor/drtv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/drtv.py b/haruhi_dl/extractor/drtv.py index 390e79f8c..c0036adb6 100644 --- a/haruhi_dl/extractor/drtv.py +++ b/haruhi_dl/extractor/drtv.py @@ -29,7 +29,7 @@ class DRTVIE(InfoExtractor): https?:// (?: (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*| - (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode)/ + (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ ) (?P<id>[\da-z_-]+) ''' @@ -111,6 +111,9 @@ class DRTVIE(InfoExtractor): }, { 'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769', 'only_matching': True, + }, { + 'url': 'https://www.dr.dk/drtv/program/jagten_220924', + 'only_matching': True, }] def _real_extract(self, url): -- GitLab From 863cae8fe46a2114c5c2ec7409242c75a00d9403 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:38:16 +0100 Subject: [PATCH 067/384] =?UTF-8?q?[yandexmusic:track]=20Fix=20extraction?= =?UTF-8?q?=20(closes=20#26449,=20closes=20#26669,=20clo=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ses #26747, closes #26748, closes #26762) --- haruhi_dl/extractor/yandexmusic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/yandexmusic.py b/haruhi_dl/extractor/yandexmusic.py index c50bc8156..ffbbdcefa 100644 --- a/haruhi_dl/extractor/yandexmusic.py +++ b/haruhi_dl/extractor/yandexmusic.py @@ -109,8 +109,7 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'Downloading track location JSON', query={'format': 'json'}) key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + fd_data['path'][1:] + fd_data['s']).encode('utf-8')).hexdigest() - storage = track['storageDir'].split('.') - f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], storage[1]) + f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], track['id']) thumbnail = None cover_uri = track.get('albums', [{}])[0].get('coverUri') -- GitLab From 31a2706650940b063a270fed793e05c5bb25e215 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:38:48 +0100 Subject: [PATCH 068/384] [mediaset] add support for movie URLs(closes #27240) --- haruhi_dl/extractor/mediaset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/mediaset.py b/haruhi_dl/extractor/mediaset.py index 933df1495..2c16fc9e2 100644 --- a/haruhi_dl/extractor/mediaset.py +++ b/haruhi_dl/extractor/mediaset.py @@ -23,7 +23,7 @@ class MediasetIE(ThePlatformBaseIE): https?:// (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ (?: - (?:video|on-demand)/(?:[^/]+/)+[^/]+_| + (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_| player/index\.html\?.*?\bprogramGuid= ) )(?P<id>[0-9A-Z]{16,}) @@ -88,6 +88,9 @@ class MediasetIE(ThePlatformBaseIE): }, { 'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135', 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/movie/herculeslaleggendahainizio/hercules-la-leggenda-ha-inizio_F305927501000102', + 'only_matching': True, }] @staticmethod -- GitLab From a321724c883cfde0de88d91b1307d049512f2061 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FRoman=3D20Ber=3DC3=3DA1nek=3F=3D?= <zavorka@users.noreply.github.com> Date: Fri, 26 Feb 2021 14:38:55 +0100 Subject: [PATCH 069/384] =?UTF-8?q?[cspan]=20Pass=20Referer=20header=20wit?= =?UTF-8?q?h=20format's=20video=20URL=20(#26032)=20(closes=20=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …#25729) --- haruhi_dl/extractor/cspan.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/haruhi_dl/extractor/cspan.py b/haruhi_dl/extractor/cspan.py index 67d6df4b0..3356cc280 100644 --- a/haruhi_dl/extractor/cspan.py +++ b/haruhi_dl/extractor/cspan.py @@ -165,6 +165,8 @@ class CSpanIE(InfoExtractor): formats = self._extract_m3u8_formats( path, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] + for f in formats: + f.setdefault('http_headers', {})['Referer'] = url self._sort_formats(formats) entries.append({ 'id': '%s_%d' % (video_id, partnum + 1), -- GitLab From 8c785f84724e8624bf33138dc6215b72e9ad3923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:39:10 +0100 Subject: [PATCH 070/384] =?UTF-8?q?[cspan]=20Extract=20info=20from=20jwpla?= =?UTF-8?q?yer=20data=20(closes=20#3672,=20closes=20#3734,=20=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …closes #10638, closes #13030, closes #18806, closes #23148, closes #24461, closes #26171, closes #26800, closes #27263) --- haruhi_dl/extractor/cspan.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/cspan.py b/haruhi_dl/extractor/cspan.py index 3356cc280..766942146 100644 --- a/haruhi_dl/extractor/cspan.py +++ b/haruhi_dl/extractor/cspan.py @@ -10,6 +10,8 @@ from ..utils import ( find_xpath_attr, get_element_by_class, int_or_none, + js_to_json, + merge_dicts, smuggle_url, unescapeHTML, ) @@ -98,6 +100,26 @@ class CSpanIE(InfoExtractor): bc_attr['data-bcid']) return self.url_result(smuggle_url(bc_url, {'source_url': url})) + def add_referer(formats): + for f in formats: + f.setdefault('http_headers', {})['Referer'] = url + + # As of 01.12.2020 this path looks to cover all cases making the rest + # of the code unnecessary + jwsetup = self._parse_json( + self._search_regex( + r'(?s)jwsetup\s*=\s*({.+?})\s*;', webpage, 'jwsetup', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if jwsetup: + info = self._parse_jwplayer_data( + jwsetup, video_id, require_title=False, m3u8_id='hls', + base_url=url) + add_referer(info['formats']) + ld_info = self._search_json_ld(webpage, video_id, default={}) + return merge_dicts(info, ld_info) + + # Obsolete # We first look for clipid, because clipprog always appears before patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] results = list(filter(None, (re.search(p, webpage) for p in patterns))) @@ -165,8 +187,7 @@ class CSpanIE(InfoExtractor): formats = self._extract_m3u8_formats( path, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] - for f in formats: - f.setdefault('http_headers', {})['Referer'] = url + add_referer(formats) self._sort_formats(formats) entries.append({ 'id': '%s_%d' % (video_id, partnum + 1), -- GitLab From 9b24767e1e3951e745f0e8fa10193f83c5f6b4a6 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:39:18 +0100 Subject: [PATCH 071/384] [toggle] Add support for new MeWatch URLs (closes #27256) --- haruhi_dl/extractor/extractors.py | 5 ++- haruhi_dl/extractor/toggle.py | 74 ++++++++++++++++++------------- 2 files changed, 47 insertions(+), 32 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 3e26dfe40..314d7bfe0 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1232,7 +1232,10 @@ from .tnaflix import ( EMPFlixIE, MovieFapIE, ) -from .toggle import ToggleIE +from .toggle import ( + ToggleIE, + MeWatchIE, +) from .tonline import TOnlineIE from .toongoggles import ToonGogglesIE from .toutv import TouTvIE diff --git a/haruhi_dl/extractor/toggle.py b/haruhi_dl/extractor/toggle.py index ca2e36efe..cababa69e 100644 --- a/haruhi_dl/extractor/toggle.py +++ b/haruhi_dl/extractor/toggle.py @@ -11,13 +11,13 @@ from ..utils import ( float_or_none, int_or_none, parse_iso8601, - sanitized_Request, + strip_or_none, ) class ToggleIE(InfoExtractor): IE_NAME = 'toggle' - _VALID_URL = r'https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)' + _VALID_URL = r'(?:https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}|toggle:)(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.mewatch.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { @@ -96,16 +96,6 @@ class ToggleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, note='Downloading video page') - - api_user = self._search_regex( - r'apiUser\s*:\s*(["\'])(?P<user>.+?)\1', webpage, 'apiUser', - default=self._API_USER, group='user') - api_pass = self._search_regex( - r'apiPass\s*:\s*(["\'])(?P<pass>.+?)\1', webpage, 'apiPass', - default=self._API_PASS, group='pass') - params = { 'initObj': { 'Locale': { @@ -118,17 +108,16 @@ class ToggleIE(InfoExtractor): 'SiteGuid': 0, 'DomainID': '0', 'UDID': '', - 'ApiUser': api_user, - 'ApiPass': api_pass + 'ApiUser': self._API_USER, + 'ApiPass': self._API_PASS }, 'MediaID': video_id, 'mediaType': 0, } - req = sanitized_Request( + info = self._download_json( 'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo', - json.dumps(params).encode('utf-8')) - info = self._download_json(req, video_id, 'Downloading video info json') + video_id, 'Downloading video info json', data=json.dumps(params).encode('utf-8')) title = info['MediaName'] @@ -172,14 +161,6 @@ class ToggleIE(InfoExtractor): raise ExtractorError('No downloadable videos found', expected=True) self._sort_formats(formats) - duration = int_or_none(info.get('Duration')) - description = info.get('Description') - created_at = parse_iso8601(info.get('CreationDate') or None) - - average_rating = float_or_none(info.get('Rating')) - view_count = int_or_none(info.get('ViewCounter') or info.get('view_counter')) - like_count = int_or_none(info.get('LikeCounter') or info.get('like_counter')) - thumbnails = [] for picture in info.get('Pictures', []): if not isinstance(picture, dict): @@ -199,15 +180,46 @@ class ToggleIE(InfoExtractor): }) thumbnails.append(thumbnail) + def counter(prefix): + return int_or_none( + info.get(prefix + 'Counter') or info.get(prefix.lower() + '_counter')) + return { 'id': video_id, 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': created_at, - 'average_rating': average_rating, - 'view_count': view_count, - 'like_count': like_count, + 'description': strip_or_none(info.get('Description')), + 'duration': int_or_none(info.get('Duration')), + 'timestamp': parse_iso8601(info.get('CreationDate') or None), + 'average_rating': float_or_none(info.get('Rating')), + 'view_count': counter('View'), + 'like_count': counter('Like'), 'thumbnails': thumbnails, 'formats': formats, } + + +class MeWatchIE(InfoExtractor): + IE_NAME = 'mewatch' + _VALID_URL = r'https?://(?:www\.)?mewatch\.sg/watch/[0-9a-zA-Z-]+-(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371', + 'info_dict': { + 'id': '1008625', + 'ext': 'mp4', + 'title': 'Recipe Of Life 味之道', + 'timestamp': 1603306526, + 'description': 'md5:6e88cde8af2068444fc8e1bc3ebf257c', + 'upload_date': '20201021', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + }] + + def _real_extract(self, url): + item_id = self._match_id(url) + custom_id = self._download_json( + 'https://cdn.mewatch.sg/api/items/' + item_id, + item_id, query={'segments': 'all'})['customId'] + return self.url_result( + 'toggle:' + custom_id, ToggleIE.ie_key(), custom_id) -- GitLab From 9e5ac5f6291968a68392e5a47c08d62e0229210f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:39:24 +0100 Subject: [PATCH 072/384] [toggle] Detect DRM protected videos (closes #16479)(closes #20805) --- haruhi_dl/extractor/toggle.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/haruhi_dl/extractor/toggle.py b/haruhi_dl/extractor/toggle.py index cababa69e..91b8023b8 100644 --- a/haruhi_dl/extractor/toggle.py +++ b/haruhi_dl/extractor/toggle.py @@ -84,12 +84,6 @@ class ToggleIE(InfoExtractor): 'only_matching': True, }] - _FORMAT_PREFERENCES = { - 'wvm-STBMain': -10, - 'wvm-iPadMain': -20, - 'wvm-iPhoneMain': -30, - 'wvm-Android': -40, - } _API_USER = 'tvpapi_147' _API_PASS = '11111' @@ -130,11 +124,16 @@ class ToggleIE(InfoExtractor): vid_format = vid_format.replace(' ', '') # if geo-restricted, m3u8 is inaccessible, but mp4 is okay if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( video_url, video_id, ext='mp4', m3u8_id=vid_format, note='Downloading %s m3u8 information' % vid_format, errnote='Failed to download %s m3u8 information' % vid_format, - fatal=False)) + fatal=False) + for f in m3u8_formats: + # Apple FairPlay Streaming + if '/fpshls/' in f['url']: + continue + formats.append(f) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id=vid_format, @@ -147,16 +146,17 @@ class ToggleIE(InfoExtractor): note='Downloading %s ISM manifest' % vid_format, errnote='Failed to download %s ISM manifest' % vid_format, fatal=False)) - elif ext in ('mp4', 'wvm'): - # wvm are drm-protected files + elif ext == 'mp4': formats.append({ 'ext': ext, 'url': video_url, 'format_id': vid_format, - 'preference': self._FORMAT_PREFERENCES.get(ext + '-' + vid_format) or -1, - 'format_note': 'DRM-protected video' if ext == 'wvm' else None }) if not formats: + for meta in (info.get('Metas') or []): + if meta.get('Key') == 'Encryption' and meta.get('Value') == '1': + raise ExtractorError( + 'This video is DRM protected.', expected=True) # Most likely because geo-blocked raise ExtractorError('No downloadable videos found', expected=True) self._sort_formats(formats) -- GitLab From 0475d9eaff9dcd5cf06bd2c45338e18a8c349e16 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:39:30 +0100 Subject: [PATCH 073/384] [tva] Add support for qub.ca (closes #27235) --- haruhi_dl/extractor/tva.py | 65 ++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/haruhi_dl/extractor/tva.py b/haruhi_dl/extractor/tva.py index 443f46e8a..52a4ddf32 100644 --- a/haruhi_dl/extractor/tva.py +++ b/haruhi_dl/extractor/tva.py @@ -4,7 +4,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( float_or_none, + int_or_none, smuggle_url, + strip_or_none, ) @@ -23,7 +25,8 @@ class TVAIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'https://video.tva.ca/details/_5596811470001', 'only_matching': True, @@ -32,26 +35,54 @@ class TVAIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'https://videos.tva.ca/proxy/item/_' + video_id, video_id, headers={ - 'Accept': 'application/json', - }, query={ - 'appId': '5955fc5f23eec60006c951f1', - }) - - def get_attribute(key): - for attribute in video_data.get('attributes', []): - if attribute.get('key') == key: - return attribute.get('value') - return None return { '_type': 'url_transparent', 'id': video_id, - 'title': get_attribute('title'), 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['CA']}), - 'description': get_attribute('description'), - 'thumbnail': get_attribute('image-background') or get_attribute('image-landscape'), - 'duration': float_or_none(get_attribute('video-duration'), 1000), 'ie_key': 'BrightcoveNew', } + + +class QubIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?qub\.ca/(?:[^/]+/)*[0-9a-z-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.qub.ca/tvaplus/tva/alerte-amber/saison-1/episode-01-1000036619', + 'md5': '949490fd0e7aee11d0543777611fbd53', + 'info_dict': { + 'id': '6084352463001', + 'ext': 'mp4', + 'title': 'Épisode 01', + 'uploader_id': '5481942443001', + 'upload_date': '20190907', + 'timestamp': 1567899756, + 'description': 'md5:9c0d7fbb90939420c651fd977df90145', + }, + }, { + 'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943', + 'only_matching': True, + }] + # reference_id also works with old account_id(5481942443001) + # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5813221784001/default_default/index.html?videoId=ref:%s' + + def _real_extract(self, url): + entity_id = self._match_id(url) + entity = self._download_json( + 'https://www.qub.ca/proxy/pfu/content-delivery-service/v1/entities', + entity_id, query={'id': entity_id}) + video_id = entity['videoId'] + episode = strip_or_none(entity.get('name')) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': episode, + # 'url': self.BRIGHTCOVE_URL_TEMPLATE % entity['referenceId'], + 'url': 'https://videos.tva.ca/details/_' + video_id, + 'description': entity.get('longDescription'), + 'duration': float_or_none(entity.get('durationMillis'), 1000), + 'episode': episode, + 'episode_number': int_or_none(entity.get('episodeNumber')), + # 'ie_key': 'BrightcoveNew', + 'ie_key': TVAIE.ie_key(), + } -- GitLab From 87889f1fe81b3aefe56281179b266b74c4bf1ecd Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:39:36 +0100 Subject: [PATCH 074/384] [extractors] Add QubIE import --- haruhi_dl/extractor/extractors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 314d7bfe0..a83e8fe6b 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1276,7 +1276,10 @@ from .tv2dk import ( from .tv2hu import TV2HuIE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE -from .tva import TVAIE +from .tva import ( + TVAIE, + QubIE, +) from .tvanouvelles import ( TVANouvellesIE, TVANouvellesArticleIE, -- GitLab From b789c2b6bb25deec1e6d93122698b9723fb6a128 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:40:26 +0100 Subject: [PATCH 075/384] [tver] Add new extractor (closes #26662)(closes #27284) --- haruhi_dl/extractor/extractors.py | 2 + haruhi_dl/extractor/fujitv.py | 35 ++++++++++++++++ haruhi_dl/extractor/tver.py | 67 +++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) create mode 100644 haruhi_dl/extractor/fujitv.py create mode 100644 haruhi_dl/extractor/tver.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index a83e8fe6b..46f3b604c 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -410,6 +410,7 @@ from .frontendmasters import ( FrontendMastersLessonIE, FrontendMastersCourseIE ) +from .fujitv import FujiTVFODPlus7IE from .funimation import FunimationIE from .funk import FunkIE from .funkwhale import ( @@ -1288,6 +1289,7 @@ from .tvc import ( TVCIE, TVCArticleIE, ) +from .tver import TVerIE from .tvigle import TvigleIE from .tvland import TVLandIE from .tvn24 import TVN24IE diff --git a/haruhi_dl/extractor/fujitv.py b/haruhi_dl/extractor/fujitv.py new file mode 100644 index 000000000..39685e075 --- /dev/null +++ b/haruhi_dl/extractor/fujitv.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class FujiTVFODPlus7IE(InfoExtractor): + _VALID_URL = r'https?://i\.fod\.fujitv\.co\.jp/plus7/web/[0-9a-z]{4}/(?P<id>[0-9a-z]+)' + _BASE_URL = 'http://i.fod.fujitv.co.jp/' + _BITRATE_MAP = { + 300: (320, 180), + 800: (640, 360), + 1200: (1280, 720), + 2000: (1280, 720), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = self._extract_m3u8_formats( + self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id) + for f in formats: + wh = self._BITRATE_MAP.get(f.get('tbr')) + if wh: + f.update({ + 'width': wh[0], + 'height': wh[1], + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + 'thumbnail': self._BASE_URL + 'pc/image/wbtn/wbtn_%s.jpg' % video_id, + } diff --git a/haruhi_dl/extractor/tver.py b/haruhi_dl/extractor/tver.py new file mode 100644 index 000000000..c5299722d --- /dev/null +++ b/haruhi_dl/extractor/tver.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + remove_start, + smuggle_url, + try_get, +) + + +class TVerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>(?:corner|episode|feature)/(?P<id>f?\d+))' + # videos are only available for 7 days + _TESTS = [{ + 'url': 'https://tver.jp/corner/f0062178', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/feature/f0062413', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/episode/79622438', + 'only_matching': True, + }] + _TOKEN = None + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + + def _real_initialize(self): + self._TOKEN = self._download_json( + 'https://tver.jp/api/access_token.php', None)['token'] + + def _real_extract(self, url): + path, video_id = re.match(self._VALID_URL, url).groups() + main = self._download_json( + 'https://api.tver.jp/v4/' + path, video_id, + query={'token': self._TOKEN})['main'] + p_id = main['publisher_id'] + service = remove_start(main['service'], 'ts_') + info = { + '_type': 'url_transparent', + 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), + 'episode': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + } + + if service == 'cx': + info.update({ + 'title': main.get('subtitle') or main['title'], + 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), + 'ie_key': 'FujiTVFODPlus7', + }) + else: + r_id = main['reference_id'] + if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): + r_id = 'ref:' + r_id + bc_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), + {'geo_countries': ['JP']}) + info.update({ + 'url': bc_url, + 'ie_key': 'BrightcoveNew', + }) + + return info -- GitLab From d60195c74b7d1e551bb7915aed6f9d5d3b6c2b7a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:42:28 +0100 Subject: [PATCH 076/384] [extractor/common] improve Akamai HTTP format extraction - Allow m3u8 manifest without an additional audio format - Fix extraction for qualities starting with a number Solution provided by @nixxo based on: https://stackoverflow.com/a/5984688 --- haruhi_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index 32a391a85..3492d8865 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -2641,7 +2641,7 @@ class InfoExtractor(object): REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') qualities_length = len(qualities) - if len(formats) in (qualities_length + 1, qualities_length * 2 + 1): + if len(formats) in (qualities_length, qualities_length + 1, qualities_length * 2, qualities_length * 2 + 1): i = 0 http_formats = [] for f in formats: @@ -2650,7 +2650,7 @@ class InfoExtractor(object): http_f = f.copy() del http_f['manifest_url'] http_url = re.sub( - REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url']) + REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url']) http_f.update({ 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), 'url': http_url, -- GitLab From 0a48eb0c7f60db75c07e2dab821981749ba75c28 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:42:39 +0100 Subject: [PATCH 077/384] [tver] correct episode_number key --- haruhi_dl/extractor/tver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/tver.py b/haruhi_dl/extractor/tver.py index c5299722d..931d4d650 100644 --- a/haruhi_dl/extractor/tver.py +++ b/haruhi_dl/extractor/tver.py @@ -43,7 +43,7 @@ class TVerIE(InfoExtractor): info = { '_type': 'url_transparent', 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), - 'episode': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), } if service == 'cx': -- GitLab From 76263cc89371812634d7d41b52c27b413caefa59 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:42:46 +0100 Subject: [PATCH 078/384] [extractor/commons] improve Akamai HTTP formats extraction --- haruhi_dl/extractor/common.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index 3492d8865..2db95d592 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -2632,20 +2632,20 @@ class InfoExtractor(object): hls_host = hosts.get('hls') if hls_host: m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) http_host = hosts.get('http') - if http_host and 'hdnea=' not in manifest_url: - REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' + if http_host and m3u8_formats and 'hdnea=' not in m3u8_url: + REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+' qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') qualities_length = len(qualities) - if len(formats) in (qualities_length, qualities_length + 1, qualities_length * 2, qualities_length * 2 + 1): + if len(m3u8_formats) in (qualities_length, qualities_length + 1): i = 0 - http_formats = [] - for f in formats: - if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none': + for f in m3u8_formats: + if f['vcodec'] != 'none': for protocol in ('http', 'https'): http_f = f.copy() del http_f['manifest_url'] @@ -2656,9 +2656,8 @@ class InfoExtractor(object): 'url': http_url, 'protocol': protocol, }) - http_formats.append(http_f) + formats.append(http_f) i += 1 - formats.extend(http_formats) return formats -- GitLab From 2f04ca9dac9b11744d236e11b4bf0a12f56fddf4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:42:52 +0100 Subject: [PATCH 079/384] [gamespot] Extract DASH and HTTP formats --- haruhi_dl/extractor/gamespot.py | 110 ++++++++------------------------ 1 file changed, 25 insertions(+), 85 deletions(-) diff --git a/haruhi_dl/extractor/gamespot.py b/haruhi_dl/extractor/gamespot.py index 4236a5ed8..7a1beae3c 100644 --- a/haruhi_dl/extractor/gamespot.py +++ b/haruhi_dl/extractor/gamespot.py @@ -1,16 +1,7 @@ from __future__ import unicode_literals -import re - from .once import OnceIE -from ..compat import ( - compat_urllib_parse_unquote, -) -from ..utils import ( - unescapeHTML, - url_basename, - dict_get, -) +from ..compat import compat_urllib_parse_unquote class GameSpotIE(OnceIE): @@ -24,17 +15,16 @@ class GameSpotIE(OnceIE): 'title': 'Arma 3 - Community Guide: SITREP I', 'description': 'Check out this video where some of the basics of Arma 3 is explained.', }, + 'skip': 'manifest URL give HTTP Error 404: Not Found', }, { 'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/', + 'md5': '173ea87ad762cf5d3bf6163dceb255a6', 'info_dict': { 'id': 'gs-2300-6424837', 'ext': 'mp4', 'title': 'Now Playing - The Witcher 3: Wild Hunt', 'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.', }, - 'params': { - 'skip_download': True, # m3u8 downloads - }, }, { 'url': 'https://www.gamespot.com/videos/embed/6439218/', 'only_matching': True, @@ -49,90 +39,40 @@ class GameSpotIE(OnceIE): def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - data_video_json = self._search_regex( - r'data-video=["\'](.*?)["\']', webpage, 'data video') - data_video = self._parse_json(unescapeHTML(data_video_json), page_id) + data_video = self._parse_json(self._html_search_regex( + r'data-video=(["\'])({.*?})\1', webpage, + 'video data', group=2), page_id) + title = compat_urllib_parse_unquote(data_video['title']) streams = data_video['videoStreams'] - - manifest_url = None formats = [] - f4m_url = streams.get('f4m_stream') - if f4m_url: - manifest_url = f4m_url - formats.extend(self._extract_f4m_formats( - f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False)) - m3u8_url = dict_get(streams, ('m3u8_stream', 'adaptive_stream')) + + m3u8_url = streams.get('adaptive_stream') if m3u8_url: - manifest_url = m3u8_url m3u8_formats = self._extract_m3u8_formats( m3u8_url, page_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(m3u8_formats) - progressive_url = dict_get( - streams, ('progressive_hd', 'progressive_high', 'progressive_low', 'other_lr')) - if progressive_url and manifest_url: - qualities_basename = self._search_regex( - r'/([^/]+)\.csmil/', - manifest_url, 'qualities basename', default=None) - if qualities_basename: - QUALITIES_RE = r'((,\d+)+,?)' - qualities = self._search_regex( - QUALITIES_RE, qualities_basename, - 'qualities', default=None) - if qualities: - qualities = list(map(lambda q: int(q), qualities.strip(',').split(','))) - qualities.sort() - http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename) - http_url_basename = url_basename(progressive_url) - if m3u8_formats: - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none', m3u8_formats)) - if len(qualities) == len(m3u8_formats): - for q, m3u8_format in zip(qualities, m3u8_formats): - f = m3u8_format.copy() - f.update({ - 'url': progressive_url.replace( - http_url_basename, http_template % q), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - for q in qualities: - formats.append({ - 'url': progressive_url.replace( - http_url_basename, http_template % q), - 'ext': 'mp4', - 'format_id': 'http-%d' % q, - 'tbr': q, - }) + for f in m3u8_formats: + formats.append(f) + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': f['url'].replace('.m3u8', '.mp4'), + }) + formats.append(http_f) - onceux_json = self._search_regex( - r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None) - if onceux_json: - onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri') - if onceux_url: - formats.extend(self._extract_once_formats(re.sub( - r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url), - http_formats_preference=-1)) + mpd_url = streams.get('adaptive_dash') + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, page_id, mpd_id='dash', fatal=False)) - if not formats: - for quality in ['sd', 'hd']: - # It's actually a link to a flv file - flv_url = streams.get('f4m_{0}'.format(quality)) - if flv_url is not None: - formats.append({ - 'url': flv_url, - 'ext': 'flv', - 'format_id': quality, - }) self._sort_formats(formats) return { - 'id': data_video['guid'], + 'id': data_video.get('guid') or page_id, 'display_id': page_id, - 'title': compat_urllib_parse_unquote(data_video['title']), + 'title': title, 'formats': formats, 'description': self._html_search_meta('description', webpage), 'thumbnail': self._og_search_thumbnail(webpage), -- GitLab From 8c8e98ffdd0a60cd479803740de00832cc7614bb Mon Sep 17 00:00:00 2001 From: Matthew Rayermann <matthew.rayermann@gmail.com> Date: Fri, 26 Feb 2021 14:42:58 +0100 Subject: [PATCH 080/384] [nhk] Add audio clip test to NHK extractor (#27269) --- haruhi_dl/extractor/nhk.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nhk.py b/haruhi_dl/extractor/nhk.py index de6a707c4..6a61a47d2 100644 --- a/haruhi_dl/extractor/nhk.py +++ b/haruhi_dl/extractor/nhk.py @@ -10,7 +10,7 @@ class NhkVodIE(InfoExtractor): # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - # clip + # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', 'md5': '256a1be14f48d960a7e61e2532d95ec3', 'info_dict': { @@ -21,6 +21,19 @@ class NhkVodIE(InfoExtractor): 'timestamp': 1565965194, 'upload_date': '20190816', }, + }, { + # audio clip + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/', + 'info_dict': { + 'id': 'r_inventions-20201104-1-en', + 'ext': 'm4a', + 'title': "Japan's Top Inventions - Miniature Video Cameras", + 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, -- GitLab From ecfc7cb9f199aaf3df9e2a797df1e4c5dcbd36e1 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:43:06 +0100 Subject: [PATCH 081/384] [zdf] extract webm formats(closes #26659) --- haruhi_dl/extractor/zdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/zdf.py b/haruhi_dl/extractor/zdf.py index 656864b2e..5ed2946c2 100644 --- a/haruhi_dl/extractor/zdf.py +++ b/haruhi_dl/extractor/zdf.py @@ -40,7 +40,7 @@ class ZDFBaseIE(InfoExtractor): class ZDFIE(ZDFBaseIE): _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?]+)\.html' - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') _GEO_COUNTRIES = ['DE'] _TESTS = [{ @@ -119,7 +119,7 @@ class ZDFIE(ZDFBaseIE): if not ptmd_path: ptmd_path = t[ 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'portal') + '{playerId}', 'ngplayer_2_4') ptmd = self._call_api( urljoin(url, ptmd_path), player, url, video_id, 'metadata') -- GitLab From 841628af91d84f6941e329f1334ddc4b9c6f46d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:43:47 +0100 Subject: [PATCH 082/384] [nrktv] Relax _VALID_URL (closes #27299, closes #26185) --- haruhi_dl/extractor/nrk.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 4a395546f..0c4b126ed 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -146,7 +146,7 @@ class NRKTVIE(NRKBaseIE): _VALID_URL = r'''(?x) https?:// (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie(?:/[^/]+){1,2}|program)/ + (?:serie(?:/[^/]+){1,}|program)/ (?![Ee]pisodes)%s (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P<part_id>\d+))? @@ -275,6 +275,9 @@ class NRKTVIE(NRKBaseIE): }, { 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201507/NPUB21019315', + 'only_matching': True, }] _api_host = None -- GitLab From 58edf65c1b27fa476dfb695a77de8c138a1d3cc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:43:56 +0100 Subject: [PATCH 083/384] [pornhub] Handle HTTP errors gracefully (closes #26414) --- haruhi_dl/extractor/pornhub.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/pornhub.py b/haruhi_dl/extractor/pornhub.py index d91c869c4..20af84955 100644 --- a/haruhi_dl/extractor/pornhub.py +++ b/haruhi_dl/extractor/pornhub.py @@ -33,7 +33,12 @@ class PornHubBaseIE(InfoExtractor): def dl(*args, **kwargs): return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) - webpage, urlh = dl(*args, **kwargs) + ret = dl(*args, **kwargs) + + if not ret: + return ret + + webpage, urlh = ret if any(re.search(p, webpage) for p in ( r'<body\b[^>]+\bonload=["\']go\(\)', -- GitLab From d439a5df63cedcceb2b64b3796476beaff968635 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:44:04 +0100 Subject: [PATCH 084/384] =?UTF-8?q?[nrk]=20improve=20format=20extraction?= =?UTF-8?q?=20and=20geo-restriction=20detection=20(closes=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … #24221) --- haruhi_dl/extractor/nrk.py | 43 +++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 0c4b126ed..19d820f61 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -24,6 +24,11 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] + def _extract_nrk_formats(self, asset_url, video_id): + return self._extract_m3u8_formats( + re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url), + video_id, 'mp4', 'm3u8_native', fatal=False) + class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) @@ -94,9 +99,7 @@ class NRKIE(NRKBaseIE): if not format_url: continue if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + formats.extend(self._extract_nrk_formats(format_url, video_id)) self._sort_formats(formats) data = self._download_json( @@ -298,6 +301,7 @@ class NRKTVIE(NRKBaseIE): title = data.get('fullTitle') or data.get('mainTitle') or data['title'] video_id = data.get('id') or video_id + urls = [] entries = [] conviva = data.get('convivaStatistics') or {} @@ -314,19 +318,13 @@ class NRKTVIE(NRKBaseIE): else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) for num, asset in enumerate(media_assets, 1): asset_url = asset.get('url') - if not asset_url: + if not asset_url or asset_url in urls: continue - formats = self._extract_akamai_formats(asset_url, video_id) + formats = extract_nrk_formats(asset_url, video_id) if not formats: continue self._sort_formats(formats) - # Some f4m streams may not work with hdcore in fragments' URLs - for f in formats: - extra_param = f.get('extra_param_to_segment_url') - if extra_param and 'hdcore' in extra_param: - del f['extra_param_to_segment_url'] - entry_id, entry_title = video_id_and_title(num) duration = parse_duration(asset.get('duration')) subtitles = {} @@ -346,16 +344,17 @@ class NRKTVIE(NRKBaseIE): if not entries: media_url = data.get('mediaUrl') - if media_url: - formats = self._extract_akamai_formats(media_url, video_id) - self._sort_formats(formats) - duration = parse_duration(data.get('duration')) - entries = [{ - 'id': video_id, - 'title': make_title(title), - 'duration': duration, - 'formats': formats, - }] + if media_url and media_url not in urls: + formats = extract_nrk_formats(media_url, video_id) + if formats: + self._sort_formats(formats) + duration = parse_duration(data.get('duration')) + entries = [{ + 'id': video_id, + 'title': make_title(title), + 'duration': duration, + 'formats': formats, + }] if not entries: MESSAGES = { @@ -366,7 +365,7 @@ class NRKTVIE(NRKBaseIE): } message_type = data.get('messageType', '') # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type: + if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is Trues: self.raise_geo_restricted( msg=MESSAGES.get('ProgramIsGeoBlocked'), countries=self._GEO_COUNTRIES) -- GitLab From 226efefec6259a1d30dc436cc6eff9f693c8d899 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:44:09 +0100 Subject: [PATCH 085/384] [nrk] fix typo --- haruhi_dl/extractor/nrk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 19d820f61..0f69579c5 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -365,7 +365,7 @@ class NRKTVIE(NRKBaseIE): } message_type = data.get('messageType', '') # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is Trues: + if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: self.raise_geo_restricted( msg=MESSAGES.get('ProgramIsGeoBlocked'), countries=self._GEO_COUNTRIES) -- GitLab From 75dc35e41846adf9872089c37a189d1f16f57731 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:44:14 +0100 Subject: [PATCH 086/384] [nrk] fix call to moved method --- haruhi_dl/extractor/nrk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 0f69579c5..8595f55b1 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -320,7 +320,7 @@ class NRKTVIE(NRKBaseIE): asset_url = asset.get('url') if not asset_url or asset_url in urls: continue - formats = extract_nrk_formats(asset_url, video_id) + formats = self._extract_nrk_formats(asset_url, video_id) if not formats: continue self._sort_formats(formats) @@ -345,7 +345,7 @@ class NRKTVIE(NRKBaseIE): if not entries: media_url = data.get('mediaUrl') if media_url and media_url not in urls: - formats = extract_nrk_formats(media_url, video_id) + formats = self._extract_nrk_formats(media_url, video_id) if formats: self._sort_formats(formats) duration = parse_duration(data.get('duration')) -- GitLab From 08fea1baa17abcfff65899e7a06f3f0c616547c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:44:20 +0100 Subject: [PATCH 087/384] [nrktv:season] Improve extraction --- haruhi_dl/extractor/nrk.py | 99 ++++++++++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 16 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 8595f55b1..4d5f4c5ba 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor @@ -17,6 +18,7 @@ from ..utils import ( parse_age_limit, parse_duration, try_get, + urljoin, url_or_none, ) @@ -547,44 +549,109 @@ class NRKTVSerieBaseIE(InfoExtractor): return [] entries = [] for episode in entry_list: - nrk_id = episode.get('prfId') + nrk_id = episode.get('prfId') or episode.get('episodeId') if not nrk_id or not isinstance(nrk_id, compat_str): continue + if not re.match(NRKTVIE._EPISODE_RE, nrk_id): + continue entries.append(self.url_result( 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) return entries class NRKTVSeasonIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?P<domain>tv|radio)\.nrk\.no/serie/(?P<serie>[^/]+)/(?:sesong/)?(?P<id>\d+)' + _TESTS = [{ 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', 'info_dict': { - 'id': '1', + 'id': 'backstage/1', 'title': 'Sesong 1', }, 'playlist_mincount': 30, - } + }, { + # no /sesong/ in path + 'url': 'https://tv.nrk.no/serie/lindmo/2016', + 'info_dict': { + 'id': 'lindmo/2016', + 'title': '2016', + }, + 'playlist_mincount': 29, + }, { + # weird nested _embedded in catalog JSON response + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens/sesong/1', + 'info_dict': { + 'id': 'dickie-dick-dickens/1', + 'title': 'Sesong 1', + }, + 'playlist_mincount': 11, + }, { + # 841 entries, multi page + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201509', + 'info_dict': { + 'id': 'dagsnytt/201509', + 'title': 'September 2015', + }, + 'playlist_mincount': 841, + }, { + # 180 entries, single page + 'url': 'https://tv.nrk.no/serie/spangas/sesong/1', + 'only_matching': True, + }] @classmethod def suitable(cls, url): return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) else super(NRKTVSeasonIE, cls).suitable(url)) - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - series = self._extract_series(webpage, display_id) + _ASSETS_KEYS = ('episodes', 'instalments',) + + def _entries(self, data, display_id): + for page_num in itertools.count(1): + embedded = data.get('_embedded') + if not isinstance(embedded, dict): + break + # Extract entries + for asset_key in self._ASSETS_KEYS: + entries = try_get( + embedded, + (lambda x: x[asset_key]['_embedded'][asset_key], + lambda x: x[asset_key]), + list) + for e in self._extract_entries(entries): + yield e + # Find next URL + for asset_key in self._ASSETS_KEYS: + next_url = urljoin( + 'https://psapi.nrk.no/', + try_get( + data, + (lambda x: x['_links']['next']['href'], + lambda x: x['_embedded'][asset_key]['_links']['next']['href']), + compat_str)) + if next_url: + break + if not next_url: + break + data = self._download_json( + next_url, display_id, + 'Downloading season JSON page %d' % page_num, fatal=False) + if not data: + break - season = next( - s for s in series['seasons'] - if int(display_id) == s.get('seasonNumber')) + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + domain = mobj.group('domain') + serie = mobj.group('serie') + season_id = mobj.group('id') + display_id = '%s/%s' % (serie, season_id) - title = try_get(season, lambda x: x['titles']['title'], compat_str) + data = self._download_json( + 'https://psapi.nrk.no/%s/catalog/series/%s/seasons/%s' + % (domain, serie, season_id), display_id, query={'pageSize': 50}) + title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id return self.playlist_result( - self._extract_episodes(season), display_id, title) + self._entries(data, display_id), + display_id, title) class NRKTVSeriesIE(NRKTVSerieBaseIE): -- GitLab From ea80c8f15eb5194dcbd72e75ab1d805a96d1d237 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:44:26 +0100 Subject: [PATCH 088/384] [nrktv:series] Improve extraction --- haruhi_dl/extractor/nrk.py | 122 ++++++++++++++++++++++--------------- 1 file changed, 74 insertions(+), 48 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 4d5f4c5ba..7cfbe7856 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -558,6 +558,46 @@ class NRKTVSerieBaseIE(InfoExtractor): 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) return entries + _ASSETS_KEYS = ('episodes', 'instalments',) + + def _extract_assets_key(self, embedded): + for asset_key in self._ASSETS_KEYS: + if embedded.get(asset_key): + return asset_key + + def _entries(self, data, display_id): + for page_num in itertools.count(1): + embedded = data.get('_embedded') + if not isinstance(embedded, dict): + break + assets_key = self._extract_assets_key(embedded) + if not assets_key: + break + # Extract entries + entries = try_get( + embedded, + (lambda x: x[assets_key]['_embedded'][assets_key], + lambda x: x[assets_key]), + list) + for e in self._extract_entries(entries): + yield e + # Find next URL + next_url = urljoin( + 'https://psapi.nrk.no/', + try_get( + data, + (lambda x: x['_links']['next']['href'], + lambda x: x['_embedded'][assets_key]['_links']['next']['href']), + compat_str)) + if not next_url: + break + data = self._download_json( + next_url, display_id, + 'Downloading %s JSON page %d' % (assets_key, page_num), + fatal=False) + if not data: + break + class NRKTVSeasonIE(NRKTVSerieBaseIE): _VALID_URL = r'https?://(?P<domain>tv|radio)\.nrk\.no/serie/(?P<serie>[^/]+)/(?:sesong/)?(?P<id>\d+)' @@ -603,41 +643,6 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) else super(NRKTVSeasonIE, cls).suitable(url)) - _ASSETS_KEYS = ('episodes', 'instalments',) - - def _entries(self, data, display_id): - for page_num in itertools.count(1): - embedded = data.get('_embedded') - if not isinstance(embedded, dict): - break - # Extract entries - for asset_key in self._ASSETS_KEYS: - entries = try_get( - embedded, - (lambda x: x[asset_key]['_embedded'][asset_key], - lambda x: x[asset_key]), - list) - for e in self._extract_entries(entries): - yield e - # Find next URL - for asset_key in self._ASSETS_KEYS: - next_url = urljoin( - 'https://psapi.nrk.no/', - try_get( - data, - (lambda x: x['_links']['next']['href'], - lambda x: x['_embedded'][asset_key]['_links']['next']['href']), - compat_str)) - if next_url: - break - if not next_url: - break - data = self._download_json( - next_url, display_id, - 'Downloading season JSON page %d' % page_num, fatal=False) - if not data: - break - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) domain = mobj.group('domain') @@ -648,6 +653,7 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): data = self._download_json( 'https://psapi.nrk.no/%s/catalog/series/%s/seasons/%s' % (domain, serie, season_id), display_id, query={'pageSize': 50}) + title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id return self.playlist_result( self._entries(data, display_id), @@ -655,9 +661,22 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?P<domain>tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' _TESTS = [{ + # new layout, instalments + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'info_dict': { + 'id': 'groenn-glede', + 'title': 'Grønn glede', + 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', + }, + 'playlist_mincount': 90, + }, { + # new layout, instalments, more entries + 'url': 'https://tv.nrk.no/serie/lindmo', + 'only_matching': True, + }, { 'url': 'https://tv.nrk.no/serie/blank', 'info_dict': { 'id': 'blank', @@ -665,24 +684,17 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e', }, 'playlist_mincount': 30, + 'expected_warnings': ['HTTP Error 404: Not Found'], }, { # new layout, seasons 'url': 'https://tv.nrk.no/serie/backstage', 'info_dict': { 'id': 'backstage', 'title': 'Backstage', - 'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3', + 'description': 'md5:63692ceb96813d9a207e9910483d948b', }, 'playlist_mincount': 60, - }, { - # new layout, instalments - 'url': 'https://tv.nrk.no/serie/groenn-glede', - 'info_dict': { - 'id': 'groenn-glede', - 'title': 'Grønn glede', - 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', - }, - 'playlist_mincount': 10, + 'expected_warnings': ['HTTP Error 404: Not Found'], }, { # old layout 'url': 'https://tv.nrksuper.no/serie/labyrint', @@ -711,16 +723,30 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): - series_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + domain = mobj.group('domain') + series_id = mobj.group('id') + + title = description = None webpage = self._download_webpage(url, series_id) - # New layout (e.g. https://tv.nrk.no/serie/backstage) series = self._extract_series(webpage, series_id, fatal=False) if series: title = try_get(series, lambda x: x['titles']['title'], compat_str) description = try_get( series, lambda x: x['titles']['subtitle'], compat_str) + + data = self._download_json( + 'https://psapi.nrk.no/%s/catalog/series/%s/instalments' + % (domain, series_id), series_id, query={'pageSize': 50}, + fatal=False) + if data: + return self.playlist_result( + self._entries(data, series_id), series_id, title, description) + + # New layout (e.g. https://tv.nrk.no/serie/backstage) + if series: entries = [] entries.extend(self._extract_seasons(series.get('seasons'))) entries.extend(self._extract_entries(series.get('instalments'))) -- GitLab From d3b00a0fa6467033f7bfd926e3744b9c739a2fd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:44:34 +0100 Subject: [PATCH 089/384] [nrktv:series] Improve extraction (closes #21926) --- haruhi_dl/extractor/nrk.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 7cfbe7856..4a82b11fd 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -521,7 +521,8 @@ class NRKTVSerieBaseIE(InfoExtractor): config = self._parse_json( self._search_regex( (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;', - r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>'), + r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', + r'PRELOADED_STATE_*\s*=\s*({.+?})\s*\n'), webpage, 'config', default='{}' if not fatal else NO_DEFAULT), display_id, fatal=False, transform_source=js_to_json) if not config: @@ -531,12 +532,26 @@ class NRKTVSerieBaseIE(InfoExtractor): (lambda x: x['initialState']['series'], lambda x: x['series']), dict) - def _extract_seasons(self, seasons): + def _extract_seasons(self, domain, series_id, seasons): + if isinstance(seasons, dict): + seasons = seasons.get('seasons') if not isinstance(seasons, list): return [] entries = [] for season in seasons: - entries.extend(self._extract_episodes(season)) + if not isinstance(season, dict): + continue + episodes = self._extract_episodes(season) + if episodes: + entries.extend(episodes) + continue + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + entries.append(self.url_result( + 'https://%s.nrk.no/serie/%s/sesong/%s' + % (domain, series_id, season_name), + ie=NRKTVSeasonIE.ie_key(), + video_title=season.get('title'))) return entries def _extract_episodes(self, season): @@ -713,6 +728,13 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, { 'url': 'https://tv.nrk.no/serie/postmann-pat', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens', + 'info_dict': { + 'id': 'dickie-dick-dickens', + }, + 'playlist_mincount': 8, + 'expected_warnings': ['HTTP Error 404: Not Found'], }] @classmethod @@ -748,7 +770,7 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): # New layout (e.g. https://tv.nrk.no/serie/backstage) if series: entries = [] - entries.extend(self._extract_seasons(series.get('seasons'))) + entries.extend(self._extract_seasons(domain, series_id, series.get('seasons'))) entries.extend(self._extract_entries(series.get('instalments'))) entries.extend(self._extract_episodes(series.get('extraMaterial'))) return self.playlist_result(entries, series_id, title, description) -- GitLab From 05fae5e182a41cacc8c1c7f2a09dd16c4260cbb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:44:39 +0100 Subject: [PATCH 090/384] [nrktv] Relax _VALID_URL --- haruhi_dl/extractor/nrk.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 4a82b11fd..08e331893 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -148,14 +148,7 @@ class NRKIE(NRKBaseIE): class NRKTVIE(NRKBaseIE): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P<id>[a-zA-Z]{4}\d{8})' - _VALID_URL = r'''(?x) - https?:// - (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie(?:/[^/]+){1,}|program)/ - (?![Ee]pisodes)%s - (?:/\d{2}-\d{2}-\d{4})? - (?:\#del=(?P<part_id>\d+))? - ''' % _EPISODE_RE + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', -- GitLab From 0ef2cc2a31abe0595fb1e9788e5e91911caf78bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:44:43 +0100 Subject: [PATCH 091/384] [nrk] Improve error extraction --- haruhi_dl/extractor/nrk.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 08e331893..f5e964753 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -31,6 +31,22 @@ class NRKBaseIE(InfoExtractor): re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url), video_id, 'mp4', 'm3u8_native', fatal=False) + def _raise_error(self, data): + MESSAGES = { + 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', + 'ProgramRightsHasExpired': 'Programmet har gått ut', + 'NoProgramRights': 'Ikke tilgjengelig', + 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', + } + message_type = data.get('messageType', '') + # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* + if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: + self.raise_geo_restricted( + msg=MESSAGES.get('ProgramIsGeoBlocked'), + countries=self._GEO_COUNTRIES) + message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) @@ -89,6 +105,9 @@ class NRKIE(NRKBaseIE): 'http://psapi.nrk.no/playback/manifest/%s' % video_id, video_id, 'Downloading manifest JSON') + if manifest.get('playability') == 'nonPlayable': + self._raise_error(manifest['nonPlayable']) + playable = manifest['playable'] formats = [] @@ -352,22 +371,7 @@ class NRKTVIE(NRKBaseIE): }] if not entries: - MESSAGES = { - 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', - 'ProgramRightsHasExpired': 'Programmet har gått ut', - 'NoProgramRights': 'Ikke tilgjengelig', - 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', - } - message_type = data.get('messageType', '') - # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: - self.raise_geo_restricted( - msg=MESSAGES.get('ProgramIsGeoBlocked'), - countries=self._GEO_COUNTRIES) - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, MESSAGES.get( - message_type, message_type)), - expected=True) + self._raise_error(data) series = conviva.get('seriesName') or data.get('seriesTitle') episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') -- GitLab From 8e06fa07b9df4c72a51509328bfbe91ecf355692 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:46:00 +0100 Subject: [PATCH 092/384] [teachable:course] Improve extraction (closes #24507, closes #27286) --- haruhi_dl/extractor/teachable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/teachable.py b/haruhi_dl/extractor/teachable.py index 5557a9925..df305e38a 100644 --- a/haruhi_dl/extractor/teachable.py +++ b/haruhi_dl/extractor/teachable.py @@ -269,7 +269,7 @@ class TeachableCourseIE(TeachableBaseIE): r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)', webpage): li = mobj.group('li') - if 'fa-youtube-play' not in li: + if 'fa-youtube-play' not in li and not re.search(r'\d{1,2}:\d{2}', li): continue lecture_url = self._search_regex( r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li, -- GitLab From b79c74dad91f5ac0915e9b4b55f353911ea27980 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:46:08 +0100 Subject: [PATCH 093/384] [peertube] Recognize audio-only formats (closes #27295) --- haruhi_dl/extractor/peertube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/haruhi_dl/extractor/peertube.py b/haruhi_dl/extractor/peertube.py index f89ccda7f..b5ee25fff 100644 --- a/haruhi_dl/extractor/peertube.py +++ b/haruhi_dl/extractor/peertube.py @@ -132,6 +132,8 @@ class PeerTubeSHIE(SelfhostedInfoExtractor): 'format_id': format_id, 'filesize': file_size, }) + if format_id == '0p': + f['vcodec'] = 'none' formats.append(f) self._sort_formats(formats) -- GitLab From 93e7c99ad6da5f254e3424f27b53a7dfdac5daca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:46:16 +0100 Subject: [PATCH 094/384] [peertube] Extract fps --- haruhi_dl/extractor/peertube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/haruhi_dl/extractor/peertube.py b/haruhi_dl/extractor/peertube.py index b5ee25fff..66aab5c90 100644 --- a/haruhi_dl/extractor/peertube.py +++ b/haruhi_dl/extractor/peertube.py @@ -134,6 +134,8 @@ class PeerTubeSHIE(SelfhostedInfoExtractor): }) if format_id == '0p': f['vcodec'] = 'none' + else: + f['fps'] = int_or_none(file_.get('fps')) formats.append(f) self._sort_formats(formats) -- GitLab From b8975995efcc5354443833aee0f411408d1f4713 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:46:24 +0100 Subject: [PATCH 095/384] [nrk] improve extraction - improve format extraction for old akamai formats - update some of the tests - add is_live value to entry info dict - request instalments only when their available - fix skole extraction --- haruhi_dl/extractor/nrk.py | 252 ++++++++++++------------------------- 1 file changed, 81 insertions(+), 171 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index f5e964753..8b31a6ad2 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -13,8 +13,6 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, - js_to_json, - NO_DEFAULT, parse_age_limit, parse_duration, try_get, @@ -24,9 +22,10 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['NO'] - def _extract_nrk_formats(self, asset_url, video_id): + if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): + return self._extract_akamai_formats( + re.sub(r'(?:b=\d+-\d+|__a__=off)&?', '', asset_url), video_id) return self._extract_m3u8_formats( re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url), video_id, 'mp4', 'm3u8_native', fatal=False) @@ -47,6 +46,12 @@ class NRKBaseIE(InfoExtractor): message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): + return self._download_json( + urljoin('http://psapi.nrk.no/', path), + video_id, note or 'Downloading %s JSON' % item, + fatal=fatal, query=query) + class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) @@ -64,7 +69,7 @@ class NRKIE(NRKBaseIE): _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '706f34cdf1322577589e369e522b50ef', + 'md5': 'f46be075326e23ad0e524edfcb06aeb6', 'info_dict': { 'id': '150533', 'ext': 'mp4', @@ -78,7 +83,7 @@ class NRKIE(NRKBaseIE): # MD5 is unstable 'info_dict': { 'id': '154915', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Slik høres internett ut når du er blind', 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', 'duration': 20, @@ -101,9 +106,9 @@ class NRKIE(NRKBaseIE): }] def _extract_from_playback(self, video_id): - manifest = self._download_json( - 'http://psapi.nrk.no/playback/manifest/%s' % video_id, - video_id, 'Downloading manifest JSON') + path_templ = 'playback/%s/' + video_id + call_playback_api = lambda x: self._call_api(path_templ % x, video_id, x) + manifest = call_playback_api('manifest') if manifest.get('playability') == 'nonPlayable': self._raise_error(manifest['nonPlayable']) @@ -123,9 +128,7 @@ class NRKIE(NRKBaseIE): formats.extend(self._extract_nrk_formats(format_url, video_id)) self._sort_formats(formats) - data = self._download_json( - 'http://psapi.nrk.no/playback/metadata/%s' % video_id, - video_id, 'Downloading metadata JSON') + data = call_playback_api('metadata') preplay = data['preplay'] titles = preplay['titles'] @@ -171,18 +174,18 @@ class NRKTVIE(NRKBaseIE): _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', - 'md5': '8270824df46ec629b66aeaa5796b36fb', + 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', 'info_dict': { 'id': 'MDDP12000117AA', 'ext': 'mp4', 'title': 'Alarm Trolltunga', 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', - 'duration': 2223, + 'duration': 2223.44, 'age_limit': 6, }, }, { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '9a167e54d04671eb6317a37b7bc8a280', + 'md5': '8d40dab61cea8ab0114e090b029a0565', 'info_dict': { 'id': 'MUHH48000314AA', 'ext': 'mp4', @@ -200,7 +203,7 @@ class NRKTVIE(NRKBaseIE): 'ext': 'mp4', 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', - 'duration': 4605, + 'duration': 4605.08, 'series': 'Kunnskapskanalen', 'episode': '24.05.2014', }, @@ -223,39 +226,13 @@ class NRKTVIE(NRKBaseIE): 'skip': 'particular part is not supported currently', }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', - 'playlist': [{ - 'info_dict': { - 'id': 'MSPO40010515AH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 772, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'info_dict': { - 'id': 'MSPO40010515BH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 6175, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }], 'info_dict': { - 'id': 'MSPO40010515', + 'id': 'MSPO40010515AH', + 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + 'description': 'md5:c03aba1e917561eface5214020551b7a', }, - 'expected_warnings': ['Video is geo restricted'], + 'skip': 'Video is geo restricted', }, { 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', 'info_dict': { @@ -286,6 +263,7 @@ class NRKTVIE(NRKBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'ProgramRightsHasExpired', }, { 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', 'only_matching': True, @@ -354,6 +332,7 @@ class NRKTVIE(NRKBaseIE): 'duration': duration, 'subtitles': subtitles, 'formats': formats, + 'is_live': live, }) if not entries: @@ -368,6 +347,7 @@ class NRKTVIE(NRKBaseIE): 'title': make_title(title), 'duration': duration, 'formats': formats, + 'is_live': live, }] if not entries: @@ -513,49 +493,7 @@ class NRKTVEpisodeIE(InfoExtractor): return info -class NRKTVSerieBaseIE(InfoExtractor): - def _extract_series(self, webpage, display_id, fatal=True): - config = self._parse_json( - self._search_regex( - (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;', - r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', - r'PRELOADED_STATE_*\s*=\s*({.+?})\s*\n'), - webpage, 'config', default='{}' if not fatal else NO_DEFAULT), - display_id, fatal=False, transform_source=js_to_json) - if not config: - return - return try_get( - config, - (lambda x: x['initialState']['series'], lambda x: x['series']), - dict) - - def _extract_seasons(self, domain, series_id, seasons): - if isinstance(seasons, dict): - seasons = seasons.get('seasons') - if not isinstance(seasons, list): - return [] - entries = [] - for season in seasons: - if not isinstance(season, dict): - continue - episodes = self._extract_episodes(season) - if episodes: - entries.extend(episodes) - continue - season_name = season.get('name') - if season_name and isinstance(season_name, compat_str): - entries.append(self.url_result( - 'https://%s.nrk.no/serie/%s/sesong/%s' - % (domain, series_id, season_name), - ie=NRKTVSeasonIE.ie_key(), - video_title=season.get('title'))) - return entries - - def _extract_episodes(self, season): - if not isinstance(season, dict): - return [] - return self._extract_entries(season.get('episodes')) - +class NRKTVSerieBaseIE(NRKBaseIE): def _extract_entries(self, entry_list): if not isinstance(entry_list, list): return [] @@ -579,7 +517,7 @@ class NRKTVSerieBaseIE(InfoExtractor): def _entries(self, data, display_id): for page_num in itertools.count(1): - embedded = data.get('_embedded') + embedded = data.get('_embedded') or data if not isinstance(embedded, dict): break assets_key = self._extract_assets_key(embedded) @@ -594,18 +532,16 @@ class NRKTVSerieBaseIE(InfoExtractor): for e in self._extract_entries(entries): yield e # Find next URL - next_url = urljoin( - 'https://psapi.nrk.no/', - try_get( - data, - (lambda x: x['_links']['next']['href'], - lambda x: x['_embedded'][assets_key]['_links']['next']['href']), - compat_str)) - if not next_url: + next_url_path = try_get( + data, + (lambda x: x['_links']['next']['href'], + lambda x: x['_embedded'][assets_key]['_links']['next']['href']), + compat_str) + if not next_url_path: break - data = self._download_json( - next_url, display_id, - 'Downloading %s JSON page %d' % (assets_key, page_num), + data = self._call_api( + next_url_path, display_id, + note='Downloading %s JSON page %d' % (assets_key, page_num), fatal=False) if not data: break @@ -656,15 +592,12 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): else super(NRKTVSeasonIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - domain = mobj.group('domain') - serie = mobj.group('serie') - season_id = mobj.group('id') + domain, serie, season_id = re.match(self._VALID_URL, url).groups() display_id = '%s/%s' % (serie, season_id) - data = self._download_json( - 'https://psapi.nrk.no/%s/catalog/series/%s/seasons/%s' - % (domain, serie, season_id), display_id, query={'pageSize': 50}) + data = self._call_api( + '%s/catalog/series/%s/seasons/%s' % (domain, serie, season_id), + display_id, 'season', query={'pageSize': 50}) title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id return self.playlist_result( @@ -673,8 +606,7 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?P<domain>tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' - _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' + _VALID_URL = r'https?://(?P<domain>(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/serie/(?P<id>[^/]+)' _TESTS = [{ # new layout, instalments 'url': 'https://tv.nrk.no/serie/groenn-glede', @@ -696,7 +628,6 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e', }, 'playlist_mincount': 30, - 'expected_warnings': ['HTTP Error 404: Not Found'], }, { # new layout, seasons 'url': 'https://tv.nrk.no/serie/backstage', @@ -706,14 +637,13 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'description': 'md5:63692ceb96813d9a207e9910483d948b', }, 'playlist_mincount': 60, - 'expected_warnings': ['HTTP Error 404: Not Found'], }, { # old layout 'url': 'https://tv.nrksuper.no/serie/labyrint', 'info_dict': { 'id': 'labyrint', 'title': 'Labyrint', - 'description': 'md5:318b597330fdac5959247c9b69fdb1ec', + 'description': 'I Daidalos sin undersjøiske Labyrint venter spennende oppgaver, skumle robotskapninger og slim.', }, 'playlist_mincount': 3, }, { @@ -729,9 +659,13 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens', 'info_dict': { 'id': 'dickie-dick-dickens', + 'title': 'Dickie Dick Dickens', + 'description': 'md5:19e67411ffe57f7dce08a943d7a0b91f', }, 'playlist_mincount': 8, - 'expected_warnings': ['HTTP Error 404: Not Found'], + }, { + 'url': 'https://nrksuper.no/serie/labyrint', + 'only_matching': True, }] @classmethod @@ -742,57 +676,39 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - domain = mobj.group('domain') - series_id = mobj.group('id') - - title = description = None - - webpage = self._download_webpage(url, series_id) - - series = self._extract_series(webpage, series_id, fatal=False) - if series: - title = try_get(series, lambda x: x['titles']['title'], compat_str) - description = try_get( - series, lambda x: x['titles']['subtitle'], compat_str) - - data = self._download_json( - 'https://psapi.nrk.no/%s/catalog/series/%s/instalments' - % (domain, series_id), series_id, query={'pageSize': 50}, - fatal=False) - if data: - return self.playlist_result( - self._entries(data, series_id), series_id, title, description) - - # New layout (e.g. https://tv.nrk.no/serie/backstage) - if series: - entries = [] - entries.extend(self._extract_seasons(domain, series_id, series.get('seasons'))) - entries.extend(self._extract_entries(series.get('instalments'))) - entries.extend(self._extract_episodes(series.get('extraMaterial'))) - return self.playlist_result(entries, series_id, title, description) - - # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint) - entries = [ - self.url_result( - 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( - series=series_id, season=season_id)) - for season_id in re.findall(self._ITEM_RE, webpage) - ] + site, series_id = re.match(self._VALID_URL, url).groups() + domain = 'radio' if site == 'radio.nrk' else 'tv' - title = self._html_search_meta( - 'seriestitle', webpage, - 'title', default=None) or self._og_search_title( - webpage, fatal=False) - if title: - title = self._search_regex( - r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title) + series = self._call_api( + '%s/catalog/series/%s' % (domain, series_id), series_id, 'serie') + titles = try_get(series, [ + lambda x: x['titles'], + lambda x: x[x['type']]['titles'], + lambda x: x[x['seriesType']]['titles'], + ]) or {} - description = self._html_search_meta( - 'series_description', webpage, - 'description', default=None) or self._og_search_description(webpage) + entries = [] + entries.extend(self._entries(series, series_id)) + embedded = series.get('_embedded') or {} + linked_seasons = try_get(series, lambda x: x['_links']['seasons']) or [] + embedded_seasons = embedded.get('seasons') or [] + if len(linked_seasons) > len(embedded_seasons): + for season in linked_seasons: + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + entries.append(self.url_result( + 'https://%s.nrk.no/serie/%s/sesong/%s' + % (domain, series_id, season_name), + ie=NRKTVSeasonIE.ie_key(), + video_title=season.get('title'))) + else: + for season in embedded_seasons: + entries.extend(self._entries(season, series_id)) + entries.extend(self._entries( + embedded.get('extraMaterial') or {}, series_id)) - return self.playlist_result(entries, series_id, title, description) + return self.playlist_result( + entries, series_id, titles.get('title'), titles.get('subtitle')) class NRKTVDirekteIE(NRKTVIE): @@ -896,14 +812,8 @@ class NRKSkoleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://mimir.nrk.no/plugin/1.0/static?mediaId=%s' % video_id, - video_id) - - nrk_id = self._parse_json( - self._search_regex( - r'<script[^>]+type=["\']application/json["\'][^>]*>({.+?})</script>', - webpage, 'application json'), - video_id)['activeMedia']['psId'] + nrk_id = self._download_json( + 'https://nrkno-skole-prod.kube.nrk.no/skole/api/media/%s' % video_id, + video_id)['psId'] return self.url_result('nrk:%s' % nrk_id) -- GitLab From d88959f3b3a880eb18055d1ace630a080bf7050d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:46:29 +0100 Subject: [PATCH 096/384] [nrk] improve format extraction --- haruhi_dl/extractor/nrk.py | 40 ++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 8b31a6ad2..289a0a3a4 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import itertools +import random import re from .common import InfoExtractor @@ -22,13 +23,26 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['NO'] + _CDN_REPL_REGEX = r'''(?x):// + (?: + nrkod\d{1,2}-httpcache0-47115-cacheod0\.dna\.ip-only\.net/47115-cacheod0| + nrk-od-no\.telenorcdn\.net| + minicdn-od\.nrk\.no/od/nrkhd-osl-rr\.netwerk\.no/no + )/''' + def _extract_nrk_formats(self, asset_url, video_id): if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): return self._extract_akamai_formats( re.sub(r'(?:b=\d+-\d+|__a__=off)&?', '', asset_url), video_id) - return self._extract_m3u8_formats( - re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url), - video_id, 'mp4', 'm3u8_native', fatal=False) + asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url) + formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) + if not formats and re.search(self._CDN_REPL_REGEX, asset_url): + formats = self._extract_m3u8_formats( + re.sub(self._CDN_REPL_REGEX, '://nrk-od-%02d.akamaized.net/no/' % random.randint(0, 99), asset_url), + video_id, 'mp4', 'm3u8_native', fatal=False) + return formats def _raise_error(self, data): MESSAGES = { @@ -107,8 +121,10 @@ class NRKIE(NRKBaseIE): def _extract_from_playback(self, video_id): path_templ = 'playback/%s/' + video_id - call_playback_api = lambda x: self._call_api(path_templ % x, video_id, x) - manifest = call_playback_api('manifest') + def call_playback_api(item, query=None): + return self._call_api(path_templ % item, video_id, item, query=query) + # known values for preferredCdn: akamai, iponly, minicdn and telenor + manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) if manifest.get('playability') == 'nonPlayable': self._raise_error(manifest['nonPlayable']) @@ -195,7 +211,6 @@ class NRKTVIE(NRKBaseIE): 'series': '20 spørsmål', 'episode': '23.05.2014', }, - 'skip': 'NoProgramRights', }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', 'info_dict': { @@ -214,15 +229,15 @@ class NRKTVIE(NRKBaseIE): # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'id': 'MSPO40010515AH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:c03aba1e917561eface5214020551b7a', }, 'params': { 'skip_download': True, }, - 'expected_warnings': ['Video is geo restricted'], + 'expected_warnings': ['Failed to download m3u8 information'], 'skip': 'particular part is not supported currently', }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', @@ -232,7 +247,7 @@ class NRKTVIE(NRKBaseIE): 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', }, - 'skip': 'Video is geo restricted', + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', 'info_dict': { @@ -312,6 +327,7 @@ class NRKTVIE(NRKBaseIE): asset_url = asset.get('url') if not asset_url or asset_url in urls: continue + urls.append(asset_url) formats = self._extract_nrk_formats(asset_url, video_id) if not formats: continue -- GitLab From 04ac0950aa22aee204820aa155f2e2df1cc39ef4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:46:34 +0100 Subject: [PATCH 097/384] [nrk] reduce the number of instalments requests --- haruhi_dl/extractor/nrk.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 289a0a3a4..24993b1c8 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -121,6 +121,7 @@ class NRKIE(NRKBaseIE): def _extract_from_playback(self, video_id): path_templ = 'playback/%s/' + video_id + def call_playback_api(item, query=None): return self._call_api(path_templ % item, video_id, item, query=query) # known values for preferredCdn: akamai, iponly, minicdn and telenor @@ -696,7 +697,8 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): domain = 'radio' if site == 'radio.nrk' else 'tv' series = self._call_api( - '%s/catalog/series/%s' % (domain, series_id), series_id, 'serie') + '%s/catalog/series/%s' % (domain, series_id), + series_id, 'serie', query={'embeddedInstalmentsPageSize': 50}) titles = try_get(series, [ lambda x: x['titles'], lambda x: x[x['type']]['titles'], -- GitLab From 5c239bfc6547c85ee466d6b48f5385562ee9abdf Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:48:24 +0100 Subject: [PATCH 098/384] [nrk] reduce requests for Radio series --- haruhi_dl/extractor/nrk.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 24993b1c8..fdf2d7407 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -694,11 +694,13 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): def _real_extract(self, url): site, series_id = re.match(self._VALID_URL, url).groups() - domain = 'radio' if site == 'radio.nrk' else 'tv' + is_radio = site == 'radio.nrk' + domain = 'radio' if is_radio else 'tv' + size_prefix = 'p' if is_radio else 'embeddedInstalmentsP' series = self._call_api( '%s/catalog/series/%s' % (domain, series_id), - series_id, 'serie', query={'embeddedInstalmentsPageSize': 50}) + series_id, 'serie', query={size_prefix + 'ageSize': 50}) titles = try_get(series, [ lambda x: x['titles'], lambda x: x[x['type']]['titles'], -- GitLab From e76a3363ba73b82b0ca616f4a235a94934ad24b8 Mon Sep 17 00:00:00 2001 From: renalid <renalid@gmail.com> Date: Fri, 26 Feb 2021 14:48:29 +0100 Subject: [PATCH 099/384] [generic] Extract RSS video description (#27177) --- haruhi_dl/extractor/generic.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index babc59dcc..0a6bc25c4 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -204,11 +204,19 @@ class GenericIE(InfoExtractor): { 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'info_dict': { - 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - 'ext': 'm4v', - 'upload_date': '20150228', - 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - } + 'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'title': 'MSNBC Rachel Maddow (video)', + 'description': 're:.*her unique approach to storytelling.*', + }, + 'playlist': [{ + 'info_dict': { + 'ext': 'mov', + 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726', + 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726', + 'description': 're:.*her unique approach to storytelling.*', + 'upload_date': '20201204', + }, + }], }, # RSS feed with enclosures and unsupported link URLs { @@ -2236,6 +2244,7 @@ class GenericIE(InfoExtractor): '_type': 'url_transparent', 'url': next_url, 'title': it.find('title').text, + 'description': xpath_text(it, 'description', default=None), }) return { -- GitLab From 0257cb6e427c8291f07d80a5177addef81e64da7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:48:35 +0100 Subject: [PATCH 100/384] [generic] Extract RSS video timestamp --- haruhi_dl/extractor/generic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 0a6bc25c4..bdc6271aa 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -30,6 +30,7 @@ from ..utils import ( smuggle_url, unescapeHTML, unified_strdate, + unified_timestamp, unsmuggle_url, UnsupportedError, xpath_text, @@ -2245,6 +2246,8 @@ class GenericIE(InfoExtractor): 'url': next_url, 'title': it.find('title').text, 'description': xpath_text(it, 'description', default=None), + 'timestamp': unified_timestamp( + xpath_text(it, 'pubDate', default=None)), }) return { -- GitLab From 1744410baa2b705aaacf4bbbc1adb61333a6ef7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:48:44 +0100 Subject: [PATCH 101/384] [generic] Extract RSS video itunes metadata --- haruhi_dl/extractor/generic.py | 36 +++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index bdc6271aa..0b9ec2b74 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -20,12 +20,14 @@ from ..utils import ( ExtractorError, float_or_none, HEADRequest, + int_or_none, is_html, js_to_json, KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, orderedSet, + parse_duration, sanitized_Request, smuggle_url, unescapeHTML, @@ -33,7 +35,9 @@ from ..utils import ( unified_timestamp, unsmuggle_url, UnsupportedError, + url_or_none, xpath_text, + xpath_with_ns, ) from .commonprotocols import RtmpIE from .brightcove import ( @@ -212,10 +216,12 @@ class GenericIE(InfoExtractor): 'playlist': [{ 'info_dict': { 'ext': 'mov', - 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726', - 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726', + 'id': 'pdv_maddow_netcast_mov-12-04-2020-224335', + 'title': 're:MSNBC Rachel Maddow', 'description': 're:.*her unique approach to storytelling.*', - 'upload_date': '20201204', + 'timestamp': int, + 'upload_date': compat_str, + 'duration': float, }, }], }, @@ -2226,6 +2232,10 @@ class GenericIE(InfoExtractor): playlist_desc_el = doc.find('./channel/description') playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text + NS_MAP = { + 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', + } + entries = [] for it in doc.findall('./channel/item'): next_url = None @@ -2241,6 +2251,20 @@ class GenericIE(InfoExtractor): if not next_url: continue + def itunes(key): + return xpath_text( + it, xpath_with_ns('./itunes:%s' % key, NS_MAP), + default=None) + + duration = itunes('duration') + explicit = itunes('explicit') + if explicit == 'true': + age_limit = 18 + elif explicit == 'false': + age_limit = 0 + else: + age_limit = None + entries.append({ '_type': 'url_transparent', 'url': next_url, @@ -2248,6 +2272,12 @@ class GenericIE(InfoExtractor): 'description': xpath_text(it, 'description', default=None), 'timestamp': unified_timestamp( xpath_text(it, 'pubDate', default=None)), + 'duration': int_or_none(duration) or parse_duration(duration), + 'thumbnail': url_or_none(itunes('image')), + 'episode': itunes('title'), + 'episode_number': int_or_none(itunes('episode')), + 'season_number': int_or_none(itunes('season')), + 'age_limit': age_limit, }) return { -- GitLab From 371904a4d994d26f751b6e7a25b2f217de5a78f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:50:42 +0100 Subject: [PATCH 102/384] [extractor/common] Extract timestamp from Last-Modified header --- haruhi_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 0b9ec2b74..e67e883b0 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -2397,7 +2397,7 @@ class GenericIE(InfoExtractor): info_dict = { 'id': video_id, 'title': self._generic_title(url), - 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) + 'timestamp': unified_timestamp(head_response.headers.get('Last-Modified')) } # Check for direct link to a video -- GitLab From 96e01843779dd0e49b252280ed55eaaa73229bca Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:50:53 +0100 Subject: [PATCH 103/384] [aenetworks] Fix extraction - Fix Fastly format extraction - Add support for play and watch subdomains - Extract series metadata closes #23363 closes #23390 closes #26795 closes #26985 --- haruhi_dl/extractor/aenetworks.py | 271 ++++++++++++++++++------------ haruhi_dl/extractor/extractors.py | 2 + 2 files changed, 162 insertions(+), 111 deletions(-) diff --git a/haruhi_dl/extractor/aenetworks.py b/haruhi_dl/extractor/aenetworks.py index 611b948f5..3d0cf1208 100644 --- a/haruhi_dl/extractor/aenetworks.py +++ b/haruhi_dl/extractor/aenetworks.py @@ -5,20 +5,30 @@ import re from .theplatform import ThePlatformIE from ..utils import ( - extract_attributes, ExtractorError, int_or_none, - smuggle_url, update_url_query, -) -from ..compat import ( - compat_urlparse, + urlencode_postdata, ) class AENetworksBaseIE(ThePlatformIE): + _BASE_URL_REGEX = r'''(?x)https?:// + (?:(?:www|play|watch)\.)? + (?P<domain> + (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| + fyi\.tv + )/''' _THEPLATFORM_KEY = 'crazyjava' _THEPLATFORM_SECRET = 's3cr3t' + _DOMAIN_MAP = { + 'history.com': ('HISTORY', 'history'), + 'aetv.com': ('AETV', 'aetv'), + 'mylifetime.com': ('LIFETIME', 'lifetime'), + 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), + 'fyi.tv': ('FYI', 'fyi'), + 'historyvault.com': (None, 'historyvault'), + } def _extract_aen_smil(self, smil_url, video_id, auth=None): query = {'mbr': 'true'} @@ -31,7 +41,7 @@ class AENetworksBaseIE(ThePlatformIE): 'assetTypes': 'high_video_s3' }, { 'assetTypes': 'high_video_s3', - 'switch': 'hls_ingest_fastly' + 'switch': 'hls_high_fastly', }] formats = [] subtitles = {} @@ -61,20 +71,13 @@ class AENetworksBaseIE(ThePlatformIE): class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?P<domain> - (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| - fyi\.tv - )/ - (?: - shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})| - movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?| - specials/(?P<special_display_id>[^/]+)/(?:full-special|preview-)| - collections/[^/]+/(?P<collection_display_id>[^/]+) - ) - ''' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'''(?P<id> + shows/[^/]+/season-\d+/episode-\d+| + (?: + (?:movie|special)s/[^/]+| + (?:shows/[^/]+/)?videos + )/[^/?#&]+ + )''' _TESTS = [{ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'info_dict': { @@ -91,22 +94,23 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], + 'skip': 'This video is only available for users of participating TV providers.', }, { - 'url': 'http://www.history.com/shows/ancient-aliens/season-1', + 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', 'info_dict': { - 'id': '71889446852', + 'id': '600587331957', + 'ext': 'mp4', + 'title': 'Inlawful Entry', + 'description': 'md5:57c12115a2b384d883fe64ca50529e08', + 'timestamp': 1452634428, + 'upload_date': '20160112', + 'uploader': 'AENE-NEW', }, - 'playlist_mincount': 5, - }, { - 'url': 'http://www.mylifetime.com/shows/atlanta-plastic', - 'info_dict': { - 'id': 'SERIES4317', - 'title': 'Atlanta Plastic', + 'params': { + # m3u8 download + 'skip_download': True, }, - 'playlist_mincount': 2, - }, { - 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', - 'only_matching': True + 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True @@ -117,80 +121,152 @@ class AENetworksIE(AENetworksBaseIE): 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', 'only_matching': True }, { - 'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us', + 'url': 'https://watch.lifetimemovieclub.com/movies/10-year-reunion/full-movie', 'only_matching': True }, { 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', 'only_matching': True }, { - 'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward', + 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', 'only_matching': True }, { - 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', + 'url': 'http://www.history.com/videos/history-of-valentines-day', + 'only_matching': True + }, { + 'url': 'https://play.aetv.com/shows/duck-dynasty/videos/best-of-duck-dynasty-getting-quack-in-shape', 'only_matching': True }] - _DOMAIN_TO_REQUESTOR_ID = { - 'history.com': 'HISTORY', - 'aetv.com': 'AETV', - 'mylifetime.com': 'LIFETIME', - 'lifetimemovieclub.com': 'LIFETIMEMOVIECLUB', - 'fyi.tv': 'FYI', - } def _real_extract(self, url): - domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups() - display_id = show_path or movie_display_id or special_display_id or collection_display_id - webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers()) - if show_path: - url_parts = show_path.split('/') - url_parts_len = len(url_parts) - if url_parts_len == 1: - entries = [] - for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): - entries.append(self.url_result( - compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) - if entries: - return self.playlist_result( - entries, self._html_search_meta('aetn:SeriesId', webpage), - self._html_search_meta('aetn:SeriesTitle', webpage)) - else: - # single season - url_parts_len = 2 - if url_parts_len == 2: - entries = [] - for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): - episode_attributes = extract_attributes(episode_item) - episode_url = compat_urlparse.urljoin( - url, episode_attributes['data-canonical']) - entries.append(self.url_result( - episode_url, 'AENetworks', - episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id'))) - return self.playlist_result( - entries, self._html_search_meta('aetn:SeasonId', webpage)) - - video_id = self._html_search_meta('aetn:VideoID', webpage) - media_url = self._search_regex( - [r"media_url\s*=\s*'(?P<url>[^']+)'", - r'data-media-url=(?P<url>(?:https?:)?//[^\s>]+)', - r'data-media-url=(["\'])(?P<url>(?:(?!\1).)+?)\1'], - webpage, 'video url', group='url') + domain, canonical = re.match(self._VALID_URL, url).groups() + requestor_id, brand = self._DOMAIN_MAP[domain] + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, + canonical, query={'filter[canonical]': '/' + canonical})['results'][0] + title = result['title'] + video_id = result['id'] + media_url = result['publicUrl'] theplatform_metadata = self._download_theplatform_metadata(self._search_regex( r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) info = self._parse_theplatform_metadata(theplatform_metadata) auth = None if theplatform_metadata.get('AETN$isBehindWall'): - requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] resource = self._get_mvpd_resource( requestor_id, theplatform_metadata['title'], theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), theplatform_metadata['ratings'][0]['rating']) auth = self._extract_mvpd_auth( url, video_id, requestor_id, resource) - info.update(self._search_json_ld(webpage, video_id, fatal=False)) info.update(self._extract_aen_smil(media_url, video_id, auth)) + info.update({ + 'title': title, + 'series': result.get('seriesName'), + 'season_number': int_or_none(result.get('tvSeasonNumber')), + 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), + }) return info +class AENetworksListBaseIE(AENetworksBaseIE): + def _call_api(self, resource, slug, brand, fields): + return self._download_json( + 'https://yoga.appsvcs.aetnd.com/graphql', + slug, query={'brand': brand}, data=urlencode_postdata({ + 'query': '''{ + %s(slug: "%s") { + %s + } +}''' % (resource, slug, fields), + }))['data'][resource] + + def _real_extract(self, url): + domain, slug = re.match(self._VALID_URL, url).groups() + _, brand = self._DOMAIN_MAP[domain] + playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS) + base_url = 'http://watch.%s' % domain + + entries = [] + for item in (playlist.get(self._ITEMS_KEY) or []): + doc = self._get_doc(item) + canonical = doc.get('canonical') + if not canonical: + continue + entries.append(self.url_result( + base_url + canonical, AENetworksIE.ie_key(), doc.get('id'))) + + description = None + if self._PLAYLIST_DESCRIPTION_KEY: + description = playlist.get(self._PLAYLIST_DESCRIPTION_KEY) + + return self.playlist_result( + entries, playlist.get('id'), + playlist.get(self._PLAYLIST_TITLE_KEY), description) + + +class AENetworksCollectionIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:collection' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'(?:[^/]+/)*(?:list|collections)/(?P<id>[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://watch.historyvault.com/list/america-the-story-of-us', + 'info_dict': { + 'id': '282', + 'title': 'America The Story of Us', + }, + 'playlist_mincount': 12, + }, { + 'url': 'https://watch.historyvault.com/shows/america-the-story-of-us-2/season-1/list/america-the-story-of-us', + 'only_matching': True + }, { + 'url': 'https://www.historyvault.com/collections/mysteryquest', + 'only_matching': True + }] + _RESOURCE = 'list' + _ITEMS_KEY = 'items' + _PLAYLIST_TITLE_KEY = 'display_title' + _PLAYLIST_DESCRIPTION_KEY = None + _FIELDS = '''id + display_title + items { + ... on ListVideoItem { + doc { + canonical + id + } + } + }''' + + def _get_doc(self, item): + return item.get('doc') or {} + + +class AENetworksShowIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:show' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'shows/(?P<id>[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.history.com/shows/ancient-aliens', + 'info_dict': { + 'id': 'SH012427480000', + 'title': 'Ancient Aliens', + 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', + }, + 'playlist_mincount': 168, + }] + _RESOURCE = 'series' + _ITEMS_KEY = 'episodes' + _PLAYLIST_TITLE_KEY = 'title' + _PLAYLIST_DESCRIPTION_KEY = 'description' + _FIELDS = '''description + id + title + episodes { + canonical + id + }''' + + def _get_doc(self, item): + return item + + class HistoryTopicIE(AENetworksBaseIE): IE_NAME = 'history:topic' IE_DESC = 'History.com Topic' @@ -204,6 +280,7 @@ class HistoryTopicIE(AENetworksBaseIE): 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', 'timestamp': 1375819729, 'upload_date': '20130806', + 'uploader': 'AENE-NEW', }, 'params': { # m3u8 download @@ -212,36 +289,8 @@ class HistoryTopicIE(AENetworksBaseIE): 'add_ie': ['ThePlatform'], }] - def theplatform_url_result(self, theplatform_url, video_id, query): - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': smuggle_url( - update_url_query(theplatform_url, query), - { - 'sig': { - 'key': self._THEPLATFORM_KEY, - 'secret': self._THEPLATFORM_SECRET, - }, - 'force_smil_url': True - }), - 'ie_key': 'ThePlatform', - } - def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'<phoenix-iframe[^>]+src="[^"]+\btpid=(\d+)', webpage, 'tpid') - result = self._download_json( - 'https://feeds.video.aetnd.com/api/v2/history/videos', - video_id, query={'filter[id]': video_id})['results'][0] - title = result['title'] - info = self._extract_aen_smil(result['publicUrl'], video_id) - info.update({ - 'title': title, - 'description': result.get('description'), - 'duration': int_or_none(result.get('duration')), - 'timestamp': int_or_none(result.get('added'), 1000), - }) - return info + return self.url_result( + 'http://www.history.com/videos/' + display_id, + AENetworksIE.ie_key()) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 46f3b604c..7a0706532 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -30,6 +30,8 @@ from .adobetv import ( from .adultswim import AdultSwimIE from .aenetworks import ( AENetworksIE, + AENetworksCollectionIE, + AENetworksShowIE, HistoryTopicIE, ) from .afreecatv import AfreecaTVIE -- GitLab From f717a3cc82662cc677ffede2ccf30a1709bbf007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:51:01 +0100 Subject: [PATCH 104/384] [extractor/generic] Remove unused import --- haruhi_dl/extractor/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index e67e883b0..037fc4d7a 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -31,7 +31,6 @@ from ..utils import ( sanitized_Request, smuggle_url, unescapeHTML, - unified_strdate, unified_timestamp, unsmuggle_url, UnsupportedError, -- GitLab From 32e8c82a3b1f2135cb25ad796102d83a3f0ec69d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:51:20 +0100 Subject: [PATCH 105/384] =?UTF-8?q?[slideslive]=20Add=20support=20for=20yo?= =?UTF-8?q?da=20service=20videos=20and=20extract=20subtitle=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …s (closes #27323) --- haruhi_dl/extractor/slideslive.py | 55 ++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/slideslive.py b/haruhi_dl/extractor/slideslive.py index d9ea76831..cd70841a9 100644 --- a/haruhi_dl/extractor/slideslive.py +++ b/haruhi_dl/extractor/slideslive.py @@ -2,7 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + bool_or_none, + smuggle_url, + try_get, + url_or_none, +) class SlidesLiveIE(InfoExtractor): @@ -18,8 +23,21 @@ class SlidesLiveIE(InfoExtractor): 'description': 'Watch full version of this video at https://slideslive.com/38902413.', 'uploader': 'SlidesLive Videos - A', 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', + 'timestamp': 1597615266, 'upload_date': '20170925', } + }, { + # video_service_name = yoda + 'url': 'https://slideslive.com/38935785', + 'md5': '575cd7a6c0acc6e28422fe76dd4bcb1a', + 'info_dict': { + 'id': 'RMraDYN5ozA_', + 'ext': 'mp4', + 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', + }, + 'params': { + 'format': 'bestvideo', + }, }, { # video_service_name = youtube 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', @@ -39,18 +57,47 @@ class SlidesLiveIE(InfoExtractor): video_data = self._download_json( 'https://ben.slideslive.com/player/' + video_id, video_id) service_name = video_data['video_service_name'].lower() - assert service_name in ('url', 'vimeo', 'youtube') + assert service_name in ('url', 'yoda', 'vimeo', 'youtube') service_id = video_data['video_service_id'] + subtitles = {} + for sub in try_get(video_data, lambda x: x['subtitles'], list) or []: + if not isinstance(sub, dict): + continue + webvtt_url = url_or_none(sub.get('webvtt_url')) + if not webvtt_url: + continue + lang = sub.get('language') or 'en' + subtitles.setdefault(lang, []).append({ + 'url': webvtt_url, + }) info = { 'id': video_id, 'thumbnail': video_data.get('thumbnail'), - 'url': service_id, + 'is_live': bool_or_none(video_data.get('is_live')), + 'subtitles': subtitles, } - if service_name == 'url': + if service_name in ('url', 'yoda'): info['title'] = video_data['title'] + if service_name == 'url': + info['url'] = service_id + else: + formats = [] + _MANIFEST_PATTERN = 'https://01.cdn.yoda.slideslive.com/%s/master.%s' + formats.extend(self._extract_m3u8_formats( + _MANIFEST_PATTERN % (service_id, 'm3u8'), service_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_mpd_formats( + _MANIFEST_PATTERN % (service_id, 'mpd'), service_id, + mpd_id='dash', fatal=False)) + self._sort_formats(formats) + info.update({ + 'id': service_id, + 'formats': formats, + }) else: info.update({ '_type': 'url_transparent', + 'url': service_id, 'ie_key': service_name.capitalize(), 'title': video_data.get('title'), }) -- GitLab From 96e0370bb26470928eefd2053f7735c094cdf077 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:51:38 +0100 Subject: [PATCH 106/384] =?UTF-8?q?[americastestkitchen]=20Fix=20Extractio?= =?UTF-8?q?n=20and=20add=20support=20for=20Cook's=20Count=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ry and Cook's Illustrated closes #17234 closes #27322 --- haruhi_dl/extractor/americastestkitchen.py | 68 +++++++++------------- 1 file changed, 26 insertions(+), 42 deletions(-) diff --git a/haruhi_dl/extractor/americastestkitchen.py b/haruhi_dl/extractor/americastestkitchen.py index 9c9d77ae1..e20f00fc3 100644 --- a/haruhi_dl/extractor/americastestkitchen.py +++ b/haruhi_dl/extractor/americastestkitchen.py @@ -1,33 +1,33 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( clean_html, - int_or_none, - js_to_json, try_get, unified_strdate, ) class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?P<resource_type>episode|videos)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', 'info_dict': { 'id': '5b400b9ee338f922cb06450c', - 'title': 'Weeknight Japanese Suppers', + 'title': 'Japanese Suppers', 'ext': 'mp4', - 'description': 'md5:3d0c1a44bb3b27607ce82652db25b4a8', + 'description': 'md5:64e606bfee910627efc4b5f050de92b3', 'thumbnail': r're:^https?://', 'timestamp': 1523664000, 'upload_date': '20180414', - 'release_date': '20180414', + 'release_date': '20180410', 'series': "America's Test Kitchen", 'season_number': 18, - 'episode': 'Weeknight Japanese Suppers', + 'episode': 'Japanese Suppers', 'episode_number': 15, }, 'params': { @@ -36,47 +36,31 @@ class AmericasTestKitchenIE(InfoExtractor): }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) + resource_type, video_id = re.match(self._VALID_URL, url).groups() + is_episode = resource_type == 'episode' + if is_episode: + resource_type = 'episodes' - video_data = self._parse_json( - self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*</script>', - webpage, 'initial context'), - video_id, js_to_json) - - ep_data = try_get( - video_data, - (lambda x: x['episodeDetail']['content']['data'], - lambda x: x['videoDetail']['content']['data']), dict) - ep_meta = ep_data.get('full_video', {}) - - zype_id = ep_data.get('zype_id') or ep_meta['zype_id'] - - title = ep_data.get('title') or ep_meta.get('title') - description = clean_html(ep_meta.get('episode_description') or ep_data.get( - 'description') or ep_meta.get('description')) - thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url']) - release_date = unified_strdate(ep_data.get('aired_at')) - - season_number = int_or_none(ep_meta.get('season_number')) - episode = ep_meta.get('title') - episode_number = int_or_none(ep_meta.get('episode_number')) + resource = self._download_json( + 'https://www.americastestkitchen.com/api/v6/%s/%s' % (resource_type, video_id), video_id) + video = resource['video'] if is_episode else resource + episode = resource if is_episode else resource.get('episode') or {} return { '_type': 'url_transparent', - 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id, + 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'], 'ie_key': 'Zype', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'release_date': release_date, - 'series': "America's Test Kitchen", - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, + 'description': clean_html(video.get('description')), + 'release_date': unified_strdate(video.get('publishDate')), + 'series': try_get(episode, lambda x: x['show']['title']), + 'episode': episode.get('title'), } -- GitLab From 7e83a9d619561302e2d4fac857ba17b28b9fec6b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:51:45 +0100 Subject: [PATCH 107/384] [tvplay:home] Fix extraction(closes #21153) --- haruhi_dl/extractor/tvplay.py | 90 ++++++++++++++--------------------- 1 file changed, 35 insertions(+), 55 deletions(-) diff --git a/haruhi_dl/extractor/tvplay.py b/haruhi_dl/extractor/tvplay.py index 3c2450dd0..0d858c025 100644 --- a/haruhi_dl/extractor/tvplay.py +++ b/haruhi_dl/extractor/tvplay.py @@ -12,11 +12,13 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + parse_duration, parse_iso8601, qualities, try_get, update_url_query, url_or_none, + urljoin, ) @@ -414,7 +416,7 @@ class ViafreeIE(InfoExtractor): class TVPlayHomeIE(InfoExtractor): - _VALID_URL = r'https?://tvplay\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/[^/]+/[^/?#&]+-(?P<id>\d+)' + _VALID_URL = r'https?://(?:tv3?)?play\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/(?:[^/]+/)*[^/?#&]+-(?P<id>\d+)' _TESTS = [{ 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/', 'info_dict': { @@ -433,80 +435,58 @@ class TVPlayHomeIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [TVPlayIE.ie_key()], }, { 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/', 'only_matching': True, }, { 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/', 'only_matching': True, + }, { + 'url': 'https://play.tv3.lt/aferistai-10047125', + 'only_matching': True, + }, { + 'url': 'https://tv3play.skaties.lv/vinas-melo-labak-10280317', + 'only_matching': True, + }, { + 'url': 'https://play.tv3.ee/cool-d-ga-mehhikosse-10044354', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_id = self._search_regex( - r'data-asset-id\s*=\s*["\'](\d{5,})\b', webpage, 'video id') - - if len(video_id) < 8: - return self.url_result( - 'mtg:%s' % video_id, ie=TVPlayIE.ie_key(), video_id=video_id) + asset = self._download_json( + urljoin(url, '/sb/public/asset/' + video_id), video_id) - m3u8_url = self._search_regex( - r'data-file\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'm3u8 url', group='url') + m3u8_url = asset['movie']['contentUrl'] + video_id = asset['assetId'] + asset_title = asset['title'] + title = asset_title['title'] formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) - title = self._search_regex( - r'data-title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, - 'title', default=None, group='value') or self._html_search_meta( - 'title', webpage, default=None) or self._og_search_title( - webpage) - - description = self._html_search_meta( - 'description', webpage, - default=None) or self._og_search_description(webpage) - - thumbnail = self._search_regex( - r'data-image\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'thumbnail', default=None, group='url') or self._html_search_meta( - 'thumbnail', webpage, default=None) or self._og_search_thumbnail( - webpage) - - duration = int_or_none(self._search_regex( - r'data-duration\s*=\s*["\'](\d+)', webpage, 'duration', - fatal=False)) + thumbnails = None + image_url = asset.get('imageUrl') + if image_url: + thumbnails = [{ + 'url': urljoin(url, image_url), + 'ext': 'jpg', + }] - season = self._search_regex( - (r'data-series-title\s*=\s*(["\'])[^/]+/(?P<value>(?:(?!\1).)+)\1', - r'\bseason\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, - 'season', default=None, group='value') - season_number = int_or_none(self._search_regex( - r'(\d+)(?:[.\s]+sezona|\s+HOOAEG)', season or '', 'season number', - default=None)) - episode = self._search_regex( - (r'\bepisode\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', - r'data-subtitle\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, - 'episode', default=None, group='value') - episode_number = int_or_none(self._search_regex( - r'(?:S[eē]rija|Osa)\s+(\d+)', episode or '', 'episode number', - default=None)) + metadata = asset.get('metadata') or {} return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'season': season, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, + 'description': asset_title.get('summaryLong') or asset_title.get('summaryShort'), + 'thumbnails': thumbnails, + 'duration': parse_duration(asset_title.get('runTime')), + 'series': asset.get('tvSeriesTitle'), + 'season': asset.get('tvSeasonTitle'), + 'season_number': int_or_none(metadata.get('seasonNumber')), + 'episode': asset_title.get('titleBrief'), + 'episode_number': int_or_none(metadata.get('episodeNumber')), 'formats': formats, } -- GitLab From 8b9bc4eeeeaa7cf167d4a3ed8c09f4d6fc77c8ed Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:51:54 +0100 Subject: [PATCH 108/384] [generic] comment a test covered now by AmericasTestKitchenIE --- haruhi_dl/extractor/generic.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 037fc4d7a..7cfc7464e 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -2126,23 +2126,23 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - { - # Zype embed - 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', - 'info_dict': { - 'id': '5b400b834b32992a310622b9', - 'ext': 'mp4', - 'title': 'Smoky Barbecue Favorites', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', - 'upload_date': '20170909', - 'timestamp': 1504915200, - }, - 'add_ie': [ZypeIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, + # { + # # Zype embed + # 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', + # 'info_dict': { + # 'id': '5b400b834b32992a310622b9', + # 'ext': 'mp4', + # 'title': 'Smoky Barbecue Favorites', + # 'thumbnail': r're:^https?://.*\.jpe?g', + # 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', + # 'upload_date': '20170909', + # 'timestamp': 1504915200, + # }, + # 'add_ie': [ZypeIE.ie_key()], + # 'params': { + # 'skip_download': True, + # }, + # }, { # videojs embed 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904', -- GitLab From e754d9d1a52762170bb52bbe6b45a502902fb947 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:52:02 +0100 Subject: [PATCH 109/384] [telequebec] Fix Extraction and Add Support for video.telequebec.tv closes #25733 closes #26883 closes #27339 --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/telequebec.py | 160 ++++++++++++++++-------------- 2 files changed, 88 insertions(+), 73 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 7a0706532..bbbbadd8a 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1199,6 +1199,7 @@ from .telequebec import ( TeleQuebecSquatIE, TeleQuebecEmissionIE, TeleQuebecLiveIE, + TeleQuebecVideoIE, ) from .teletask import TeleTaskIE from .telewebion import TelewebionIE diff --git a/haruhi_dl/extractor/telequebec.py b/haruhi_dl/extractor/telequebec.py index b4c485b9b..800d87b70 100644 --- a/haruhi_dl/extractor/telequebec.py +++ b/haruhi_dl/extractor/telequebec.py @@ -12,25 +12,16 @@ from ..utils import ( class TeleQuebecBaseIE(InfoExtractor): + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + @staticmethod - def _result(url, ie_key): + def _brightcove_result(brightcove_id, player_id, account_id='6150020952001'): return { '_type': 'url_transparent', - 'url': smuggle_url(url, {'geo_countries': ['CA']}), - 'ie_key': ie_key, + 'url': smuggle_url(TeleQuebecBaseIE.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, brightcove_id), {'geo_countries': ['CA']}), + 'ie_key': 'BrightcoveNew', } - @staticmethod - def _limelight_result(media_id): - return TeleQuebecBaseIE._result( - 'limelight:media:' + media_id, 'LimelightMedia') - - @staticmethod - def _brightcove_result(brightcove_id): - return TeleQuebecBaseIE._result( - 'http://players.brightcove.net/6150020952001/default_default/index.html?videoId=%s' - % brightcove_id, 'BrightcoveNew') - class TeleQuebecIE(TeleQuebecBaseIE): _VALID_URL = r'''(?x) @@ -44,14 +35,18 @@ class TeleQuebecIE(TeleQuebecBaseIE): # available till 01.01.2023 'url': 'http://zonevideo.telequebec.tv/media/37578/un-petit-choc-et-puis-repart/un-chef-a-la-cabane', 'info_dict': { - 'id': '577116881b4b439084e6b1cf4ef8b1b3', + 'id': '6155972771001', 'ext': 'mp4', 'title': 'Un petit choc et puis repart!', - 'description': 'md5:067bc84bd6afecad85e69d1000730907', + 'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374', + 'timestamp': 1589262469, + 'uploader_id': '6150020952001', + 'upload_date': '20200512', }, 'params': { - 'skip_download': True, + 'format': 'bestvideo', }, + 'add_ie': ['BrightcoveNew'], }, { 'url': 'https://zonevideo.telequebec.tv/media/55267/le-soleil/passe-partout', 'info_dict': { @@ -65,7 +60,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): }, 'params': { 'format': 'bestvideo', - 'skip_download': True, }, 'add_ie': ['BrightcoveNew'], }, { @@ -79,25 +73,20 @@ class TeleQuebecIE(TeleQuebecBaseIE): def _real_extract(self, url): media_id = self._match_id(url) - - media_data = self._download_json( - 'https://mnmedias.api.telequebec.tv/api/v2/media/' + media_id, + media = self._download_json( + 'https://mnmedias.api.telequebec.tv/api/v3/media/' + media_id, media_id)['media'] - - source_id = media_data['streamInfo']['sourceId'] - source = (try_get( - media_data, lambda x: x['streamInfo']['source'], - compat_str) or 'limelight').lower() - if source == 'brightcove': - info = self._brightcove_result(source_id) - else: - info = self._limelight_result(source_id) + source_id = next(source_info['sourceId'] for source_info in media['streamInfos'] if source_info.get('source') == 'Brightcove') + info = self._brightcove_result(source_id, '22gPKdt7f') + product = media.get('product') or {} + season = product.get('season') or {} info.update({ - 'title': media_data.get('title'), - 'description': try_get( - media_data, lambda x: x['descriptions'][0]['text'], compat_str), - 'duration': int_or_none( - media_data.get('durationInMilliseconds'), 1000), + 'description': try_get(media, lambda x: x['descriptions'][-1]['text'], compat_str), + 'series': try_get(season, lambda x: x['serie']['titre']), + 'season': season.get('name'), + 'season_number': int_or_none(season.get('seasonNo')), + 'episode': product.get('titre'), + 'episode_number': int_or_none(product.get('episodeNo')), }) return info @@ -148,7 +137,7 @@ class TeleQuebecSquatIE(InfoExtractor): } -class TeleQuebecEmissionIE(TeleQuebecBaseIE): +class TeleQuebecEmissionIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: @@ -160,15 +149,16 @@ class TeleQuebecEmissionIE(TeleQuebecBaseIE): _TESTS = [{ 'url': 'http://lindicemcsween.telequebec.tv/emissions/100430013/des-soins-esthetiques-a-377-d-interets-annuels-ca-vous-tente', 'info_dict': { - 'id': '66648a6aef914fe3badda25e81a4d50a', + 'id': '6154476028001', 'ext': 'mp4', - 'title': "Des soins esthétiques à 377 % d'intérêts annuels, ça vous tente?", - 'description': 'md5:369e0d55d0083f1fc9b71ffb640ea014', - 'upload_date': '20171024', - 'timestamp': 1508862118, + 'title': 'Des soins esthétiques à 377 % d’intérêts annuels, ça vous tente?', + 'description': 'md5:cb4d378e073fae6cce1f87c00f84ae9f', + 'upload_date': '20200505', + 'timestamp': 1588713424, + 'uploader_id': '6150020952001', }, 'params': { - 'skip_download': True, + 'format': 'bestvideo', }, }, { 'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression', @@ -187,26 +177,26 @@ class TeleQuebecEmissionIE(TeleQuebecBaseIE): webpage = self._download_webpage(url, display_id) media_id = self._search_regex( - r'mediaUID\s*:\s*["\'][Ll]imelight_(?P<id>[a-z0-9]{32})', webpage, - 'limelight id') + r'mediaId\s*:\s*(?P<id>\d+)', webpage, 'media id') - info = self._limelight_result(media_id) - info.update({ - 'title': self._og_search_title(webpage, default=None), - 'description': self._og_search_description(webpage, default=None), - }) - return info + return self.url_result( + 'http://zonevideo.telequebec.tv/media/' + media_id, + TeleQuebecIE.ie_key()) -class TeleQuebecLiveIE(InfoExtractor): +class TeleQuebecLiveIE(TeleQuebecBaseIE): _VALID_URL = r'https?://zonevideo\.telequebec\.tv/(?P<id>endirect)' _TEST = { 'url': 'http://zonevideo.telequebec.tv/endirect/', 'info_dict': { - 'id': 'endirect', + 'id': '6159095684001', 'ext': 'mp4', - 'title': 're:^Télé-Québec - En direct [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^Télé-Québec [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'description': 'Canal principal de Télé-Québec', + 'uploader_id': '6150020952001', + 'timestamp': 1590439901, + 'upload_date': '20200525', }, 'params': { 'skip_download': True, @@ -214,25 +204,49 @@ class TeleQuebecLiveIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + return self._brightcove_result('6159095684001', 'skCsmi2Uw') - m3u8_url = None - webpage = self._download_webpage( - 'https://player.telequebec.tv/Tq_VideoPlayer.js', video_id, - fatal=False) - if webpage: - m3u8_url = self._search_regex( - r'm3U8Url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'm3u8 url', default=None, group='url') - if not m3u8_url: - m3u8_url = 'https://teleqmmd.mmdlive.lldns.net/teleqmmd/f386e3b206814e1f8c8c1c71c0f8e748/manifest.m3u8' - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', m3u8_id='hls') - self._sort_formats(formats) - return { - 'id': video_id, - 'title': self._live_title('Télé-Québec - En direct'), - 'is_live': True, - 'formats': formats, - } +class TeleQuebecVideoIE(TeleQuebecBaseIE): + _VALID_URL = r'https?://video\.telequebec\.tv/player(?:-live)?/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://video.telequebec.tv/player/31110/stream', + 'info_dict': { + 'id': '6202570652001', + 'ext': 'mp4', + 'title': 'Le coût du véhicule le plus vendu au Canada / Tous les frais liés à la procréation assistée', + 'description': 'md5:685a7e4c450ba777c60adb6e71e41526', + 'upload_date': '20201019', + 'timestamp': 1603115930, + 'uploader_id': '6101674910001', + }, + 'params': { + 'format': 'bestvideo', + }, + }, { + 'url': 'https://video.telequebec.tv/player-live/28527', + 'only_matching': True, + }] + + def _call_api(self, path, video_id): + return self._download_json( + 'http://beacon.playback.api.brightcove.com/telequebec/api/assets/' + path, + video_id, query={'device_layout': 'web', 'device_type': 'web'})['data'] + + def _real_extract(self, url): + asset_id = self._match_id(url) + asset = self._call_api(asset_id, asset_id)['asset'] + stream = self._call_api( + asset_id + '/streams/' + asset['streams'][0]['id'], asset_id)['stream'] + stream_url = stream['url'] + account_id = try_get( + stream, lambda x: x['video_provider_details']['account_id']) or '6101674910001' + info = self._brightcove_result(stream_url, 'default', account_id) + info.update({ + 'description': asset.get('long_description') or asset.get('short_description'), + 'series': asset.get('series_original_name'), + 'season_number': int_or_none(asset.get('season_number')), + 'episode': asset.get('original_name'), + 'episode_number': int_or_none(asset.get('episode_number')), + }) + return info -- GitLab From 228d41686d835e4c660131e936b2df862af0cf49 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:52:17 +0100 Subject: [PATCH 110/384] [amcnetworks] Fix free content extraction(closes #20354) --- haruhi_dl/extractor/amcnetworks.py | 50 +++++++++++++++--------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/haruhi_dl/extractor/amcnetworks.py b/haruhi_dl/extractor/amcnetworks.py index 6fb3d6c53..12b6de0bf 100644 --- a/haruhi_dl/extractor/amcnetworks.py +++ b/haruhi_dl/extractor/amcnetworks.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .theplatform import ThePlatformIE from ..utils import ( int_or_none, @@ -11,25 +13,22 @@ from ..utils import ( class AMCNetworksIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<site>amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P<id>(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', - 'md5': '', + 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', 'info_dict': { - 'id': 's3MX01Nl4vPH', + 'id': '4Lq1dzOnZGt0', 'ext': 'mp4', - 'title': 'Maron - Season 4 - Step 1', - 'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.', - 'age_limit': 17, - 'upload_date': '20160505', - 'timestamp': 1462468831, + 'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner", + 'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.", + 'upload_date': '20201120', + 'timestamp': 1605904350, 'uploader': 'AMCN', }, 'params': { # m3u8 download 'skip_download': True, }, - 'skip': 'Requires TV provider accounts', }, { 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', 'only_matching': True, @@ -55,32 +54,33 @@ class AMCNetworksIE(ThePlatformIE): 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', 'only_matching': True, }] + _REQUESTOR_ID_MAP = { + 'amc': 'AMC', + 'bbcamerica': 'BBCA', + 'ifc': 'IFC', + 'sundancetv': 'SUNDANCE', + 'wetv': 'WETV', + } def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + site, display_id = re.match(self._VALID_URL, url).groups() + requestor_id = self._REQUESTOR_ID_MAP[site] + properties = self._download_json( + 'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/%s/url/%s' % (requestor_id.lower(), display_id), + display_id)['data']['properties'] query = { 'mbr': 'true', 'manifest': 'm3u', } - media_url = self._search_regex( - r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', - webpage, 'media url') - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'link\.theplatform\.com/s/([^?]+)', - media_url, 'theplatform_path'), display_id) + tp_path = 'M_UwQC/media/' + properties['videoPid'] + media_url = 'https://link.theplatform.com/s/' + tp_path + theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id) info = self._parse_theplatform_metadata(theplatform_metadata) video_id = theplatform_metadata['pid'] title = theplatform_metadata['title'] rating = try_get( theplatform_metadata, lambda x: x['ratings'][0]['rating']) - auth_required = self._search_regex( - r'window\.authRequired\s*=\s*(true|false);', - webpage, 'auth required') - if auth_required == 'true': - requestor_id = self._search_regex( - r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', - webpage, 'requestor id') + if properties.get('videoCategory') == 'TVE-Auth': resource = self._get_mvpd_resource( requestor_id, title, video_id, rating) query['auth'] = self._extract_mvpd_auth( -- GitLab From eefe89651dccc952d48413da555a658da325ba16 Mon Sep 17 00:00:00 2001 From: EntranceJew <EntranceJew@gmail.com> Date: Fri, 26 Feb 2021 14:53:07 +0100 Subject: [PATCH 111/384] [tubitv] Extract release year (#27317) --- haruhi_dl/extractor/tubitv.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/haruhi_dl/extractor/tubitv.py b/haruhi_dl/extractor/tubitv.py index a51fa6515..ebfb05c63 100644 --- a/haruhi_dl/extractor/tubitv.py +++ b/haruhi_dl/extractor/tubitv.py @@ -33,6 +33,19 @@ class TubiTvIE(InfoExtractor): }, { 'url': 'http://tubitv.com/movies/383676/tracker', 'only_matching': True, + }, { + 'url': 'https://tubitv.com/movies/560057/penitentiary?start=true', + 'info_dict': { + 'id': '560057', + 'ext': 'mp4', + 'title': 'Penitentiary', + 'description': 'md5:8d2fc793a93cc1575ff426fdcb8dd3f9', + 'uploader_id': 'd8fed30d4f24fcb22ec294421b9defc2', + 'release_year': 1979, + }, + 'params': { + 'skip_download': True, + }, }] def _login(self): @@ -93,4 +106,5 @@ class TubiTvIE(InfoExtractor): 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'uploader_id': video_data.get('publisher_id'), + 'release_year': int_or_none(video_data.get('year')), } -- GitLab From 325ff4c628d0fb57550682554641d461d287d94e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:53:18 +0100 Subject: [PATCH 112/384] [beampro] Remove Extractor closes #17290 closes #22871 closes #23020 closes #23061 closes #26099 --- haruhi_dl/extractor/beampro.py | 194 ------------------------------ haruhi_dl/extractor/extractors.py | 4 - 2 files changed, 198 deletions(-) delete mode 100644 haruhi_dl/extractor/beampro.py diff --git a/haruhi_dl/extractor/beampro.py b/haruhi_dl/extractor/beampro.py deleted file mode 100644 index 86abdae00..000000000 --- a/haruhi_dl/extractor/beampro.py +++ /dev/null @@ -1,194 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - clean_html, - compat_str, - float_or_none, - int_or_none, - parse_iso8601, - try_get, - urljoin, -) - - -class BeamProBaseIE(InfoExtractor): - _API_BASE = 'https://mixer.com/api/v1' - _RATINGS = {'family': 0, 'teen': 13, '18+': 18} - - def _extract_channel_info(self, chan): - user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id']) - return { - 'uploader': chan.get('token') or try_get( - chan, lambda x: x['user']['username'], compat_str), - 'uploader_id': compat_str(user_id) if user_id else None, - 'age_limit': self._RATINGS.get(chan.get('audience')), - } - - -class BeamProLiveIE(BeamProBaseIE): - IE_NAME = 'Mixer:live' - _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'http://mixer.com/niterhayven', - 'info_dict': { - 'id': '261562', - 'ext': 'mp4', - 'title': 'Introducing The Witcher 3 // The Grind Starts Now!', - 'description': 'md5:0b161ac080f15fe05d18a07adb44a74d', - 'thumbnail': r're:https://.*\.jpg$', - 'timestamp': 1483477281, - 'upload_date': '20170103', - 'uploader': 'niterhayven', - 'uploader_id': '373396', - 'age_limit': 18, - 'is_live': True, - 'view_count': int, - }, - 'skip': 'niterhayven is offline', - 'params': { - 'skip_download': True, - }, - } - - _MANIFEST_URL_TEMPLATE = '%s/channels/%%s/manifest.%%s' % BeamProBaseIE._API_BASE - - @classmethod - def suitable(cls, url): - return False if BeamProVodIE.suitable(url) else super(BeamProLiveIE, cls).suitable(url) - - def _real_extract(self, url): - channel_name = self._match_id(url) - - chan = self._download_json( - '%s/channels/%s' % (self._API_BASE, channel_name), channel_name) - - if chan.get('online') is False: - raise ExtractorError( - '{0} is offline'.format(channel_name), expected=True) - - channel_id = chan['id'] - - def manifest_url(kind): - return self._MANIFEST_URL_TEMPLATE % (channel_id, kind) - - formats = self._extract_m3u8_formats( - manifest_url('m3u8'), channel_name, ext='mp4', m3u8_id='hls', - fatal=False) - formats.extend(self._extract_smil_formats( - manifest_url('smil'), channel_name, fatal=False)) - self._sort_formats(formats) - - info = { - 'id': compat_str(chan.get('id') or channel_name), - 'title': self._live_title(chan.get('name') or channel_name), - 'description': clean_html(chan.get('description')), - 'thumbnail': try_get( - chan, lambda x: x['thumbnail']['url'], compat_str), - 'timestamp': parse_iso8601(chan.get('updatedAt')), - 'is_live': True, - 'view_count': int_or_none(chan.get('viewersTotal')), - 'formats': formats, - } - info.update(self._extract_channel_info(chan)) - - return info - - -class BeamProVodIE(BeamProBaseIE): - IE_NAME = 'Mixer:vod' - _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P<id>[^?#&]+)' - _TESTS = [{ - 'url': 'https://mixer.com/willow8714?vod=2259830', - 'md5': 'b2431e6e8347dc92ebafb565d368b76b', - 'info_dict': { - 'id': '2259830', - 'ext': 'mp4', - 'title': 'willow8714\'s Channel', - 'duration': 6828.15, - 'thumbnail': r're:https://.*source\.png$', - 'timestamp': 1494046474, - 'upload_date': '20170506', - 'uploader': 'willow8714', - 'uploader_id': '6085379', - 'age_limit': 13, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://mixer.com/streamer?vod=IxFno1rqC0S_XJ1a2yGgNw', - 'only_matching': True, - }, { - 'url': 'https://mixer.com/streamer?vod=Rh3LY0VAqkGpEQUe2pN-ig', - 'only_matching': True, - }] - - @staticmethod - def _extract_format(vod, vod_type): - if not vod.get('baseUrl'): - return [] - - if vod_type == 'hls': - filename, protocol = 'manifest.m3u8', 'm3u8_native' - elif vod_type == 'raw': - filename, protocol = 'source.mp4', 'https' - else: - assert False - - data = vod.get('data') if isinstance(vod.get('data'), dict) else {} - - format_id = [vod_type] - if isinstance(data.get('Height'), compat_str): - format_id.append('%sp' % data['Height']) - - return [{ - 'url': urljoin(vod['baseUrl'], filename), - 'format_id': '-'.join(format_id), - 'ext': 'mp4', - 'protocol': protocol, - 'width': int_or_none(data.get('Width')), - 'height': int_or_none(data.get('Height')), - 'fps': int_or_none(data.get('Fps')), - 'tbr': int_or_none(data.get('Bitrate'), 1000), - }] - - def _real_extract(self, url): - vod_id = self._match_id(url) - - vod_info = self._download_json( - '%s/recordings/%s' % (self._API_BASE, vod_id), vod_id) - - state = vod_info.get('state') - if state != 'AVAILABLE': - raise ExtractorError( - 'VOD %s is not available (state: %s)' % (vod_id, state), - expected=True) - - formats = [] - thumbnail_url = None - - for vod in vod_info['vods']: - vod_type = vod.get('format') - if vod_type in ('hls', 'raw'): - formats.extend(self._extract_format(vod, vod_type)) - elif vod_type == 'thumbnail': - thumbnail_url = urljoin(vod.get('baseUrl'), 'source.png') - - self._sort_formats(formats) - - info = { - 'id': vod_id, - 'title': vod_info.get('name') or vod_id, - 'duration': float_or_none(vod_info.get('duration')), - 'thumbnail': thumbnail_url, - 'timestamp': parse_iso8601(vod_info.get('createdAt')), - 'view_count': int_or_none(vod_info.get('viewsTotal')), - 'formats': formats, - } - info.update(self._extract_channel_info(vod_info.get('channel') or {})) - - return info diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index bbbbadd8a..cb9c69e8b 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -101,10 +101,6 @@ from .bbc import ( BBCCoUkPlaylistIE, BBCIE, ) -from .beampro import ( - BeamProLiveIE, - BeamProVodIE, -) from .beeg import BeegIE from .behindkink import BehindKinkIE from .bellmedia import BellMediaIE -- GitLab From ec7e1e27c27bd6d71d9743e0c51509ed07702a19 Mon Sep 17 00:00:00 2001 From: Andrey Smirnoff <37037851+mashed-potatoes@users.noreply.github.com> Date: Fri, 26 Feb 2021 14:53:29 +0100 Subject: [PATCH 113/384] [smotri] Remove extractor (#27358) --- haruhi_dl/extractor/extractors.py | 6 - haruhi_dl/extractor/generic.py | 6 - haruhi_dl/extractor/smotri.py | 416 ------------------------------ haruhi_dl/options.py | 2 +- 4 files changed, 1 insertion(+), 429 deletions(-) delete mode 100644 haruhi_dl/extractor/smotri.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index cb9c69e8b..297a5e02b 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1085,12 +1085,6 @@ from .sky import ( from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE from .slutload import SlutloadIE -from .smotri import ( - SmotriIE, - SmotriCommunityIE, - SmotriUserIE, - SmotriBroadcastIE, -) from .snotr import SnotrIE from .sohu import SohuIE from .sonyliv import SonyLIVIE diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 7cfc7464e..a321bcd6d 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -52,7 +52,6 @@ from .ooyala import OoyalaIE from .rutv import RUTVIE from .tvc import TVCIE from .sportbox import SportBoxIE -from .smotri import SmotriIE from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE @@ -2804,11 +2803,6 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) - # Look for embedded smotri.com player - smotri_url = SmotriIE._extract_url(webpage) - if smotri_url: - return self.url_result(smotri_url, 'Smotri') - # Look for embedded Myvi.ru player myvi_url = MyviIE._extract_url(webpage) if myvi_url: diff --git a/haruhi_dl/extractor/smotri.py b/haruhi_dl/extractor/smotri.py deleted file mode 100644 index 45995f30f..000000000 --- a/haruhi_dl/extractor/smotri.py +++ /dev/null @@ -1,416 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import json -import hashlib -import uuid - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - sanitized_Request, - unified_strdate, - urlencode_postdata, - xpath_text, -) - - -class SmotriIE(InfoExtractor): - IE_DESC = 'Smotri.com' - IE_NAME = 'smotri' - _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' - _NETRC_MACHINE = 'smotri' - - _TESTS = [ - # real video id 2610366 - { - 'url': 'http://smotri.com/video/view/?id=v261036632ab', - 'md5': '02c0dfab2102984e9c5bb585cc7cc321', - 'info_dict': { - 'id': 'v261036632ab', - 'ext': 'mp4', - 'title': 'катастрофа с камер видеонаблюдения', - 'uploader': 'rbc2008', - 'uploader_id': 'rbc08', - 'upload_date': '20131118', - 'thumbnail': 'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg', - }, - }, - # real video id 57591 - { - 'url': 'http://smotri.com/video/view/?id=v57591cb20', - 'md5': '830266dfc21f077eac5afd1883091bcd', - 'info_dict': { - 'id': 'v57591cb20', - 'ext': 'flv', - 'title': 'test', - 'uploader': 'Support Photofile@photofile', - 'uploader_id': 'support-photofile', - 'upload_date': '20070704', - 'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', - }, - }, - # video-password, not approved by moderator - { - 'url': 'http://smotri.com/video/view/?id=v1390466a13c', - 'md5': 'f6331cef33cad65a0815ee482a54440b', - 'info_dict': { - 'id': 'v1390466a13c', - 'ext': 'mp4', - 'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', - 'uploader': 'timoxa40', - 'uploader_id': 'timoxa40', - 'upload_date': '20100404', - 'thumbnail': 'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg', - }, - 'params': { - 'videopassword': 'qwerty', - }, - 'skip': 'Video is not approved by moderator', - }, - # video-password - { - 'url': 'http://smotri.com/video/view/?id=v6984858774#', - 'md5': 'f11e01d13ac676370fc3b95b9bda11b0', - 'info_dict': { - 'id': 'v6984858774', - 'ext': 'mp4', - 'title': 'Дача Солженицина ПАРОЛЬ 223322', - 'uploader': 'psavari1', - 'uploader_id': 'psavari1', - 'upload_date': '20081103', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - 'videopassword': '223322', - }, - }, - # age limit + video-password, not approved by moderator - { - 'url': 'http://smotri.com/video/view/?id=v15408898bcf', - 'md5': '91e909c9f0521adf5ee86fbe073aad70', - 'info_dict': { - 'id': 'v15408898bcf', - 'ext': 'flv', - 'title': 'этот ролик не покажут по ТВ', - 'uploader': 'zzxxx', - 'uploader_id': 'ueggb', - 'upload_date': '20101001', - 'thumbnail': 'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', - 'age_limit': 18, - }, - 'params': { - 'videopassword': '333' - }, - 'skip': 'Video is not approved by moderator', - }, - # age limit + video-password - { - 'url': 'http://smotri.com/video/view/?id=v7780025814', - 'md5': 'b4599b068422559374a59300c5337d72', - 'info_dict': { - 'id': 'v7780025814', - 'ext': 'mp4', - 'title': 'Sexy Beach (пароль 123)', - 'uploader': 'вАся', - 'uploader_id': 'asya_prosto', - 'upload_date': '20081218', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - }, - 'params': { - 'videopassword': '123' - }, - }, - # swf player - { - 'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500', - 'md5': '31099eeb4bc906712c5f40092045108d', - 'info_dict': { - 'id': 'v9188090500', - 'ext': 'mp4', - 'title': 'Shakira - Don\'t Bother', - 'uploader': 'HannahL', - 'uploader_id': 'lisaha95', - 'upload_date': '20090331', - 'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg', - }, - }, - ] - - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r'<embed[^>]src=(["\'])(?P<url>http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)', - webpage) - if mobj is not None: - return mobj.group('url') - - mobj = re.search( - r'''(?x)<div\s+class="video_file">http://smotri\.com/video/download/file/[^<]+</div>\s* - <div\s+class="video_image">[^<]+</div>\s* - <div\s+class="video_id">(?P<id>[^<]+)</div>''', webpage) - if mobj is not None: - return 'http://smotri.com/video/view/?id=%s' % mobj.group('id') - - def _search_meta(self, name, html, display_name=None): - if display_name is None: - display_name = name - return self._html_search_meta(name, html, display_name) - - def _real_extract(self, url): - video_id = self._match_id(url) - - video_form = { - 'ticket': video_id, - 'video_url': '1', - 'frame_url': '1', - 'devid': 'LoadupFlashPlayer', - 'getvideoinfo': '1', - } - - video_password = self._downloader.params.get('videopassword') - if video_password: - video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() - - video = self._download_json( - 'http://smotri.com/video/view/url/bot/', - video_id, 'Downloading video JSON', - data=urlencode_postdata(video_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - video_url = video.get('_vidURL') or video.get('_vidURL_mp4') - - if not video_url: - if video.get('_moderate_no'): - raise ExtractorError( - 'Video %s has not been approved by moderator' % video_id, expected=True) - - if video.get('error'): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - if video.get('_pass_protected') == 1: - msg = ('Invalid video password' if video_password - else 'This video is protected by a password, use the --video-password option') - raise ExtractorError(msg, expected=True) - - title = video['title'] - thumbnail = video.get('_imgURL') - upload_date = unified_strdate(video.get('added')) - uploader = video.get('userNick') - uploader_id = video.get('userLogin') - duration = int_or_none(video.get('duration')) - - # Video JSON does not provide enough meta data - # We will extract some from the video web page instead - webpage_url = 'http://smotri.com/video/view/?id=%s' % video_id - webpage = self._download_webpage(webpage_url, video_id, 'Downloading video page') - - # Warning if video is unavailable - warning = self._html_search_regex( - r'<div[^>]+class="videoUnModer"[^>]*>(.+?)</div>', webpage, - 'warning message', default=None) - if warning is not None: - self._downloader.report_warning( - 'Video %s may not be available; smotri said: %s ' % - (video_id, warning)) - - # Adult content - if 'EroConfirmText">' in webpage: - self.report_age_confirmation() - confirm_string = self._html_search_regex( - r'<a[^>]+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id, - webpage, 'confirm string') - confirm_url = webpage_url + '&confirm=%s' % confirm_string - webpage = self._download_webpage( - confirm_url, video_id, - 'Downloading video page (age confirmed)') - adult_content = True - else: - adult_content = False - - view_count = self._html_search_regex( - r'(?s)Общее количество просмотров.*?<span class="Number">(\d+)</span>', - webpage, 'view count', fatal=False) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - 'uploader_id': uploader_id, - 'duration': duration, - 'view_count': int_or_none(view_count), - 'age_limit': 18 if adult_content else 0, - } - - -class SmotriCommunityIE(InfoExtractor): - IE_DESC = 'Smotri.com community videos' - IE_NAME = 'smotri:community' - _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P<id>[0-9A-Za-z_\'-]+)' - _TEST = { - 'url': 'http://smotri.com/community/video/kommuna', - 'info_dict': { - 'id': 'kommuna', - }, - 'playlist_mincount': 4, - } - - def _real_extract(self, url): - community_id = self._match_id(url) - - rss = self._download_xml( - 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id, - community_id, 'Downloading community RSS') - - entries = [ - self.url_result(video_url.text, SmotriIE.ie_key()) - for video_url in rss.findall('./channel/item/link')] - - return self.playlist_result(entries, community_id) - - -class SmotriUserIE(InfoExtractor): - IE_DESC = 'Smotri.com user videos' - IE_NAME = 'smotri:user' - _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P<id>[0-9A-Za-z_\'-]+)' - _TESTS = [{ - 'url': 'http://smotri.com/user/inspector', - 'info_dict': { - 'id': 'inspector', - 'title': 'Inspector', - }, - 'playlist_mincount': 9, - }] - - def _real_extract(self, url): - user_id = self._match_id(url) - - rss = self._download_xml( - 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id, - user_id, 'Downloading user RSS') - - entries = [self.url_result(video_url.text, 'Smotri') - for video_url in rss.findall('./channel/item/link')] - - description_text = xpath_text(rss, './channel/description') or '' - user_nickname = self._search_regex( - '^Видео режиссера (.+)$', description_text, - 'user nickname', fatal=False) - - return self.playlist_result(entries, user_id, user_nickname) - - -class SmotriBroadcastIE(InfoExtractor): - IE_DESC = 'Smotri.com broadcasts' - IE_NAME = 'smotri:broadcast' - _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*' - _NETRC_MACHINE = 'smotri' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - broadcast_id = mobj.group('id') - - broadcast_url = 'http://' + mobj.group('url') - broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') - - if re.search('>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None: - raise ExtractorError( - 'Broadcast %s does not exist' % broadcast_id, expected=True) - - # Adult content - if re.search('EroConfirmText">', broadcast_page) is not None: - - (username, password) = self._get_login_info() - if username is None: - self.raise_login_required( - 'Erotic broadcasts allowed only for registered users') - - login_form = { - 'login-hint53': '1', - 'confirm_erotic': '1', - 'login': username, - 'password': password, - } - - request = sanitized_Request( - broadcast_url + '/?no_redirect=1', urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - broadcast_page = self._download_webpage( - request, broadcast_id, 'Logging in and confirming age') - - if '>Неверный логин или пароль<' in broadcast_page: - raise ExtractorError( - 'Unable to log in: bad username or password', expected=True) - - adult_content = True - else: - adult_content = False - - ticket = self._html_search_regex( - (r'data-user-file=(["\'])(?P<ticket>(?!\1).+)\1', - r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'(?P<ticket>[^']+)'\)"), - broadcast_page, 'broadcast ticket', group='ticket') - - broadcast_url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket - - broadcast_password = self._downloader.params.get('videopassword') - if broadcast_password: - broadcast_url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() - - broadcast_json_page = self._download_webpage( - broadcast_url, broadcast_id, 'Downloading broadcast JSON') - - try: - broadcast_json = json.loads(broadcast_json_page) - - protected_broadcast = broadcast_json['_pass_protected'] == 1 - if protected_broadcast and not broadcast_password: - raise ExtractorError( - 'This broadcast is protected by a password, use the --video-password option', - expected=True) - - broadcast_offline = broadcast_json['is_play'] == 0 - if broadcast_offline: - raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True) - - rtmp_url = broadcast_json['_server'] - mobj = re.search(r'^rtmp://[^/]+/(?P<app>.+)/?$', rtmp_url) - if not mobj: - raise ExtractorError('Unexpected broadcast rtmp URL') - - broadcast_playpath = broadcast_json['_streamName'] - broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) - broadcast_thumbnail = broadcast_json.get('_imgURL') - broadcast_title = self._live_title(broadcast_json['title']) - broadcast_description = broadcast_json.get('description') - broadcaster_nick = broadcast_json.get('nick') - broadcaster_login = broadcast_json.get('login') - rtmp_conn = 'S:%s' % uuid.uuid4().hex - except KeyError: - if protected_broadcast: - raise ExtractorError('Bad broadcast password', expected=True) - raise ExtractorError('Unexpected broadcast JSON') - - return { - 'id': broadcast_id, - 'url': rtmp_url, - 'title': broadcast_title, - 'thumbnail': broadcast_thumbnail, - 'description': broadcast_description, - 'uploader': broadcaster_nick, - 'uploader_id': broadcaster_login, - 'age_limit': 18 if adult_content else 0, - 'ext': 'flv', - 'play_path': broadcast_playpath, - 'player_url': 'http://pics.smotri.com/broadcast_play.swf', - 'app': broadcast_app, - 'rtmp_live': True, - 'rtmp_conn': rtmp_conn, - 'is_live': True, - } diff --git a/haruhi_dl/options.py b/haruhi_dl/options.py index acbef1584..76f97f452 100644 --- a/haruhi_dl/options.py +++ b/haruhi_dl/options.py @@ -369,7 +369,7 @@ def parseOpts(overrideArguments=None): authentication.add_option( '--video-password', dest='videopassword', metavar='PASSWORD', - help='Video password (vimeo, smotri, youku)') + help='Video password (vimeo, youku)') adobe_pass = optparse.OptionGroup(parser, 'Adobe Pass Options') adobe_pass.add_option( -- GitLab From e62320f70a3199917eeb580d43fddd1d594599ba Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:53:36 +0100 Subject: [PATCH 114/384] [facebook] remove hardcoded chrome user-agent closes #18974 closes #25411 closes #26958 closes #27329 --- haruhi_dl/extractor/facebook.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 72781bd80..2143d41a7 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -54,8 +54,6 @@ class FacebookIE(InfoExtractor): _NETRC_MACHINE = 'facebook' IE_NAME = 'facebook' - _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' - _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' @@ -306,9 +304,7 @@ class FacebookIE(InfoExtractor): self._login() def _extract_from_url(self, url, video_id, fatal_if_no_video=True): - req = sanitized_Request(url) - req.add_header('User-Agent', self._CHROME_USER_AGENT) - webpage = self._download_webpage(req, video_id) + webpage = self._download_webpage(url, video_id) video_data = None -- GitLab From feac903afbd615fa3390a8c7b86e5979f750de48 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:53:49 +0100 Subject: [PATCH 115/384] [facebook] try to reduce unessessary tahoe requests --- haruhi_dl/extractor/facebook.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 2143d41a7..6045058c1 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -328,11 +328,10 @@ class FacebookIE(InfoExtractor): js_data, lambda x: x['jsmods']['instances'], list) or []) if not video_data: - server_js_data = self._parse_json( - self._search_regex( - r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', - webpage, 'js data', default='{}'), - video_id, transform_source=js_to_json, fatal=False) + server_js_data = self._parse_json(self._search_regex([ + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', + r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"permalink_video_pagelet".*?})\);' + ], webpage, 'js data', default='{}'), video_id, js_to_json, False) video_data = extract_from_jsmods_instances(server_js_data) if not video_data: -- GitLab From fa06aa76adc1d20bd39d20b3e378df241daae893 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:53:53 +0100 Subject: [PATCH 116/384] [facebook] Add support for Relay based pages(closes #26823) --- haruhi_dl/extractor/facebook.py | 71 ++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 6045058c1..0d3a86b6c 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -16,11 +16,13 @@ from ..utils import ( clean_html, error_to_compat_str, ExtractorError, + float_or_none, get_element_by_id, int_or_none, js_to_json, limit_length, parse_count, + qualities, sanitized_Request, try_get, urlencode_postdata, @@ -327,6 +329,14 @@ class FacebookIE(InfoExtractor): return extract_video_data(try_get( js_data, lambda x: x['jsmods']['instances'], list) or []) + formats = [] + + def extract_dash_manifest(video): + dash_manifest = video.get('dash_manifest') + if dash_manifest: + formats.extend(self._parse_mpd_formats( + compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + if not video_data: server_js_data = self._parse_json(self._search_regex([ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', @@ -334,6 +344,61 @@ class FacebookIE(InfoExtractor): ], webpage, 'js data', default='{}'), video_id, js_to_json, False) video_data = extract_from_jsmods_instances(server_js_data) + if not video_data: + graphql_data = self._parse_json(self._search_regex( + r'handleWithCustomApplyEach\([^,]+,\s*({.*?"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+".*?})\);', + webpage, 'graphql data', default='{}'), video_id, fatal=False) or {} + for require in (graphql_data.get('require') or []): + if require[0] == 'RelayPrefetchedStreamCache': + def parse_graphql_video(video): + q = qualities(['sd', 'hd']) + for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: + playable_url = video.get('playable_url' + suffix) + if not playable_url: + continue + formats.append({ + 'format_id': format_id, + 'quality': q(format_id), + 'url': playable_url, + }) + extract_dash_manifest(video) + self._sort_formats(formats) + v_id = video.get('videoId') or video.get('id') or video_id + info = { + 'id': v_id, + 'formats': formats, + 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), + 'uploader_id': try_get(video, lambda x: x['owner']['id']), + 'timestamp': int_or_none(video.get('publish_time')), + 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), + } + description = try_get(video, lambda x: x['savable_description']['text']) + title = video.get('name') + if title: + info.update({ + 'title': title, + 'description': description, + }) + else: + info['title'] = description or 'Facebook video #%s' % v_id + return webpage, info + + data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + + attachments = try_get(data, [ + lambda x: x['video']['story']['attachments'], + lambda x: x['video']['creation_story']['attachments'], + lambda x: x['node']['comet_sections']['content']['story']['attachments'] + ], list) or [] + for attachment in attachments: + media = attachment.get('media') or try_get(attachment, lambda x: x['style_type_renderer']['attachment']['media'], dict) or {} + if media.get('__typename') == 'Video': + return parse_graphql_video(media) + + video = data.get('video') or {} + if video: + return parse_graphql_video(video) + if not video_data: if not fatal_if_no_video: return webpage, False @@ -375,7 +440,6 @@ class FacebookIE(InfoExtractor): raise ExtractorError('Cannot parse data') subtitles = {} - formats = [] for f in video_data: format_id = f['stream_type'] if f and isinstance(f, dict): @@ -394,10 +458,7 @@ class FacebookIE(InfoExtractor): 'url': src, 'preference': preference, }) - dash_manifest = f[0].get('dash_manifest') - if dash_manifest: - formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + extract_dash_manifest(f[0]) subtitles_src = f[0].get('subtitles_src') if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src}) -- GitLab From 91f1af44a1c740047cdd7929b8f4b2d681994ee4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:53:58 +0100 Subject: [PATCH 117/384] [facebook] redirect Mobile URLs to Desktop URLs closes #24831 closes #25624 --- haruhi_dl/extractor/facebook.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 0d3a86b6c..bcb224c03 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -306,7 +306,8 @@ class FacebookIE(InfoExtractor): self._login() def _extract_from_url(self, url, video_id, fatal_if_no_video=True): - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) video_data = None -- GitLab From c9b7b7dd04cd854d61350bbd41837fdf6c2aa230 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:54:03 +0100 Subject: [PATCH 118/384] [itv] remove old extractio method and fix series metadata extraction closes #23177 closes #26897 --- haruhi_dl/extractor/itv.py | 307 +++++++++++-------------------------- 1 file changed, 90 insertions(+), 217 deletions(-) diff --git a/haruhi_dl/extractor/itv.py b/haruhi_dl/extractor/itv.py index ad2f4eca5..08bcc8b68 100644 --- a/haruhi_dl/extractor/itv.py +++ b/haruhi_dl/extractor/itv.py @@ -1,29 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals -import uuid -import xml.etree.ElementTree as etree import json import re from .common import InfoExtractor from .brightcove import BrightcoveNewIE -from ..compat import ( - compat_str, - compat_etree_register_namespace, -) from ..utils import ( determine_ext, - ExtractorError, extract_attributes, - int_or_none, + get_element_by_class, + JSON_LD_RE, merge_dicts, parse_duration, smuggle_url, + strip_or_none, url_or_none, - xpath_with_ns, - xpath_element, - xpath_text, ) @@ -31,14 +23,18 @@ class ITVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)' _GEO_COUNTRIES = ['GB'] _TESTS = [{ - 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', + 'url': 'https://www.itv.com/hub/liar/2a4547a0012', 'info_dict': { - 'id': '2a2936a0053', - 'ext': 'flv', - 'title': 'Home Movie', + 'id': '2a4547a0012', + 'ext': 'mp4', + 'title': 'Liar - Series 2 - Episode 6', + 'description': 'md5:d0f91536569dec79ea184f0a44cca089', + 'series': 'Liar', + 'season_number': 2, + 'episode_number': 6, }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, }, { @@ -61,220 +57,97 @@ class ITVIE(InfoExtractor): params = extract_attributes(self._search_regex( r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params')) - ns_map = { - 'soapenv': 'http://schemas.xmlsoap.org/soap/envelope/', - 'tem': 'http://tempuri.org/', - 'itv': 'http://schemas.datacontract.org/2004/07/Itv.BB.Mercury.Common.Types', - 'com': 'http://schemas.itv.com/2009/05/Common', - } - for ns, full_ns in ns_map.items(): - compat_etree_register_namespace(ns, full_ns) - - def _add_ns(name): - return xpath_with_ns(name, ns_map) - - def _add_sub_element(element, name): - return etree.SubElement(element, _add_ns(name)) - - production_id = ( - params.get('data-video-autoplay-id') - or '%s#001' % ( - params.get('data-video-episode-id') - or video_id.replace('a', '/'))) - - req_env = etree.Element(_add_ns('soapenv:Envelope')) - _add_sub_element(req_env, 'soapenv:Header') - body = _add_sub_element(req_env, 'soapenv:Body') - get_playlist = _add_sub_element(body, ('tem:GetPlaylist')) - request = _add_sub_element(get_playlist, 'tem:request') - _add_sub_element(request, 'itv:ProductionId').text = production_id - _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper() - vodcrid = _add_sub_element(request, 'itv:Vodcrid') - _add_sub_element(vodcrid, 'com:Id') - _add_sub_element(request, 'itv:Partition') - user_info = _add_sub_element(get_playlist, 'tem:userInfo') - _add_sub_element(user_info, 'itv:Broadcaster').text = 'Itv' - _add_sub_element(user_info, 'itv:DM') - _add_sub_element(user_info, 'itv:RevenueScienceValue') - _add_sub_element(user_info, 'itv:SessionId') - _add_sub_element(user_info, 'itv:SsoToken') - _add_sub_element(user_info, 'itv:UserToken') - site_info = _add_sub_element(get_playlist, 'tem:siteInfo') - _add_sub_element(site_info, 'itv:AdvertisingRestriction').text = 'None' - _add_sub_element(site_info, 'itv:AdvertisingSite').text = 'ITV' - _add_sub_element(site_info, 'itv:AdvertisingType').text = 'Any' - _add_sub_element(site_info, 'itv:Area').text = 'ITVPLAYER.VIDEO' - _add_sub_element(site_info, 'itv:Category') - _add_sub_element(site_info, 'itv:Platform').text = 'DotCom' - _add_sub_element(site_info, 'itv:Site').text = 'ItvCom' - device_info = _add_sub_element(get_playlist, 'tem:deviceInfo') - _add_sub_element(device_info, 'itv:ScreenSize').text = 'Big' - player_info = _add_sub_element(get_playlist, 'tem:playerInfo') - _add_sub_element(player_info, 'itv:Version').text = '2' - + ios_playlist_url = params.get('data-video-playlist') or params['data-video-id'] + hmac = params['data-video-hmac'] headers = self.geo_verification_headers() headers.update({ - 'Content-Type': 'text/xml; charset=utf-8', - 'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist', + 'Accept': 'application/vnd.itv.vod.playlist.v2+json', + 'Content-Type': 'application/json', + 'hmac': hmac.upper(), }) + ios_playlist = self._download_json( + ios_playlist_url, video_id, data=json.dumps({ + 'user': { + 'itvUserId': '', + 'entitlements': [], + 'token': '' + }, + 'device': { + 'manufacturer': 'Safari', + 'model': '5', + 'os': { + 'name': 'Windows NT', + 'version': '6.1', + 'type': 'desktop' + } + }, + 'client': { + 'version': '4.1', + 'id': 'browser' + }, + 'variantAvailability': { + 'featureset': { + 'min': ['hls', 'aes', 'outband-webvtt'], + 'max': ['hls', 'aes', 'outband-webvtt'] + }, + 'platformTag': 'dotcom' + } + }).encode(), headers=headers) + video_data = ios_playlist['Playlist']['Video'] + ios_base_url = video_data.get('Base') - info = self._search_json_ld(webpage, video_id, default={}) formats = [] - subtitles = {} - - def extract_subtitle(sub_url): - ext = determine_ext(sub_url, 'ttml') - subtitles.setdefault('en', []).append({ - 'url': sub_url, - 'ext': 'ttml' if ext == 'xml' else ext, - }) - - resp_env = self._download_xml( - params['data-playlist-url'], video_id, - headers=headers, data=etree.tostring(req_env), fatal=False) - if resp_env: - playlist = xpath_element(resp_env, './/Playlist') - if playlist is None: - fault_code = xpath_text(resp_env, './/faultcode') - fault_string = xpath_text(resp_env, './/faultstring') - if fault_code == 'InvalidGeoRegion': - self.raise_geo_restricted( - msg=fault_string, countries=self._GEO_COUNTRIES) - elif fault_code not in ( - 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, fault_string), expected=True) - info.update({ - 'title': self._og_search_title(webpage), - 'episode_title': params.get('data-video-episode'), - 'series': params.get('data-video-title'), - }) + for media_file in (video_data.get('MediaFiles') or []): + href = media_file.get('Href') + if not href: + continue + if ios_base_url: + href = ios_base_url + href + ext = determine_ext(href) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + href, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) else: - title = xpath_text(playlist, 'EpisodeTitle', default=None) - info.update({ - 'title': title, - 'episode_title': title, - 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), - 'series': xpath_text(playlist, 'ProgrammeTitle'), - 'duration': parse_duration(xpath_text(playlist, 'Duration')), + formats.append({ + 'url': href, }) - video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) - media_files = xpath_element(video_element, 'MediaFiles', fatal=True) - rtmp_url = media_files.attrib['base'] - - for media_file in media_files.findall('MediaFile'): - play_path = xpath_text(media_file, 'URL') - if not play_path: - continue - tbr = int_or_none(media_file.get('bitrate'), 1000) - f = { - 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'play_path': play_path, - # Providing this swfVfy allows to avoid truncated downloads - 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', - 'page_url': url, - 'tbr': tbr, - 'ext': 'flv', - } - app = self._search_regex( - 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) - if app: - f.update({ - 'url': rtmp_url.split('?', 1)[0], - 'app': app, - }) - else: - f['url'] = rtmp_url - formats.append(f) - - for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): - if caption_url.text: - extract_subtitle(caption_url.text) + self._sort_formats(formats) - ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') - hmac = params.get('data-video-hmac') - if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url): - headers = self.geo_verification_headers() - headers.update({ - 'Accept': 'application/vnd.itv.vod.playlist.v2+json', - 'Content-Type': 'application/json', - 'hmac': hmac.upper(), + subtitles = {} + subs = video_data.get('Subtitles') or [] + for sub in subs: + if not isinstance(sub, dict): + continue + href = url_or_none(sub.get('Href')) + if not href: + continue + subtitles.setdefault('en', []).append({ + 'url': href, + 'ext': determine_ext(href, 'vtt'), }) - ios_playlist = self._download_json( - ios_playlist_url, video_id, data=json.dumps({ - 'user': { - 'itvUserId': '', - 'entitlements': [], - 'token': '' - }, - 'device': { - 'manufacturer': 'Safari', - 'model': '5', - 'os': { - 'name': 'Windows NT', - 'version': '6.1', - 'type': 'desktop' - } - }, - 'client': { - 'version': '4.1', - 'id': 'browser' - }, - 'variantAvailability': { - 'featureset': { - 'min': ['hls', 'aes', 'outband-webvtt'], - 'max': ['hls', 'aes', 'outband-webvtt'] - }, - 'platformTag': 'dotcom' - } - }).encode(), headers=headers, fatal=False) - if ios_playlist: - video_data = ios_playlist.get('Playlist', {}).get('Video', {}) - ios_base_url = video_data.get('Base') - for media_file in video_data.get('MediaFiles', []): - href = media_file.get('Href') - if not href: - continue - if ios_base_url: - href = ios_base_url + href - ext = determine_ext(href) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - href, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': href, - }) - subs = video_data.get('Subtitles') - if isinstance(subs, list): - for sub in subs: - if not isinstance(sub, dict): - continue - href = url_or_none(sub.get('Href')) - if href: - extract_subtitle(href) - if not info.get('duration'): - info['duration'] = parse_duration(video_data.get('Duration')) - - self._sort_formats(formats) - info.update({ + info = self._search_json_ld(webpage, video_id, default={}) + if not info: + json_ld = self._parse_json(self._search_regex( + JSON_LD_RE, webpage, 'JSON-LD', '{}', + group='json_ld'), video_id, fatal=False) + if json_ld and json_ld.get('@type') == 'BreadcrumbList': + for ile in (json_ld.get('itemListElement:') or []): + item = ile.get('item:') or {} + if item.get('@type') == 'TVEpisode': + item['@context'] = 'http://schema.org' + info = self._json_ld(item, video_id, fatal=False) or {} + break + + return merge_dicts({ 'id': video_id, + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), 'formats': formats, 'subtitles': subtitles, - }) - - webpage_info = self._search_json_ld(webpage, video_id, default={}) - if not webpage_info.get('title'): - webpage_info['title'] = self._html_search_regex( - r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<', - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', - default=None) or webpage_info['episode'] - - return merge_dicts(info, webpage_info) + 'duration': parse_duration(video_data.get('Duration')), + 'description': strip_or_none(get_element_by_class('episode-info__synopsis', webpage)), + }, info) class ITVBTCCIE(InfoExtractor): -- GitLab From 1765b2f8706a13502ee9788ea3b07c780462be72 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:54:08 +0100 Subject: [PATCH 119/384] =?UTF-8?q?[facebook]=20add=20support=20for=20grou?= =?UTF-8?q?p=20posts=20with=20multiple=20videos(closes=20#1=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …9131) --- haruhi_dl/extractor/facebook.py | 57 ++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index bcb224c03..2c3e4b251 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -72,6 +72,7 @@ class FacebookIE(InfoExtractor): }, 'skip': 'Requires logging in', }, { + # data.video 'url': 'https://www.facebook.com/video.php?v=274175099429670', 'info_dict': { 'id': '274175099429670', @@ -133,6 +134,7 @@ class FacebookIE(InfoExtractor): }, }, { # have 1080P, but only up to 720p in swf params + # data.video.story.attachments[].media 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', 'md5': '9571fae53d4165bbbadb17a94651dcdc', 'info_dict': { @@ -147,6 +149,7 @@ class FacebookIE(InfoExtractor): }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', 'info_dict': { 'id': '1417995061575415', @@ -174,6 +177,7 @@ class FacebookIE(InfoExtractor): 'skip_download': True, }, }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', 'info_dict': { 'id': '1396382447100162', @@ -193,18 +197,23 @@ class FacebookIE(InfoExtractor): 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', 'only_matching': True, }, { + # data.mediaset.currMedia.edges 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', 'only_matching': True, }, { + # data.video.story.attachments[].media 'url': 'facebook:544765982287235', 'only_matching': True, }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 'only_matching': True, }, { + # data.video.creation_story.attachments[].media 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', 'only_matching': True, }, { + # data.video 'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670', 'only_matching': True, }, { @@ -212,6 +221,7 @@ class FacebookIE(InfoExtractor): 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', 'only_matching': True, }, { + # data.video 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', 'info_dict': { 'id': '359649331226507', @@ -222,6 +232,13 @@ class FacebookIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media + 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', + 'info_dict': { + 'id': '106560053808006', + }, + 'playlist_count': 2, }] @staticmethod @@ -330,9 +347,7 @@ class FacebookIE(InfoExtractor): return extract_video_data(try_get( js_data, lambda x: x['jsmods']['instances'], list) or []) - formats = [] - - def extract_dash_manifest(video): + def extract_dash_manifest(video, formats): dash_manifest = video.get('dash_manifest') if dash_manifest: formats.extend(self._parse_mpd_formats( @@ -351,7 +366,10 @@ class FacebookIE(InfoExtractor): webpage, 'graphql data', default='{}'), video_id, fatal=False) or {} for require in (graphql_data.get('require') or []): if require[0] == 'RelayPrefetchedStreamCache': + entries = [] + def parse_graphql_video(video): + formats = [] q = qualities(['sd', 'hd']) for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: playable_url = video.get('playable_url' + suffix) @@ -362,7 +380,7 @@ class FacebookIE(InfoExtractor): 'quality': q(format_id), 'url': playable_url, }) - extract_dash_manifest(video) + extract_dash_manifest(video, formats) self._sort_formats(formats) v_id = video.get('videoId') or video.get('id') or video_id info = { @@ -382,7 +400,12 @@ class FacebookIE(InfoExtractor): }) else: info['title'] = description or 'Facebook video #%s' % v_id - return webpage, info + entries.append(info) + + def parse_attachment(attachment, key='media'): + media = attachment.get(key) or {} + if media.get('__typename') == 'Video': + return parse_graphql_video(media) data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} @@ -392,13 +415,22 @@ class FacebookIE(InfoExtractor): lambda x: x['node']['comet_sections']['content']['story']['attachments'] ], list) or [] for attachment in attachments: - media = attachment.get('media') or try_get(attachment, lambda x: x['style_type_renderer']['attachment']['media'], dict) or {} - if media.get('__typename') == 'Video': - return parse_graphql_video(media) + attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) or attachment + nodes = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] + for node in nodes: + parse_attachment(node) + parse_attachment(attachment) - video = data.get('video') or {} - if video: - return parse_graphql_video(video) + edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] + for edge in edges: + parse_attachment(edge, key='node') + + if not entries: + video = data.get('video') or {} + if video: + parse_graphql_video(video) + + return webpage, self.playlist_result(entries, video_id) if not video_data: if not fatal_if_no_video: @@ -440,6 +472,7 @@ class FacebookIE(InfoExtractor): if not video_data: raise ExtractorError('Cannot parse data') + formats = [] subtitles = {} for f in video_data: format_id = f['stream_type'] @@ -459,7 +492,7 @@ class FacebookIE(InfoExtractor): 'url': src, 'preference': preference, }) - extract_dash_manifest(f[0]) + extract_dash_manifest(f[0], formats) subtitles_src = f[0].get('subtitles_src') if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src}) -- GitLab From 493a5245dc16750e094fb228c40d1fe0614edfbe Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:54:13 +0100 Subject: [PATCH 120/384] [facebook] add support for watch videos(closes #22795) --- haruhi_dl/extractor/facebook.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 2c3e4b251..417d7a370 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -41,7 +41,8 @@ class FacebookIE(InfoExtractor): photo\.php| video\.php| video/embed| - story\.php + story\.php| + watch/? )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| @@ -239,6 +240,20 @@ class FacebookIE(InfoExtractor): 'id': '106560053808006', }, 'playlist_count': 2, + }, { + # data.video_home_www_feed.video_home_sections.edges[].node.feed_section_renderer.section.section_components.edges[].node.feed_unit.attachments + 'url': 'https://www.facebook.com/watch/?v=125475412191640', + 'md5': 'a38bed45dd1b2881ea230f3561c914b7', + 'info_dict': { + 'id': '373249263226147', + 'ext': 'mp4', + 'title': 'شوف بعينيك ماذا يحدث...ماناش نخوف فيكم رانا ننقل لكم مايحدث...', + 'description': 'شوف بعينيك ماذا يحدث خويا العزيز...ماناش نخوف فيكم رانا ننقل لكم مايحدث...\nتذكروا جيدا ماكنا نقوله لكم منذ سنوات وماكنا نحذركم .', + 'timestamp': 1550353963, + 'upload_date': '20190216', + 'uploader_id': '176917942440142', + }, + 'skip': 'Requires logging in', }] @staticmethod @@ -425,6 +440,14 @@ class FacebookIE(InfoExtractor): for edge in edges: parse_attachment(edge, key='node') + video_home_sections = try_get(data, lambda x: x['video_home_www_feed']['video_home_sections']['edges'], list) or [] + for video_home_section in video_home_sections: + section_components = try_get(video_home_section, lambda x: x['node']['feed_section_renderer']['section']['section_components']['edges'], list) or [] + for section_component in section_components: + attachments = try_get(section_component, lambda x: x['node']['feed_unit']['attachments'], list) or [] + for attachment in attachments: + parse_attachment(attachment) + if not entries: video = data.get('video') or {} if video: -- GitLab From e51e641c6c80a9d3347a02c52401a21220c4768a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:54:19 +0100 Subject: [PATCH 121/384] Revert "[facebook] add support for watch videos(closes #22795)" This reverts commit dc65041c224497f46b2984df02c234ce54bdedfd. --- haruhi_dl/extractor/facebook.py | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 417d7a370..2c3e4b251 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -41,8 +41,7 @@ class FacebookIE(InfoExtractor): photo\.php| video\.php| video/embed| - story\.php| - watch/? + story\.php )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| @@ -240,20 +239,6 @@ class FacebookIE(InfoExtractor): 'id': '106560053808006', }, 'playlist_count': 2, - }, { - # data.video_home_www_feed.video_home_sections.edges[].node.feed_section_renderer.section.section_components.edges[].node.feed_unit.attachments - 'url': 'https://www.facebook.com/watch/?v=125475412191640', - 'md5': 'a38bed45dd1b2881ea230f3561c914b7', - 'info_dict': { - 'id': '373249263226147', - 'ext': 'mp4', - 'title': 'شوف بعينيك ماذا يحدث...ماناش نخوف فيكم رانا ننقل لكم مايحدث...', - 'description': 'شوف بعينيك ماذا يحدث خويا العزيز...ماناش نخوف فيكم رانا ننقل لكم مايحدث...\nتذكروا جيدا ماكنا نقوله لكم منذ سنوات وماكنا نحذركم .', - 'timestamp': 1550353963, - 'upload_date': '20190216', - 'uploader_id': '176917942440142', - }, - 'skip': 'Requires logging in', }] @staticmethod @@ -440,14 +425,6 @@ class FacebookIE(InfoExtractor): for edge in edges: parse_attachment(edge, key='node') - video_home_sections = try_get(data, lambda x: x['video_home_www_feed']['video_home_sections']['edges'], list) or [] - for video_home_section in video_home_sections: - section_components = try_get(video_home_section, lambda x: x['node']['feed_section_renderer']['section']['section_components']['edges'], list) or [] - for section_component in section_components: - attachments = try_get(section_component, lambda x: x['node']['feed_unit']['attachments'], list) or [] - for attachment in attachments: - parse_attachment(attachment) - if not entries: video = data.get('video') or {} if video: -- GitLab From 9f4416afd729b1b552a1247f09e710e42a4c5a39 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:54:28 +0100 Subject: [PATCH 122/384] [facebook] proper support for watch videos(closes #22795)(#27062) --- haruhi_dl/extractor/facebook.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 2c3e4b251..c16453776 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -41,7 +41,8 @@ class FacebookIE(InfoExtractor): photo\.php| video\.php| video/embed| - story\.php + story\.php| + watch/? )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| @@ -239,6 +240,10 @@ class FacebookIE(InfoExtractor): 'id': '106560053808006', }, 'playlist_count': 2, + }, { + # data.video.story.attachments[].media + 'url': 'https://www.facebook.com/watch/?v=647537299265662', + 'only_matching': True, }] @staticmethod -- GitLab From 96b2d8bb346554b2f75fef9115dfb6db7c69237c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:08:54 +0100 Subject: [PATCH 123/384] [PATCH] [facebook] add support for Relay post pages(closes #26935) --- haruhi_dl/extractor/facebook.py | 125 +++++++++++++++++++------------- 1 file changed, 73 insertions(+), 52 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index c16453776..82f90d2ac 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -26,6 +26,7 @@ from ..utils import ( sanitized_Request, try_get, urlencode_postdata, + urljoin, ) @@ -244,7 +245,28 @@ class FacebookIE(InfoExtractor): # data.video.story.attachments[].media 'url': 'https://www.facebook.com/watch/?v=647537299265662', 'only_matching': True, + }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media + 'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271', + 'info_dict': { + 'id': '10157667649866271', + }, + 'playlist_count': 3, + }, { + # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media + 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', + 'info_dict': { + 'id': '117576630041613', + 'ext': 'mp4', + # TODO: title can be extracted from video page + 'title': 'Facebook video #117576630041613', + 'uploader_id': '189393014416438', + 'upload_date': '20201123', + 'timestamp': 1606162592, + }, + 'skip': 'Requires logging in', }] + _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' @staticmethod def _extract_urls(webpage, **kwargs): @@ -327,18 +349,20 @@ class FacebookIE(InfoExtractor): def _real_initialize(self): self._login() - def _extract_from_url(self, url, video_id, fatal_if_no_video=True): + def _extract_from_url(self, url, video_id): webpage = self._download_webpage( url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) video_data = None def extract_video_data(instances): + video_data = [] for item in instances: if item[1][0] == 'VideoConfig': video_item = item[2][0] if video_item.get('video_id'): - return video_item['videoData'] + video_data.append(video_item['videoData']) + return video_data server_js_data = self._parse_json(self._search_regex( r'handleServerJS\(({.+})(?:\);|,")', webpage, @@ -358,10 +382,18 @@ class FacebookIE(InfoExtractor): formats.extend(self._parse_mpd_formats( compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + def process_formats(formats): + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. + for f in formats: + f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' + + self._sort_formats(formats) + if not video_data: server_js_data = self._parse_json(self._search_regex([ - r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', - r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"permalink_video_pagelet".*?})\);' + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, + r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"%s".*?})\);' % self._SUPPORTED_PAGLETS_REGEX ], webpage, 'js data', default='{}'), video_id, js_to_json, False) video_data = extract_from_jsmods_instances(server_js_data) @@ -386,7 +418,7 @@ class FacebookIE(InfoExtractor): 'url': playable_url, }) extract_dash_manifest(video, formats) - self._sort_formats(formats) + process_formats(formats) v_id = video.get('videoId') or video.get('id') or video_id info = { 'id': v_id, @@ -414,32 +446,37 @@ class FacebookIE(InfoExtractor): data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} - attachments = try_get(data, [ - lambda x: x['video']['story']['attachments'], - lambda x: x['video']['creation_story']['attachments'], - lambda x: x['node']['comet_sections']['content']['story']['attachments'] - ], list) or [] - for attachment in attachments: - attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) or attachment - nodes = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] - for node in nodes: - parse_attachment(node) - parse_attachment(attachment) + nodes = data.get('nodes') or [] + node = data.get('node') or {} + if not nodes and node: + nodes.append(node) + for node in nodes: + attachments = try_get(node, lambda x: x['comet_sections']['content']['story']['attachments'], list) or [] + for attachment in attachments: + attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) + ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] + for n in ns: + parse_attachment(n) + parse_attachment(attachment) edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] for edge in edges: parse_attachment(edge, key='node') - if not entries: - video = data.get('video') or {} - if video: + video = data.get('video') or {} + if video: + attachments = try_get(video, [ + lambda x: x['story']['attachments'], + lambda x: x['creation_story']['attachments'] + ], list) or [] + for attachment in attachments: + parse_attachment(attachment) + if not entries: parse_graphql_video(video) - return webpage, self.playlist_result(entries, video_id) + return self.playlist_result(entries, video_id) if not video_data: - if not fatal_if_no_video: - return webpage, False m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) if m_msg is not None: raise ExtractorError( @@ -477,6 +514,17 @@ class FacebookIE(InfoExtractor): if not video_data: raise ExtractorError('Cannot parse data') + if len(video_data) > 1: + entries = [] + for v in video_data: + video_url = v[0].get('video_url') + if not video_url: + continue + entries.append(self.url_result(urljoin( + url, video_url), self.ie_key(), v[0].get('video_id'))) + return self.playlist_result(entries, video_id) + video_data = video_data[0] + formats = [] subtitles = {} for f in video_data: @@ -504,12 +552,7 @@ class FacebookIE(InfoExtractor): if not formats: raise ExtractorError('Cannot find video formats') - # Downloads with browser's User-Agent are rate limited. Working around - # with non-browser User-Agent. - for f in formats: - f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - - self._sort_formats(formats) + process_formats(formats) video_title = self._html_search_regex( r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, @@ -549,35 +592,13 @@ class FacebookIE(InfoExtractor): 'subtitles': subtitles, } - return webpage, info_dict + return info_dict def _real_extract(self, url): video_id = self._match_id(url) real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url - webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False) - - if info_dict: - return info_dict - - if '/posts/' in url: - video_id_json = self._search_regex( - r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])', webpage, 'video ids', group='ids', - default='') - if video_id_json: - entries = [ - self.url_result('facebook:%s' % vid, FacebookIE.ie_key()) - for vid in self._parse_json(video_id_json, video_id)] - return self.playlist_result(entries, video_id) - - # Single Video? - video_id = self._search_regex(r'video_id:\s*"([0-9]+)"', webpage, 'single video id') - return self.url_result('facebook:%s' % video_id, FacebookIE.ie_key()) - else: - _, info_dict = self._extract_from_url( - self._VIDEO_PAGE_TEMPLATE % video_id, - video_id, fatal_if_no_video=True) - return info_dict + return self._extract_from_url(real_url, video_id) class FacebookPluginsVideoIE(InfoExtractor): -- GitLab From 7818a5cbb613f3a279f7c82355f79b60a624dfd2 Mon Sep 17 00:00:00 2001 From: compujo <2576634+compujo@users.noreply.github.com> Date: Fri, 26 Feb 2021 15:11:12 +0100 Subject: [PATCH 124/384] =?UTF-8?q?[YoutubeDL]=20Improve=20thumbnails'=20f?= =?UTF-8?q?ilenames=20deducing=20(closes=20#26010)=20(#=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …27244) --- haruhi_dl/HaruhiDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/HaruhiDL.py b/haruhi_dl/HaruhiDL.py index ffc583e82..813d32d76 100755 --- a/haruhi_dl/HaruhiDL.py +++ b/haruhi_dl/HaruhiDL.py @@ -2414,7 +2414,7 @@ class HaruhiDL(object): thumb_ext = determine_ext(t['url'], 'jpg') suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' - t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext + t['filename'] = thumb_filename = replace_extension(filename + suffix, thumb_ext, info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): self.to_screen('[%s] %s: Thumbnail %sis already present' % -- GitLab From cba73be180a795a550eaf8034c3fc68c41bbf233 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:11:21 +0100 Subject: [PATCH 125/384] [facebook] fix embed page extraction --- haruhi_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 82f90d2ac..c2e2155f5 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -358,7 +358,7 @@ class FacebookIE(InfoExtractor): def extract_video_data(instances): video_data = [] for item in instances: - if item[1][0] == 'VideoConfig': + if try_get(item, lambda x: x[1][0]) == 'VideoConfig': video_item = item[2][0] if video_item.get('video_id'): video_data.append(video_item['videoData']) -- GitLab From fe9f5a795d06dc2dd422f5ff002fa13f021fd8b9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:11:26 +0100 Subject: [PATCH 126/384] [facebook] Add another regex for handleServerJS --- haruhi_dl/extractor/facebook.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index c2e2155f5..9113678c4 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -365,8 +365,8 @@ class FacebookIE(InfoExtractor): return video_data server_js_data = self._parse_json(self._search_regex( - r'handleServerJS\(({.+})(?:\);|,")', webpage, - 'server js data', default='{}'), video_id, fatal=False) + [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'], + webpage, 'server js data', default='{}'), video_id, fatal=False) if server_js_data: video_data = extract_video_data(server_js_data.get('instances', [])) -- GitLab From ba0f2c14da7ec1722858342059fe05b26e426047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:11:31 +0100 Subject: [PATCH 127/384] [wdr:page] Add support for kinder.wdr.de (closes #27350) --- haruhi_dl/extractor/wdr.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/wdr.py b/haruhi_dl/extractor/wdr.py index cf6f7c7ed..ba97d983b 100644 --- a/haruhi_dl/extractor/wdr.py +++ b/haruhi_dl/extractor/wdr.py @@ -105,7 +105,7 @@ class WDRIE(InfoExtractor): class WDRPageIE(InfoExtractor): _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html' - _VALID_URL = r'https?://(?:www\d?\.)?(?:wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL + _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL _TESTS = [ { @@ -212,7 +212,11 @@ class WDRPageIE(InfoExtractor): { 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', 'only_matching': True, - } + }, + { + 'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html', + 'only_matching': True, + }, ] def _real_extract(self, url): -- GitLab From 7cebd30677f6e1322be8af37ac448740b077a0f4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:11:35 +0100 Subject: [PATCH 128/384] =?UTF-8?q?[facebook]=20add=20support=20for=20vide?= =?UTF-8?q?os=20attached=20to=20Relay=20based=20story=20pages=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …(#10795) --- haruhi_dl/extractor/facebook.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 9113678c4..370365ab8 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -265,6 +265,17 @@ class FacebookIE(InfoExtractor): 'timestamp': 1606162592, }, 'skip': 'Requires logging in', + }, { + # node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media + 'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/', + 'info_dict': { + 'id': '211567722618337', + 'ext': 'mp4', + 'title': 'Facebook video #211567722618337', + 'uploader_id': '127875227654254', + 'upload_date': '20161122', + 'timestamp': 1479793574, + }, }] _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' @@ -451,7 +462,11 @@ class FacebookIE(InfoExtractor): if not nodes and node: nodes.append(node) for node in nodes: - attachments = try_get(node, lambda x: x['comet_sections']['content']['story']['attachments'], list) or [] + story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} + attachments = try_get(story, [ + lambda x: x['attached_story']['attachments'], + lambda x: x['attachments'] + ], list) or [] for attachment in attachments: attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] -- GitLab From a8573bb5b2586b49db30803f6d5236d263bc6545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:11:39 +0100 Subject: [PATCH 129/384] =?UTF-8?q?[wdr]=20Extent=20subtitles=20extraction?= =?UTF-8?q?=20and=20improve=20overall=20extraction=20(clo=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ses #22672, closes #22723) --- haruhi_dl/extractor/wdr.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/wdr.py b/haruhi_dl/extractor/wdr.py index ba97d983b..2903d189e 100644 --- a/haruhi_dl/extractor/wdr.py +++ b/haruhi_dl/extractor/wdr.py @@ -17,6 +17,7 @@ from ..utils import ( unified_strdate, update_url_query, urlhandle_detect_ext, + url_or_none, ) @@ -42,16 +43,20 @@ class WDRIE(InfoExtractor): is_live = metadata.get('mediaType') == 'live' tracker_data = metadata['trackerData'] + title = tracker_data['trackerClipTitle'] + media_resource = metadata['mediaResource'] formats = [] # check if the metadata contains a direct URL to a file - for kind, media_resource in media_resource.items(): + for kind, media in media_resource.items(): + if not isinstance(media, dict): + continue if kind not in ('dflt', 'alt'): continue - for tag_name, medium_url in media_resource.items(): + for tag_name, medium_url in media.items(): if tag_name not in ('videoURL', 'audioURL'): continue @@ -88,8 +93,16 @@ class WDRIE(InfoExtractor): 'url': caption_url, 'ext': 'ttml', }] - - title = tracker_data['trackerClipTitle'] + captions_hash = media_resource.get('captionsHash') + if isinstance(captions_hash, dict): + for ext, format_url in captions_hash.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + subtitles.setdefault('de', []).append({ + 'url': format_url, + 'ext': determine_ext(format_url, None) or ext, + }) return { 'id': tracker_data.get('trackerClipId', video_id), -- GitLab From 19d8f8301369d8b3641299e611220dc79e5d9fb3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:11:44 +0100 Subject: [PATCH 130/384] [facebook] Add support archived live video URLs(closes #15859) --- haruhi_dl/extractor/facebook.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 370365ab8..d5afd0051 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -43,7 +43,7 @@ class FacebookIE(InfoExtractor): video\.php| video/embed| story\.php| - watch/? + watch(?:/live)?/? )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| @@ -276,6 +276,10 @@ class FacebookIE(InfoExtractor): 'upload_date': '20161122', 'timestamp': 1479793574, }, + }, { + # data.video.creation_story.attachments[].media + 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', + 'only_matching': True, }] _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' -- GitLab From 1646a89d7105524cfbec594ae22a161148d86953 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:11:51 +0100 Subject: [PATCH 131/384] [ruutu] Extend _VALID_URL (closes #24839) --- haruhi_dl/extractor/ruutu.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/ruutu.py b/haruhi_dl/extractor/ruutu.py index f984040aa..4dbd144bc 100644 --- a/haruhi_dl/extractor/ruutu.py +++ b/haruhi_dl/extractor/ruutu.py @@ -13,7 +13,7 @@ from ..utils import ( class RuutuIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/(?P<id>\d+)' _TESTS = [ { 'url': 'http://www.ruutu.fi/video/2058907', @@ -71,8 +71,15 @@ class RuutuIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 0, }, - 'expected_warnings': ['HTTP Error 502: Bad Gateway'], - } + 'expected_warnings': [ + 'HTTP Error 502: Bad Gateway', + 'Failed to download m3u8 information', + ], + }, + { + 'url': 'http://www.supla.fi/audio/2231370', + 'only_matching': True, + }, ] def _real_extract(self, url): -- GitLab From 39031fb5ac0c06f1f2e1b9fc668987763dbf1f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:11:57 +0100 Subject: [PATCH 132/384] [ruutu] Add support for static.nelonenmedia.fi (closes #25412) --- haruhi_dl/extractor/ruutu.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/ruutu.py b/haruhi_dl/extractor/ruutu.py index 4dbd144bc..af42e3b12 100644 --- a/haruhi_dl/extractor/ruutu.py +++ b/haruhi_dl/extractor/ruutu.py @@ -13,7 +13,14 @@ from ..utils import ( class RuutuIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/| + static\.nelonenmedia\.fi/player/misc/embed_player\.html\?.*?\bnid= + ) + (?P<id>\d+) + ''' _TESTS = [ { 'url': 'http://www.ruutu.fi/video/2058907', @@ -80,6 +87,10 @@ class RuutuIE(InfoExtractor): 'url': 'http://www.supla.fi/audio/2231370', 'only_matching': True, }, + { + 'url': 'https://static.nelonenmedia.fi/player/misc/embed_player.html?nid=3618790', + 'only_matching': True, + }, ] def _real_extract(self, url): -- GitLab From e9de74c42f6e342f1d7127e9b9fec95fd9468c24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:12:03 +0100 Subject: [PATCH 133/384] [ruutu] Authenticate format URLs (closes #21031, closes #26782) --- haruhi_dl/extractor/ruutu.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/ruutu.py b/haruhi_dl/extractor/ruutu.py index af42e3b12..561669bb2 100644 --- a/haruhi_dl/extractor/ruutu.py +++ b/haruhi_dl/extractor/ruutu.py @@ -7,6 +7,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + url_or_none, xpath_attr, xpath_text, ) @@ -92,12 +93,13 @@ class RuutuIE(InfoExtractor): 'only_matching': True, }, ] + _API_BASE = 'https://gatling.nelonenmedia.fi' def _real_extract(self, url): video_id = self._match_id(url) video_xml = self._download_xml( - 'https://gatling.nelonenmedia.fi/media-xml-cache', video_id, + '%s/media-xml-cache' % self._API_BASE, video_id, query={'id': video_id}) formats = [] @@ -114,9 +116,18 @@ class RuutuIE(InfoExtractor): continue processed_urls.append(video_url) ext = determine_ext(video_url) + auth_video_url = url_or_none(self._download_webpage( + '%s/auth/access/v2' % self._API_BASE, video_id, + note='Downloading authenticated %s stream URL' % ext, + fatal=False, query={'stream': video_url})) + if auth_video_url: + processed_urls.append(auth_video_url) + video_url = auth_video_url if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + video_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds', fatal=False)) -- GitLab From c1f59f3fb6cb89b816ff905bcbd52832b800e097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:12:07 +0100 Subject: [PATCH 134/384] [ruutu] Extract more metadata and detect non-free videos (closes #21154) --- haruhi_dl/extractor/ruutu.py | 53 +++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/ruutu.py b/haruhi_dl/extractor/ruutu.py index 561669bb2..c50cd3ecd 100644 --- a/haruhi_dl/extractor/ruutu.py +++ b/haruhi_dl/extractor/ruutu.py @@ -6,7 +6,9 @@ from ..compat import compat_urllib_parse_urlparse from ..utils import ( determine_ext, ExtractorError, + find_xpath_attr, int_or_none, + unified_strdate, url_or_none, xpath_attr, xpath_text, @@ -92,6 +94,32 @@ class RuutuIE(InfoExtractor): 'url': 'https://static.nelonenmedia.fi/player/misc/embed_player.html?nid=3618790', 'only_matching': True, }, + { + # episode + 'url': 'https://www.ruutu.fi/video/3401964', + 'info_dict': { + 'id': '3401964', + 'ext': 'mp4', + 'title': 'Temptation Island Suomi - Kausi 5 - Jakso 17', + 'description': 'md5:87cf01d5e1e88adf0c8a2937d2bd42ba', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2582, + 'age_limit': 12, + 'upload_date': '20190508', + 'series': 'Temptation Island Suomi', + 'season_number': 5, + 'episode_number': 17, + 'categories': ['Reality ja tositapahtumat', 'Kotimaiset suosikit', 'Romantiikka ja parisuhde'], + }, + 'params': { + 'skip_download': True, + }, + }, + { + # premium + 'url': 'https://www.ruutu.fi/video/3618715', + 'only_matching': True, + }, ] _API_BASE = 'https://gatling.nelonenmedia.fi' @@ -165,18 +193,35 @@ class RuutuIE(InfoExtractor): extract_formats(video_xml.find('./Clip')) - drm = xpath_text(video_xml, './Clip/DRM', default=None) - if not formats and drm: - raise ExtractorError('This video is DRM protected.', expected=True) + def pv(name): + node = find_xpath_attr( + video_xml, './Clip/PassthroughVariables/variable', 'name', name) + if node is not None: + return node.get('value') + + if not formats: + drm = xpath_text(video_xml, './Clip/DRM', default=None) + if drm: + raise ExtractorError('This video is DRM protected.', expected=True) + ns_st_cds = pv('ns_st_cds') + if ns_st_cds != 'free': + raise ExtractorError('This video is %s.' % ns_st_cds, expected=True) self._sort_formats(formats) + themes = pv('themes') + return { 'id': video_id, 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'), - 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')), + 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')) or int_or_none(pv('runtime')), 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), + 'upload_date': unified_strdate(pv('date_start')), + 'series': pv('series_name'), + 'season_number': int_or_none(pv('season_number')), + 'episode_number': int_or_none(pv('episode_number')), + 'categories': themes.split(',') if themes else [], 'formats': formats, } -- GitLab From d1a7ceb19afb372beae64e334e97f85b6322f663 Mon Sep 17 00:00:00 2001 From: toniz4 <cassioavila000@gmail.com> Date: Fri, 26 Feb 2021 15:12:33 +0100 Subject: [PATCH 135/384] [youtube] Add some invidious instances (#27373) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Cássio <heyitscassio@cock.li> --- haruhi_dl/extractor/youtube.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/haruhi_dl/extractor/youtube.py b/haruhi_dl/extractor/youtube.py index 2430fa180..dd58b2407 100644 --- a/haruhi_dl/extractor/youtube.py +++ b/haruhi_dl/extractor/youtube.py @@ -293,10 +293,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?invidious\.kabi\.tk/| (?:www\.)?invidious\.13ad\.de/| (?:www\.)?invidious\.mastodon\.host/| + (?:www\.)?invidious\.zapashcanon\.fr/| + (?:www\.)?invidious\.kavin\.rocks/| + (?:www\.)?invidious\.tube/| + (?:www\.)?invidiou\.site/| + (?:www\.)?invidious\.site/| + (?:www\.)?invidious\.xyz/| (?:www\.)?invidious\.nixnet\.xyz/| (?:www\.)?invidious\.drycat\.fr/| (?:www\.)?tube\.poal\.co/| + (?:www\.)?tube\.connect\.cafe/| (?:www\.)?vid\.wxzm\.sx/| + (?:www\.)?vid\.mint\.lgbt/| (?:www\.)?yewtu\.be/| (?:www\.)?yt\.elukerio\.org/| (?:www\.)?yt\.lelux\.fi/| -- GitLab From a1e744970320b513cb66b1fa38f589418978d099 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:12:42 +0100 Subject: [PATCH 136/384] [hotstart] fix and improve extraction - fix format extraction (closes #26690) - extract thumbnail URL (closes #16079, closes #20412) - support country specific playlist URLs (closes #23496) - select the last id in video URL (closes #26412) --- haruhi_dl/extractor/hotstar.py | 96 ++++++++++++++++++++++++---------- 1 file changed, 69 insertions(+), 27 deletions(-) diff --git a/haruhi_dl/extractor/hotstar.py b/haruhi_dl/extractor/hotstar.py index f97eefa3d..1620822b6 100644 --- a/haruhi_dl/extractor/hotstar.py +++ b/haruhi_dl/extractor/hotstar.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import hashlib import hmac +import json import re import time import uuid @@ -25,43 +26,50 @@ from ..utils import ( class HotStarBaseIE(InfoExtractor): _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee' - def _call_api_impl(self, path, video_id, query): + def _call_api_impl(self, path, video_id, headers, query, data=None): st = int(time.time()) exp = st + 6000 auth = 'st=%d~exp=%d~acl=/*' % (st, exp) auth += '~hmac=' + hmac.new(self._AKAMAI_ENCRYPTION_KEY, auth.encode(), hashlib.sha256).hexdigest() - response = self._download_json( - 'https://api.hotstar.com/' + path, video_id, headers={ - 'hotstarauth': auth, - 'x-country-code': 'IN', - 'x-platform-code': 'JIO', - }, query=query) - if response['statusCode'] != 'OK': - raise ExtractorError( - response['body']['message'], expected=True) - return response['body']['results'] + h = {'hotstarauth': auth} + h.update(headers) + return self._download_json( + 'https://api.hotstar.com/' + path, + video_id, headers=h, query=query, data=data) def _call_api(self, path, video_id, query_name='contentId'): - return self._call_api_impl(path, video_id, { + response = self._call_api_impl(path, video_id, { + 'x-country-code': 'IN', + 'x-platform-code': 'JIO', + }, { query_name: video_id, 'tas': 10000, }) + if response['statusCode'] != 'OK': + raise ExtractorError( + response['body']['message'], expected=True) + return response['body']['results'] - def _call_api_v2(self, path, video_id): - return self._call_api_impl( - '%s/in/contents/%s' % (path, video_id), video_id, { - 'desiredConfig': 'encryption:plain;ladder:phone,tv;package:hls,dash', - 'client': 'mweb', - 'clientVersion': '6.18.0', - 'deviceId': compat_str(uuid.uuid4()), - 'osName': 'Windows', - 'osVersion': '10', - }) + def _call_api_v2(self, path, video_id, headers, query=None, data=None): + h = {'X-Request-Id': compat_str(uuid.uuid4())} + h.update(headers) + try: + return self._call_api_impl( + path, video_id, h, query, data) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + if e.cause.code == 402: + self.raise_login_required() + message = self._parse_json(e.cause.read().decode(), video_id)['message'] + if message in ('Content not available in region', 'Country is not supported'): + raise self.raise_geo_restricted(message) + raise ExtractorError(message) + raise e class HotStarIE(HotStarBaseIE): IE_NAME = 'hotstar' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P<id>\d{10})' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+[/-])?(?P<id>\d{10})' _TESTS = [{ # contentData 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', @@ -92,8 +100,13 @@ class HotStarIE(HotStarBaseIE): # only available via api v2 'url': 'https://www.hotstar.com/tv/ek-bhram-sarvagun-sampanna/s-2116/janhvi-targets-suman/1000234847', 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/tv/start-music/1260005217/cooks-vs-comalis/1100039717', + 'only_matching': True, }] _GEO_BYPASS = False + _DEVICE_ID = None + _USER_TOKEN = None def _real_extract(self, url): video_id = self._match_id(url) @@ -121,7 +134,30 @@ class HotStarIE(HotStarBaseIE): headers = {'Referer': url} formats = [] geo_restricted = False - playback_sets = self._call_api_v2('h/v2/play', video_id)['playBackSets'] + + if not self._USER_TOKEN: + self._DEVICE_ID = compat_str(uuid.uuid4()) + self._USER_TOKEN = self._call_api_v2('um/v3/users', video_id, { + 'X-HS-Platform': 'PCTV', + 'Content-Type': 'application/json', + }, data=json.dumps({ + 'device_ids': [{ + 'id': self._DEVICE_ID, + 'type': 'device_id', + }], + }).encode())['user_identity'] + + playback_sets = self._call_api_v2( + 'play/v2/playback/content/' + video_id, video_id, { + 'X-HS-Platform': 'web', + 'X-HS-AppVersion': '6.99.1', + 'X-HS-UserToken': self._USER_TOKEN, + }, query={ + 'device-id': self._DEVICE_ID, + 'desired-config': 'encryption:plain', + 'os-name': 'Windows', + 'os-version': '10', + })['data']['playBackSets'] for playback_set in playback_sets: if not isinstance(playback_set, dict): continue @@ -163,19 +199,22 @@ class HotStarIE(HotStarBaseIE): for f in formats: f.setdefault('http_headers', {}).update(headers) + image = try_get(video_data, lambda x: x['image']['h'], compat_str) + return { 'id': video_id, 'title': title, + 'thumbnail': 'https://img1.hotstarext.com/image/upload/' + image if image else None, 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')), 'formats': formats, 'channel': video_data.get('channelName'), - 'channel_id': video_data.get('channelId'), + 'channel_id': str_or_none(video_data.get('channelId')), 'series': video_data.get('showName'), 'season': video_data.get('seasonName'), 'season_number': int_or_none(video_data.get('seasonNo')), - 'season_id': video_data.get('seasonId'), + 'season_id': str_or_none(video_data.get('seasonId')), 'episode': title, 'episode_number': int_or_none(video_data.get('episodeNo')), } @@ -183,7 +222,7 @@ class HotStarIE(HotStarBaseIE): class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com/tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:[a-z]{2}/)?tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', 'info_dict': { @@ -193,6 +232,9 @@ class HotStarPlaylistIE(HotStarBaseIE): }, { 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/us/tv/masterchef-india/s-830/list/episodes/t-1_2_830', + 'only_matching': True, }] def _real_extract(self, url): -- GitLab From 0311375dc5bc4c8294b612e61a69000bb1c0dd6b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:13:05 +0100 Subject: [PATCH 137/384] [itv] clean description from HTML tags (closes #27399) --- haruhi_dl/extractor/itv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/itv.py b/haruhi_dl/extractor/itv.py index 08bcc8b68..e86c40b42 100644 --- a/haruhi_dl/extractor/itv.py +++ b/haruhi_dl/extractor/itv.py @@ -7,6 +7,7 @@ import re from .common import InfoExtractor from .brightcove import BrightcoveNewIE from ..utils import ( + clean_html, determine_ext, extract_attributes, get_element_by_class, @@ -14,7 +15,6 @@ from ..utils import ( merge_dicts, parse_duration, smuggle_url, - strip_or_none, url_or_none, ) @@ -146,7 +146,7 @@ class ITVIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'duration': parse_duration(video_data.get('Duration')), - 'description': strip_or_none(get_element_by_class('episode-info__synopsis', webpage)), + 'description': clean_html(get_element_by_class('episode-info__synopsis', webpage)), }, info) -- GitLab From 28bbfdff53d2cb5db81d3de1c09518ecdc3e4c7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:13:12 +0100 Subject: [PATCH 138/384] =?UTF-8?q?[linuxacademy]=20Fix=20authentication?= =?UTF-8?q?=20and=20extraction=20(closes=20#21129,=20clos=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …es #26223, closes #27402) --- haruhi_dl/extractor/linuxacademy.py | 130 +++++++++++++++++++++------- 1 file changed, 100 insertions(+), 30 deletions(-) diff --git a/haruhi_dl/extractor/linuxacademy.py b/haruhi_dl/extractor/linuxacademy.py index 23ca965d9..7ec4a6557 100644 --- a/haruhi_dl/extractor/linuxacademy.py +++ b/haruhi_dl/extractor/linuxacademy.py @@ -8,11 +8,15 @@ from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_HTTPError, + compat_str, ) from ..utils import ( + clean_html, ExtractorError, - orderedSet, - unescapeHTML, + js_to_json, + parse_duration, + try_get, + unified_timestamp, urlencode_postdata, urljoin, ) @@ -28,11 +32,15 @@ class LinuxAcademyIE(InfoExtractor): ) ''' _TESTS = [{ - 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154', + 'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675', 'info_dict': { - 'id': '1498-2', + 'id': '7971-2', 'ext': 'mp4', - 'title': "Introduction to the Practitioner's Brief", + 'title': 'What Is Data Science', + 'description': 'md5:c574a3c20607144fb36cb65bdde76c99', + 'timestamp': 1607387907, + 'upload_date': '20201208', + 'duration': 304, }, 'params': { 'skip_download': True, @@ -46,7 +54,8 @@ class LinuxAcademyIE(InfoExtractor): 'info_dict': { 'id': '154', 'title': 'AWS Certified Cloud Practitioner', - 'description': 'md5:039db7e60e4aac9cf43630e0a75fa834', + 'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c', + 'duration': 28835, }, 'playlist_count': 41, 'skip': 'Requires Linux Academy account credentials', @@ -74,6 +83,7 @@ class LinuxAcademyIE(InfoExtractor): self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ 'client_id': self._CLIENT_ID, 'response_type': 'token id_token', + 'response_mode': 'web_message', 'redirect_uri': self._ORIGIN_URL, 'scope': 'openid email user_impersonation profile', 'audience': self._ORIGIN_URL, @@ -129,7 +139,13 @@ class LinuxAcademyIE(InfoExtractor): access_token = self._search_regex( r'access_token=([^=&]+)', urlh.geturl(), - 'access token') + 'access token', default=None) + if not access_token: + access_token = self._parse_json( + self._search_regex( + r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page, + 'authorization response'), None, + transform_source=js_to_json)['response']['access_token'] self._download_webpage( 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' @@ -144,30 +160,84 @@ class LinuxAcademyIE(InfoExtractor): # course path if course_id: - entries = [ - self.url_result( - urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key()) - for lesson_url in orderedSet(re.findall( - r'<a[^>]+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)', - webpage))] - title = unescapeHTML(self._html_search_regex( - (r'class=["\']course-title["\'][^>]*>(?P<value>[^<]+)', - r'var\s+title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), - webpage, 'title', default=None, group='value')) - description = unescapeHTML(self._html_search_regex( - r'var\s+description\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', - webpage, 'description', default=None, group='value')) - return self.playlist_result(entries, course_id, title, description) + module = self._parse_json( + self._search_regex( + r'window\.module\s*=\s*({.+?})\s*;', webpage, 'module'), + item_id) + entries = [] + chapter_number = None + chapter = None + chapter_id = None + for item in module['items']: + if not isinstance(item, dict): + continue + + def type_field(key): + return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower() + type_fields = (type_field('name'), type_field('slug')) + # Move to next module section + if 'section' in type_fields: + chapter = item.get('course_name') + chapter_id = item.get('course_module') + chapter_number = 1 if not chapter_number else chapter_number + 1 + continue + # Skip non-lessons + if 'lesson' not in type_fields: + continue + lesson_url = urljoin(url, item.get('url')) + if not lesson_url: + continue + title = item.get('title') or item.get('lesson_name') + description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text')) + entries.append({ + '_type': 'url_transparent', + 'url': lesson_url, + 'ie_key': LinuxAcademyIE.ie_key(), + 'title': title, + 'description': description, + 'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')), + 'duration': parse_duration(item.get('duration')), + 'chapter': chapter, + 'chapter_id': chapter_id, + 'chapter_number': chapter_number, + }) + return { + '_type': 'playlist', + 'entries': entries, + 'id': course_id, + 'title': module.get('title'), + 'description': module.get('md_desc') or clean_html(module.get('desc')), + 'duration': parse_duration(module.get('duration')), + } # single video path - info = self._extract_jwplayer_data( - webpage, item_id, require_title=False, m3u8_id='hls',) - title = self._search_regex( - (r'>Lecture\s*:\s*(?P<value>[^<]+)', - r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, - 'title', group='value') - info.update({ + m3u8_url = self._parse_json( + self._search_regex( + r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'), + item_id)[0]['file'] + formats = self._extract_m3u8_formats( + m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + info = { 'id': item_id, - 'title': title, - }) + 'formats': formats, + } + lesson = self._parse_json( + self._search_regex( + (r'window\.lesson\s*=\s*({.+?})\s*;', + r'player\.lesson\s*=\s*({.+?})\s*;'), + webpage, 'lesson', default='{}'), item_id, fatal=False) + if lesson: + info.update({ + 'title': lesson.get('lesson_name'), + 'description': lesson.get('md_desc') or clean_html(lesson.get('desc')), + 'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')), + 'duration': parse_duration(lesson.get('duration')), + }) + if not info.get('title'): + info['title'] = self._search_regex( + (r'>Lecture\s*:\s*(?P<value>[^<]+)', + r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, + 'title', group='value') return info -- GitLab From 51d290f5d7d0e339fa72edbb198dd0113dbd1fe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:13:21 +0100 Subject: [PATCH 139/384] [extractor/common] Document duration meta field for playlists --- haruhi_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index 2db95d592..3353987a3 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -336,8 +336,8 @@ class InfoExtractor(object): object, each element of which is a valid dictionary by this specification. Additionally, playlists can have "id", "title", "description", "uploader", - "uploader_id", "uploader_url" attributes with the same semantics as videos - (see above). + "uploader_id", "uploader_url", "duration" attributes with the same semantics + as videos (see above). _type "multi_video" indicates that there are multiple videos that -- GitLab From 0f3f3e90466380e73cbe5c85ea2ef95f821843d9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:13:28 +0100 Subject: [PATCH 140/384] =?UTF-8?q?[twitcasting]=20fix=20format=20extracti?= =?UTF-8?q?on=20and=20improve=20info=20extraction(close=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …s #24868) --- haruhi_dl/extractor/twitcasting.py | 72 +++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/haruhi_dl/extractor/twitcasting.py b/haruhi_dl/extractor/twitcasting.py index 2dbe89f5b..6596eef9f 100644 --- a/haruhi_dl/extractor/twitcasting.py +++ b/haruhi_dl/extractor/twitcasting.py @@ -1,11 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import urlencode_postdata - import re +from .common import InfoExtractor +from ..utils import ( + clean_html, + float_or_none, + get_element_by_class, + get_element_by_id, + parse_duration, + str_to_int, + unified_timestamp, + urlencode_postdata, +) + class TwitCastingIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)' @@ -17,8 +26,12 @@ class TwitCastingIE(InfoExtractor): 'ext': 'mp4', 'title': 'Live #2357609', 'uploader_id': 'ivetesangalo', - 'description': "Moi! I'm live on TwitCasting from my iPhone.", + 'description': 'Twitter Oficial da cantora brasileira Ivete Sangalo.', 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20110822', + 'timestamp': 1314010824, + 'duration': 32, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -30,8 +43,12 @@ class TwitCastingIE(InfoExtractor): 'ext': 'mp4', 'title': 'Live playing something #3689740', 'uploader_id': 'mttbernardini', - 'description': "I'm live on TwitCasting from my iPad. password: abc (Santa Marinella/Lazio, Italia)", + 'description': 'Salve, io sono Matto (ma con la e). Questa è la mia presentazione, in quanto sono letteralmente matto (nel senso di strano), con qualcosa in più.', 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20120212', + 'timestamp': 1329028024, + 'duration': 681, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -40,9 +57,7 @@ class TwitCastingIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - uploader_id = mobj.group('uploader_id') + uploader_id, video_id = re.match(self._VALID_URL, url).groups() video_password = self._downloader.params.get('videopassword') request_data = None @@ -52,30 +67,45 @@ class TwitCastingIE(InfoExtractor): }) webpage = self._download_webpage(url, video_id, data=request_data) - title = self._html_search_regex( - r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)</', - webpage, 'title', default=None) or self._html_search_meta( - 'twitter:title', webpage, fatal=True) + title = clean_html(get_element_by_id( + 'movietitle', webpage)) or self._html_search_meta( + ['og:title', 'twitter:title'], webpage, fatal=True) + video_js_data = {} m3u8_url = self._search_regex( - (r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - r'(["\'])(?P<url>http.+?\.m3u8.*?)\1'), - webpage, 'm3u8 url', group='url') + r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'm3u8 url', group='url', default=None) + if not m3u8_url: + video_js_data = self._parse_json(self._search_regex( + r"data-movie-playlist='(\[[^']+\])'", + webpage, 'movie playlist'), video_id)[0] + m3u8_url = video_js_data['source']['url'] + # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + m3u8_url, video_id, 'mp4', m3u8_id='hls') - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'twitter:description', webpage) + thumbnail = video_js_data.get('thumbnailUrl') or self._og_search_thumbnail(webpage) + description = clean_html(get_element_by_id( + 'authorcomment', webpage)) or self._html_search_meta( + ['description', 'og:description', 'twitter:description'], webpage) + duration = float_or_none(video_js_data.get( + 'duration'), 1000) or parse_duration(clean_html( + get_element_by_class('tw-player-duration-time', webpage))) + view_count = str_to_int(self._search_regex( + r'Total\s*:\s*([\d,]+)\s*Views', webpage, 'views', None)) + timestamp = unified_timestamp(self._search_regex( + r'data-toggle="true"[^>]+datetime="([^"]+)"', + webpage, 'datetime', None)) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, + 'timestamp': timestamp, 'uploader_id': uploader_id, + 'duration': duration, + 'view_count': view_count, 'formats': formats, } -- GitLab From a7f325972c2a080e2b7457812f8dbad7b329be8a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:13:34 +0100 Subject: [PATCH 141/384] [downloader/hls] delegate manifests with media initialization to ffmpeg --- haruhi_dl/downloader/hls.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/haruhi_dl/downloader/hls.py b/haruhi_dl/downloader/hls.py index 0cd16db87..56c84e113 100644 --- a/haruhi_dl/downloader/hls.py +++ b/haruhi_dl/downloader/hls.py @@ -42,11 +42,13 @@ class HlsFD(FragmentFD): # no segments will definitely be appended to the end of the playlist. # r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of # # event media playlists [4] + r'#EXT-X-MAP:', # media initialization [5] # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 + # 5. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.5 ) check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] is_aes128_enc = '#EXT-X-KEY:METHOD=AES-128' in manifest -- GitLab From f42fea540291cb3e8ca551ef1d1d2ff2f22a109d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:13:40 +0100 Subject: [PATCH 142/384] [slideslive] use m3u8 entry protocol for m3u8 formats(closes #27400) --- haruhi_dl/extractor/slideslive.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/slideslive.py b/haruhi_dl/extractor/slideslive.py index cd70841a9..9409a0100 100644 --- a/haruhi_dl/extractor/slideslive.py +++ b/haruhi_dl/extractor/slideslive.py @@ -83,9 +83,10 @@ class SlidesLiveIE(InfoExtractor): else: formats = [] _MANIFEST_PATTERN = 'https://01.cdn.yoda.slideslive.com/%s/master.%s' + # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol formats.extend(self._extract_m3u8_formats( - _MANIFEST_PATTERN % (service_id, 'm3u8'), service_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + _MANIFEST_PATTERN % (service_id, 'm3u8'), + service_id, 'mp4', m3u8_id='hls', fatal=False)) formats.extend(self._extract_mpd_formats( _MANIFEST_PATTERN % (service_id, 'mpd'), service_id, mpd_id='dash', fatal=False)) -- GitLab From ff7c31e4f2c250b8e072879d7c5e9d6e7bbe3183 Mon Sep 17 00:00:00 2001 From: spvkgn <spvkgn@users.noreply.github.com> Date: Fri, 26 Feb 2021 15:13:48 +0100 Subject: [PATCH 143/384] [eporner] Fix hash extraction and extend _VALID_URL (#27396) Co-authored-by: Sergey M <dstftw@gmail.com> --- haruhi_dl/extractor/eporner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/eporner.py b/haruhi_dl/extractor/eporner.py index fe42821c7..709925471 100644 --- a/haruhi_dl/extractor/eporner.py +++ b/haruhi_dl/extractor/eporner.py @@ -16,7 +16,7 @@ from ..utils import ( class EpornerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?eporner\.com/(?:hd-porn|embed)/(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' + _VALID_URL = r'https?://(?:www\.)?eporner\.com/(?:(?:hd-porn|embed)/|video-)(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' _TESTS = [{ 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', 'md5': '39d486f046212d8e1b911c52ab4691f8', @@ -45,6 +45,9 @@ class EpornerIE(InfoExtractor): }, { 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', 'only_matching': True, + }, { + 'url': 'https://www.eporner.com/video-FJsA19J3Y3H/one-of-the-greats/', + 'only_matching': True, }] def _real_extract(self, url): @@ -57,7 +60,7 @@ class EpornerIE(InfoExtractor): video_id = self._match_id(urlh.geturl()) hash = self._search_regex( - r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash') + r'hash\s*[:=]\s*["\']([\da-f]{32})', webpage, 'hash') title = self._og_search_title(webpage, default=None) or self._html_search_regex( r'<title>(.+?) - EPORNER', webpage, 'title') -- GitLab From 2f63edb44aafa5cf0cc83432a7f7aea50045115a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:13:58 +0100 Subject: [PATCH 144/384] [eporner] Fix embed test URL --- haruhi_dl/extractor/eporner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/eporner.py b/haruhi_dl/extractor/eporner.py index 709925471..920fb417e 100644 --- a/haruhi_dl/extractor/eporner.py +++ b/haruhi_dl/extractor/eporner.py @@ -43,7 +43,7 @@ class EpornerIE(InfoExtractor): 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', 'only_matching': True, }, { - 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', + 'url': 'http://www.eporner.com/embed/3YRUtzMcWn0', 'only_matching': True, }, { 'url': 'https://www.eporner.com/video-FJsA19J3Y3H/one-of-the-greats/', -- GitLab From e4b993e9dbc8b7161a1df8593105d8695b2c0fce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:14:10 +0100 Subject: [PATCH 145/384] =?UTF-8?q?[extractor/common]=20Improve=20JSON-LD?= =?UTF-8?q?=20interaction=20statistic=20extraction=20(=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …refs #23306) --- haruhi_dl/extractor/common.py | 12 +++++++-- test/test_InfoExtractor.py | 50 +++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index 3353987a3..f845688f5 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -1239,8 +1239,16 @@ class InfoExtractor(object): 'ViewAction': 'view', } + def extract_interaction_type(e): + interaction_type = e.get('interactionType') + if isinstance(interaction_type, dict): + interaction_type = interaction_type.get('@type') + return str_or_none(interaction_type) + def extract_interaction_statistic(e): interaction_statistic = e.get('interactionStatistic') + if isinstance(interaction_statistic, dict): + interaction_statistic = [interaction_statistic] if not isinstance(interaction_statistic, list): return for is_e in interaction_statistic: @@ -1248,8 +1256,8 @@ class InfoExtractor(object): continue if is_e.get('@type') != 'InteractionCounter': continue - interaction_type = is_e.get('interactionType') - if not isinstance(interaction_type, compat_str): + interaction_type = extract_interaction_type(is_e) + if not interaction_type: continue # For interaction count some sites provide string instead of # an integer (as per spec) with non digit characters (e.g. ",") diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index a7a15b8ae..54ed446d5 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -98,6 +98,56 @@ class TestInfoExtractor(unittest.TestCase): self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) + def test_search_json_ld_realworld(self): + # https://github.com/hdl-org/haruhi-dl/issues/23306 + expect_dict( + self, + self.ie._search_json_ld(r'''<script type="application/ld+json"> +{ +"@context": "http://schema.org/", +"@type": "VideoObject", +"name": "1 On 1 With Kleio", +"url": "https://www.eporner.com/hd-porn/xN49A1cT3eB/1-On-1-With-Kleio/", +"duration": "PT0H12M23S", +"thumbnailUrl": ["https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg", "https://imggen.eporner.com/780814/1920/1080/9.jpg"], +"contentUrl": "https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4", +"embedUrl": "https://www.eporner.com/embed/xN49A1cT3eB/1-On-1-With-Kleio/", +"image": "https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg", +"width": "1920", +"height": "1080", +"encodingFormat": "mp4", +"bitrate": "6617kbps", +"isFamilyFriendly": "False", +"description": "Kleio Valentien", +"uploadDate": "2015-12-05T21:24:35+01:00", +"interactionStatistic": { +"@type": "InteractionCounter", +"interactionType": { "@type": "http://schema.org/WatchAction" }, +"userInteractionCount": 1120958 +}, "aggregateRating": { +"@type": "AggregateRating", +"ratingValue": "88", +"ratingCount": "630", +"bestRating": "100", +"worstRating": "0" +}, "actor": [{ +"@type": "Person", +"name": "Kleio Valentien", +"url": "https://www.eporner.com/pornstar/kleio-valentien/" +}]} +</script>''', None), + { + 'title': '1 On 1 With Kleio', + 'description': 'Kleio Valentien', + 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4', + 'timestamp': 1449347075, + 'duration': 743.0, + 'view_count': 1120958, + 'width': 1920, + 'height': 1080, + }) + + def test_download_json(self): uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'}) -- GitLab From c4dfcc3d9c02cdff8b778fdc16fd652c33279163 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:14:15 +0100 Subject: [PATCH 146/384] [eporner] Fix view count extraction and make optional (closes #23306) --- haruhi_dl/extractor/eporner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/eporner.py b/haruhi_dl/extractor/eporner.py index 920fb417e..bfecd3a41 100644 --- a/haruhi_dl/extractor/eporner.py +++ b/haruhi_dl/extractor/eporner.py @@ -118,8 +118,8 @@ class EpornerIE(InfoExtractor): duration = parse_duration(self._html_search_meta( 'duration', webpage, default=None)) view_count = str_to_int(self._search_regex( - r'id="cinemaviews">\s*([0-9,]+)\s*<small>views', - webpage, 'view count', fatal=False)) + r'id=["\']cinemaviews1["\'][^>]*>\s*([0-9,]+)', + webpage, 'view count', default=None)) return merge_dicts(json_ld, { 'id': video_id, -- GitLab From 1339530c44d92db7d2f9652f148af0be7f4ccba3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:15:27 +0100 Subject: [PATCH 147/384] [mdr] Improve extraction (closes #24346, closes #26873) --- haruhi_dl/extractor/mdr.py | 75 +++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/haruhi_dl/extractor/mdr.py b/haruhi_dl/extractor/mdr.py index 322e5b45a..38afdc789 100644 --- a/haruhi_dl/extractor/mdr.py +++ b/haruhi_dl/extractor/mdr.py @@ -2,12 +2,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( determine_ext, int_or_none, parse_duration, parse_iso8601, + url_or_none, xpath_text, ) @@ -66,6 +70,22 @@ class MDRIE(InfoExtractor): 'duration': 3239, 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, + }, { + # empty bitrateVideo and bitrateAudio + 'url': 'https://www.kika.de/filme/sendung128372_zc-572e3f45_zs-1d9fb70e.html', + 'info_dict': { + 'id': '128372', + 'ext': 'mp4', + 'title': 'Der kleine Wichtel kehrt zurück', + 'description': 'md5:f77fafdff90f7aa1e9dca14f662c052a', + 'duration': 4876, + 'timestamp': 1607823300, + 'upload_date': '20201213', + 'uploader': 'ZDF', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', 'only_matching': True, @@ -91,10 +111,13 @@ class MDRIE(InfoExtractor): title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True) + type_ = xpath_text(doc, './type', default=None) + formats = [] processed_urls = [] for asset in doc.findall('./assets/asset'): for source in ( + 'download', 'progressiveDownload', 'dynamicHttpStreamingRedirector', 'adaptiveHttpStreamingRedirector'): @@ -102,63 +125,49 @@ class MDRIE(InfoExtractor): if url_el is None: continue - video_url = url_el.text - if video_url in processed_urls: + video_url = url_or_none(url_el.text) + if not video_url or video_url in processed_urls: continue processed_urls.append(video_url) - vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) - abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) - - ext = determine_ext(url_el.text) + ext = determine_ext(video_url) if ext == 'm3u8': - url_formats = self._extract_m3u8_formats( + formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', entry_protocol='m3u8_native', - preference=0, m3u8_id='HLS', fatal=False) + preference=0, m3u8_id='HLS', fatal=False)) elif ext == 'f4m': - url_formats = self._extract_f4m_formats( + formats.extend(self._extract_f4m_formats( video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, - preference=0, f4m_id='HDS', fatal=False) + preference=0, f4m_id='HDS', fatal=False)) else: media_type = xpath_text(asset, './mediaType', 'media type', default='MP4') vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) + format_id = [media_type] + if vbr or abr: + format_id.append(compat_str(vbr or abr)) + f = { 'url': video_url, - 'format_id': '%s-%d' % (media_type, vbr or abr), + 'format_id': '-'.join(format_id), 'filesize': filesize, 'abr': abr, - 'preference': 1, + 'vbr': vbr, } if vbr: - width = int_or_none(xpath_text(asset, './frameWidth', 'width')) - height = int_or_none(xpath_text(asset, './frameHeight', 'height')) f.update({ - 'vbr': vbr, - 'width': width, - 'height': height, + 'width': int_or_none(xpath_text(asset, './frameWidth', 'width')), + 'height': int_or_none(xpath_text(asset, './frameHeight', 'height')), }) - url_formats = [f] - - if not url_formats: - continue - - if not vbr: - for f in url_formats: - abr = f.get('tbr') or abr - if 'tbr' in f: - del f['tbr'] - f.update({ - 'abr': abr, - 'vcodec': 'none', - }) + if type_ == 'audio': + f['vcodec'] = 'none' - formats.extend(url_formats) + formats.append(f) self._sort_formats(formats) -- GitLab From 2c85578a1fe929e0a71d7d66a1e07962d1988868 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:15:34 +0100 Subject: [PATCH 148/384] [mdr] Bypass geo restriction --- haruhi_dl/extractor/mdr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/haruhi_dl/extractor/mdr.py b/haruhi_dl/extractor/mdr.py index 38afdc789..dc6aa9819 100644 --- a/haruhi_dl/extractor/mdr.py +++ b/haruhi_dl/extractor/mdr.py @@ -20,6 +20,8 @@ class MDRIE(InfoExtractor): IE_DESC = 'MDR.DE and KiKA' _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P<id>\d+)(?:_.+?)?\.html' + _GEO_COUNTRIES = ['DE'] + _TESTS = [{ # MDR regularly deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', -- GitLab From 541e22037b74a0c9a18f847a24e9fb5a46912f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:15:40 +0100 Subject: [PATCH 149/384] [test_InfoExtractor] PEP 8 --- test/test_InfoExtractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 54ed446d5..d7f42a02d 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -147,7 +147,6 @@ class TestInfoExtractor(unittest.TestCase): 'height': 1080, }) - def test_download_json(self): uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'}) -- GitLab From 27765ca68fc7ffd577b012ecb2ee6471105e7070 Mon Sep 17 00:00:00 2001 From: Matthew Rayermann <matthew.rayermann@gmail.com> Date: Fri, 26 Feb 2021 15:15:47 +0100 Subject: [PATCH 150/384] [nhk] Add support for NHK video programs (#27230) --- haruhi_dl/extractor/extractors.py | 5 +- haruhi_dl/extractor/nhk.py | 162 +++++++++++++++++++++--------- 2 files changed, 118 insertions(+), 49 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 297a5e02b..1b5f9b65e 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -756,7 +756,10 @@ from .nexx import ( NexxEmbedIE, ) from .nfl import NFLIE -from .nhk import NhkVodIE +from .nhk import ( + NhkVodIE, + NhkVodProgramIE, +) from .nhl import NHLIE from .nick import ( NickIE, diff --git a/haruhi_dl/extractor/nhk.py b/haruhi_dl/extractor/nhk.py index 6a61a47d2..907db4de9 100644 --- a/haruhi_dl/extractor/nhk.py +++ b/haruhi_dl/extractor/nhk.py @@ -3,14 +3,96 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ExtractorError -class NhkVodIE(InfoExtractor): +class NhkBaseIE(InfoExtractor): + _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json' + + def _get_clean_field(self, episode, key): + return episode.get(key + '_clean') or episode.get(key) + + def _list_episodes(self, m_id, lang, is_video, is_episode): + return self._download_json( + self._API_URL_TEMPLATE % ( + 'v' if is_video else 'r', + 'clip' if m_id[:4] == '9999' else 'esd', + 'episode' if is_episode else 'program', + m_id, lang, '/all' if is_video else ''), + m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] + + def _parse_episode_json(self, episode, lang, is_video): + title = episode.get('sub_title_clean') or episode['sub_title'] + + episode_id = None + if is_video: + pgm_id = episode.get('pgm_id') + pgm_no = episode.get('pgm_no') + + if not (pgm_id and pgm_no): + missing_field = 'pgm_id' if not pgm_id else 'pgm_no' + raise ExtractorError('Cannot download episode. Field %s is missing from episode JSON.' % missing_field) + + episode_id = pgm_id + pgm_no + else: + pgm_gr_id = episode.get('pgm_gr_id') + first_onair_date = episode.get('first_onair_date') + first_onair_no = episode.get('first_onair_no') + + if not (pgm_gr_id and first_onair_date and first_onair_no): + missing_field = 'pgm_gr_id' if not pgm_gr_id else 'first_onair_date' if not first_onair_date else 'first_onair_no' + raise ExtractorError('Cannot download episode. Field %s is missing from episode JSON.' % missing_field) + + episode_id = pgm_gr_id + '-' + first_onair_date + '-' + first_onair_no + + series = self._get_clean_field(episode, 'title') + + thumbnails = [] + for s, w, h in [('', 640, 360), ('_l', 1280, 720)]: + img_path = episode.get('image' + s) + if not img_path: + continue + thumbnails.append({ + 'id': '%dp' % h, + 'height': h, + 'width': w, + 'url': 'https://www3.nhk.or.jp' + img_path, + }) + + info = { + 'id': episode_id + '-' + lang, + 'title': '%s - %s' % (series, title) if series and title else title, + 'description': self._get_clean_field(episode, 'description'), + 'thumbnails': thumbnails, + 'series': series, + 'episode': title, + } + + if is_video: + info.update({ + '_type': 'url_transparent', + 'ie_key': 'Piksel', + 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + episode['vod_id'], + }) + else: + audio = episode['audio'] + audio_path = audio['audio'] + info['formats'] = self._extract_m3u8_formats( + 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + episode_id, 'm4a', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in info['formats']: + f['language'] = lang + + return info + + +class NhkVodIE(NhkBaseIE): _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(?P<type>video|audio)/(?P<id>\d{7}|[^/]+?-\d{8}-\d+)' # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - # video clip + # clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', 'md5': '256a1be14f48d960a7e61e2532d95ec3', 'info_dict': { @@ -47,60 +129,44 @@ class NhkVodIE(InfoExtractor): 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', 'only_matching': True, }] - _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/episode/%s/%s/all%s.json' def _real_extract(self, url): lang, m_type, episode_id = re.match(self._VALID_URL, url).groups() + if episode_id.isdigit(): episode_id = episode_id[:4] + '-' + episode_id[4:] - is_video = m_type == 'video' - episode = self._download_json( - self._API_URL_TEMPLATE % ( - 'v' if is_video else 'r', - 'clip' if episode_id[:4] == '9999' else 'esd', - episode_id, lang, '/all' if is_video else ''), - episode_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'][0] - title = episode.get('sub_title_clean') or episode['sub_title'] + episode = self._list_episodes(episode_id, lang, m_type == 'video', True)[0] - def get_clean_field(key): - return episode.get(key + '_clean') or episode.get(key) + return self._parse_episode_json(episode, lang, m_type == 'video') - series = get_clean_field('title') - thumbnails = [] - for s, w, h in [('', 640, 360), ('_l', 1280, 720)]: - img_path = episode.get('image' + s) - if not img_path: - continue - thumbnails.append({ - 'id': '%dp' % h, - 'height': h, - 'width': w, - 'url': 'https://www3.nhk.or.jp' + img_path, - }) +class NhkVodProgramIE(NhkBaseIE): + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(program/video)/(?P<id>\w+)' + # Content available only for a limited period of time. Visit + # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. + _TESTS = [{ + # video program + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', + 'info_dict': { + 'id': 'japanrailway', + 'title': 'Japan Railway Journal', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/', + 'only_matching': True, + }] - info = { - 'id': episode_id + '-' + lang, - 'title': '%s - %s' % (series, title) if series and title else title, - 'description': get_clean_field('description'), - 'thumbnails': thumbnails, - 'series': series, - 'episode': title, - } - if is_video: - info.update({ - '_type': 'url_transparent', - 'ie_key': 'Piksel', - 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + episode['vod_id'], - }) + def _real_extract(self, url): + lang, m_type, program_id = re.match(self._VALID_URL, url).groups() + + episodes = self._list_episodes(program_id, lang, True, False) + + if episodes: + return self.playlist_result( + [self._parse_episode_json(episode, lang, True) + for episode in episodes], + self._get_clean_field(episodes[0], 'pgm_gr_id'), self._get_clean_field(episodes[0], 'title')) else: - audio = episode['audio'] - audio_path = audio['audio'] - info['formats'] = self._extract_m3u8_formats( - 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, - episode_id, 'm4a', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False) - for f in info['formats']: - f['language'] = lang - return info + raise ExtractorError('No episodes returned for program with ID: %s' % program_id, expected=True) -- GitLab From 1859fa8ac4cd64819ec3e0739d138ed129c784b7 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:15:53 +0100 Subject: [PATCH 151/384] [nhk:program] Add support for audio programs and program clips --- haruhi_dl/extractor/nhk.py | 134 +++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 64 deletions(-) diff --git a/haruhi_dl/extractor/nhk.py b/haruhi_dl/extractor/nhk.py index 907db4de9..c5b406573 100644 --- a/haruhi_dl/extractor/nhk.py +++ b/haruhi_dl/extractor/nhk.py @@ -3,49 +3,39 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import urljoin class NhkBaseIE(InfoExtractor): _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json' + _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand' + _TYPE_REGEX = r'/(?P<type>video|audio)/' - def _get_clean_field(self, episode, key): - return episode.get(key + '_clean') or episode.get(key) - - def _list_episodes(self, m_id, lang, is_video, is_episode): + def _call_api(self, m_id, lang, is_video, is_episode, is_clip): return self._download_json( self._API_URL_TEMPLATE % ( 'v' if is_video else 'r', - 'clip' if m_id[:4] == '9999' else 'esd', + 'clip' if is_clip else 'esd', 'episode' if is_episode else 'program', m_id, lang, '/all' if is_video else ''), - m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] - - def _parse_episode_json(self, episode, lang, is_video): - title = episode.get('sub_title_clean') or episode['sub_title'] - - episode_id = None - if is_video: - pgm_id = episode.get('pgm_id') - pgm_no = episode.get('pgm_no') - - if not (pgm_id and pgm_no): - missing_field = 'pgm_id' if not pgm_id else 'pgm_no' - raise ExtractorError('Cannot download episode. Field %s is missing from episode JSON.' % missing_field) + m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] or [] - episode_id = pgm_id + pgm_no - else: - pgm_gr_id = episode.get('pgm_gr_id') - first_onair_date = episode.get('first_onair_date') - first_onair_no = episode.get('first_onair_no') + def _extract_episode_info(self, url, episode=None): + fetch_episode = episode is None + lang, m_type, episode_id = re.match(NhkVodIE._VALID_URL, url).groups() + if episode_id.isdigit(): + episode_id = episode_id[:4] + '-' + episode_id[4:] - if not (pgm_gr_id and first_onair_date and first_onair_no): - missing_field = 'pgm_gr_id' if not pgm_gr_id else 'first_onair_date' if not first_onair_date else 'first_onair_no' - raise ExtractorError('Cannot download episode. Field %s is missing from episode JSON.' % missing_field) + is_video = m_type == 'video' + if fetch_episode: + episode = self._call_api( + episode_id, lang, is_video, True, episode_id[:4] == '9999')[0] + title = episode.get('sub_title_clean') or episode['sub_title'] - episode_id = pgm_gr_id + '-' + first_onair_date + '-' + first_onair_no + def get_clean_field(key): + return episode.get(key + '_clean') or episode.get(key) - series = self._get_clean_field(episode, 'title') + series = get_clean_field('title') thumbnails = [] for s, w, h in [('', 640, 360), ('_l', 1280, 720)]: @@ -62,37 +52,43 @@ class NhkBaseIE(InfoExtractor): info = { 'id': episode_id + '-' + lang, 'title': '%s - %s' % (series, title) if series and title else title, - 'description': self._get_clean_field(episode, 'description'), + 'description': get_clean_field('description'), 'thumbnails': thumbnails, 'series': series, 'episode': title, } - if is_video: + vod_id = episode['vod_id'] info.update({ '_type': 'url_transparent', 'ie_key': 'Piksel', - 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + episode['vod_id'], + 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id, + 'id': vod_id, }) else: - audio = episode['audio'] - audio_path = audio['audio'] - info['formats'] = self._extract_m3u8_formats( - 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, - episode_id, 'm4a', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False) - for f in info['formats']: - f['language'] = lang - + if fetch_episode: + audio_path = episode['audio']['audio'] + info['formats'] = self._extract_m3u8_formats( + 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + episode_id, 'm4a', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in info['formats']: + f['language'] = lang + else: + info.update({ + '_type': 'url_transparent', + 'ie_key': NhkVodIE.ie_key(), + 'url': url, + }) return info class NhkVodIE(NhkBaseIE): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(?P<type>video|audio)/(?P<id>\d{7}|[^/]+?-\d{8}-\d+)' + _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - # clip + # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', 'md5': '256a1be14f48d960a7e61e2532d95ec3', 'info_dict': { @@ -131,42 +127,52 @@ class NhkVodIE(NhkBaseIE): }] def _real_extract(self, url): - lang, m_type, episode_id = re.match(self._VALID_URL, url).groups() - - if episode_id.isdigit(): - episode_id = episode_id[:4] + '-' + episode_id[4:] - - episode = self._list_episodes(episode_id, lang, m_type == 'video', True)[0] - - return self._parse_episode_json(episode, lang, m_type == 'video') + return self._extract_episode_info(url) class NhkVodProgramIE(NhkBaseIE): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(program/video)/(?P<id>\w+)' - # Content available only for a limited period of time. Visit - # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. + _VALID_URL = r'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) _TESTS = [{ - # video program + # video program episodes 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', }, 'playlist_mincount': 1, + }, { + # video program clips + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip', + 'info_dict': { + 'id': 'japanrailway', + 'title': 'Japan Railway Journal', + }, + 'playlist_mincount': 5, }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/', 'only_matching': True, + }, { + # audio program + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/', + 'only_matching': True, }] def _real_extract(self, url): - lang, m_type, program_id = re.match(self._VALID_URL, url).groups() + lang, m_type, program_id, episode_type = re.match(self._VALID_URL, url).groups() - episodes = self._list_episodes(program_id, lang, True, False) + episodes = self._call_api( + program_id, lang, m_type == 'video', False, episode_type == 'clip') - if episodes: - return self.playlist_result( - [self._parse_episode_json(episode, lang, True) - for episode in episodes], - self._get_clean_field(episodes[0], 'pgm_gr_id'), self._get_clean_field(episodes[0], 'title')) - else: - raise ExtractorError('No episodes returned for program with ID: %s' % program_id, expected=True) + entries = [] + for episode in episodes: + episode_path = episode.get('url') + if not episode_path: + continue + entries.append(self._extract_episode_info( + urljoin(url, episode_path), episode)) + + program_title = None + if entries: + program_title = entries[0].get('series') + + return self.playlist_result(entries, program_id, program_title) -- GitLab From 1315296aedfd99c406e5f3bdfda93e16b7c827a9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:16:07 +0100 Subject: [PATCH 152/384] [videomore] add support more.tv (closes #27088) --- haruhi_dl/extractor/videomore.py | 255 ++++++++++++++++--------------- 1 file changed, 135 insertions(+), 120 deletions(-) diff --git a/haruhi_dl/extractor/videomore.py b/haruhi_dl/extractor/videomore.py index e3eda3327..e0c10aa5b 100644 --- a/haruhi_dl/extractor/videomore.py +++ b/haruhi_dl/extractor/videomore.py @@ -4,30 +4,50 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) from ..utils import ( + ExtractorError, int_or_none, - orderedSet, - parse_duration, - str_or_none, - unified_strdate, - url_or_none, - xpath_element, - xpath_text, ) +class VideomoreBaseIE(InfoExtractor): + _API_BASE_URL = 'https://more.tv/api/v3/web/' + _VALID_URL_BASE = r'https?://(?:videomore\.ru|more\.tv)/' + + def _download_page_data(self, display_id): + return self._download_json( + self._API_BASE_URL + 'PageData', display_id, query={ + 'url': '/' + display_id, + })['attributes']['response']['data'] + + def _track_url_result(self, track): + track_vod = track['trackVod'] + video_url = track_vod.get('playerLink') or track_vod['link'] + return self.url_result( + video_url, VideomoreIE.ie_key(), track_vod.get('hubId')) + + class VideomoreIE(InfoExtractor): IE_NAME = 'videomore' _VALID_URL = r'''(?x) videomore:(?P<sid>\d+)$| - https?://(?:player\.)?videomore\.ru/ + https?:// (?: + videomore\.ru/ (?: embed| [^/]+/[^/]+ )/| - [^/]*\?.*?\btrack_id= + (?: + (?:player\.)?videomore\.ru| + siren\.more\.tv/player + )/[^/]*\?.*?\btrack_id=| + odysseus\.more.tv/player/(?P<partner_id>\d+)/ ) (?P<id>\d+) (?:[/?#&]|\.(?:xml|json)|$) @@ -47,18 +67,19 @@ class VideomoreIE(InfoExtractor): 'comment_count': int, 'age_limit': 16, }, + 'skip': 'The video is not available for viewing.', }, { 'url': 'http://videomore.ru/embed/259974', 'info_dict': { 'id': '259974', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Молодежка 2 сезон 40 серия', 'series': 'Молодежка', + 'season': '2 сезон', 'episode': '40 серия', 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 2809, + 'duration': 2789, 'view_count': int, - 'comment_count': int, 'age_limit': 16, }, 'params': { @@ -79,6 +100,7 @@ class VideomoreIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'The video is not available for viewing.', }, { 'url': 'http://videomore.ru/elki_3?track_id=364623', 'only_matching': True, @@ -100,7 +122,14 @@ class VideomoreIE(InfoExtractor): }, { 'url': 'https://player.videomore.ru/?partner_id=97&track_id=736234&autoplay=0&userToken=', 'only_matching': True, + }, { + 'url': 'https://odysseus.more.tv/player/1788/352317', + 'only_matching': True, + }, { + 'url': 'https://siren.more.tv/player/config?track_id=352317&partner_id=1788&user_token=', + 'only_matching': True, }] + _GEO_BYPASS = False @staticmethod def _extract_url(webpage): @@ -118,46 +147,73 @@ class VideomoreIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('sid') or mobj.group('id') - - video = self._download_xml( - 'http://videomore.ru/video/tracks/%s.xml' % video_id, - video_id, 'Downloading video XML') - - item = xpath_element(video, './/playlist/item', fatal=True) - - title = xpath_text( - item, ('./title', './episode_name'), 'title', fatal=True) - - video_url = xpath_text(item, './video_url', 'video url', fatal=True) - formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds') + partner_id = mobj.group('partner_id') or compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('partner_id', [None])[0] or '97' + + item = self._download_json( + 'https://siren.more.tv/player/config', video_id, query={ + 'partner_id': partner_id, + 'track_id': video_id, + })['data']['playlist']['items'][0] + + title = item.get('title') + series = item.get('project_name') + season = item.get('season_name') + episode = item.get('episode_name') + if not title: + title = [] + for v in (series, season, episode): + if v: + title.append(v) + title = ' '.join(title) + + streams = item.get('streams') or [] + for protocol in ('DASH', 'HLS'): + stream_url = item.get(protocol.lower() + '_url') + if stream_url: + streams.append({'protocol': protocol, 'url': stream_url}) + + formats = [] + for stream in streams: + stream_url = stream.get('url') + if not stream_url: + continue + protocol = stream.get('protocol') + if protocol == 'DASH': + formats.extend(self._extract_mpd_formats( + stream_url, video_id, mpd_id='dash', fatal=False)) + elif protocol == 'HLS': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif protocol == 'MSS': + formats.extend(self._extract_ism_formats( + stream_url, video_id, ism_id='mss', fatal=False)) + + if not formats: + error = item.get('error') + if error: + if error in ('Данное видео недоступно для просмотра на территории этой страны', 'Данное видео доступно для просмотра только на территории России'): + self.raise_geo_restricted(countries=['RU']) + raise ExtractorError(error, expected=True) self._sort_formats(formats) - thumbnail = xpath_text(item, './thumbnail_url') - duration = int_or_none(xpath_text(item, './duration')) - view_count = int_or_none(xpath_text(item, './views')) - comment_count = int_or_none(xpath_text(item, './count_comments')) - age_limit = int_or_none(xpath_text(item, './min_age')) - - series = xpath_text(item, './project_name') - episode = xpath_text(item, './episode_name') - return { 'id': video_id, 'title': title, 'series': series, + 'season': season, 'episode': episode, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'age_limit': age_limit, + 'thumbnail': item.get('thumbnail_url'), + 'duration': int_or_none(item.get('duration')), + 'view_count': int_or_none(item.get('views')), + 'age_limit': int_or_none(item.get('min_age')), 'formats': formats, } -class VideomoreVideoIE(InfoExtractor): +class VideomoreVideoIE(VideomoreBaseIE): IE_NAME = 'videomore:video' - _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)(?:/*|[?#&].*?)$' + _VALID_URL = VideomoreBaseIE._VALID_URL_BASE + r'(?P<id>(?:(?:[^/]+/){2})?[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ # single video with og:video:iframe 'url': 'http://videomore.ru/elki_3', @@ -174,10 +230,25 @@ class VideomoreVideoIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Requires logging in', }, { # season single series with og:video:iframe 'url': 'http://videomore.ru/poslednii_ment/1_sezon/14_seriya', - 'only_matching': True, + 'info_dict': { + 'id': '352317', + 'ext': 'mp4', + 'title': 'Последний мент 1 сезон 14 серия', + 'series': 'Последний мент', + 'season': '1 сезон', + 'episode': '14 серия', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2464, + 'age_limit': 16, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://videomore.ru/sejchas_v_seti/serii_221-240/226_vypusk', 'only_matching': True, @@ -197,9 +268,13 @@ class VideomoreVideoIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'redirects to https://more.tv/' }, { 'url': 'https://videomore.ru/molodezhka/6_sezon/29_seriya?utm_so', 'only_matching': True, + }, { + 'url': 'https://more.tv/poslednii_ment/1_sezon/14_seriya', + 'only_matching': True, }] @classmethod @@ -208,38 +283,25 @@ class VideomoreVideoIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_url = self._og_search_property( - 'video:iframe', webpage, 'video url', default=None) - - if not video_url: - video_id = self._search_regex( - (r'config\s*:\s*["\']https?://videomore\.ru/video/tracks/(\d+)\.xml', - r'track-id=["\'](\d+)', - r'xcnt_product_id\s*=\s*(\d+)'), webpage, 'video id') - video_url = 'videomore:%s' % video_id - else: - video_id = None - - return self.url_result( - video_url, ie=VideomoreIE.ie_key(), video_id=video_id) + return self._track_url_result(self._download_page_data(display_id)) -class VideomoreSeasonIE(InfoExtractor): +class VideomoreSeasonIE(VideomoreBaseIE): IE_NAME = 'videomore:season' - _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$' + _VALID_URL = VideomoreBaseIE._VALID_URL_BASE + r'(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ - 'url': 'http://videomore.ru/molodezhka/sezon_promo', + 'url': 'http://videomore.ru/molodezhka/film_o_filme', 'info_dict': { - 'id': 'molodezhka/sezon_promo', - 'title': 'Молодежка Промо', + 'id': 'molodezhka/film_o_filme', + 'title': 'Фильм о фильме', }, - 'playlist_mincount': 12, + 'playlist_mincount': 3, }, { 'url': 'http://videomore.ru/molodezhka/sezon_promo?utm_so', 'only_matching': True, + }, { + 'url': 'https://more.tv/molodezhka/film_o_filme', + 'only_matching': True, }] @classmethod @@ -249,59 +311,12 @@ class VideomoreSeasonIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - title = self._og_search_title(webpage) - - data = self._parse_json( - self._html_search_regex( - r'\bclass=["\']seasons-tracks["\'][^>]+\bdata-custom-data=(["\'])(?P<value>{.+?})\1', - webpage, 'data', default='{}', group='value'), - display_id, fatal=False) - + season = self._download_page_data(display_id) + season_id = compat_str(season['id']) + tracks = self._download_json( + self._API_BASE_URL + 'seasons/%s/tracks' % season_id, + season_id)['data'] entries = [] - - if data: - episodes = data.get('episodes') - if isinstance(episodes, list): - for ep in episodes: - if not isinstance(ep, dict): - continue - ep_id = int_or_none(ep.get('id')) - ep_url = url_or_none(ep.get('url')) - if ep_id: - e = { - 'url': 'videomore:%s' % ep_id, - 'id': compat_str(ep_id), - } - elif ep_url: - e = {'url': ep_url} - else: - continue - e.update({ - '_type': 'url', - 'ie_key': VideomoreIE.ie_key(), - 'title': str_or_none(ep.get('title')), - 'thumbnail': url_or_none(ep.get('image')), - 'duration': parse_duration(ep.get('duration')), - 'episode_number': int_or_none(ep.get('number')), - 'upload_date': unified_strdate(ep.get('date')), - }) - entries.append(e) - - if not entries: - entries = [ - self.url_result( - 'videomore:%s' % video_id, ie=VideomoreIE.ie_key(), - video_id=video_id) - for video_id in orderedSet(re.findall( - r':(?:id|key)=["\'](\d+)["\']', webpage))] - - if not entries: - entries = [ - self.url_result(item) for item in re.findall( - r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"' - % display_id, webpage)] - - return self.playlist_result(entries, display_id, title) + for track in tracks: + entries.append(self._track_url_result(track)) + return self.playlist_result(entries, display_id, season.get('title')) -- GitLab From dfb69009b9d66a893ba8f56c0afef0e101aa0ea2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:16:13 +0100 Subject: [PATCH 153/384] [tv5unis] Add new extractor(closes #22399)(closes #24890) --- haruhi_dl/extractor/extractors.py | 4 + haruhi_dl/extractor/tv5unis.py | 121 ++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 haruhi_dl/extractor/tv5unis.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 1b5f9b65e..825a28907 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1273,6 +1273,10 @@ from .tv2dk import ( from .tv2hu import TV2HuIE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE +from .tv5unis import ( + TV5UnisVideoIE, + TV5UnisIE, +) from .tva import ( TVAIE, QubIE, diff --git a/haruhi_dl/extractor/tv5unis.py b/haruhi_dl/extractor/tv5unis.py new file mode 100644 index 000000000..eabdc2271 --- /dev/null +++ b/haruhi_dl/extractor/tv5unis.py @@ -0,0 +1,121 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_age_limit, + smuggle_url, + try_get, +) + + +class TV5UnisBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['CA'] + + def _real_extract(self, url): + groups = re.match(self._VALID_URL, url).groups() + product = self._download_json( + 'https://api.tv5unis.ca/graphql', groups[0], query={ + 'query': '''{ + %s(%s) { + collection { + title + } + episodeNumber + rating { + name + } + seasonNumber + tags + title + videoElement { + ... on Video { + mediaId + } + } + } +}''' % (self._GQL_QUERY_NAME, self._gql_args(groups)), + })['data'][self._GQL_QUERY_NAME] + media_id = product['videoElement']['mediaId'] + + return { + '_type': 'url_transparent', + 'id': media_id, + 'title': product.get('title'), + 'url': smuggle_url('limelight:media:' + media_id, {'geo_countries': self._GEO_COUNTRIES}), + 'age_limit': parse_age_limit(try_get(product, lambda x: x['rating']['name'])), + 'tags': product.get('tags'), + 'series': try_get(product, lambda x: x['collection']['title']), + 'season_number': int_or_none(product.get('seasonNumber')), + 'episode_number': int_or_none(product.get('episodeNumber')), + 'ie_key': 'LimelightMedia', + } + + +class TV5UnisVideoIE(TV5UnisBaseIE): + IE_NAME = 'tv5unis:video' + _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/[^/]+/(?P<id>\d+)' + _TEST = { + 'url': 'https://www.tv5unis.ca/videos/bande-annonces/71843', + 'md5': '3d794164928bda97fb87a17e89923d9b', + 'info_dict': { + 'id': 'a883684aecb2486cad9bdc7bbe17f861', + 'ext': 'mp4', + 'title': 'Watatatow', + 'duration': 10.01, + } + } + _GQL_QUERY_NAME = 'productById' + + @staticmethod + def _gql_args(groups): + return 'id: %s' % groups + + +class TV5UnisIE(TV5UnisBaseIE): + IE_NAME = 'tv5unis' + _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/(?P<id>[^/]+)(?:/saisons/(?P<season_number>\d+)/episodes/(?P<episode_number>\d+))?/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://www.tv5unis.ca/videos/watatatow/saisons/6/episodes/1', + 'md5': 'a479907d2e531a73e1f8dc48d6388d02', + 'info_dict': { + 'id': 'e5ee23a586c44612a56aad61accf16ef', + 'ext': 'mp4', + 'title': 'Je ne peux pas lui résister', + 'description': "Atys, le nouveau concierge de l'école, a réussi à ébranler la confiance de Mado en affirmant qu\'une médaille, ce n'est que du métal. Comme Mado essaie de lui prouver que ses valeurs sont solides, il veut la mettre à l'épreuve...", + 'subtitles': { + 'fr': 'count:1', + }, + 'duration': 1370, + 'age_limit': 8, + 'tags': 'count:3', + 'series': 'Watatatow', + 'season_number': 6, + 'episode_number': 1, + }, + }, { + 'url': 'https://www.tv5unis.ca/videos/le-voyage-de-fanny', + 'md5': '9ca80ebb575c681d10cae1adff3d4774', + 'info_dict': { + 'id': '726188eefe094d8faefb13381d42bc06', + 'ext': 'mp4', + 'title': 'Le voyage de Fanny', + 'description': "Fanny, 12 ans, cachée dans un foyer loin de ses parents, s'occupe de ses deux soeurs. Devant fuir, Fanny prend la tête d'un groupe de huit enfants et s'engage dans un dangereux périple à travers la France occupée pour rejoindre la frontière suisse.", + 'subtitles': { + 'fr': 'count:1', + }, + 'duration': 5587.034, + 'tags': 'count:4', + }, + }] + _GQL_QUERY_NAME = 'productByRootProductSlug' + + @staticmethod + def _gql_args(groups): + args = 'rootProductSlug: "%s"' % groups[0] + if groups[1]: + args += ', seasonNumber: %s, episodeNumber: %s' % groups[1:] + return args -- GitLab From 441fbc40561efb679a1354177d276fbb7c9a6c95 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:16:18 +0100 Subject: [PATCH 154/384] [sky] relax SkySports URL regex (closes #27435) --- haruhi_dl/extractor/sky.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/sky.py b/haruhi_dl/extractor/sky.py index ea30d6e62..681691004 100644 --- a/haruhi_dl/extractor/sky.py +++ b/haruhi_dl/extractor/sky.py @@ -41,8 +41,8 @@ class SkyBaseIE(InfoExtractor): class SkySportsIE(SkyBaseIE): - _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/([^/]+/)*(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', 'md5': '77d59166cddc8d3cb7b13e35eaf0f5ec', 'info_dict': { @@ -52,7 +52,13 @@ class SkySportsIE(SkyBaseIE): 'description': 'md5:e88bda94ae15f7720c5cb467e777bb6d', }, 'add_ie': ['Ooyala'], - } + }, { + 'url': 'https://www.skysports.com/watch/video/sports/f1/12160544/abu-dhabi-gp-the-notebook', + 'only_matching': True, + }, { + 'url': 'https://www.skysports.com/watch/video/tv-shows/12118508/rainford-brent-how-ace-programme-helps', + 'only_matching': True, + }] class SkyNewsIE(SkyBaseIE): -- GitLab From 3463c192f6a7594fc9f7092fa6def7c90311512d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:16:29 +0100 Subject: [PATCH 155/384] =?UTF-8?q?[anvato]=20update=20ANVACK=20table=20an?= =?UTF-8?q?d=20add=20experimental=20token=20generator=20for=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … NFL --- haruhi_dl/extractor/anvato.py | 97 ++++++++++++++++--- .../anvato_token_generator/__init__.py | 7 ++ .../anvato_token_generator/common.py | 6 ++ .../extractor/anvato_token_generator/nfl.py | 30 ++++++ 4 files changed, 129 insertions(+), 11 deletions(-) create mode 100644 haruhi_dl/extractor/anvato_token_generator/__init__.py create mode 100644 haruhi_dl/extractor/anvato_token_generator/common.py create mode 100644 haruhi_dl/extractor/anvato_token_generator/nfl.py diff --git a/haruhi_dl/extractor/anvato.py b/haruhi_dl/extractor/anvato.py index 84e841035..a6410311c 100644 --- a/haruhi_dl/extractor/anvato.py +++ b/haruhi_dl/extractor/anvato.py @@ -9,6 +9,7 @@ import re import time from .common import InfoExtractor +from .anvato_token_generator import NFLTokenGenerator from ..aes import aes_encrypt from ..compat import compat_str from ..utils import ( @@ -116,7 +117,76 @@ class AnvatoIE(InfoExtractor): 'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn', 'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W', 'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ', - 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ' + 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', + 'X8POa4zPPaKVZHqmWjuEzfP31b1QM9VN': 'Dn5vOY9ooDw7VSl9qztjZI5o0g08mA0z', + 'M2v78QkBMpNJlSPp9diX5F2PBmBy6Bog': 'ka6K32kyo7nDZfNkjQCGWf1lpApXMd1B', + 'bvJ0dQpav07l0hG5JgfVLF2dv1vARwpP': 'BzoQW24GrJZoJfmNodiJKSPeB9B8NOxj', + 'lxQMLg2XZKuEZaWgsqubBxV9INZ6bryY': 'Vm2Mx6noKds9jB71h6urazwlTG3m9x8l', + '04EnjvXeoSmkbJ9ckPs7oY0mcxv7PlyN': 'aXERQP9LMfQVlEDsgGs6eEA1SWznAQ8P', + 'mQbO2ge6BFRWVPYCYpU06YvNt80XLvAX': 'E2BV1NGmasN5v7eujECVPJgwflnLPm2A', + 'g43oeBzJrCml7o6fa5fRL1ErCdeD8z4K': 'RX34mZ6zVH4Nr6whbxIGLv9WSbxEKo8V', + 'VQrDJoP7mtdBzkxhXbSPwGB1coeElk4x': 'j2VejQx0VFKQepAF7dI0mJLKtOVJE18z', + 'WxA5NzLRjCrmq0NUgaU5pdMDuZO7RJ4w': 'lyY5ADLKaIOLEgAsGQCveEMAcqnx3rY9', + 'M4lpMXB71ie0PjMCjdFzVXq0SeRVqz49': 'n2zVkOqaLIv3GbLfBjcwW51LcveWOZ2e', + 'dyDZGEqN8u8nkJZcJns0oxYmtP7KbGAn': 'VXOEqQW9BtEVLajfZQSLEqxgS5B7qn2D', + 'E7QNjrVY5u5mGvgu67IoDgV1CjEND8QR': 'rz8AaDmdKIkLmPNhB5ILPJnjS5PnlL8d', + 'a4zrqjoKlfzg0dwHEWtP31VqcLBpjm4g': 'LY9J16gwETdGWa3hjBu5o0RzuoQDjqXQ', + 'dQP5BZroMsMVLO1hbmT5r2Enu86GjxA6': '7XR3oOdbPF6x3PRFLDCq9RkgsRjAo48V', + 'M4lKNBO1NFe0PjMCj1tzVXq0SeRVqzA9': 'n2zoRqGLRUv3GbLfBmTwW51LcveWOZYe', + 'nAZ7MZdpGCGg1pqFEbsoJOz2C60mv143': 'dYJgdqA9aT4yojETqGi7yNgoFADxqmXP', + '3y1MERYgOuE9NzbFgwhV6Wv2F0YKvbyz': '081xpZDQgC4VadLTavhWQxrku56DAgXV', + 'bmQvmEXr5HWklBMCZOcpE2Z3HBYwqGyl': 'zxXPbVNyMiMAZldhr9FkOmA0fl4aKr2v', + 'wA7oDNYldfr6050Hwxi52lPZiVlB86Ap': 'ZYK16aA7ni0d3l3c34uwpxD7CbReMm8Q', + 'g43MbKMWmFml7o7sJoSRkXxZiXRvJ3QK': 'RX3oBJonvs4Nr6rUWBCGn3matRGqJPXV', + 'mA9VdlqpLS0raGaSDvtoqNrBTzb8XY4q': '0XN4OjBD3fnW7r7IbmtJB4AyfOmlrE2r', + 'mAajOwgkGt17oGoFmEuklMP9H0GnW54d': 'lXbBLPGyzikNGeGujAuAJGjZiwLRxyXR', + 'vy8vjJ9kbUwrRqRu59Cj5dWZfzYErlAb': 'K8l7gpwaGcBpnAnCLNCmPZRdin3eaQX0', + 'xQMWBpR8oHEZaWaSMGUb0avOHjLVYn4Y': 'm2MrN4vEaf9jB7BFy5Srb40jTrN67AYl', + 'xyKEmVO3miRr6D6UVkt7oB8jtD6aJEAv': 'g2ddDebqDfqdgKgswyUKwGjbTWwzq923', + '7Qk0wa2D9FjKapacoJF27aLvUDKkLGA0': 'b2kgBEkephJaMkMTL7s1PLe4Ua6WyP2P', + '3QLg6nqmNTJ5VvVTo7f508LPidz1xwyY': 'g2L1GgpraipmAOAUqmIbBnPxHOmw4MYa', + '3y1B7zZjXTE9NZNSzZSVNPZaTNLjo6Qz': '081b5G6wzH4VagaURmcWbN5mT4JGEe2V', + 'lAqnwvkw6SG6D8DSqmUg6DRLUp0w3G4x': 'O2pbP0xPDFNJjpjIEvcdryOJtpkVM4X5', + 'awA7xd1N0Hr6050Hw2c52lPZiVlB864p': 'GZYKpn4aoT0d3l3c3PiwpxD7CbReMmXQ', + 'jQVqPLl9YHL1WGWtR1HDgWBGT63qRNyV': '6X03ne6vrU4oWyWUN7tQVoajikxJR3Ye', + 'GQRMR8mL7uZK797t7xH3eNzPIP5dOny1': 'm2vqPWGd4U31zWzSyasDRAoMT1PKRp8o', + 'zydq9RdmRhXLkNkfNoTJlMzaF0lWekQB': '3X7LnvE7vH5nkEkSqLiey793Un7dLB8e', + 'VQrDzwkB2IdBzjzu9MHPbEYkSB50gR4x': 'j2VebLzoKUKQeEesmVh0gM1eIp9jKz8z', + 'mAa2wMamBs17oGoFmktklMP9H0GnW54d': 'lXbgP74xZTkNGeGujVUAJGjZiwLRxy8R', + '7yjB6ZLG6sW8R6RF2xcan1KGfJ5dNoyd': 'wXQkPorvPHZ45N5t4Jf6qwg5Tp4xvw29', + 'a4zPpNeWGuzg0m0iX3tPeanGSkRKWXQg': 'LY9oa3QAyHdGW9Wu3Ri5JGeEik7l1N8Q', + 'k2rneA2M38k25cXDwwSknTJlxPxQLZ6M': '61lyA2aEVDzklfdwmmh31saPxQx2VRjp', + 'bK9Zk4OvPnvxduLgxvi8VUeojnjA02eV': 'o5jANYjbeMb4nfBaQvcLAt1jzLzYx6ze', + '5VD6EydM3R9orHmNMGInGCJwbxbQvGRw': 'w3zjmX7g4vnxzCxElvUEOiewkokXprkZ', + '70X35QbVYVYNPUmP9YfbzI06YqYQk2R1': 'vG4Aj2BMjMjoztB7zeFOnCVPJpJ8lMOa', + '26qYwQVG9p1Bks2GgBckjfDJOXOAMgG1': 'r4ev9X0mv5zqJc0yk5IBDcQOwZw8mnwQ', + 'rvVKpA56MBXWlSxMw3cobT5pdkd4Dm7q': '1J7ZkY53pZ645c93owcLZuveE7E8B3rL', + 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo': 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo', + 'jdKqRGF16dKsBviMDae7IGDl7oTjEbVV': 'Q09l7vhlNxPFErIOK6BVCe7KnwUW5DVV', + '3QLkogW1OUJ5VvPsrDH56DY2u7lgZWyY': 'g2LRE1V9espmAOPhE4ubj4ZdUA57yDXa', + 'wyJvWbXGBSdbkEzhv0CW8meou82aqRy8': 'M2wolPvyBIpQGkbT4juedD4ruzQGdK2y', + '7QkdZrzEkFjKap6IYDU2PB0oCNZORmA0': 'b2kN1l96qhJaMkPs9dt1lpjBfwqZoA8P', + 'pvA05113MHG1w3JTYxc6DVlRCjErVz4O': 'gQXeAbblBUnDJ7vujbHvbRd1cxlz3AXO', + 'mA9blJDZwT0raG1cvkuoeVjLC7ZWd54q': '0XN9jRPwMHnW7rvumgfJZOD9CJgVkWYr', + '5QwRN5qKJTvGKlDTmnf7xwNZcjRmvEy9': 'R2GP6LWBJU1QlnytwGt0B9pytWwAdDYy', + 'eyn5rPPbkfw2KYxH32fG1q58CbLJzM40': 'p2gyqooZnS56JWeiDgfmOy1VugOQEBXn', + '3BABn3b5RfPJGDwilbHe7l82uBoR05Am': '7OYZG7KMVhbPdKJS3xcWEN3AuDlLNmXj', + 'xA5zNGXD3HrmqMlF6OS5pdMDuZO7RJ4w': 'yY5DAm6r1IOLE3BCVMFveEMAcqnx3r29', + 'g43PgW3JZfml7o6fDEURL1ErCdeD8zyK': 'RX3aQn1zrS4Nr6whDgCGLv9WSbxEKo2V', + 'lAqp8WbGgiG6D8LTKJcg3O72CDdre1Qx': 'O2pnm6473HNJjpKuVosd3vVeh975yrX5', + 'wyJbYEDxKSdbkJ6S6RhW8meou82aqRy8': 'M2wPm7EgRSpQGlAh70CedD4ruzQGdKYy', + 'M4lgW28nLCe0PVdtaXszVXq0SeRVqzA9': 'n2zmJvg4jHv3G0ETNgiwW51LcveWOZ8e', + '5Qw3OVvp9FvGKlDTmOC7xwNZcjRmvEQ9': 'R2GzDdml9F1Qlnytw9s0B9pytWwAdD8y', + 'vy8a98X7zCwrRqbHrLUjYzwDiK2b70Qb': 'K8lVwzyjZiBpnAaSGeUmnAgxuGOBxmY0', + 'g4eGjJLLoiqRD3Pf9oT5O03LuNbLRDQp': '6XqD59zzpfN4EwQuaGt67qNpSyRBlnYy', + 'g43OPp9boIml7o6fDOIRL1ErCdeD8z4K': 'RX33alNB4s4Nr6whDPUGLv9WSbxEKoXV', + 'xA2ng9OkBcGKzDbTkKsJlx7dUK8R3dA5': 'z2aPnJvzBfObkwGC3vFaPxeBhxoMqZ8K', + 'xyKEgBajZuRr6DEC0Kt7XpD1cnNW9gAv': 'g2ddlEBvRsqdgKaI4jUK9PrgfMexGZ23', + 'BAogww51jIMa2JnH1BcYpXM5F658RNAL': 'rYWDmm0KptlkGv4FGJFMdZmjs9RDE6XR', + 'BAokpg62VtMa2JnH1mHYpXM5F658RNAL': 'rYWryDnlNslkGv4FG4HMdZmjs9RDE62R', + 'a4z1Px5e2hzg0m0iMMCPeanGSkRKWXAg': 'LY9eorNQGUdGW9WuKKf5JGeEik7l1NYQ', + 'kAx69R58kF9nY5YcdecJdl2pFXP53WyX': 'gXyRxELpbfPvLeLSaRil0mp6UEzbZJ8L', + 'BAoY13nwViMa2J2uo2cY6BlETgmdwryL': 'rYWwKzJmNFlkGvGtNoUM9bzwIJVzB1YR', } _MCP_TO_ACCESS_KEY_TABLE = { @@ -134,6 +204,10 @@ class AnvatoIE(InfoExtractor): 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' } + _TOKEN_GENERATORS = { + 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, + } + _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' @@ -189,19 +263,20 @@ class AnvatoIE(InfoExtractor): video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') anvrid = md5_text(time.time() * 1000 * random.random())[:30] - payload = { - 'api': { - 'anvrid': anvrid, - 'anvstk': md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, - self._ANVACK_TABLE.get(access_key, self._API_KEY))), - 'anvts': server_time, - }, + api = { + 'anvrid': anvrid, + 'anvts': server_time, } + if access_key in self._TOKEN_GENERATORS: + api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id) + else: + api['anvstk'] = md5_text('%s|%s|%d|%s' % ( + access_key, anvrid, server_time, + self._ANVACK_TABLE.get(access_key, self._API_KEY))) return self._download_json( video_data_url, video_id, transform_source=strip_jsonp, - data=json.dumps(payload).encode('utf-8')) + data=json.dumps({'api': api}).encode('utf-8')) def _get_anvato_videos(self, access_key, video_id): video_data = self._get_video_json(access_key, video_id) @@ -259,7 +334,7 @@ class AnvatoIE(InfoExtractor): 'description': video_data.get('def_description'), 'tags': video_data.get('def_tags', '').split(','), 'categories': video_data.get('categories'), - 'thumbnail': video_data.get('thumbnail'), + 'thumbnail': video_data.get('src_image_url') or video_data.get('thumbnail'), 'timestamp': int_or_none(video_data.get( 'ts_published') or video_data.get('ts_added')), 'uploader': video_data.get('mcp_id'), diff --git a/haruhi_dl/extractor/anvato_token_generator/__init__.py b/haruhi_dl/extractor/anvato_token_generator/__init__.py new file mode 100644 index 000000000..6e223db9f --- /dev/null +++ b/haruhi_dl/extractor/anvato_token_generator/__init__.py @@ -0,0 +1,7 @@ +from __future__ import unicode_literals + +from .nfl import NFLTokenGenerator + +__all__ = [ + 'NFLTokenGenerator', +] diff --git a/haruhi_dl/extractor/anvato_token_generator/common.py b/haruhi_dl/extractor/anvato_token_generator/common.py new file mode 100644 index 000000000..b959a903b --- /dev/null +++ b/haruhi_dl/extractor/anvato_token_generator/common.py @@ -0,0 +1,6 @@ +from __future__ import unicode_literals + + +class TokenGenerator: + def generate(self, anvack, mcp_id): + raise NotImplementedError('This method must be implemented by subclasses') diff --git a/haruhi_dl/extractor/anvato_token_generator/nfl.py b/haruhi_dl/extractor/anvato_token_generator/nfl.py new file mode 100644 index 000000000..97a2b245f --- /dev/null +++ b/haruhi_dl/extractor/anvato_token_generator/nfl.py @@ -0,0 +1,30 @@ +from __future__ import unicode_literals + +import json + +from .common import TokenGenerator + + +class NFLTokenGenerator(TokenGenerator): + _AUTHORIZATION = None + + def generate(ie, anvack, mcp_id): + if not NFLTokenGenerator._AUTHORIZATION: + reroute = ie._download_json( + 'https://api.nfl.com/v1/reroute', mcp_id, + data=b'grant_type=client_credentials', + headers={'X-Domain-Id': 100}) + NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token']) + return ie._download_json( + 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ + 'query': '''{ + viewer { + mediaToken(anvack: "%s", id: %s) { + token + } + } +}''' % (anvack, mcp_id), + }).encode(), headers={ + 'Authorization': NFLTokenGenerator._AUTHORIZATION, + 'Content-Type': 'application/json', + })['data']['viewer']['mediaToken']['token'] -- GitLab From 7346665442a3f485e8db21aafdbcc21266885e7a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:16:34 +0100 Subject: [PATCH 156/384] [nfl] fix extraction(closes #22245) --- haruhi_dl/extractor/extractors.py | 5 +- haruhi_dl/extractor/nfl.py | 256 +++++++++++------------------- 2 files changed, 96 insertions(+), 165 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 825a28907..6a78ccb97 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -755,7 +755,10 @@ from .nexx import ( NexxIE, NexxEmbedIE, ) -from .nfl import NFLIE +from .nfl import ( + NFLIE, + NFLArticleIE, +) from .nhk import ( NhkVodIE, NhkVodProgramIE, diff --git a/haruhi_dl/extractor/nfl.py b/haruhi_dl/extractor/nfl.py index 460deb162..e234fad38 100644 --- a/haruhi_dl/extractor/nfl.py +++ b/haruhi_dl/extractor/nfl.py @@ -4,19 +4,15 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, -) from ..utils import ( - ExtractorError, - int_or_none, - remove_end, + clean_html, + determine_ext, + get_element_by_class, ) -class NFLIE(InfoExtractor): - IE_NAME = 'nfl.com' - _VALID_URL = r'''(?x) +class NFLBaseIE(InfoExtractor): + _VALID_URL_BASE = r'''(?x) https?:// (?P<host> (?:www\.)? @@ -34,15 +30,15 @@ class NFLIE(InfoExtractor): houstontexans| colts| jaguars| - titansonline| + (?:titansonline|tennesseetitans)| denverbroncos| - kcchiefs| + (?:kc)?chiefs| raiders| chargers| dallascowboys| giants| philadelphiaeagles| - redskins| + (?:redskins|washingtonfootball)| chicagobears| detroitlions| packers| @@ -52,180 +48,112 @@ class NFLIE(InfoExtractor): neworleanssaints| buccaneers| azcardinals| - stlouisrams| + (?:stlouis|the)rams| 49ers| seahawks )\.com| .+?\.clubs\.nfl\.com ) )/ - (?:.+?/)* - (?P<id>[^/#?&]+) ''' + _VIDEO_CONFIG_REGEX = r'<script[^>]+id="[^"]*video-config-[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}[^"]*"[^>]*>\s*({.+})' + + def _parse_video_config(self, video_config, display_id): + video_config = self._parse_json(video_config, display_id) + item = video_config['playlist'][0] + mcp_id = item.get('mcpID') + if mcp_id: + info = self.url_result( + 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:' + mcp_id, + 'Anvato', mcp_id) + else: + media_id = item.get('id') or item['entityId'] + title = item['title'] + item_url = item['url'] + info = {'id': media_id} + ext = determine_ext(item_url) + if ext == 'm3u8': + info['formats'] = self._extract_m3u8_formats(item_url, media_id, 'mp4') + self._sort_formats(info['formats']) + else: + info['url'] = item_url + if item.get('audio') is True: + info['vcodec'] = 'none' + is_live = video_config.get('live') is True + thumbnails = None + image_url = item.get(item.get('imageSrc')) or item.get(item.get('posterImage')) + if image_url: + thumbnails = [{ + 'url': image_url, + 'ext': determine_ext(image_url, 'jpg'), + }] + info.update({ + 'title': self._live_title(title) if is_live else title, + 'is_live': is_live, + 'description': clean_html(item.get('description')), + 'thumbnails': thumbnails, + }) + return info + + +class NFLIE(NFLBaseIE): + IE_NAME = 'nfl.com' + _VALID_URL = NFLBaseIE._VALID_URL_BASE + r'(?:videos?|listen|audio)/(?P<id>[^/#?&]+)' _TESTS = [{ - 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', - 'md5': '394ef771ddcd1354f665b471d78ec4c6', + 'url': 'https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14', 'info_dict': { - 'id': '0ap3000000398478', + 'id': '899441', 'ext': 'mp4', - 'title': 'Week 3: Redskins vs. Eagles highlights', - 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', - 'upload_date': '20140921', - 'timestamp': 1411337580, + 'title': "Baker Mayfield's game-changing plays from 3-TD game Week 14", + 'description': 'md5:85e05a3cc163f8c344340f220521136d', + 'upload_date': '20201215', + 'timestamp': 1608009755, 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'NFL', } }, { - 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266', - 'md5': 'cf85bdb4bc49f6e9d3816d130c78279c', + 'url': 'https://www.chiefs.com/listen/patrick-mahomes-travis-kelce-react-to-win-over-dolphins-the-breakdown', + 'md5': '6886b32c24b463038c760ceb55a34566', 'info_dict': { - 'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266', - 'ext': 'mp4', - 'title': 'LIVE: Post Game vs. Browns', - 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8', - 'upload_date': '20131229', - 'timestamp': 1388354455, - 'thumbnail': r're:^https?://.*\.jpg$', + 'id': 'd87e8790-3e14-11eb-8ceb-ff05c2867f99', + 'ext': 'mp3', + 'title': 'Patrick Mahomes, Travis Kelce React to Win Over Dolphins | The Breakdown', + 'description': 'md5:12ada8ee70e6762658c30e223e095075', } }, { - 'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish', - 'info_dict': { - 'id': '0ap3000000467607', - 'ext': 'mp4', - 'title': 'Frustrations flare on the field', - 'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.', - 'timestamp': 1422850320, - 'upload_date': '20150202', - }, - }, { - 'url': 'http://www.patriots.com/video/2015/09/18/10-days-gillette', - 'md5': '4c319e2f625ffd0b481b4382c6fc124c', - 'info_dict': { - 'id': 'n-238346', - 'ext': 'mp4', - 'title': '10 Days at Gillette', - 'description': 'md5:8cd9cd48fac16de596eadc0b24add951', - 'timestamp': 1442618809, - 'upload_date': '20150918', - }, - }, { - # lowercase data-contentid - 'url': 'http://www.steelers.com/news/article-1/Tomlin-on-Ben-getting-Vick-ready/56399c96-4160-48cf-a7ad-1d17d4a3aef7', - 'info_dict': { - 'id': '12693586-6ea9-4743-9c1c-02c59e4a5ef2', - 'ext': 'mp4', - 'title': 'Tomlin looks ahead to Ravens on a short week', - 'description': 'md5:32f3f7b139f43913181d5cbb24ecad75', - 'timestamp': 1443459651, - 'upload_date': '20150928', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood', + 'url': 'https://www.buffalobills.com/video/buffalo-bills-military-recognition-week-14', 'only_matching': True, }, { - 'url': 'http://www.buffalobills.com/video/videos/Rex_Ryan_Show_World_Wide_Rex/b1dcfab2-3190-4bb1-bfc0-d6e603d6601a', + 'url': 'https://www.raiders.com/audio/instant-reactions-raiders-week-14-loss-to-indianapolis-colts-espn-jason-fitz', 'only_matching': True, }] - @staticmethod - def prepend_host(host, url): - if not url.startswith('http'): - if not url.startswith('/'): - url = '/%s' % url - url = 'http://{0:}{1:}'.format(host, url) - return url - - @staticmethod - def format_from_stream(stream, protocol, host, path_prefix='', - preference=0, note=None): - url = '{protocol:}://{host:}/{prefix:}{path:}'.format( - protocol=protocol, - host=host, - prefix=path_prefix, - path=stream.get('path'), - ) - return { - 'url': url, - 'vbr': int_or_none(stream.get('rate', 0), 1000), - 'preference': preference, - 'format_note': note, - } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, host = mobj.group('id'), mobj.group('host') - - webpage = self._download_webpage(url, video_id) - - config_url = NFLIE.prepend_host(host, self._search_regex( - r'(?:(?:config|configURL)\s*:\s*|<nflcs:avplayer[^>]+data-config\s*=\s*)(["\'])(?P<config>.+?)\1', - webpage, 'config URL', default='static/content/static/config/video/config.json', - group='config')) - # For articles, the id in the url is not the video id - video_id = self._search_regex( - r'(?:<nflcs:avplayer[^>]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'video id', default=video_id, group='id') - config = self._download_json(config_url, video_id, 'Downloading player config') - url_template = NFLIE.prepend_host( - host, '{contentURLTemplate:}'.format(**config)) - video_data = self._download_json( - url_template.format(id=video_id), video_id) - - formats = [] - cdn_data = video_data.get('cdnData', {}) - streams = cdn_data.get('bitrateInfo', []) - if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM': - parts = compat_urllib_parse_urlparse(cdn_data.get('uri')) - protocol, host = parts.scheme, parts.netloc - for stream in streams: - formats.append( - NFLIE.format_from_stream(stream, protocol, host)) - else: - cdns = config.get('cdns') - if not cdns: - raise ExtractorError('Failed to get CDN data', expected=True) - - for name, cdn in cdns.items(): - # LimeLight streams don't seem to work - if cdn.get('name') == 'LIMELIGHT': - continue + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + return self._parse_video_config(self._search_regex( + self._VIDEO_CONFIG_REGEX, webpage, 'video config'), display_id) - protocol = cdn.get('protocol') - host = remove_end(cdn.get('host', ''), '/') - if not (protocol and host): - continue - prefix = cdn.get('pathprefix', '') - if prefix and not prefix.endswith('/'): - prefix = '%s/' % prefix - - preference = 0 - if protocol == 'rtmp': - preference = -2 - elif 'prog' in name.lower(): - preference = 1 - - for stream in streams: - formats.append( - NFLIE.format_from_stream(stream, protocol, host, - prefix, preference, name)) - - self._sort_formats(formats) - - thumbnail = None - for q in ('xl', 'l', 'm', 's', 'xs'): - thumbnail = video_data.get('imagePaths', {}).get(q) - if thumbnail: - break +class NFLArticleIE(NFLBaseIE): + IE_NAME = 'nfl.com:article' + _VALID_URL = NFLBaseIE._VALID_URL_BASE + r'news/(?P<id>[^/#?&]+)' + _TEST = { + 'url': 'https://www.buffalobills.com/news/the-only-thing-we-ve-earned-is-the-noise-bills-coaches-discuss-handling-rising-e', + 'info_dict': { + 'id': 'the-only-thing-we-ve-earned-is-the-noise-bills-coaches-discuss-handling-rising-e', + 'title': "'The only thing we've earned is the noise' | Bills coaches discuss handling rising expectations", + }, + 'playlist_count': 4, + } - return { - 'id': video_id, - 'title': video_data.get('headline'), - 'formats': formats, - 'description': video_data.get('caption'), - 'duration': video_data.get('duration'), - 'thumbnail': thumbnail, - 'timestamp': int_or_none(video_data.get('posted'), 1000), - } + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + entries = [] + for video_config in re.findall(self._VIDEO_CONFIG_REGEX, webpage): + entries.append(self._parse_video_config(video_config, display_id)) + title = clean_html(get_element_by_class( + 'nfl-c-article__title', webpage)) or self._html_search_meta( + ['og:title', 'twitter:title'], webpage) + return self.playlist_result(entries, display_id, title) -- GitLab From 794a3becfbbc08a4c6268c38ac884b040a2cccc0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:17:04 +0100 Subject: [PATCH 157/384] [asiancrush] fix extraction and add support for retrocrush.tv closes #25577 closes #25829 --- haruhi_dl/extractor/asiancrush.py | 221 +++++++++++++++++++----------- 1 file changed, 138 insertions(+), 83 deletions(-) diff --git a/haruhi_dl/extractor/asiancrush.py b/haruhi_dl/extractor/asiancrush.py index 0348e680c..66ce7c686 100644 --- a/haruhi_dl/extractor/asiancrush.py +++ b/haruhi_dl/extractor/asiancrush.py @@ -1,27 +1,91 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import re from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import extract_attributes +from ..utils import ( + extract_attributes, + int_or_none, + OnDemandPagedList, + parse_age_limit, + strip_or_none, + try_get, +) + + +class AsianCrushBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?(?P<host>(?:(?:asiancrush|yuyutv|midnightpulp)\.com|(?:cocoro|retrocrush)\.tv))' + _KALTURA_KEYS = [ + 'video_url', 'progressive_url', 'download_url', 'thumbnail_url', + 'widescreen_thumbnail_url', 'screencap_widescreen', + ] + _API_SUFFIX = {'retrocrush.tv': '-ott'} + + def _call_api(self, host, endpoint, video_id, query, resource): + return self._download_json( + 'https://api%s.%s/%s' % (self._API_SUFFIX.get(host, ''), host, endpoint), video_id, + 'Downloading %s JSON metadata' % resource, query=query, + headers=self.geo_verification_headers())['objects'] + + def _download_object_data(self, host, object_id, resource): + return self._call_api( + host, 'search', object_id, {'id': object_id}, resource)[0] + + def _get_object_description(self, obj): + return strip_or_none(obj.get('long_description') or obj.get('short_description')) + + def _parse_video_data(self, video): + title = video['name'] + + entry_id, partner_id = [None] * 2 + for k in self._KALTURA_KEYS: + k_url = video.get(k) + if k_url: + mobj = re.search(r'/p/(\d+)/.+?/entryId/([^/]+)/', k_url) + if mobj: + partner_id, entry_id = mobj.groups() + break + + meta_categories = try_get(video, lambda x: x['meta']['categories'], list) or [] + categories = list(filter(None, [c.get('name') for c in meta_categories])) + + show_info = video.get('show_info') or {} + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': KalturaIE.ie_key(), + 'id': entry_id, + 'title': title, + 'description': self._get_object_description(video), + 'age_limit': parse_age_limit(video.get('mpaa_rating') or video.get('tv_rating')), + 'categories': categories, + 'series': show_info.get('show_name'), + 'season_number': int_or_none(show_info.get('season_num')), + 'season_id': show_info.get('season_id'), + 'episode_number': int_or_none(show_info.get('episode_num')), + } -class AsianCrushIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?(?P<host>(?:(?:asiancrush|yuyutv|midnightpulp)\.com|cocoro\.tv))' - _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' % _VALID_URL_BASE +class AsianCrushIE(AsianCrushBaseIE): + _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' % AsianCrushBaseIE._VALID_URL_BASE _TESTS = [{ - 'url': 'https://www.asiancrush.com/video/012869v/women-who-flirt/', + 'url': 'https://www.asiancrush.com/video/004289v/women-who-flirt', 'md5': 'c3b740e48d0ba002a42c0b72857beae6', 'info_dict': { 'id': '1_y4tmjm5r', 'ext': 'mp4', 'title': 'Women Who Flirt', - 'description': 'md5:7e986615808bcfb11756eb503a751487', + 'description': 'md5:b65c7e0ae03a85585476a62a186f924c', 'timestamp': 1496936429, 'upload_date': '20170608', 'uploader_id': 'craig@crifkin.com', + 'age_limit': 13, + 'categories': 'count:5', + 'duration': 5812, }, }, { 'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/', @@ -41,67 +105,35 @@ class AsianCrushIE(InfoExtractor): }, { 'url': 'https://www.cocoro.tv/video/the-wonderful-wizard-of-oz/008878v-the-wonderful-wizard-of-oz-ep01/', 'only_matching': True, + }, { + 'url': 'https://www.retrocrush.tv/video/true-tears/012328v-i...gave-away-my-tears', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - video_id = mobj.group('id') - - webpage = self._download_webpage(url, video_id) + host, video_id = re.match(self._VALID_URL, url).groups() - entry_id, partner_id, title = [None] * 3 - - vars = self._parse_json( - self._search_regex( + if host == 'cocoro.tv': + webpage = self._download_webpage(url, video_id) + embed_vars = self._parse_json(self._search_regex( r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars', - default='{}'), video_id, fatal=False) - if vars: - entry_id = vars.get('entry_id') - partner_id = vars.get('partner_id') - title = vars.get('vid_label') - - if not entry_id: - entry_id = self._search_regex( - r'\bentry_id["\']\s*:\s*["\'](\d+)', webpage, 'entry id') + default='{}'), video_id, fatal=False) or {} + video_id = embed_vars.get('entry_id') or video_id - player = self._download_webpage( - 'https://api.%s/embeddedVideoPlayer' % host, video_id, - query={'id': entry_id}) + video = self._download_object_data(host, video_id, 'video') + return self._parse_video_data(video) - kaltura_id = self._search_regex( - r'entry_id["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', player, - 'kaltura id', group='id') - if not partner_id: - partner_id = self._search_regex( - r'/p(?:artner_id)?/(\d+)', player, 'partner id', - default='513551') - - description = self._html_search_regex( - r'(?s)<div[^>]+\bclass=["\']description["\'][^>]*>(.+?)</div>', - webpage, 'description', fatal=False) - - return { - '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), - 'ie_key': KalturaIE.ie_key(), - 'id': video_id, - 'title': title, - 'description': description, - } - - -class AsianCrushPlaylistIE(InfoExtractor): - _VALID_URL = r'%s/series/0+(?P<id>\d+)s\b' % AsianCrushIE._VALID_URL_BASE +class AsianCrushPlaylistIE(AsianCrushBaseIE): + _VALID_URL = r'%s/series/0+(?P<id>\d+)s\b' % AsianCrushBaseIE._VALID_URL_BASE _TESTS = [{ - 'url': 'https://www.asiancrush.com/series/012481s/scholar-walks-night/', + 'url': 'https://www.asiancrush.com/series/006447s/fruity-samurai', 'info_dict': { - 'id': '12481', - 'title': 'Scholar Who Walks the Night', - 'description': 'md5:7addd7c5132a09fd4741152d96cce886', + 'id': '6447', + 'title': 'Fruity Samurai', + 'description': 'md5:7535174487e4a202d3872a7fc8f2f154', }, - 'playlist_count': 20, + 'playlist_count': 13, }, { 'url': 'https://www.yuyutv.com/series/013920s/peep-show/', 'only_matching': True, @@ -111,35 +143,58 @@ class AsianCrushPlaylistIE(InfoExtractor): }, { 'url': 'https://www.cocoro.tv/series/008549s/the-wonderful-wizard-of-oz/', 'only_matching': True, + }, { + 'url': 'https://www.retrocrush.tv/series/012355s/true-tears', + 'only_matching': True, }] + _PAGE_SIZE = 1000000000 + + def _fetch_page(self, domain, parent_id, page): + videos = self._call_api( + domain, 'getreferencedobjects', parent_id, { + 'max': self._PAGE_SIZE, + 'object_type': 'video', + 'parent_id': parent_id, + 'start': page * self._PAGE_SIZE, + }, 'page %d' % (page + 1)) + for video in videos: + yield self._parse_video_data(video) def _real_extract(self, url): - playlist_id = self._match_id(url) - - webpage = self._download_webpage(url, playlist_id) - - entries = [] - - for mobj in re.finditer( - r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, - webpage): - attrs = extract_attributes(mobj.group(0)) - if attrs.get('class') == 'clearfix': - entries.append(self.url_result( - mobj.group('url'), ie=AsianCrushIE.ie_key())) - - title = self._html_search_regex( - r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage, - 'title', default=None) or self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', - default=None) or self._search_regex( - r'<title>([^<]+)', webpage, 'title', fatal=False) - if title: - title = re.sub(r'\s*\|\s*.+?$', '', title) - - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'twitter:description', webpage, 'description', fatal=False) + host, playlist_id = re.match(self._VALID_URL, url).groups() + + if host == 'cocoro.tv': + webpage = self._download_webpage(url, playlist_id) + + entries = [] + + for mobj in re.finditer( + r']+href=(["\'])(?P%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, + webpage): + attrs = extract_attributes(mobj.group(0)) + if attrs.get('class') == 'clearfix': + entries.append(self.url_result( + mobj.group('url'), ie=AsianCrushIE.ie_key())) + + title = self._html_search_regex( + r'(?s)]\bid=["\']movieTitle[^>]+>(.+?)', webpage, + 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or self._search_regex( + r'([^<]+)', webpage, 'title', fatal=False) + if title: + title = re.sub(r'\s*\|\s*.+?$', '', title) + + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage, 'description', fatal=False) + else: + show = self._download_object_data(host, playlist_id, 'show') + title = show.get('name') + description = self._get_object_description(show) + entries = OnDemandPagedList( + functools.partial(self._fetch_page, host, playlist_id), + self._PAGE_SIZE) return self.playlist_result(entries, playlist_id, title, description) -- GitLab From 597505ed41af92c8c35b954d1cfc37f65dffdaf2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:19:00 +0100 Subject: [PATCH 158/384] [zaq1] Remove extractor --- haruhi_dl/extractor/extractors.py | 1 - haruhi_dl/extractor/zaq1.py | 101 ------------------------------ 2 files changed, 102 deletions(-) delete mode 100644 haruhi_dl/extractor/zaq1.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 6a78ccb97..39fa151e6 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1586,7 +1586,6 @@ from .youtube import ( YoutubeTruncatedURLIE, ) from .zapiks import ZapiksIE -from .zaq1 import Zaq1IE from .zattoo import ( BBVTVIE, EinsUndEinsTVIE, diff --git a/haruhi_dl/extractor/zaq1.py b/haruhi_dl/extractor/zaq1.py deleted file mode 100644 index 889aff5d8..000000000 --- a/haruhi_dl/extractor/zaq1.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_timestamp, -) - - -class Zaq1IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'http://zaq1.pl/video/xev0e', - 'md5': '24a5eb3f052e604ae597c4d0d19b351e', - 'info_dict': { - 'id': 'xev0e', - 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa', - 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147', - 'ext': 'mp4', - 'duration': 511, - 'timestamp': 1490896361, - 'uploader': 'Anonim', - 'upload_date': '20170330', - 'view_count': int, - } - }, { - # malformed JSON-LD - 'url': 'http://zaq1.pl/video/x81vn', - 'info_dict': { - 'id': 'x81vn', - 'title': 'SEKRETNE ŻYCIE WALTERA MITTY', - 'ext': 'mp4', - 'duration': 6234, - 'timestamp': 1493494860, - 'uploader': 'Anonim', - 'upload_date': '20170429', - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Failed to parse JSON'], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_url = self._search_regex( - r'data-video-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'video url', group='url') - - info = self._search_json_ld(webpage, video_id, fatal=False) - - def extract_data(field, name, fatal=False): - return self._search_regex( - r'data-%s=(["\'])(?P(?:(?!\1).)+)\1' % field, - webpage, field, fatal=fatal, group='field') - - if not info.get('title'): - info['title'] = extract_data('file-name', 'title', fatal=True) - - if not info.get('duration'): - info['duration'] = int_or_none(extract_data('duration', 'duration')) - - if not info.get('thumbnail'): - info['thumbnail'] = extract_data('photo-url', 'thumbnail') - - if not info.get('timestamp'): - info['timestamp'] = unified_timestamp(self._html_search_meta( - 'uploadDate', webpage, 'timestamp')) - - if not info.get('interactionCount'): - info['view_count'] = int_or_none(self._html_search_meta( - 'interactionCount', webpage, 'view count')) - - uploader = self._html_search_regex( - r'Wideo dodał:\s*]*>([^<]+)', webpage, 'uploader', - fatal=False) - - width = int_or_none(self._html_search_meta( - 'width', webpage, fatal=False)) - height = int_or_none(self._html_search_meta( - 'height', webpage, fatal=False)) - - info.update({ - 'id': video_id, - 'formats': [{ - 'url': video_url, - 'width': width, - 'height': height, - 'http_headers': { - 'Referer': url, - }, - }], - 'uploader': uploader, - }) - - return info -- GitLab From 76c441edf0ddc26d165b679f4ceb52a96237c7d5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:20:36 +0100 Subject: [PATCH 159/384] [anvato] Disable NFLTokenGenerator(closes #27449) --- haruhi_dl/extractor/anvato.py | 4 ++-- haruhi_dl/extractor/nfl.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/anvato.py b/haruhi_dl/extractor/anvato.py index a6410311c..98c5e6d38 100644 --- a/haruhi_dl/extractor/anvato.py +++ b/haruhi_dl/extractor/anvato.py @@ -9,7 +9,7 @@ import re import time from .common import InfoExtractor -from .anvato_token_generator import NFLTokenGenerator +# from .anvato_token_generator import NFLTokenGenerator from ..aes import aes_encrypt from ..compat import compat_str from ..utils import ( @@ -205,7 +205,7 @@ class AnvatoIE(InfoExtractor): } _TOKEN_GENERATORS = { - 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, + # 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, } _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' diff --git a/haruhi_dl/extractor/nfl.py b/haruhi_dl/extractor/nfl.py index e234fad38..871923e4c 100644 --- a/haruhi_dl/extractor/nfl.py +++ b/haruhi_dl/extractor/nfl.py @@ -57,6 +57,7 @@ class NFLBaseIE(InfoExtractor): )/ ''' _VIDEO_CONFIG_REGEX = r']+id="[^"]*video-config-[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}[^"]*"[^>]*>\s*({.+})' + _WORKING = False def _parse_video_config(self, video_config, display_id): video_config = self._parse_json(video_config, display_id) -- GitLab From 4f7380c8f5f52840d6210c62bff276c94f463bf3 Mon Sep 17 00:00:00 2001 From: Trevor Nelson <25140503+trevnels@users.noreply.github.com> Date: Fri, 26 Feb 2021 15:20:42 +0100 Subject: [PATCH 160/384] [redditr] Extract duration (#27426) --- haruhi_dl/extractor/reddit.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/haruhi_dl/extractor/reddit.py b/haruhi_dl/extractor/reddit.py index 663f622b3..3b2abb262 100644 --- a/haruhi_dl/extractor/reddit.py +++ b/haruhi_dl/extractor/reddit.py @@ -7,6 +7,7 @@ from ..utils import ( ExtractorError, int_or_none, float_or_none, + try_get, url_or_none, ) @@ -59,6 +60,7 @@ class RedditRIE(InfoExtractor): 'timestamp': 1501941939, 'upload_date': '20170805', 'uploader': 'Antw87', + 'duration': 12, 'like_count': int, 'dislike_count': int, 'comment_count': int, @@ -123,6 +125,10 @@ class RedditRIE(InfoExtractor): 'thumbnail': url_or_none(data.get('thumbnail')), 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), + 'duration': int_or_none(try_get( + data, + (lambda x: x['media']['reddit_video']['duration'], + lambda x: x['secure_media']['reddit_video']['duration']))), 'like_count': int_or_none(data.get('ups')), 'dislike_count': int_or_none(data.get('downs')), 'comment_count': int_or_none(data.get('num_comments')), -- GitLab From c359b1903460fa751a39ccf6fd079a50b84c8af4 Mon Sep 17 00:00:00 2001 From: renalid Date: Fri, 26 Feb 2021 15:20:47 +0100 Subject: [PATCH 161/384] [generic] Fix RSS itunes thumbnail extraction (#27405) --- haruhi_dl/extractor/generic.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index a321bcd6d..181ccd491 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -35,6 +35,7 @@ from ..utils import ( unsmuggle_url, UnsupportedError, url_or_none, + xpath_attr, xpath_text, xpath_with_ns, ) @@ -223,6 +224,30 @@ class GenericIE(InfoExtractor): }, }], }, + # RSS feed with item with description and thumbnails + { + 'url': 'https://anchor.fm/s/dd00e14/podcast/rss', + 'info_dict': { + 'id': 'https://anchor.fm/s/dd00e14/podcast/rss', + 'title': 're:.*100% Hydrogen.*', + 'description': 're:.*In this episode.*', + }, + 'playlist': [{ + 'info_dict': { + 'ext': 'm4a', + 'id': 'c1c879525ce2cb640b344507e682c36d', + 'title': 're:Hydrogen!', + 'description': 're:.*In this episode we are going.*', + 'timestamp': int, + 'upload_date': '20190908', + 'duration': int, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }], + 'params': { + 'skip_download': True, + }, + }, # RSS feed with enclosures and unsupported link URLs { 'url': 'http://www.hellointernet.fm/podcast?format=rss', @@ -2271,7 +2296,7 @@ class GenericIE(InfoExtractor): 'timestamp': unified_timestamp( xpath_text(it, 'pubDate', default=None)), 'duration': int_or_none(duration) or parse_duration(duration), - 'thumbnail': url_or_none(itunes('image')), + 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), 'episode': itunes('title'), 'episode_number': int_or_none(itunes('episode')), 'season_number': int_or_none(itunes('season')), -- GitLab From 457ef9b4b50b1bc538a2c98f27c7274d768f45a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:20:52 +0100 Subject: [PATCH 162/384] [generic] Improve RSS age limit extraction --- haruhi_dl/extractor/generic.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 181ccd491..240de66da 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -238,10 +238,13 @@ class GenericIE(InfoExtractor): 'id': 'c1c879525ce2cb640b344507e682c36d', 'title': 're:Hydrogen!', 'description': 're:.*In this episode we are going.*', - 'timestamp': int, + 'timestamp': 1567977776, 'upload_date': '20190908', - 'duration': int, + 'duration': 459, 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 1, + 'season_number': 1, + 'age_limit': 0, }, }], 'params': { @@ -2280,10 +2283,10 @@ class GenericIE(InfoExtractor): default=None) duration = itunes('duration') - explicit = itunes('explicit') - if explicit == 'true': + explicit = (itunes('explicit') or '').lower() + if explicit in ('true', 'yes'): age_limit = 18 - elif explicit == 'false': + elif explicit in ('false', 'no'): age_limit = 0 else: age_limit = None -- GitLab From 9d2fabe5d43845c5849da50ad11e0d33fefdf6c4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:21:13 +0100 Subject: [PATCH 163/384] [common] remove unwanted query params from unsigned akamai manifest URLs --- haruhi_dl/extractor/common.py | 9 ++++++++- haruhi_dl/extractor/nrk.py | 3 +-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index f845688f5..6b8ebbdef 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -2621,6 +2621,13 @@ class InfoExtractor(object): return entries def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): + signed = 'hdnea=' in manifest_url + if not signed: + # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html + manifest_url = re.sub( + r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?', + '', manifest_url).strip('?') + formats = [] hdcore_sign = 'hdcore=3.7.0' @@ -2646,7 +2653,7 @@ class InfoExtractor(object): formats.extend(m3u8_formats) http_host = hosts.get('http') - if http_host and m3u8_formats and 'hdnea=' not in m3u8_url: + if http_host and m3u8_formats and not signed: REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+' qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') qualities_length = len(qualities) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index fdf2d7407..b545f291b 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -33,8 +33,7 @@ class NRKBaseIE(InfoExtractor): def _extract_nrk_formats(self, asset_url, video_id): if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): - return self._extract_akamai_formats( - re.sub(r'(?:b=\d+-\d+|__a__=off)&?', '', asset_url), video_id) + return self._extract_akamai_formats(asset_url, video_id) asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url) formats = self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) -- GitLab From cb5a16067be0caa65b685014ed5ecb03a902de7f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:21:20 +0100 Subject: [PATCH 164/384] [turner] improve info extraction --- haruhi_dl/extractor/cnn.py | 5 +++- haruhi_dl/extractor/turner.py | 44 ++++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/haruhi_dl/extractor/cnn.py b/haruhi_dl/extractor/cnn.py index 774b71055..2d950fa05 100644 --- a/haruhi_dl/extractor/cnn.py +++ b/haruhi_dl/extractor/cnn.py @@ -96,7 +96,10 @@ class CNNIE(TurnerBaseIE): config['data_src'] % path, page_title, { 'default': { 'media_src': config['media_src'], - } + }, + 'f4m': { + 'host': 'cnn-vh.akamaihd.net', + }, }) diff --git a/haruhi_dl/extractor/turner.py b/haruhi_dl/extractor/turner.py index 4a6cbfbb8..820e3cbe1 100644 --- a/haruhi_dl/extractor/turner.py +++ b/haruhi_dl/extractor/turner.py @@ -6,6 +6,7 @@ import re from .adobepass import AdobePassIE from ..compat import compat_str from ..utils import ( + fix_xml_ampersands, xpath_text, int_or_none, determine_ext, @@ -49,8 +50,13 @@ class TurnerBaseIE(AdobePassIE): self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token return video_url + '?hdnea=' + token - def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}): - video_data = self._download_xml(data_src, video_id) + def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}, fatal=False): + video_data = self._download_xml( + data_src, video_id, + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=fatal) + if not video_data: + return {} video_id = video_data.attrib['id'] title = xpath_text(video_data, 'headline', fatal=True) content_id = xpath_text(video_data, 'contentId') or video_id @@ -63,12 +69,14 @@ class TurnerBaseIE(AdobePassIE): urls = [] formats = [] + thumbnails = [] + subtitles = {} rex = re.compile( r'(?P[0-9]+)x(?P[0-9]+)(?:_(?P[0-9]+))?') # Possible formats locations: files/file, files/groupFiles/files # and maybe others for video_file in video_data.findall('.//file'): - video_url = video_file.text.strip() + video_url = url_or_none(video_file.text.strip()) if not video_url: continue ext = determine_ext(video_url) @@ -108,9 +116,28 @@ class TurnerBaseIE(AdobePassIE): continue urls.append(video_url) format_id = video_file.get('bitrate') - if ext == 'smil': + if ext in ('scc', 'srt', 'vtt'): + subtitles.setdefault('en', []).append({ + 'ext': ext, + 'url': video_url, + }) + elif ext == 'png': + thumbnails.append({ + 'id': format_id, + 'url': video_url, + }) + elif ext == 'smil': formats.extend(self._extract_smil_formats( video_url, video_id, fatal=False)) + elif re.match(r'https?://[^/]+\.akamaihd\.net/[iz]/', video_url): + formats.extend(self._extract_akamai_formats( + video_url, video_id, { + 'hds': path_data.get('f4m', {}).get('host'), + # nba.cdn.turner.com, ht.cdn.turner.com, ht2.cdn.turner.com + # ht3.cdn.turner.com, i.cdn.turner.com, s.cdn.turner.com + # ssl.cdn.turner.com + 'http': 'pmd.cdn.turner.com', + })) elif ext == 'm3u8': m3u8_formats = self._extract_m3u8_formats( video_url, video_id, 'mp4', @@ -129,7 +156,7 @@ class TurnerBaseIE(AdobePassIE): 'url': video_url, 'ext': ext, } - mobj = rex.search(format_id + video_url) + mobj = rex.search(video_url) if mobj: f.update({ 'width': int(mobj.group('width')), @@ -152,7 +179,6 @@ class TurnerBaseIE(AdobePassIE): formats.append(f) self._sort_formats(formats) - subtitles = {} for source in video_data.findall('closedCaptions/source'): for track in source.findall('track'): track_url = url_or_none(track.get('url')) @@ -168,12 +194,12 @@ class TurnerBaseIE(AdobePassIE): }.get(source.get('format')) }) - thumbnails = [{ - 'id': image.get('cut'), + thumbnails.extend({ + 'id': image.get('cut') or image.get('name'), 'url': image.text, 'width': int_or_none(image.get('width')), 'height': int_or_none(image.get('height')), - } for image in video_data.findall('images/image')] + } for image in video_data.findall('images/image')) is_live = xpath_text(video_data, 'isLive') == 'true' -- GitLab From 027f07edd3d84b66286769db847c8e8c15dd665f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:21:32 +0100 Subject: [PATCH 165/384] [nba] rewrite extractor --- haruhi_dl/extractor/extractors.py | 9 +- haruhi_dl/extractor/nba.py | 480 +++++++++++++++++++++++------- 2 files changed, 385 insertions(+), 104 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 39fa151e6..f639ade6c 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -710,7 +710,14 @@ from .nationalgeographic import ( NationalGeographicTVIE, ) from .naver import NaverIE -from .nba import NBAIE +from .nba import ( + NBAWatchEmbedIE, + NBAWatchIE, + NBAWatchCollectionIE, + NBAEmbedIE, + NBAIE, + NBAChannelIE, +) from .nbc import ( CSNNEIE, NBCIE, diff --git a/haruhi_dl/extractor/nba.py b/haruhi_dl/extractor/nba.py index be295a7a3..fbc7adaf4 100644 --- a/haruhi_dl/extractor/nba.py +++ b/haruhi_dl/extractor/nba.py @@ -5,33 +5,137 @@ import re from .turner import TurnerBaseIE from ..compat import ( - compat_urllib_parse_urlencode, - compat_urlparse, + compat_parse_qs, + compat_str, + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, ) from ..utils import ( + int_or_none, + merge_dicts, OnDemandPagedList, - remove_start, + parse_duration, + parse_iso8601, + try_get, + update_url_query, + urljoin, ) -class NBAIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P(?:[^/]+/)+(?P[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' +class NBACVPBaseIE(TurnerBaseIE): + def _extract_nba_cvp_info(self, path, video_id, fatal=False): + return self._extract_cvp_info( + 'http://secure.nba.com/%s' % path, video_id, { + 'default': { + 'media_src': 'http://nba.cdn.turner.com/nba/big', + }, + 'm3u8': { + 'media_src': 'http://nbavod-f.akamaihd.net', + }, + }, fatal=fatal) + + +class NBAWatchBaseIE(NBACVPBaseIE): + _VALID_URL_BASE = r'https?://(?:(?:www\.)?nba\.com(?:/watch)?|watch\.nba\.com)/' + + def _extract_video(self, filter_key, filter_value): + video = self._download_json( + 'https://neulionscnbav2-a.akamaihd.net/solr/nbad_program/usersearch', + filter_value, query={ + 'fl': 'description,image,name,pid,releaseDate,runtime,tags,seoName', + 'q': filter_key + ':' + filter_value, + 'wt': 'json', + })['response']['docs'][0] + + video_id = str(video['pid']) + title = video['name'] + + formats = [] + m3u8_url = (self._download_json( + 'https://watch.nba.com/service/publishpoint', video_id, query={ + 'type': 'video', + 'format': 'json', + 'id': video_id, + }, headers={ + 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1', + }, fatal=False) or {}).get('path') + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats( + re.sub(r'_(?:pc|iphone)\.', '.', m3u8_url), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + for f in m3u8_formats: + http_f = f.copy() + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': http_f['url'].replace('.m3u8', ''), + }) + formats.append(http_f) + + info = { + 'id': video_id, + 'title': title, + 'thumbnail': urljoin('https://nbadsdmt.akamaized.net/media/nba/nba/thumbs/', video.get('image')), + 'description': video.get('description'), + 'duration': int_or_none(video.get('runtime')), + 'timestamp': parse_iso8601(video.get('releaseDate')), + 'tags': video.get('tags'), + } + + seo_name = video.get('seoName') + if seo_name and re.search(r'\d{4}/\d{2}/\d{2}/', seo_name): + base_path = '' + if seo_name.startswith('teams/'): + base_path += seo_name.split('/')[1] + '/' + base_path += 'video/' + cvp_info = self._extract_nba_cvp_info( + base_path + seo_name + '.xml', video_id, False) + if cvp_info: + formats.extend(cvp_info['formats']) + info = merge_dicts(info, cvp_info) + + self._sort_formats(formats) + info['formats'] = formats + return info + + +class NBAWatchEmbedIE(NBAWatchBaseIE): + IENAME = 'nba:watch:embed' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P\d+)' + _TESTS = [{ + 'url': 'http://watch.nba.com/embed?id=659395', + 'md5': 'b7e3f9946595f4ca0a13903ce5edd120', + 'info_dict': { + 'id': '659395', + 'ext': 'mp4', + 'title': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017', + 'description': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017', + 'timestamp': 1492228800, + 'upload_date': '20170415', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video('pid', video_id) + + +class NBAWatchIE(NBAWatchBaseIE): + IE_NAME = 'nba:watch' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'(?:nba/)?video/(?P.+?(?=/index\.html)|(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - 'md5': '9e7729d3010a9c71506fd1248f74e4f4', + 'md5': '9d902940d2a127af3f7f9d2f3dc79c96', 'info_dict': { - 'id': '0021200253-okc-bkn-recap', + 'id': '70946', 'ext': 'mp4', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, - 'timestamp': 1354638466, + 'timestamp': 1354597200, 'upload_date': '20121204', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, @@ -39,116 +143,286 @@ class NBAIE(TurnerBaseIE): 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { - 'id': 'channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'id': '330865', 'ext': 'mp4', 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, - 'timestamp': 1432134543, - 'upload_date': '20150520', + 'timestamp': 1432094400, + 'upload_date': '20150521', }, - 'expected_warnings': ['Unable to download f4m manifest'], }, { - 'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake', - 'info_dict': { - 'id': 'teams/clippers/2016/02/17/1455672027478-Doc_Feb16_720.mov-297324', - 'ext': 'mp4', - 'title': 'Practice: Doc Rivers - 2/16/16', - 'description': 'Head Coach Doc Rivers addresses the media following practice.', - 'upload_date': '20160216', - 'timestamp': 1455672000, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], + 'url': 'http://watch.nba.com/nba/video/channels/nba_tv/2015/06/11/YT_go_big_go_home_Game4_061115', + 'only_matching': True, }, { - 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', - 'info_dict': { - 'id': 'timberwolves', - 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', - }, - 'playlist_count': 30, - 'params': { - # Download the whole playlist takes too long time - 'playlist_items': '1-30', - }, + # only CVP mp4 format available + 'url': 'https://watch.nba.com/video/teams/cavaliers/2012/10/15/sloan121015mov-2249106', + 'only_matching': True, }, { - 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', + 'url': 'https://watch.nba.com/video/top-100-dunks-from-the-2019-20-season?plsrc=nba&collection=2019-20-season-highlights', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + collection_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('collection', [None])[0] + if collection_id: + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % display_id) + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % collection_id) + return self.url_result( + 'https://www.nba.com/watch/list/collection/' + collection_id, + NBAWatchCollectionIE.ie_key(), collection_id) + return self._extract_video('seoName', display_id) + + +class NBAWatchCollectionIE(NBAWatchBaseIE): + IE_NAME = 'nba:watch:collection' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'list/collection/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://watch.nba.com/list/collection/season-preview-2020', 'info_dict': { - 'id': 'teams/timberwolves/2014/12/12/Wigginsmp4-3462601', - 'ext': 'mp4', - 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', - 'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.', - 'upload_date': '20141212', - 'timestamp': 1418418600, + 'id': 'season-preview-2020', }, - 'params': { - 'noplaylist': True, - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], + 'playlist_mincount': 43, }] + _PAGE_SIZE = 100 - _PAGE_SIZE = 30 + def _fetch_page(self, collection_id, page): + page += 1 + videos = self._download_json( + 'https://content-api-prod.nba.com/public/1/endeavor/video-list/collection/' + collection_id, + collection_id, 'Downloading page %d JSON metadata' % page, query={ + 'count': self._PAGE_SIZE, + 'page': page, + })['results']['videos'] + for video in videos: + program = video.get('program') or {} + seo_name = program.get('seoName') or program.get('slug') + if not seo_name: + continue + yield { + '_type': 'url', + 'id': program.get('id'), + 'title': program.get('title') or video.get('title'), + 'url': 'https://www.nba.com/watch/video/' + seo_name, + 'thumbnail': video.get('image'), + 'description': program.get('description') or video.get('description'), + 'duration': parse_duration(program.get('runtimeHours')), + 'timestamp': parse_iso8601(video.get('releaseDate')), + } - def _fetch_page(self, team, video_id, page): - search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse_urlencode({ - 'type': 'teamvideo', - 'start': page * self._PAGE_SIZE + 1, - 'npp': (page + 1) * self._PAGE_SIZE + 1, - 'sort': 'recent', - 'output': 'json', - 'site': team, - }) - results = self._download_json( - search_url, video_id, note='Download page %d of playlist data' % page)['results'][0] - for item in results: - yield self.url_result(compat_urlparse.urljoin('http://www.nba.com/', item['url'])) - - def _extract_playlist(self, orig_path, video_id, webpage): - team = orig_path.split('/')[0] - - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video because of --no-playlist') - video_path = self._search_regex( - r'nbaVideoCore\.firstVideo\s*=\s*\'([^\']+)\';', webpage, 'video path') - video_url = 'http://www.nba.com/%s/video/%s' % (team, video_path) - return self.url_result(video_url) - - self.to_screen('Downloading playlist - add --no-playlist to just download video') - playlist_title = self._og_search_title(webpage, fatal=False) + def _real_extract(self, url): + collection_id = self._match_id(url) entries = OnDemandPagedList( - functools.partial(self._fetch_page, team, video_id), + functools.partial(self._fetch_page, collection_id), self._PAGE_SIZE) + return self.playlist_result(entries, collection_id) - return self.playlist_result(entries, team, playlist_title) - def _real_extract(self, url): - path, video_id = re.match(self._VALID_URL, url).groups() - orig_path = path - if path.startswith('nba/'): - path = path[3:] +class NBABaseIE(NBACVPBaseIE): + _VALID_URL_BASE = r'''(?x) + https?://(?:www\.)?nba\.com/ + (?P + blazers| + bucks| + bulls| + cavaliers| + celtics| + clippers| + grizzlies| + hawks| + heat| + hornets| + jazz| + kings| + knicks| + lakers| + magic| + mavericks| + nets| + nuggets| + pacers| + pelicans| + pistons| + raptors| + rockets| + sixers| + spurs| + suns| + thunder| + timberwolves| + warriors| + wizards + ) + (?:/play\#)?/''' + _CHANNEL_PATH_REGEX = r'video/channel|series' - if 'video/' not in path: - webpage = self._download_webpage(url, video_id) - path = remove_start(self._search_regex(r'data-videoid="([^"]+)"', webpage, 'video id'), '/') + def _embed_url_result(self, team, content_id): + return self.url_result(update_url_query( + 'https://secure.nba.com/assets/amp/include/video/iframe.html', { + 'contentId': content_id, + 'team': team, + }), NBAEmbedIE.ie_key()) - if path == '{{id}}': - return self._extract_playlist(orig_path, video_id, webpage) + def _call_api(self, team, content_id, query, resource): + return self._download_json( + 'https://api.nba.net/2/%s/video,imported_video,wsc/' % team, + content_id, 'Download %s JSON metadata' % resource, + query=query, headers={ + 'accessToken': 'internal|bb88df6b4c2244e78822812cecf1ee1b', + })['response']['result'] - # See prepareContentId() of pkgCvp.js - if path.startswith('video/teams'): - path = 'video/channels/proxy/' + path[6:] + def _extract_video(self, video, team, extract_all=True): + video_id = compat_str(video['nid']) + team = video['brand'] - return self._extract_cvp_info( - 'http://www.nba.com/%s.xml' % path, video_id, { - 'default': { - 'media_src': 'http://nba.cdn.turner.com/nba/big', - }, - 'm3u8': { - 'media_src': 'http://nbavod-f.akamaihd.net', - }, + info = { + 'id': video_id, + 'title': video.get('title') or video.get('headline') or video['shortHeadline'], + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('published')), + } + + subtitles = {} + captions = try_get(video, lambda x: x['videoCaptions']['sidecars'], dict) or {} + for caption_url in captions.values(): + subtitles.setdefault('en', []).append({'url': caption_url}) + + formats = [] + mp4_url = video.get('mp4') + if mp4_url: + formats.append({ + 'url': mp4_url, }) + + if extract_all: + source_url = video.get('videoSource') + if source_url and not source_url.startswith('s3://') and self._is_valid_url(source_url, video_id, 'source'): + formats.append({ + 'format_id': 'source', + 'url': source_url, + 'preference': 1, + }) + + m3u8_url = video.get('m3u8') + if m3u8_url: + if '.akamaihd.net/i/' in m3u8_url: + formats.extend(self._extract_akamai_formats( + m3u8_url, video_id, {'http': 'pmd.cdn.turner.com'})) + else: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + content_xml = video.get('contentXml') + if team and content_xml: + cvp_info = self._extract_nba_cvp_info( + team + content_xml, video_id, fatal=False) + if cvp_info: + formats.extend(cvp_info['formats']) + subtitles = self._merge_subtitles(subtitles, cvp_info['subtitles']) + info = merge_dicts(info, cvp_info) + + self._sort_formats(formats) + else: + info.update(self._embed_url_result(team, video['videoId'])) + + info.update({ + 'formats': formats, + 'subtitles': subtitles, + }) + + return info + + def _real_extract(self, url): + team, display_id = re.match(self._VALID_URL, url).groups() + if '/play#/' in url: + display_id = compat_urllib_parse_unquote(display_id) + else: + webpage = self._download_webpage(url, display_id) + display_id = self._search_regex( + self._CONTENT_ID_REGEX + r'\s*:\s*"([^"]+)"', webpage, 'video id') + return self._extract_url_results(team, display_id) + + +class NBAEmbedIE(NBABaseIE): + IENAME = 'nba:embed' + _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P[^?#&]+)' + _TESTS = [{ + 'url': 'https://secure.nba.com/assets/amp/include/video/topIframe.html?contentId=teams/bulls/2020/12/04/3478774/1607105587854-20201204_SCHEDULE_RELEASE_FINAL_DRUPAL-3478774&team=bulls&adFree=false&profile=71&videoPlayerName=TAMPCVP&baseUrl=&videoAdsection=nba.com_mobile_web_teamsites_chicagobulls&Env=', + 'only_matching': True, + }, { + 'url': 'https://secure.nba.com/assets/amp/include/video/iframe.html?contentId=2016/10/29/0021600027boschaplay7&adFree=false&profile=71&team=&videoPlayerName=LAMPCVP', + 'only_matching': True, + }] + + def _real_extract(self, url): + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + content_id = qs['contentId'][0] + team = qs.get('team', [None])[0] + if not team: + return self.url_result( + 'https://watch.nba.com/video/' + content_id, NBAWatchIE.ie_key()) + video = self._call_api(team, content_id, {'videoid': content_id}, 'video')[0] + return self._extract_video(video, team) + + +class NBAIE(NBABaseIE): + IENAME = 'nba' + _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?!%s)video/(?P(?:[^/]+/)*[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX + _TESTS = [{ + 'url': 'https://www.nba.com/bulls/video/teams/bulls/2020/12/04/3478774/1607105587854-20201204schedulereleasefinaldrupal-3478774', + 'info_dict': { + 'id': '45039', + 'ext': 'mp4', + 'title': 'AND WE BACK.', + 'description': 'Part 1 of our 2020-21 schedule is here! Watch our games on NBC Sports Chicago.', + 'duration': 94, + 'timestamp': 1607112000, + 'upload_date': '20201218', + }, + }, { + 'url': 'https://www.nba.com/bucks/play#/video/teams%2Fbucks%2F2020%2F12%2F17%2F64860%2F1608252863446-Op_Dream_16x9-64860', + 'only_matching': True, + }, { + 'url': 'https://www.nba.com/bucks/play#/video/wsc%2Fteams%2F2787C911AA1ACD154B5377F7577CCC7134B2A4B0', + 'only_matching': True, + }] + _CONTENT_ID_REGEX = r'videoID' + + def _extract_url_results(self, team, content_id): + return self._embed_url_result(team, content_id) + + +class NBAChannelIE(NBABaseIE): + IENAME = 'nba:channel' + _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?:%s)/(?P[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX + _TESTS = [{ + 'url': 'https://www.nba.com/blazers/video/channel/summer_league', + 'info_dict': { + 'title': 'Summer League', + }, + 'playlist_mincount': 138, + }, { + 'url': 'https://www.nba.com/bucks/play#/series/On%20This%20Date', + 'only_matching': True, + }] + _CONTENT_ID_REGEX = r'videoSubCategory' + _PAGE_SIZE = 100 + + def _fetch_page(self, team, channel, page): + results = self._call_api(team, channel, { + 'channels': channel, + 'count': self._PAGE_SIZE, + 'offset': page * self._PAGE_SIZE, + }, 'page %d' % (page + 1)) + for video in results: + yield self._extract_video(video, team, False) + + def _extract_url_results(self, team, content_id): + entries = OnDemandPagedList( + functools.partial(self._fetch_page, team, content_id), + self._PAGE_SIZE) + return self.playlist_result(entries, playlist_title=content_id) -- GitLab From ef03683547c9331d62333260d9137207f2bb5ae6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:22:09 +0100 Subject: [PATCH 166/384] [kanalplay] Remove Extractor --- haruhi_dl/extractor/extractors.py | 1 - haruhi_dl/extractor/kanalplay.py | 97 ------------------------------- 2 files changed, 98 deletions(-) delete mode 100644 haruhi_dl/extractor/kanalplay.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index f639ade6c..b7a2d5eba 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -532,7 +532,6 @@ from .joj import JojIE from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE -from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE diff --git a/haruhi_dl/extractor/kanalplay.py b/haruhi_dl/extractor/kanalplay.py deleted file mode 100644 index 6c3498c67..000000000 --- a/haruhi_dl/extractor/kanalplay.py +++ /dev/null @@ -1,97 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - srt_subtitles_timecode, -) - - -class KanalPlayIE(InfoExtractor): - IE_DESC = 'Kanal 5/9/11 Play' - _VALID_URL = r'https?://(?:www\.)?kanal(?P5|9|11)play\.se/(?:#!/)?(?:play/)?program/\d+/video/(?P\d+)' - _TESTS = [{ - 'url': 'http://www.kanal5play.se/#!/play/program/3060212363/video/3270012277', - 'info_dict': { - 'id': '3270012277', - 'ext': 'flv', - 'title': 'Saknar både dusch och avlopp', - 'description': 'md5:6023a95832a06059832ae93bc3c7efb7', - 'duration': 2636.36, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.kanal9play.se/#!/play/program/335032/video/246042', - 'only_matching': True, - }, { - 'url': 'http://www.kanal11play.se/#!/play/program/232835958/video/367135199', - 'only_matching': True, - }] - - def _fix_subtitles(self, subs): - return '\r\n\r\n'.join( - '%s\r\n%s --> %s\r\n%s' - % ( - num, - srt_subtitles_timecode(item['startMillis'] / 1000.0), - srt_subtitles_timecode(item['endMillis'] / 1000.0), - item['text'], - ) for num, item in enumerate(subs, 1)) - - def _get_subtitles(self, channel_id, video_id): - subs = self._download_json( - 'http://www.kanal%splay.se/api/subtitles/%s' % (channel_id, video_id), - video_id, 'Downloading subtitles JSON', fatal=False) - return {'sv': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {} - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - channel_id = mobj.group('channel_id') - - video = self._download_json( - 'http://www.kanal%splay.se/api/getVideo?format=FLASH&videoId=%s' % (channel_id, video_id), - video_id) - - reasons_for_no_streams = video.get('reasonsForNoStreams') - if reasons_for_no_streams: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, '\n'.join(reasons_for_no_streams)), - expected=True) - - title = video['title'] - description = video.get('description') - duration = float_or_none(video.get('length'), 1000) - thumbnail = video.get('posterUrl') - - stream_base_url = video['streamBaseUrl'] - - formats = [{ - 'url': stream_base_url, - 'play_path': stream['source'], - 'ext': 'flv', - 'tbr': float_or_none(stream.get('bitrate'), 1000), - 'rtmp_real_time': True, - } for stream in video['streams']] - self._sort_formats(formats) - - subtitles = {} - if video.get('hasSubtitle'): - subtitles = self.extract_subtitles(channel_id, video_id) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } -- GitLab From 90988f47724514a8029b7e4bb1426159bb8bfe7d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:22:49 +0100 Subject: [PATCH 167/384] [everyonesmixtape] Remove Extractor --- haruhi_dl/extractor/everyonesmixtape.py | 77 ------------------------- haruhi_dl/extractor/extractors.py | 1 - 2 files changed, 78 deletions(-) delete mode 100644 haruhi_dl/extractor/everyonesmixtape.py diff --git a/haruhi_dl/extractor/everyonesmixtape.py b/haruhi_dl/extractor/everyonesmixtape.py deleted file mode 100644 index 84a9b750e..000000000 --- a/haruhi_dl/extractor/everyonesmixtape.py +++ /dev/null @@ -1,77 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - sanitized_Request, -) - - -class EveryonesMixtapeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?everyonesmixtape\.com/#/mix/(?P[0-9a-zA-Z]+)(?:/(?P[0-9]))?$' - - _TESTS = [{ - 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi/5', - 'info_dict': { - 'id': '5bfseWNmlds', - 'ext': 'mp4', - 'title': "Passion Pit - \"Sleepyhead\" (Official Music Video)", - 'uploader': 'FKR.TV', - 'uploader_id': 'frenchkissrecords', - 'description': "Music video for \"Sleepyhead\" from Passion Pit's debut EP Chunk Of Change.\nBuy on iTunes: https://itunes.apple.com/us/album/chunk-of-change-ep/id300087641\n\nDirected by The Wilderness.\n\nhttp://www.passionpitmusic.com\nhttp://www.frenchkissrecords.com", - 'upload_date': '20081015' - }, - 'params': { - 'skip_download': True, # This is simply YouTube - } - }, { - 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi', - 'info_dict': { - 'id': 'm7m0jJAbMQi', - 'title': 'Driving', - }, - 'playlist_count': 24 - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - - pllist_url = 'http://everyonesmixtape.com/mixtape.php?a=getMixes&u=-1&linked=%s&explore=' % playlist_id - pllist_req = sanitized_Request(pllist_url) - pllist_req.add_header('X-Requested-With', 'XMLHttpRequest') - - playlist_list = self._download_json( - pllist_req, playlist_id, note='Downloading playlist metadata') - try: - playlist_no = next(playlist['id'] - for playlist in playlist_list - if playlist['code'] == playlist_id) - except StopIteration: - raise ExtractorError('Playlist id not found') - - pl_url = 'http://everyonesmixtape.com/mixtape.php?a=getMix&id=%s&userId=null&code=' % playlist_no - pl_req = sanitized_Request(pl_url) - pl_req.add_header('X-Requested-With', 'XMLHttpRequest') - playlist = self._download_json( - pl_req, playlist_id, note='Downloading playlist info') - - entries = [{ - '_type': 'url', - 'url': t['url'], - 'title': t['title'], - } for t in playlist['tracks']] - - if mobj.group('songnr'): - songnr = int(mobj.group('songnr')) - 1 - return entries[songnr] - - playlist_title = playlist['mixData']['name'] - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': playlist_title, - 'entries': entries, - } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index b7a2d5eba..4d3b7bb50 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -344,7 +344,6 @@ from .eurozet import ( EurozetPlayerPodcastIE, EurozetPlayerMusicStreamIE, ) -from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE from .expressen import ExpressenIE from .extremetube import ExtremeTubeIE -- GitLab From b3acd855b8bf48ef1c70995c1c342cc4813fb244 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:24:05 +0100 Subject: [PATCH 168/384] [niconico] fix playlist extraction(closes #27428) --- haruhi_dl/extractor/niconico.py | 97 ++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 26 deletions(-) diff --git a/haruhi_dl/extractor/niconico.py b/haruhi_dl/extractor/niconico.py index eb07ca776..a85fc3d5c 100644 --- a/haruhi_dl/extractor/niconico.py +++ b/haruhi_dl/extractor/niconico.py @@ -1,20 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -import json import datetime +import functools +import json +import math from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urlparse, + compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, dict_get, ExtractorError, - int_or_none, float_or_none, + InAdvancePagedList, + int_or_none, parse_duration, parse_iso8601, remove_start, @@ -181,7 +184,7 @@ class NiconicoIE(InfoExtractor): if urlh is False: login_ok = False else: - parts = compat_urlparse.urlparse(urlh.geturl()) + parts = compat_urllib_parse_urlparse(urlh.geturl()) if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login': login_ok = False if not login_ok: @@ -292,7 +295,7 @@ class NiconicoIE(InfoExtractor): 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', video_id, 'Downloading flv info') - flv_info = compat_urlparse.parse_qs(flv_info_webpage) + flv_info = compat_parse_qs(flv_info_webpage) if 'url' not in flv_info: if 'deleted' in flv_info: raise ExtractorError('The video has been deleted.', @@ -437,34 +440,76 @@ class NiconicoIE(InfoExtractor): class NiconicoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/mylist/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/)?mylist/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.nicovideo.jp/mylist/27411728', 'info_dict': { 'id': '27411728', 'title': 'AKB48のオールナイトニッポン', + 'description': 'md5:d89694c5ded4b6c693dea2db6e41aa08', + 'uploader': 'のっく', + 'uploader_id': '805442', }, 'playlist_mincount': 225, - } + }, { + 'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728', + 'only_matching': True, + }] + _PAGE_SIZE = 100 + + def _call_api(self, list_id, resource, query): + return self._download_json( + 'https://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, + 'Downloading %s JSON metatdata' % resource, query=query, + headers={'X-Frontend-Id': 6})['data']['mylist'] + + def _parse_owner(self, item): + owner = item.get('owner') or {} + if owner: + return { + 'uploader': owner.get('name'), + 'uploader_id': owner.get('id'), + } + return {} + + def _fetch_page(self, list_id, page): + page += 1 + items = self._call_api(list_id, 'page %d' % page, { + 'page': page, + 'pageSize': self._PAGE_SIZE, + })['items'] + for item in items: + video = item.get('video') or {} + video_id = video.get('id') + if not video_id: + continue + count = video.get('count') or {} + get_count = lambda x: int_or_none(count.get(x)) + info = { + '_type': 'url', + 'id': video_id, + 'title': video.get('title'), + 'url': 'https://www.nicovideo.jp/watch/' + video_id, + 'description': video.get('shortDescription'), + 'duration': int_or_none(video.get('duration')), + 'view_count': get_count('view'), + 'comment_count': get_count('comment'), + 'ie_key': NiconicoIE.ie_key(), + } + info.update(self._parse_owner(video)) + yield info def _real_extract(self, url): list_id = self._match_id(url) - webpage = self._download_webpage(url, list_id) - - entries_json = self._search_regex(r'Mylist\.preload\(\d+, (\[.*\])\);', - webpage, 'entries') - entries = json.loads(entries_json) - entries = [{ - '_type': 'url', - 'ie_key': NiconicoIE.ie_key(), - 'url': ('http://www.nicovideo.jp/watch/%s' % - entry['item_data']['video_id']), - } for entry in entries] - - return { - '_type': 'playlist', - 'title': self._search_regex(r'\s+name: "(.*?)"', webpage, 'title'), - 'id': list_id, - 'entries': entries, - } + mylist = self._call_api(list_id, 'list', { + 'pageSize': 1, + }) + entries = InAdvancePagedList( + functools.partial(self._fetch_page, list_id), + math.ceil(mylist['totalItemCount'] / self._PAGE_SIZE), + self._PAGE_SIZE) + result = self.playlist_result( + entries, list_id, mylist.get('name'), mylist.get('description')) + result.update(self._parse_owner(mylist)) + return result -- GitLab From 3c6c586e4b0373c7877bb81a1a97d7f736ef175b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:24:11 +0100 Subject: [PATCH 169/384] [tastytrade] Remove Extractor(closes #25716) covered by GenericIE via BrighcoveNewIE --- haruhi_dl/extractor/extractors.py | 1 - haruhi_dl/extractor/tastytrade.py | 43 ------------------------------- 2 files changed, 44 deletions(-) delete mode 100644 haruhi_dl/extractor/tastytrade.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 4d3b7bb50..b1aed7e05 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1173,7 +1173,6 @@ from .tagesschau import ( TagesschauIE, ) from .tass import TassIE -from .tastytrade import TastyTradeIE from .tbs import TBSIE from .tdslifeway import TDSLifewayIE from .teachable import ( diff --git a/haruhi_dl/extractor/tastytrade.py b/haruhi_dl/extractor/tastytrade.py deleted file mode 100644 index 7fe96bd5f..000000000 --- a/haruhi_dl/extractor/tastytrade.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .ooyala import OoyalaIE - - -class TastyTradeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/shows/[^/]+/episodes/(?P[^/?#&]+)' - - _TESTS = [{ - 'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017', - 'info_dict': { - 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', - 'ext': 'mp4', - 'title': 'A History of Teaming', - 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', - 'duration': 422.255, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, { - 'url': 'https://www.tastytrade.com/tt/shows/daily-dose/episodes/daily-dose-06-30-2017', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - ooyala_code = self._search_regex( - r'data-media-id=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'ooyala code', group='code') - - info = self._search_json_ld(webpage, display_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'ie_key': OoyalaIE.ie_key(), - 'url': 'ooyala:%s' % ooyala_code, - 'display_id': display_id, - }) - return info -- GitLab From ed14efaed272789c6f5dde6fb633af4fcb1c11f8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:24:16 +0100 Subject: [PATCH 170/384] [anvato] remove NFLTokenGenerator until a better solution is introduced that: - works with lazy_extractors - allows for 3rd party token generators --- haruhi_dl/extractor/anvato.py | 14 ++------- .../anvato_token_generator/__init__.py | 7 ----- .../anvato_token_generator/common.py | 6 ---- .../extractor/anvato_token_generator/nfl.py | 30 ------------------- 4 files changed, 3 insertions(+), 54 deletions(-) delete mode 100644 haruhi_dl/extractor/anvato_token_generator/__init__.py delete mode 100644 haruhi_dl/extractor/anvato_token_generator/common.py delete mode 100644 haruhi_dl/extractor/anvato_token_generator/nfl.py diff --git a/haruhi_dl/extractor/anvato.py b/haruhi_dl/extractor/anvato.py index 98c5e6d38..b7398563b 100644 --- a/haruhi_dl/extractor/anvato.py +++ b/haruhi_dl/extractor/anvato.py @@ -9,7 +9,6 @@ import re import time from .common import InfoExtractor -# from .anvato_token_generator import NFLTokenGenerator from ..aes import aes_encrypt from ..compat import compat_str from ..utils import ( @@ -204,10 +203,6 @@ class AnvatoIE(InfoExtractor): 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' } - _TOKEN_GENERATORS = { - # 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, - } - _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' _ANVP_RE = r']+\bdata-anvp\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' @@ -267,12 +262,9 @@ class AnvatoIE(InfoExtractor): 'anvrid': anvrid, 'anvts': server_time, } - if access_key in self._TOKEN_GENERATORS: - api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id) - else: - api['anvstk'] = md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, - self._ANVACK_TABLE.get(access_key, self._API_KEY))) + api['anvstk'] = md5_text('%s|%s|%d|%s' % ( + access_key, anvrid, server_time, + self._ANVACK_TABLE.get(access_key, self._API_KEY))) return self._download_json( video_data_url, video_id, transform_source=strip_jsonp, diff --git a/haruhi_dl/extractor/anvato_token_generator/__init__.py b/haruhi_dl/extractor/anvato_token_generator/__init__.py deleted file mode 100644 index 6e223db9f..000000000 --- a/haruhi_dl/extractor/anvato_token_generator/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from __future__ import unicode_literals - -from .nfl import NFLTokenGenerator - -__all__ = [ - 'NFLTokenGenerator', -] diff --git a/haruhi_dl/extractor/anvato_token_generator/common.py b/haruhi_dl/extractor/anvato_token_generator/common.py deleted file mode 100644 index b959a903b..000000000 --- a/haruhi_dl/extractor/anvato_token_generator/common.py +++ /dev/null @@ -1,6 +0,0 @@ -from __future__ import unicode_literals - - -class TokenGenerator: - def generate(self, anvack, mcp_id): - raise NotImplementedError('This method must be implemented by subclasses') diff --git a/haruhi_dl/extractor/anvato_token_generator/nfl.py b/haruhi_dl/extractor/anvato_token_generator/nfl.py deleted file mode 100644 index 97a2b245f..000000000 --- a/haruhi_dl/extractor/anvato_token_generator/nfl.py +++ /dev/null @@ -1,30 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import TokenGenerator - - -class NFLTokenGenerator(TokenGenerator): - _AUTHORIZATION = None - - def generate(ie, anvack, mcp_id): - if not NFLTokenGenerator._AUTHORIZATION: - reroute = ie._download_json( - 'https://api.nfl.com/v1/reroute', mcp_id, - data=b'grant_type=client_credentials', - headers={'X-Domain-Id': 100}) - NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token']) - return ie._download_json( - 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ - 'query': '''{ - viewer { - mediaToken(anvack: "%s", id: %s) { - token - } - } -}''' % (anvack, mcp_id), - }).encode(), headers={ - 'Authorization': NFLTokenGenerator._AUTHORIZATION, - 'Content-Type': 'application/json', - })['data']['viewer']['mediaToken']['token'] -- GitLab From 4317f7c6fafa171ec1b518a348084206b0da2ade Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:26:14 +0100 Subject: [PATCH 171/384] [mewatch] Relax _VALID_URL (closes #27506) --- haruhi_dl/extractor/toggle.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/toggle.py b/haruhi_dl/extractor/toggle.py index 91b8023b8..3b9b54759 100644 --- a/haruhi_dl/extractor/toggle.py +++ b/haruhi_dl/extractor/toggle.py @@ -200,7 +200,7 @@ class ToggleIE(InfoExtractor): class MeWatchIE(InfoExtractor): IE_NAME = 'mewatch' - _VALID_URL = r'https?://(?:www\.)?mewatch\.sg/watch/[0-9a-zA-Z-]+-(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?mewatch\.sg/watch/[^/?#&]+-(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371', 'info_dict': { @@ -214,6 +214,12 @@ class MeWatchIE(InfoExtractor): 'params': { 'skip_download': 'm3u8 download', }, + }, { + 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-搜密。打卡。小红点-S2-E1-176232', + 'only_matching': True, + }, { + 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-%E6%90%9C%E5%AF%86%E3%80%82%E6%89%93%E5%8D%A1%E3%80%82%E5%B0%8F%E7%BA%A2%E7%82%B9-S2-E1-176232', + 'only_matching': True, }] def _real_extract(self, url): -- GitLab From fc441623a8bcbb103e68b7389a90e94d9d97ae49 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:26:18 +0100 Subject: [PATCH 172/384] [brightcove] add another method to extract policyKey --- haruhi_dl/extractor/brightcove.py | 36 +++++++++++++++++-------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/haruhi_dl/extractor/brightcove.py b/haruhi_dl/extractor/brightcove.py index ee2867ecc..2845f4df2 100644 --- a/haruhi_dl/extractor/brightcove.py +++ b/haruhi_dl/extractor/brightcove.py @@ -28,6 +28,7 @@ from ..utils import ( parse_iso8601, smuggle_url, str_or_none, + try_get, unescapeHTML, unsmuggle_url, UnsupportedError, @@ -600,24 +601,27 @@ class BrightcoveNewIE(AdobePassIE): store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) def extract_policy_key(): - webpage = self._download_webpage( - 'http://players.brightcove.net/%s/%s_%s/index.min.js' - % (account_id, player_id, embed), video_id) - - policy_key = None + base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed) + config = self._download_json( + base_url + 'config.json', video_id, fatal=False) or {} + policy_key = try_get( + config, lambda x: x['video_cloud']['policy_key']) + if not policy_key: + webpage = self._download_webpage( + base_url + 'index.min.js', video_id) - catalog = self._search_regex( - r'catalog\(({.+?})\);', webpage, 'catalog', default=None) - if catalog: - catalog = self._parse_json( - js_to_json(catalog), video_id, fatal=False) + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) if catalog: - policy_key = catalog.get('policyKey') - - if not policy_key: - policy_key = self._search_regex( - r'policyKey\s*:\s*(["\'])(?P.+?)\1', - webpage, 'policy key', group='pk') + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P.+?)\1', + webpage, 'policy key', group='pk') store_pk(policy_key) return policy_key -- GitLab From 437ab525e92c1c44dc23f4bae8a73a5ae0fa3079 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:26:26 +0100 Subject: [PATCH 173/384] [cbslocal] fix video extraction --- haruhi_dl/extractor/cbslocal.py | 67 +++++++++++++++++++------------ haruhi_dl/extractor/extractors.py | 5 ++- 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/haruhi_dl/extractor/cbslocal.py b/haruhi_dl/extractor/cbslocal.py index 90852a9ef..3b7e1a8b9 100644 --- a/haruhi_dl/extractor/cbslocal.py +++ b/haruhi_dl/extractor/cbslocal.py @@ -11,7 +11,47 @@ from ..utils import ( class CBSLocalIE(AnvatoIE): - _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P[0-9a-z-]+)' + _VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/' + _VALID_URL = _VALID_URL_BASE + r'video/(?P\d+)' + + _TESTS = [{ + 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', + 'info_dict': { + 'id': '3580809', + 'ext': 'mp4', + 'title': 'A Very Blue Anniversary', + 'description': 'CBS2’s Cindy Hsu has more.', + 'thumbnail': 're:^https?://.*', + 'timestamp': int, + 'upload_date': r're:^\d{8}$', + 'uploader': 'CBS', + 'subtitles': { + 'en': 'mincount:5', + }, + 'categories': [ + 'Stations\\Spoken Word\\WCBSTV', + 'Syndication\\AOL', + 'Syndication\\MSN', + 'Syndication\\NDN', + 'Syndication\\Yahoo', + 'Content\\News', + 'Content\\News\\Local News', + ], + 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + mcp_id = self._match_id(url) + return self.url_result( + 'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id) + + +class CBSLocalArticleIE(AnvatoIE): + _VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P[0-9a-z-]+)' _TESTS = [{ # Anvato backend @@ -52,31 +92,6 @@ class CBSLocalIE(AnvatoIE): # m3u8 download 'skip_download': True, }, - }, { - 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', - 'info_dict': { - 'id': '3580809', - 'ext': 'mp4', - 'title': 'A Very Blue Anniversary', - 'description': 'CBS2’s Cindy Hsu has more.', - 'thumbnail': 're:^https?://.*', - 'timestamp': int, - 'upload_date': r're:^\d{8}$', - 'uploader': 'CBS', - 'subtitles': { - 'en': 'mincount:5', - }, - 'categories': [ - 'Stations\\Spoken Word\\WCBSTV', - 'Syndication\\AOL', - 'Syndication\\MSN', - 'Syndication\\NDN', - 'Syndication\\Yahoo', - 'Content\\News', - 'Content\\News\\Local News', - ], - 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], - }, }] def _real_extract(self, url): diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index b1aed7e05..da6767164 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -173,7 +173,10 @@ from .cbc import ( CBCOlympicsIE, ) from .cbs import CBSIE -from .cbslocal import CBSLocalIE +from .cbslocal import ( + CBSLocalIE, + CBSLocalArticleIE, +) from .cbsinteractive import CBSInteractiveIE from .cbsnews import ( CBSNewsEmbedIE, -- GitLab From 00e2c2ddea0df6d88aa6122ff19a3e0d8e14d381 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:33:41 +0100 Subject: [PATCH 174/384] [facebook] add support for watchparty pages(closes #27507) --- haruhi_dl/extractor/facebook.py | 221 ++++++++++++++++++++------------ 1 file changed, 139 insertions(+), 82 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index d5afd0051..5dc931b86 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re import socket @@ -8,6 +9,7 @@ from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, compat_http_client, + compat_str, compat_urllib_error, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, @@ -47,7 +49,8 @@ class FacebookIE(InfoExtractor): )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| - groups/[^/]+/permalink/ + groups/[^/]+/permalink/| + watchparty/ )| facebook: ) @@ -280,8 +283,18 @@ class FacebookIE(InfoExtractor): # data.video.creation_story.attachments[].media 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/watchparty/211641140192478', + 'info_dict': { + 'id': '211641140192478', + }, + 'playlist_count': 1, + 'skip': 'Requires logging in', }] _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' + _api_config = { + 'graphURI': '/api/graphql/' + } @staticmethod def _extract_urls(webpage, **kwargs): @@ -405,6 +418,17 @@ class FacebookIE(InfoExtractor): self._sort_formats(formats) + def extract_relay_data(_filter): + return self._parse_json(self._search_regex( + r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, + webpage, 'replay data', default='{}'), video_id, fatal=False) or {} + + def extract_relay_prefetched_data(_filter): + replay_data = extract_relay_data(_filter) + for require in (replay_data.get('require') or []): + if require[0] == 'RelayPrefetchedStreamCache': + return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + if not video_data: server_js_data = self._parse_json(self._search_regex([ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, @@ -413,87 +437,83 @@ class FacebookIE(InfoExtractor): video_data = extract_from_jsmods_instances(server_js_data) if not video_data: - graphql_data = self._parse_json(self._search_regex( - r'handleWithCustomApplyEach\([^,]+,\s*({.*?"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+".*?})\);', - webpage, 'graphql data', default='{}'), video_id, fatal=False) or {} - for require in (graphql_data.get('require') or []): - if require[0] == 'RelayPrefetchedStreamCache': - entries = [] - - def parse_graphql_video(video): - formats = [] - q = qualities(['sd', 'hd']) - for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: - playable_url = video.get('playable_url' + suffix) - if not playable_url: - continue - formats.append({ - 'format_id': format_id, - 'quality': q(format_id), - 'url': playable_url, - }) - extract_dash_manifest(video, formats) - process_formats(formats) - v_id = video.get('videoId') or video.get('id') or video_id - info = { - 'id': v_id, - 'formats': formats, - 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), - 'uploader_id': try_get(video, lambda x: x['owner']['id']), - 'timestamp': int_or_none(video.get('publish_time')), - 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), - } - description = try_get(video, lambda x: x['savable_description']['text']) - title = video.get('name') - if title: - info.update({ - 'title': title, - 'description': description, - }) - else: - info['title'] = description or 'Facebook video #%s' % v_id - entries.append(info) - - def parse_attachment(attachment, key='media'): - media = attachment.get(key) or {} - if media.get('__typename') == 'Video': - return parse_graphql_video(media) - - data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} - - nodes = data.get('nodes') or [] - node = data.get('node') or {} - if not nodes and node: - nodes.append(node) - for node in nodes: - story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} - attachments = try_get(story, [ - lambda x: x['attached_story']['attachments'], - lambda x: x['attachments'] - ], list) or [] - for attachment in attachments: - attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) - ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] - for n in ns: - parse_attachment(n) - parse_attachment(attachment) - - edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] - for edge in edges: - parse_attachment(edge, key='node') - - video = data.get('video') or {} - if video: - attachments = try_get(video, [ - lambda x: x['story']['attachments'], - lambda x: x['creation_story']['attachments'] - ], list) or [] - for attachment in attachments: - parse_attachment(attachment) - if not entries: - parse_graphql_video(video) - - return self.playlist_result(entries, video_id) + data = extract_relay_prefetched_data( + r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') + if data: + entries = [] + + def parse_graphql_video(video): + formats = [] + q = qualities(['sd', 'hd']) + for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: + playable_url = video.get('playable_url' + suffix) + if not playable_url: + continue + formats.append({ + 'format_id': format_id, + 'quality': q(format_id), + 'url': playable_url, + }) + extract_dash_manifest(video, formats) + process_formats(formats) + v_id = video.get('videoId') or video.get('id') or video_id + info = { + 'id': v_id, + 'formats': formats, + 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), + 'uploader_id': try_get(video, lambda x: x['owner']['id']), + 'timestamp': int_or_none(video.get('publish_time')), + 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), + } + description = try_get(video, lambda x: x['savable_description']['text']) + title = video.get('name') + if title: + info.update({ + 'title': title, + 'description': description, + }) + else: + info['title'] = description or 'Facebook video #%s' % v_id + entries.append(info) + + def parse_attachment(attachment, key='media'): + media = attachment.get(key) or {} + if media.get('__typename') == 'Video': + return parse_graphql_video(media) + + nodes = data.get('nodes') or [] + node = data.get('node') or {} + if not nodes and node: + nodes.append(node) + for node in nodes: + story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} + attachments = try_get(story, [ + lambda x: x['attached_story']['attachments'], + lambda x: x['attachments'] + ], list) or [] + for attachment in attachments: + attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) + ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] + for n in ns: + parse_attachment(n) + parse_attachment(attachment) + + edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] + for edge in edges: + parse_attachment(edge, key='node') + + video = data.get('video') or {} + if video: + attachments = try_get(video, [ + lambda x: x['story']['attachments'], + lambda x: x['creation_story']['attachments'] + ], list) or [] + for attachment in attachments: + parse_attachment(attachment) + if not entries: + parse_graphql_video(video) + + return self.playlist_result(entries, video_id) if not video_data: m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) @@ -504,6 +524,43 @@ class FacebookIE(InfoExtractor): elif '>You must log in to continue' in webpage: self.raise_login_required() + if not video_data and '/watchparty/' in url: + post_data = { + 'doc_id': 3731964053542869, + 'variables': json.dumps({ + 'livingRoomID': video_id, + }), + } + + prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{') + if prefetched_data: + lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict) + if lsd: + post_data[lsd['name']] = lsd['value'] + + relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,') + for define in (relay_data.get('define') or []): + if define[0] == 'RelayAPIConfigDefaults': + self._api_config = define[2] + + living_room = self._download_json( + urljoin(url, self._api_config['graphURI']), video_id, + data=urlencode_postdata(post_data))['data']['living_room'] + + entries = [] + for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []): + video = try_get(edge, lambda x: x['node']['video']) or {} + v_id = video.get('id') + if not v_id: + continue + v_id = compat_str(v_id) + entries.append(self.url_result( + self._VIDEO_PAGE_TEMPLATE % v_id, + self.ie_key(), v_id, video.get('name'))) + + return self.playlist_result(entries, video_id) + + if not video_data: # Video info not in first request, do a secondary request using # tahoe player specific URL tahoe_data = self._download_webpage( -- GitLab From 6e80cb939be1b62653899d9cc34eccf38064cbf9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:34:30 +0100 Subject: [PATCH 175/384] [streetvoice] fix extraction(closes #27455)(closes #27492) --- haruhi_dl/extractor/streetvoice.py | 93 +++++++++++++++++++++++------- 1 file changed, 72 insertions(+), 21 deletions(-) diff --git a/haruhi_dl/extractor/streetvoice.py b/haruhi_dl/extractor/streetvoice.py index 91612c7f2..f21681ae7 100644 --- a/haruhi_dl/extractor/streetvoice.py +++ b/haruhi_dl/extractor/streetvoice.py @@ -2,25 +2,40 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str -from ..utils import unified_strdate +from ..utils import ( + int_or_none, + parse_iso8601, + str_or_none, + strip_or_none, + try_get, + urljoin, +) class StreetVoiceIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P[0-9]+)' _TESTS = [{ - 'url': 'http://streetvoice.com/skippylu/songs/94440/', - 'md5': '15974627fc01a29e492c98593c2fd472', + 'url': 'https://streetvoice.com/skippylu/songs/123688/', + 'md5': '0eb535970629a5195685355f3ed60bfd', 'info_dict': { - 'id': '94440', + 'id': '123688', 'ext': 'mp3', - 'title': '輸', - 'description': 'Crispy脆樂團 - 輸', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 260, - 'upload_date': '20091018', + 'title': '流浪', + 'description': 'md5:8eb0bfcc9dcd8aa82bd6efca66e3fea6', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 270, + 'upload_date': '20100923', 'uploader': 'Crispy脆樂團', 'uploader_id': '627810', + 'uploader_url': 're:^https?://streetvoice.com/skippylu/', + 'timestamp': 1285261661, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + 'track': '流浪', + 'track_id': '123688', + 'album': '2010', } }, { 'url': 'http://tw.streetvoice.com/skippylu/songs/94440/', @@ -29,21 +44,57 @@ class StreetVoiceIE(InfoExtractor): def _real_extract(self, url): song_id = self._match_id(url) + base_url = 'https://streetvoice.com/api/v4/song/%s/' % song_id + song = self._download_json(base_url, song_id, query={ + 'fields': 'album,comments_count,created_at,id,image,length,likes_count,name,nickname,plays_count,profile,share_count,synopsis,user,username', + }) + title = song['name'] - song = self._download_json( - 'https://streetvoice.com/api/v1/public/song/%s/' % song_id, song_id, data=b'') + formats = [] + for suffix, format_id in [('hls/file', 'hls'), ('file', 'http'), ('file/original', 'original')]: + f_url = (self._download_json( + base_url + suffix + '/', song_id, + 'Downloading %s format URL' % format_id, + data=b'', fatal=False) or {}).get('file') + if not f_url: + continue + f = { + 'ext': 'mp3', + 'format_id': format_id, + 'url': f_url, + 'vcodec': 'none', + } + if format_id == 'hls': + f['protocol'] = 'm3u8_native' + abr = self._search_regex(r'\.mp3\.(\d+)k', f_url, 'bitrate', default=None) + if abr: + abr = int(abr) + f.update({ + 'abr': abr, + 'tbr': abr, + }) + formats.append(f) - title = song['name'] - author = song['user']['nickname'] + user = song.get('user') or {} + username = user.get('username') + get_count = lambda x: int_or_none(song.get(x + '_count')) return { 'id': song_id, - 'url': song['file'], + 'formats': formats, 'title': title, - 'description': '%s - %s' % (author, title), - 'thumbnail': self._proto_relative_url(song.get('image'), 'http:'), - 'duration': song.get('length'), - 'upload_date': unified_strdate(song.get('created_at')), - 'uploader': author, - 'uploader_id': compat_str(song['user']['id']), + 'description': strip_or_none(song.get('synopsis')), + 'thumbnail': song.get('image'), + 'duration': int_or_none(song.get('length')), + 'timestamp': parse_iso8601(song.get('created_at')), + 'uploader': try_get(user, lambda x: x['profile']['nickname']), + 'uploader_id': str_or_none(user.get('id')), + 'uploader_url': urljoin(url, '/%s/' % username) if username else None, + 'view_count': get_count('plays'), + 'like_count': get_count('likes'), + 'comment_count': get_count('comments'), + 'repost_count': get_count('share'), + 'track': title, + 'track_id': song_id, + 'album': try_get(song, lambda x: x['album']['name']), } -- GitLab From 08d63a28df92441b03e9383b31e0d7a2817d86d2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:34:41 +0100 Subject: [PATCH 176/384] [sonyliv] fix extraction(closes #25667) --- haruhi_dl/extractor/sonyliv.py | 112 +++++++++++++++++++++++++++------ 1 file changed, 92 insertions(+), 20 deletions(-) diff --git a/haruhi_dl/extractor/sonyliv.py b/haruhi_dl/extractor/sonyliv.py index 58a8c0d4d..b460b343a 100644 --- a/haruhi_dl/extractor/sonyliv.py +++ b/haruhi_dl/extractor/sonyliv.py @@ -1,40 +1,112 @@ # coding: utf-8 from __future__ import unicode_literals +import time +import uuid + from .common import InfoExtractor -from ..utils import smuggle_url +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + int_or_none, +) class SonyLIVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/(?:s(?:how|port)s/[^/]+|movies|clip|trailer|music-videos)/[^/?#&]+-(?P\d+)' _TESTS = [{ - 'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight", + 'url': 'https://www.sonyliv.com/shows/bachelors-delight-1700000113/achaari-cheese-toast-1000022678?watch=true', 'info_dict': { - 'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", - 'id': 'ref:5024612095001', + 'title': 'Bachelors Delight - Achaari Cheese Toast', + 'id': '1000022678', 'ext': 'mp4', - 'upload_date': '20170923', - 'description': 'md5:7f28509a148d5be9d0782b4d5106410d', - 'uploader_id': '5182475815001', - 'timestamp': 1506200547, + 'upload_date': '20200411', + 'description': 'md5:3957fa31d9309bf336ceb3f37ad5b7cb', + 'timestamp': 1586632091, + 'duration': 185, + 'season_number': 1, + 'episode': 'Achaari Cheese Toast', + 'episode_number': 1, + 'release_year': 2016, }, 'params': { 'skip_download': True, }, - 'add_ie': ['BrightcoveNew'], }, { - 'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)', + 'url': 'https://www.sonyliv.com/movies/tahalka-1000050121?watch=true', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/clip/jigarbaaz-1000098925', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/trailer/sandwiched-forever-1000100286?watch=true', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/sports/india-tour-of-australia-2020-21-1700000286/cricket-hls-day-3-1st-test-aus-vs-ind-19-dec-2020-1000100959?watch=true', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/music-videos/yeh-un-dinon-ki-baat-hai-1000018779', 'only_matching': True, }] + _GEO_COUNTRIES = ['IN'] + _TOKEN = None - # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s' + def _call_api(self, version, path, video_id): + headers = {} + if self._TOKEN: + headers['security_token'] = self._TOKEN + try: + return self._download_json( + 'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path), + video_id, headers=headers)['resultObj'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + message = self._parse_json( + e.cause.read().decode(), video_id)['message'] + if message == 'Geoblocked Country': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise ExtractorError(message) + raise + + def _real_initialize(self): + self._TOKEN = self._call_api('1.4', 'ALL/GETTOKEN', None) def _real_extract(self, url): - brightcove_id = self._match_id(url) - return self.url_result( - smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, { - 'geo_countries': ['IN'], - 'referrer': url, - }), - 'BrightcoveNew', brightcove_id) + video_id = self._match_id(url) + content = self._call_api( + '1.5', 'IN/CONTENT/VIDEOURL/VOD/' + video_id, video_id) + if content.get('isEncrypted'): + raise ExtractorError('This video is DRM protected.', expected=True) + dash_url = content['videoURL'] + headers = { + 'x-playback-session-id': '%s-%d' % (uuid.uuid4().hex, time.time() * 1000) + } + formats = self._extract_mpd_formats( + dash_url, video_id, mpd_id='dash', headers=headers, fatal=False) + formats.extend(self._extract_m3u8_formats( + dash_url.replace('.mpd', '.m3u8').replace('/DASH/', '/HLS/'), + video_id, 'mp4', m3u8_id='hls', headers=headers, fatal=False)) + for f in formats: + f.setdefault('http_headers', {}).update(headers) + self._sort_formats(formats) + + metadata = self._call_api( + '1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata'] + title = metadata['title'] + episode = metadata.get('episodeTitle') + if episode: + title += ' - ' + episode + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': content.get('posterURL'), + 'description': metadata.get('longDescription') or metadata.get('shortDescription'), + 'timestamp': int_or_none(metadata.get('creationDate'), 1000), + 'duration': int_or_none(metadata.get('duration')), + 'season_number': int_or_none(metadata.get('season')), + 'episode': episode, + 'episode_number': int_or_none(metadata.get('episodeNumber')), + 'release_year': int_or_none(metadata.get('year')), + } -- GitLab From 0445f9de8df3c7f4c5b36e3e5901c804260cba4a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:34:50 +0100 Subject: [PATCH 177/384] [sonyliv] fix title for movies --- haruhi_dl/extractor/sonyliv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/sonyliv.py b/haruhi_dl/extractor/sonyliv.py index b460b343a..fedfceb62 100644 --- a/haruhi_dl/extractor/sonyliv.py +++ b/haruhi_dl/extractor/sonyliv.py @@ -94,7 +94,7 @@ class SonyLIVIE(InfoExtractor): '1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata'] title = metadata['title'] episode = metadata.get('episodeTitle') - if episode: + if episode and title != episode: title += ' - ' + episode return { -- GitLab From f350e326ac3da25ca2a2cd5dfbc1d67ace47bc82 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:35:11 +0100 Subject: [PATCH 178/384] [9c9media] improve info extraction --- haruhi_dl/extractor/ninecninemedia.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/ninecninemedia.py b/haruhi_dl/extractor/ninecninemedia.py index 65754c5e7..a569c889e 100644 --- a/haruhi_dl/extractor/ninecninemedia.py +++ b/haruhi_dl/extractor/ninecninemedia.py @@ -5,10 +5,11 @@ import re from .common import InfoExtractor from ..utils import ( - parse_iso8601, - float_or_none, ExtractorError, + float_or_none, int_or_none, + parse_iso8601, + try_get, ) @@ -35,7 +36,7 @@ class NineCNineMediaIE(InfoExtractor): '$include': '[HasClosedCaptions]', }) - if content_package.get('Constraints', {}).get('Security', {}).get('Type'): + if try_get(content_package, lambda x: x['Constraints']['Security']['Type']): raise ExtractorError('This video is DRM protected.', expected=True) manifest_base_url = content_package_url + 'manifest.' @@ -52,7 +53,7 @@ class NineCNineMediaIE(InfoExtractor): self._sort_formats(formats) thumbnails = [] - for image in content.get('Images', []): + for image in (content.get('Images') or []): image_url = image.get('Url') if not image_url: continue @@ -70,7 +71,7 @@ class NineCNineMediaIE(InfoExtractor): continue container.append(e_name) - season = content.get('Season', {}) + season = content.get('Season') or {} info = { 'id': content_id, @@ -79,13 +80,14 @@ class NineCNineMediaIE(InfoExtractor): 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), 'episode_number': int_or_none(content.get('Episode')), 'season': season.get('Name'), - 'season_number': season.get('Number'), + 'season_number': int_or_none(season.get('Number')), 'season_id': season.get('Id'), - 'series': content.get('Media', {}).get('Name'), + 'series': try_get(content, lambda x: x['Media']['Name']), 'tags': tags, 'categories': categories, 'duration': float_or_none(content_package.get('Duration')), 'formats': formats, + 'thumbnails': thumbnails, } if content_package.get('HasClosedCaptions'): -- GitLab From 90a021a137d6999c53b6975bd233643ad3b9c53f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:35:17 +0100 Subject: [PATCH 179/384] [ctv] Add new extractor (closes #27525) --- haruhi_dl/extractor/ctv.py | 52 +++++++++++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 1 + 2 files changed, 53 insertions(+) create mode 100644 haruhi_dl/extractor/ctv.py diff --git a/haruhi_dl/extractor/ctv.py b/haruhi_dl/extractor/ctv.py new file mode 100644 index 000000000..756bcc2be --- /dev/null +++ b/haruhi_dl/extractor/ctv.py @@ -0,0 +1,52 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ctv\.ca/(?P(?:show|movie)s/[^/]+/[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.ctv.ca/shows/your-morning/wednesday-december-23-2020-s5e88', + 'info_dict': { + 'id': '2102249', + 'ext': 'flv', + 'title': 'Wednesday, December 23, 2020', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Your Morning delivers original perspectives and unique insights into the headlines of the day.', + 'timestamp': 1608732000, + 'upload_date': '20201223', + 'series': 'Your Morning', + 'season': '2020-2021', + 'season_number': 5, + 'episode_number': 88, + 'tags': ['Your Morning'], + 'categories': ['Talk Show'], + 'duration': 7467.126, + }, + }, { + 'url': 'https://www.ctv.ca/movies/adam-sandlers-eight-crazy-nights/adam-sandlers-eight-crazy-nights', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + content = self._download_json( + 'https://www.ctv.ca/space-graphql/graphql', display_id, query={ + 'query': '''{ + resolvedPath(path: "/%s") { + lastSegment { + content { + ... on AxisContent { + axisId + videoPlayerDestCode + } + } + } + } +}''' % display_id, + })['data']['resolvedPath']['lastSegment']['content'] + video_id = content['axisId'] + return self.url_result( + '9c9media:%s:%s' % (content['videoPlayerDestCode'], video_id), + 'NineCNineMedia', video_id) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index da6767164..9d196a135 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -254,6 +254,7 @@ from .crunchyroll import ( ) from .cspan import CSpanIE from .ctsnews import CtsNewsIE +from .ctv import CTVIE from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE from .curiositystream import ( -- GitLab From 5f00c83c35b6322a9d9058573d2d2813104bcfb1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:35:57 +0100 Subject: [PATCH 180/384] [theplatform] allow passing geo bypass countries from other extractors --- haruhi_dl/extractor/theplatform.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/haruhi_dl/extractor/theplatform.py b/haruhi_dl/extractor/theplatform.py index 5b14bbf82..cdba10e40 100644 --- a/haruhi_dl/extractor/theplatform.py +++ b/haruhi_dl/extractor/theplatform.py @@ -234,6 +234,9 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) mobj = re.match(self._VALID_URL, url) provider_id = mobj.group('provider_id') -- GitLab From 4d81f8326715f23bf5e7f7ec03756b0f2a62f99e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:36:04 +0100 Subject: [PATCH 181/384] [sprout] Add support for Universal Kids (closes #22518) --- haruhi_dl/extractor/sprout.py | 88 ++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 38 deletions(-) diff --git a/haruhi_dl/extractor/sprout.py b/haruhi_dl/extractor/sprout.py index 8467bf49d..b1f8e05a2 100644 --- a/haruhi_dl/extractor/sprout.py +++ b/haruhi_dl/extractor/sprout.py @@ -3,50 +3,62 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( - extract_attributes, - update_url_query, + int_or_none, smuggle_url, + update_url_query, ) class SproutIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?sproutonline\.com/watch/(?P[^/?#]+)' - _TEST = { - 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', - 'md5': '74bf14128578d1e040c3ebc82088f45f', + _VALID_URL = r'https?://(?:www\.)?(?:sproutonline|universalkids)\.com/(?:watch|(?:[^/]+/)*videos)/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.universalkids.com/shows/remy-and-boo/season/1/videos/robot-bike-race', 'info_dict': { - 'id': '9dexnwtmh8_X', + 'id': 'bm0foJFaTKqb', 'ext': 'mp4', - 'title': 'A Cowboy Adventure', - 'description': 'Ruff-Ruff, Tweet and Dave get to be cowboys for the day at Six Cow Corral.', - 'timestamp': 1437758640, - 'upload_date': '20150724', - 'uploader': 'NBCU-SPROUT-NEW', - } - } + 'title': 'Robot Bike Race', + 'description': 'md5:436b1d97117cc437f54c383f4debc66d', + 'timestamp': 1606148940, + 'upload_date': '20201123', + 'uploader': 'NBCU-MPAT', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', + 'only_matching': True, + }, { + 'url': 'https://www.universalkids.com/watch/robot-bike-race', + 'only_matching': True, + }] + _GEO_COUNTRIES = ['US'] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_component = self._search_regex( - r'(?s)(]+data-component="video"[^>]*?>)', - webpage, 'video component', default=None) - if video_component: - options = self._parse_json(extract_attributes( - video_component)['data-options'], video_id) - theplatform_url = options['video'] - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - if options.get('protected'): - query['auth'] = self._extract_mvpd_auth(url, options['pid'], 'sprout', 'sprout') - theplatform_url = smuggle_url(update_url_query( - theplatform_url, query), {'force_smil_url': True}) - else: - iframe = self._search_regex( - r'(]+id="sproutVideoIframe"[^>]*?>)', - webpage, 'iframe') - theplatform_url = extract_attributes(iframe)['src'] - - return self.url_result(theplatform_url, 'ThePlatform') + display_id = self._match_id(url) + mpx_metadata = self._download_json( + # http://nbcuunikidsprod.apps.nbcuni.com/networks/universalkids/content/videos/ + 'https://www.universalkids.com/_api/videos/' + display_id, + display_id)['mpxMetadata'] + media_pid = mpx_metadata['mediaPid'] + theplatform_url = 'https://link.theplatform.com/s/HNK2IC/' + media_pid + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + if mpx_metadata.get('entitlement') == 'auth': + query['auth'] = self._extract_mvpd_auth(url, media_pid, 'sprout', 'sprout') + theplatform_url = smuggle_url( + update_url_query(theplatform_url, query), { + 'force_smil_url': True, + 'geo_countries': self._GEO_COUNTRIES, + }) + return { + '_type': 'url_transparent', + 'id': 'id', + 'url': theplatform_url, + 'series': mpx_metadata.get('seriesName'), + 'season_number': int_or_none(mpx_metadata.get('seasonNumber')), + 'episode_number': int_or_none(mpx_metadata.get('episodeNumber')), + 'ie_key': 'ThePlatform', + } -- GitLab From 8567d4488fce04c3576b48885cf294d2397cb38f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:36:11 +0100 Subject: [PATCH 182/384] [sprout] correct typo --- haruhi_dl/extractor/sprout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/sprout.py b/haruhi_dl/extractor/sprout.py index b1f8e05a2..e243732f2 100644 --- a/haruhi_dl/extractor/sprout.py +++ b/haruhi_dl/extractor/sprout.py @@ -55,7 +55,7 @@ class SproutIE(AdobePassIE): }) return { '_type': 'url_transparent', - 'id': 'id', + 'id': media_pid, 'url': theplatform_url, 'series': mpx_metadata.get('seriesName'), 'season_number': int_or_none(mpx_metadata.get('seasonNumber')), -- GitLab From b88f43a813fd75c0d5a6824088d31387848e2415 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:36:17 +0100 Subject: [PATCH 183/384] [theweatherchannel] fix extraction (closes #25930)(closes #26051) --- haruhi_dl/extractor/theweatherchannel.py | 43 ++++++++++++++++++------ 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/haruhi_dl/extractor/theweatherchannel.py b/haruhi_dl/extractor/theweatherchannel.py index c34a49d03..b2a8c3797 100644 --- a/haruhi_dl/extractor/theweatherchannel.py +++ b/haruhi_dl/extractor/theweatherchannel.py @@ -1,18 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .theplatform import ThePlatformIE from ..utils import ( determine_ext, parse_duration, + parse_iso8601, ) class TheWeatherChannelIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?weather\.com/(?:[^/]+/)*video/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?weather\.com(?P(?:/(?P[a-z]{2}-[A-Z]{2}))?/(?:[^/]+/)*video/(?P[^/?#]+))' _TESTS = [{ 'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock', - 'md5': 'ab924ac9574e79689c24c6b95e957def', + 'md5': 'c4cbe74c9c17c5676b704b950b73dd92', 'info_dict': { 'id': 'cc82397e-cc3f-4d11-9390-a785add090e8', 'ext': 'mp4', @@ -20,18 +24,33 @@ class TheWeatherChannelIE(ThePlatformIE): 'description': 'md5:55606ce1378d4c72e6545e160c9d9695', 'uploader': 'TWC - Digital (No Distro)', 'uploader_id': '6ccd5455-16bb-46f2-9c57-ff858bb9f62c', + 'upload_date': '20160720', + 'timestamp': 1469018835, } + }, { + 'url': 'https://weather.com/en-CA/international/videos/video/unidentified-object-falls-from-sky-in-india', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - drupal_settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), display_id) - video_id = drupal_settings['twc']['contexts']['node']['uuid'] - video_data = self._download_json( - 'https://dsx.weather.com/cms/v4/asset-collection/en_US/' + video_id, video_id) + asset_name, locale, display_id = re.match(self._VALID_URL, url).groups() + if not locale: + locale = 'en-US' + video_data = list(self._download_json( + 'https://weather.com/api/v1/p/redux-dal', display_id, data=json.dumps([{ + 'name': 'getCMSAssetsUrlConfig', + 'params': { + 'language': locale.replace('-', '_'), + 'query': { + 'assetName': { + '$in': asset_name, + }, + }, + } + }]).encode(), headers={ + 'Content-Type': 'application/json', + })['dal']['getCMSAssetsUrlConfig'].values())[0]['data'][0] + video_id = video_data['id'] seo_meta = video_data.get('seometa', {}) title = video_data.get('title') or seo_meta['title'] @@ -66,6 +85,8 @@ class TheWeatherChannelIE(ThePlatformIE): }) self._sort_formats(formats) + cc_url = video_data.get('cc_url') + return { 'id': video_id, 'display_id': display_id, @@ -74,6 +95,8 @@ class TheWeatherChannelIE(ThePlatformIE): 'duration': parse_duration(video_data.get('duration')), 'uploader': video_data.get('providername'), 'uploader_id': video_data.get('providerid'), + 'timestamp': parse_iso8601(video_data.get('publishdate')), + 'subtitles': {locale[:2]: [{'url': cc_url}]} if cc_url else None, 'thumbnails': thumbnails, 'formats': formats, } -- GitLab From 5b75c620bd148af4f4d4abe52b3c41d9e75e6d45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:36:23 +0100 Subject: [PATCH 184/384] [bongacams] Add extractor (closes #27440) --- haruhi_dl/extractor/bongacams.py | 60 +++++++++++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 1 + 2 files changed, 61 insertions(+) create mode 100644 haruhi_dl/extractor/bongacams.py diff --git a/haruhi_dl/extractor/bongacams.py b/haruhi_dl/extractor/bongacams.py new file mode 100644 index 000000000..180542fbc --- /dev/null +++ b/haruhi_dl/extractor/bongacams.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + try_get, + urlencode_postdata, +) + + +class BongaCamsIE(InfoExtractor): + _VALID_URL = r'https?://(?P(?:[^/]+\.)?bongacams\d*\.com)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://de.bongacams.com/azumi-8', + 'only_matching': True, + }, { + 'url': 'https://cn.bongacams.com/azumi-8', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + channel_id = mobj.group('id') + + amf = self._download_json( + 'https://%s/tools/amf.php' % host, channel_id, + data=urlencode_postdata(( + ('method', 'getRoomData'), + ('args[]', channel_id), + ('args[]', 'false'), + )), headers={'X-Requested-With': 'XMLHttpRequest'}) + + server_url = amf['localData']['videoServerUrl'] + + uploader_id = try_get( + amf, lambda x: x['performerData']['username'], compat_str) or channel_id + uploader = try_get( + amf, lambda x: x['performerData']['displayName'], compat_str) + like_count = int_or_none(try_get( + amf, lambda x: x['performerData']['loversCount'])) + + formats = self._extract_m3u8_formats( + '%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id), + channel_id, 'mp4', m3u8_id='hls', live=True) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': self._live_title(uploader or uploader_id), + 'uploader': uploader, + 'uploader_id': uploader_id, + 'like_count': like_count, + 'age_limit': 18, + 'is_live': True, + 'formats': formats, + } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 9d196a135..69c7d9f62 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -129,6 +129,7 @@ from .bleacherreport import ( from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE +from .bongacams import BongaCamsIE from .bostonglobe import BostonGlobeIE from .box import BoxIE from .bpb import BpbIE -- GitLab From 226faa552189bdb7267367fcfa692d63c589c381 Mon Sep 17 00:00:00 2001 From: JChris246 Date: Fri, 26 Feb 2021 15:36:31 +0100 Subject: [PATCH 185/384] [pornhub] Fix lq formats extraction (closes #27386) --- haruhi_dl/extractor/pornhub.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/pornhub.py b/haruhi_dl/extractor/pornhub.py index 20af84955..c9be511ea 100644 --- a/haruhi_dl/extractor/pornhub.py +++ b/haruhi_dl/extractor/pornhub.py @@ -291,15 +291,25 @@ class PornHubIE(PornHubBaseIE): video_urls.append((v_url, None)) video_urls_set.add(v_url) + def parse_quality_items(js_str): + if (url_or_none(js_str)): + return js_str + media_definitions = self._parse_json(js_str, video_id, fatal=False) + if isinstance(media_definitions, list): + for definition in media_definitions: + if not isinstance(definition, dict): + continue + add_video_url(definition.get('url')) + if not video_urls: - FORMAT_PREFIXES = ('media', 'quality') + FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') js_vars = extract_js_vars( webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), default=None) if js_vars: for key, format_url in js_vars.items(): if any(key.startswith(p) for p in FORMAT_PREFIXES): - add_video_url(format_url) + add_video_url(parse_quality_items(format_url)) if not video_urls and re.search( r'<[^>]+\bid=["\']lockedPlayer', webpage): raise ExtractorError( -- GitLab From dc69c587bfcd6594c7a51127addd1623e4fb867a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:36:38 +0100 Subject: [PATCH 186/384] [pornhub] Fix review issues (closes #27393) --- haruhi_dl/extractor/pornhub.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/haruhi_dl/extractor/pornhub.py b/haruhi_dl/extractor/pornhub.py index c9be511ea..a66152e98 100644 --- a/haruhi_dl/extractor/pornhub.py +++ b/haruhi_dl/extractor/pornhub.py @@ -291,15 +291,13 @@ class PornHubIE(PornHubBaseIE): video_urls.append((v_url, None)) video_urls_set.add(v_url) - def parse_quality_items(js_str): - if (url_or_none(js_str)): - return js_str - media_definitions = self._parse_json(js_str, video_id, fatal=False) - if isinstance(media_definitions, list): - for definition in media_definitions: - if not isinstance(definition, dict): - continue - add_video_url(definition.get('url')) + def parse_quality_items(quality_items): + q_items = self._parse_json(quality_items, video_id, fatal=False) + if not isinstance(q_items, list): + return + for item in q_items: + if isinstance(item, dict): + add_video_url(item.get('url')) if not video_urls: FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') @@ -308,8 +306,10 @@ class PornHubIE(PornHubBaseIE): default=None) if js_vars: for key, format_url in js_vars.items(): - if any(key.startswith(p) for p in FORMAT_PREFIXES): - add_video_url(parse_quality_items(format_url)) + if key.startswith(FORMAT_PREFIXES[-1]): + parse_quality_items(format_url) + elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]): + add_video_url(format_url) if not video_urls and re.search( r'<[^>]+\bid=["\']lockedPlayer', webpage): raise ExtractorError( -- GitLab From 84b7f91b289cac49d13d08c118de5bc564ac9bbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:38:12 +0100 Subject: [PATCH 187/384] [spangbang] Add support for playlist videos --- haruhi_dl/extractor/spankbang.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/spankbang.py b/haruhi_dl/extractor/spankbang.py index 61ca902ce..e3ec8602d 100644 --- a/haruhi_dl/extractor/spankbang.py +++ b/haruhi_dl/extractor/spankbang.py @@ -17,7 +17,14 @@ from ..utils import ( class SpankBangIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/(?:video|play|embed)\b' + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?spankbang\.com/ + (?: + (?P[\da-z]+)/(?:video|play|embed)\b| + [\da-z]+-(?P[\da-z]+)/playlist/[^/?#&]+ + ) + ''' _TESTS = [{ 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', 'md5': '1cc433e1d6aa14bc376535b8679302f7', @@ -57,10 +64,14 @@ class SpankBangIE(InfoExtractor): }, { 'url': 'https://spankbang.com/2y3td/embed/', 'only_matching': True, + }, { + 'url': 'https://spankbang.com/2v7ik-7ecbgu/playlist/latina+booty', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_2') webpage = self._download_webpage( url.replace('/%s/embed' % video_id, '/%s/video' % video_id), video_id, headers={'Cookie': 'country=US'}) -- GitLab From 50dfe7adb8a4d1f1f85e30568da944b2db179257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:38:20 +0100 Subject: [PATCH 188/384] [spangbang:playlist] Fix extraction (closes #24087) --- haruhi_dl/extractor/spankbang.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/haruhi_dl/extractor/spankbang.py b/haruhi_dl/extractor/spankbang.py index e3ec8602d..8a7102d0c 100644 --- a/haruhi_dl/extractor/spankbang.py +++ b/haruhi_dl/extractor/spankbang.py @@ -13,6 +13,7 @@ from ..utils import ( str_to_int, url_or_none, urlencode_postdata, + urljoin, ) @@ -166,30 +167,33 @@ class SpankBangIE(InfoExtractor): class SpankBangPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/playlist/[^/]+' + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/playlist/(?P[^/]+)' _TEST = { 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', 'info_dict': { 'id': 'ug0k', 'title': 'Big Ass Titties', }, - 'playlist_mincount': 50, + 'playlist_mincount': 40, } def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + display_id = mobj.group('display_id') webpage = self._download_webpage( url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) entries = [self.url_result( - 'https://spankbang.com/%s/video' % video_id, - ie=SpankBangIE.ie_key(), video_id=video_id) - for video_id in orderedSet(re.findall( - r']+\bhref=["\']/?([\da-z]+)/play/', webpage))] + urljoin(url, mobj.group('path')), + ie=SpankBangIE.ie_key(), video_id=mobj.group('id')) + for mobj in re.finditer( + r']+\bhref=(["\'])(?P/?[\da-z]+-(?P[\da-z]+)/playlist/%s(?:(?!\1).)*)\1' + % re.escape(display_id), webpage)] title = self._html_search_regex( - r'

([^<]+)\s+playlist

', webpage, 'playlist title', + r'

([^<]+)\s+playlist\s*<', webpage, 'playlist title', fatal=False) return self.playlist_result(entries, playlist_id, title) -- GitLab From 10a6f841a7aefeb182dd5e3a5ee491cb346517e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:38:28 +0100 Subject: [PATCH 189/384] [spankbang] Remove unused import --- haruhi_dl/extractor/spankbang.py | 1 - 1 file changed, 1 deletion(-) diff --git a/haruhi_dl/extractor/spankbang.py b/haruhi_dl/extractor/spankbang.py index 8a7102d0c..37cb8c839 100644 --- a/haruhi_dl/extractor/spankbang.py +++ b/haruhi_dl/extractor/spankbang.py @@ -7,7 +7,6 @@ from ..utils import ( determine_ext, ExtractorError, merge_dicts, - orderedSet, parse_duration, parse_resolution, str_to_int, -- GitLab From d7c028a33ef2fa24b6dc0868b9b15c1ea6a22f84 Mon Sep 17 00:00:00 2001 From: Andrew Udvare Date: Fri, 26 Feb 2021 15:38:34 +0100 Subject: [PATCH 190/384] [instagram] Fix extraction when authenticated (closes #27422) --- haruhi_dl/extractor/instagram.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/haruhi_dl/extractor/instagram.py b/haruhi_dl/extractor/instagram.py index b061850a1..0755896de 100644 --- a/haruhi_dl/extractor/instagram.py +++ b/haruhi_dl/extractor/instagram.py @@ -137,6 +137,16 @@ class InstagramIE(InfoExtractor): (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], lambda x: x['entry_data']['PostPage'][0]['media']), dict) + if not media: + additional_data = self._parse_json( + self._search_regex(r'window\.__additionalDataLoaded\(\'[^\']+\',\s*({.+?})\);', + webpage, 'additional data', default='{}'), + video_id, fatal=False) + if additional_data: + media = try_get( + additional_data, + lambda x: x['graphql']['shortcode_media'], + dict) if media: video_url = media.get('video_url') height = int_or_none(media.get('dimensions', {}).get('height')) -- GitLab From 73c5dc4104d6eb6914d008aeb62acfe58ce052aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:38:42 +0100 Subject: [PATCH 191/384] [instagram] Improve extraction (closes #22880) --- haruhi_dl/extractor/instagram.py | 132 ++++++++++++++++--------------- 1 file changed, 67 insertions(+), 65 deletions(-) diff --git a/haruhi_dl/extractor/instagram.py b/haruhi_dl/extractor/instagram.py index 0755896de..82f59c349 100644 --- a/haruhi_dl/extractor/instagram.py +++ b/haruhi_dl/extractor/instagram.py @@ -122,9 +122,9 @@ class InstagramIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - (video_url, description, thumbnail, timestamp, uploader, + (media, video_url, description, thumbnail, timestamp, uploader, uploader_id, like_count, comment_count, comments, height, - width) = [None] * 11 + width) = [None] * 12 shared_data = self._parse_json( self._search_regex( @@ -137,69 +137,71 @@ class InstagramIE(InfoExtractor): (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], lambda x: x['entry_data']['PostPage'][0]['media']), dict) - if not media: - additional_data = self._parse_json( - self._search_regex(r'window\.__additionalDataLoaded\(\'[^\']+\',\s*({.+?})\);', - webpage, 'additional data', default='{}'), - video_id, fatal=False) - if additional_data: - media = try_get( - additional_data, - lambda x: x['graphql']['shortcode_media'], - dict) - if media: - video_url = media.get('video_url') - height = int_or_none(media.get('dimensions', {}).get('height')) - width = int_or_none(media.get('dimensions', {}).get('width')) - description = try_get( - media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) or media.get('caption') - thumbnail = media.get('display_src') - timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) - uploader = media.get('owner', {}).get('full_name') - uploader_id = media.get('owner', {}).get('username') - - def get_count(key, kind): - return int_or_none(try_get( - media, (lambda x: x['edge_media_%s' % key]['count'], - lambda x: x['%ss' % kind]['count']))) - like_count = get_count('preview_like', 'like') - comment_count = get_count('to_comment', 'comment') - - comments = [{ - 'author': comment.get('user', {}).get('username'), - 'author_id': comment.get('user', {}).get('id'), - 'id': comment.get('id'), - 'text': comment.get('text'), - 'timestamp': int_or_none(comment.get('created_at')), - } for comment in media.get( - 'comments', {}).get('nodes', []) if comment.get('text')] - if not video_url: - edges = try_get( - media, lambda x: x['edge_sidecar_to_children']['edges'], - list) or [] - if edges: - entries = [] - for edge_num, edge in enumerate(edges, start=1): - node = try_get(edge, lambda x: x['node'], dict) - if not node: - continue - node_video_url = url_or_none(node.get('video_url')) - if not node_video_url: - continue - entries.append({ - 'id': node.get('shortcode') or node['id'], - 'title': 'Video %d' % edge_num, - 'url': node_video_url, - 'thumbnail': node.get('display_url'), - 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), - 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), - 'view_count': int_or_none(node.get('video_view_count')), - }) - return self.playlist_result( - entries, video_id, - 'Post by %s' % uploader_id if uploader_id else None, - description) + # _sharedData.entry_data.PostPage is empty when authenticated (see + # https://github.com/hdl-org/haruhi-dl/pull/22880) + if not media: + additional_data = self._parse_json( + self._search_regex( + r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', + webpage, 'additional data', default='{}'), + video_id, fatal=False) + if additional_data: + media = try_get( + additional_data, lambda x: x['graphql']['shortcode_media'], + dict) + if media: + video_url = media.get('video_url') + height = int_or_none(media.get('dimensions', {}).get('height')) + width = int_or_none(media.get('dimensions', {}).get('width')) + description = try_get( + media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) or media.get('caption') + thumbnail = media.get('display_src') + timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) + uploader = media.get('owner', {}).get('full_name') + uploader_id = media.get('owner', {}).get('username') + + def get_count(key, kind): + return int_or_none(try_get( + media, (lambda x: x['edge_media_%s' % key]['count'], + lambda x: x['%ss' % kind]['count']))) + like_count = get_count('preview_like', 'like') + comment_count = get_count('to_comment', 'comment') + + comments = [{ + 'author': comment.get('user', {}).get('username'), + 'author_id': comment.get('user', {}).get('id'), + 'id': comment.get('id'), + 'text': comment.get('text'), + 'timestamp': int_or_none(comment.get('created_at')), + } for comment in media.get( + 'comments', {}).get('nodes', []) if comment.get('text')] + if not video_url: + edges = try_get( + media, lambda x: x['edge_sidecar_to_children']['edges'], + list) or [] + if edges: + entries = [] + for edge_num, edge in enumerate(edges, start=1): + node = try_get(edge, lambda x: x['node'], dict) + if not node: + continue + node_video_url = url_or_none(node.get('video_url')) + if not node_video_url: + continue + entries.append({ + 'id': node.get('shortcode') or node['id'], + 'title': 'Video %d' % edge_num, + 'url': node_video_url, + 'thumbnail': node.get('display_url'), + 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), + 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), + 'view_count': int_or_none(node.get('video_view_count')), + }) + return self.playlist_result( + entries, video_id, + 'Post by %s' % uploader_id if uploader_id else None, + description) if not video_url: video_url = self._og_search_video_url(webpage, secure=False) -- GitLab From ff12ad0ee42a4004c40320dd5bede28966b26a97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:38:48 +0100 Subject: [PATCH 192/384] [instagram] Improve thumbnail extraction --- haruhi_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/instagram.py b/haruhi_dl/extractor/instagram.py index 82f59c349..0e70d9ea0 100644 --- a/haruhi_dl/extractor/instagram.py +++ b/haruhi_dl/extractor/instagram.py @@ -156,7 +156,7 @@ class InstagramIE(InfoExtractor): description = try_get( media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], compat_str) or media.get('caption') - thumbnail = media.get('display_src') + thumbnail = media.get('display_src') or media.get('display_url') timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') -- GitLab From c298be2ebd5d03cf1c54a785e1d59e939bd7aa73 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:38:53 +0100 Subject: [PATCH 193/384] [bbc] switch to media selector v6 closes #23232 closes #23933 closes #26303 closes #26432 closes #26821 closes #27538 --- haruhi_dl/extractor/bbc.py | 74 +++++++++++--------------------------- 1 file changed, 20 insertions(+), 54 deletions(-) diff --git a/haruhi_dl/extractor/bbc.py b/haruhi_dl/extractor/bbc.py index 7aa3a11b5..b73521043 100644 --- a/haruhi_dl/extractor/bbc.py +++ b/haruhi_dl/extractor/bbc.py @@ -49,22 +49,17 @@ class BBCCoUkIE(InfoExtractor): _LOGIN_URL = 'https://account.bbc.com/signin' _NETRC_MACHINE = 'bbc' - _MEDIASELECTOR_URLS = [ + _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s' + _MEDIA_SETS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails # with geolocation in some cases when it's even not geo restricted at all (e.g. # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', + 'iptv-all', + 'pc', ] - _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection' _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' - _NAMESPACES = ( - _MEDIASELECTION_NS, - _EMP_PLAYLIST_NS, - ) - _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', @@ -261,8 +256,6 @@ class BBCCoUkIE(InfoExtractor): 'only_matching': True, }] - _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' - def _login(self): username, password = self._get_login_info() if username is None: @@ -307,22 +300,14 @@ class BBCCoUkIE(InfoExtractor): def _extract_items(self, playlist): return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) - def _findall_ns(self, element, xpath): - elements = [] - for ns in self._NAMESPACES: - elements.extend(element.findall(xpath % ns)) - return elements - def _extract_medias(self, media_selection): - error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS) - if error is None: - media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS) - if error is not None: - raise BBCCoUkIE.MediaSelectionError(error.get('id')) - return self._findall_ns(media_selection, './{%s}media') + error = media_selection.get('result') + if error: + raise BBCCoUkIE.MediaSelectionError(error) + return media_selection.get('media') or [] def _extract_connections(self, media): - return self._findall_ns(media, './{%s}connection') + return media.get('connection') or [] def _get_subtitles(self, media, programme_id): subtitles = {} @@ -334,13 +319,13 @@ class BBCCoUkIE(InfoExtractor): cc_url, programme_id, 'Downloading captions', fatal=False) if not isinstance(captions, compat_etree_Element): continue - lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') - subtitles[lang] = [ + subtitles['en'] = [ { 'url': connection.get('href'), 'ext': 'ttml', }, ] + break return subtitles def _raise_extractor_error(self, media_selection_error): @@ -350,10 +335,10 @@ class BBCCoUkIE(InfoExtractor): def _download_media_selector(self, programme_id): last_exception = None - for mediaselector_url in self._MEDIASELECTOR_URLS: + for media_set in self._MEDIA_SETS: try: return self._download_media_selector_url( - mediaselector_url % programme_id, programme_id) + self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id) except BBCCoUkIE.MediaSelectionError as e: if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): last_exception = e @@ -362,8 +347,8 @@ class BBCCoUkIE(InfoExtractor): self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): - media_selection = self._download_xml( - url, programme_id, 'Downloading media selection XML', + media_selection = self._download_json( + url, programme_id, 'Downloading media selection JSON', expected_status=(403, 404)) return self._process_media_selector(media_selection, programme_id) @@ -377,7 +362,6 @@ class BBCCoUkIE(InfoExtractor): if kind in ('video', 'audio'): bitrate = int_or_none(media.get('bitrate')) encoding = media.get('encoding') - service = media.get('service') width = int_or_none(media.get('width')) height = int_or_none(media.get('height')) file_size = int_or_none(media.get('media_file_size')) @@ -392,8 +376,6 @@ class BBCCoUkIE(InfoExtractor): supplier = connection.get('supplier') transfer_format = connection.get('transferFormat') format_id = supplier or conn_kind or protocol - if service: - format_id = '%s_%s' % (service, format_id) # ASX playlist if supplier == 'asx': for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): @@ -408,20 +390,11 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) - if re.search(self._USP_RE, href): - usp_formats = self._extract_m3u8_formats( - re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href), - programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False) - for f in usp_formats: - if f.get('height') and f['height'] > 720: - continue - formats.append(f) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) else: - if not service and not supplier and bitrate: + if not supplier and bitrate: format_id += '-%d' % bitrate fmt = { 'format_id': format_id, @@ -554,7 +527,7 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') error = self._search_regex( - r']+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<', + r']+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<', webpage, 'error', default=None) if error: raise ExtractorError(error, expected=True) @@ -607,16 +580,9 @@ class BBCIE(BBCCoUkIE): IE_DESC = 'BBC' _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' - _MEDIASELECTOR_URLS = [ - # Provides HQ HLS streams but fails with geolocation in some cases when it's - # even not geo restricted at all - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', - # Provides more formats, namely direct mp4 links, but fails on some videos with - # notukerror for non UK (?) users (e.g. - # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) - 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', - # Provides fewer formats, but works everywhere for everybody (hopefully) - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', + _MEDIA_SETS = [ + 'mobile-tablet-main', + 'pc', ] _TESTS = [{ -- GitLab From 477e444c3bd59e43ce5cf3de7d3aae7644b9bd5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:38:59 +0100 Subject: [PATCH 194/384] [instagram] Add support for reel URLs (closes #26234, closes #26250) --- haruhi_dl/extractor/instagram.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/instagram.py b/haruhi_dl/extractor/instagram.py index 0e70d9ea0..2d24d62c8 100644 --- a/haruhi_dl/extractor/instagram.py +++ b/haruhi_dl/extractor/instagram.py @@ -22,7 +22,7 @@ from ..utils import ( class InstagramIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv)/(?P[^/?#&]+))' + _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -95,6 +95,9 @@ class InstagramIE(InfoExtractor): }, { 'url': 'https://www.instagram.com/tv/aye83DjauH/', 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/reel/CDUMkliABpa/', + 'only_matching': True, }] @staticmethod -- GitLab From 97c34326598bbfa9d3abe02c705d8333101e57bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:39:04 +0100 Subject: [PATCH 195/384] [instagram] Fix comment count extraction --- haruhi_dl/extractor/instagram.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/instagram.py b/haruhi_dl/extractor/instagram.py index 2d24d62c8..5f917a603 100644 --- a/haruhi_dl/extractor/instagram.py +++ b/haruhi_dl/extractor/instagram.py @@ -164,12 +164,18 @@ class InstagramIE(InfoExtractor): uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') - def get_count(key, kind): - return int_or_none(try_get( - media, (lambda x: x['edge_media_%s' % key]['count'], - lambda x: x['%ss' % kind]['count']))) + def get_count(keys, kind): + if not isinstance(keys, (list, tuple)): + keys = [keys] + for key in keys: + count = int_or_none(try_get( + media, (lambda x: x['edge_media_%s' % key]['count'], + lambda x: x['%ss' % kind]['count']))) + if count is not None: + return count like_count = get_count('preview_like', 'like') - comment_count = get_count('to_comment', 'comment') + comment_count = get_count( + ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment') comments = [{ 'author': comment.get('user', {}).get('username'), -- GitLab From 1d9552c2367b9a22c03b8ab3eb98269a6ea0fbc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:39:10 +0100 Subject: [PATCH 196/384] [instagram] Fix test --- haruhi_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/instagram.py b/haruhi_dl/extractor/instagram.py index 5f917a603..27ea97f56 100644 --- a/haruhi_dl/extractor/instagram.py +++ b/haruhi_dl/extractor/instagram.py @@ -35,7 +35,7 @@ class InstagramIE(InfoExtractor): 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': 'naomipq', - 'uploader': 'Naomi Leonor Phan-Quang', + 'uploader': 'B E A U T Y F O R A S H E S', 'like_count': int, 'comment_count': int, 'comments': list, -- GitLab From 92bd8a446ef226acf58be120ad9dce95e71e96e1 Mon Sep 17 00:00:00 2001 From: Laura Liberda Date: Fri, 26 Feb 2021 15:45:57 +0100 Subject: [PATCH 197/384] VHX embeds https://github.com/ytdl-org/youtube-dl/issues/27546 --- haruhi_dl/extractor/generic.py | 17 ++++++++++++++++- haruhi_dl/extractor/vimeo.py | 7 +++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 240de66da..b67f066eb 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -67,7 +67,10 @@ from .tube8 import Tube8IE from .mofosex import MofosexEmbedIE from .spankwire import SpankwireIE from .youporn import YouPornIE -from .vimeo import VimeoIE +from .vimeo import ( + VimeoIE, + VHXEmbedIE, +) from .dailymotion import DailymotionIE from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE @@ -2247,6 +2250,17 @@ class GenericIE(InfoExtractor): # 'force_generic_extractor': True, # }, # } + { + # VHX Embed + 'url': 'https://demo.vhx.tv/category-c/videos/file-example-mp4-480-1-5mg-copy', + 'info_dict': { + 'id': '858208', + 'ext': 'mp4', + 'title': 'Untitled', + 'uploader_id': 'user80538407', + 'uploader': 'OTT Videos', + }, + }, ] def report_following_redirect(self, new_url): @@ -2661,6 +2675,7 @@ class GenericIE(InfoExtractor): SVTIE, XLinkIE, LibsynIE, + VHXEmbedIE, ): try: ie_key = embie.ie_key() diff --git a/haruhi_dl/extractor/vimeo.py b/haruhi_dl/extractor/vimeo.py index e8a4547cd..773296173 100644 --- a/haruhi_dl/extractor/vimeo.py +++ b/haruhi_dl/extractor/vimeo.py @@ -1125,6 +1125,12 @@ class VHXEmbedIE(VimeoBaseInfoExtractor): IE_NAME = 'vhx:embed' _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P\d+)' + @staticmethod + def _extract_urls(webpage, **kw): + mobjs = re.finditer( + r']+src="(https?://embed\.vhx\.tv/videos/\d+[^"]*)"', webpage) + return [unescapeHTML(mobj.group(1)) for mobj in mobjs] + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -1133,5 +1139,6 @@ class VHXEmbedIE(VimeoBaseInfoExtractor): 'ott data'), video_id, js_to_json)['config_url'] config = self._download_json(config_url, video_id) info = self._parse_config(config, video_id) + info['id'] = video_id self._vimeo_sort_formats(info['formats']) return info -- GitLab From 2d3b82a754df55ca35d12c20a2b631630588c421 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:46:29 +0100 Subject: [PATCH 198/384] [amcnetworks] improve auth only video detection(closes #27548) --- haruhi_dl/extractor/amcnetworks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/amcnetworks.py b/haruhi_dl/extractor/amcnetworks.py index 12b6de0bf..b8027bbca 100644 --- a/haruhi_dl/extractor/amcnetworks.py +++ b/haruhi_dl/extractor/amcnetworks.py @@ -80,7 +80,8 @@ class AMCNetworksIE(ThePlatformIE): title = theplatform_metadata['title'] rating = try_get( theplatform_metadata, lambda x: x['ratings'][0]['rating']) - if properties.get('videoCategory') == 'TVE-Auth': + video_category = properties.get('videoCategory') + if video_category and video_category.endswith('-Auth'): resource = self._get_mvpd_resource( requestor_id, title, video_id, rating) query['auth'] = self._extract_mvpd_auth( -- GitLab From c4445c3311a3bfa1c8ba61fa7679cadc23a77f5f Mon Sep 17 00:00:00 2001 From: Sergey M Date: Fri, 26 Feb 2021 15:46:39 +0100 Subject: [PATCH 199/384] [youtube] Update invidious.snopyta.org (#22667) Co-authored-by: sofutru <54445344+sofutru@users.noreply.github.com> --- haruhi_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/youtube.py b/haruhi_dl/extractor/youtube.py index dd58b2407..f80c82f85 100644 --- a/haruhi_dl/extractor/youtube.py +++ b/haruhi_dl/extractor/youtube.py @@ -289,7 +289,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances (?:(?:www|dev)\.)?invidio\.us/| (?:(?:www|no)\.)?invidiou\.sh/| - (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| + (?:(?:www|fi)\.)?invidious\.snopyta\.org/| (?:www\.)?invidious\.kabi\.tk/| (?:www\.)?invidious\.13ad\.de/| (?:www\.)?invidious\.mastodon\.host/| -- GitLab From 7490ed64b4dda6740a5d14872f53b5ea905b67ef Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:46:46 +0100 Subject: [PATCH 200/384] [telecinco] fix extraction --- haruhi_dl/extractor/telecinco.py | 77 +++++++++----------------------- 1 file changed, 20 insertions(+), 57 deletions(-) diff --git a/haruhi_dl/extractor/telecinco.py b/haruhi_dl/extractor/telecinco.py index 9ba3da341..eecd6a5c9 100644 --- a/haruhi_dl/extractor/telecinco.py +++ b/haruhi_dl/extractor/telecinco.py @@ -5,14 +5,11 @@ import json import re from .common import InfoExtractor -from .ooyala import OoyalaIE from ..utils import ( clean_html, - determine_ext, int_or_none, str_or_none, try_get, - urljoin, ) @@ -28,7 +25,7 @@ class TelecincoIE(InfoExtractor): 'description': 'md5:716caf5601e25c3c5ab6605b1ae71529', }, 'playlist': [{ - 'md5': 'adb28c37238b675dad0f042292f209a7', + 'md5': '7ee56d665cfd241c0e6d80fd175068b0', 'info_dict': { 'id': 'JEA5ijCnF6p5W08A1rNKn7', 'ext': 'mp4', @@ -38,7 +35,7 @@ class TelecincoIE(InfoExtractor): }] }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', - 'md5': '9468140ebc300fbb8b9d65dc6e5c4b43', + 'md5': 'c86fe0d99e3bdb46b7950d38bf6ef12a', 'info_dict': { 'id': 'jn24Od1zGLG4XUZcnUnZB6', 'ext': 'mp4', @@ -48,7 +45,7 @@ class TelecincoIE(InfoExtractor): }, }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6', + 'md5': 'eddb50291df704ce23c74821b995bcac', 'info_dict': { 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', @@ -90,58 +87,24 @@ class TelecincoIE(InfoExtractor): def _parse_content(self, content, url): video_id = content['dataMediaId'] - if content.get('dataCmsId') == 'ooyala': - return self.url_result( - 'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id) - config_url = urljoin(url, content['dataConfig']) config = self._download_json( - config_url, video_id, 'Downloading config JSON') + content['dataConfig'], video_id, 'Downloading config JSON') title = config['info']['title'] - - def mmc_url(mmc_type): - return re.sub( - r'/(?:flash|html5)\.json', '/%s.json' % mmc_type, - config['services']['mmc']) - - duration = None - formats = [] - for mmc_type in ('flash', 'html5'): - mmc = self._download_json( - mmc_url(mmc_type), video_id, - 'Downloading %s mmc JSON' % mmc_type, fatal=False) - if not mmc: - continue - if not duration: - duration = int_or_none(mmc.get('duration')) - for location in mmc['locations']: - gat = self._proto_relative_url(location.get('gat'), 'http:') - gcp = location.get('gcp') - ogn = location.get('ogn') - if None in (gat, gcp, ogn): - continue - token_data = { - 'gcp': gcp, - 'ogn': ogn, - 'sta': 0, - } - media = self._download_json( - gat, video_id, data=json.dumps(token_data).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }, fatal=False) or {} - stream = media.get('stream') or media.get('file') - if not stream: - continue - ext = determine_ext(stream) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + services = config['services'] + caronte = self._download_json(services['caronte'], video_id) + stream = caronte['dls'][0]['stream'] + headers = self.geo_verification_headers() + headers.update({ + 'Content-Type': 'application/json;charset=UTF-8', + 'Origin': re.match(r'https?://[^/]+', url).group(0), + }) + cdn = self._download_json( + caronte['cerbero'], video_id, data=json.dumps({ + 'bbx': caronte['bbx'], + 'gbx': self._download_json(services['gbx'], video_id)['gbx'], + }).encode(), headers=headers)['tokens']['1']['cdn'] + formats = self._extract_m3u8_formats( + stream + '?' + cdn, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) return { @@ -149,7 +112,7 @@ class TelecincoIE(InfoExtractor): 'title': title, 'formats': formats, 'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'), - 'duration': duration, + 'duration': int_or_none(content.get('dataDuration')), } def _real_extract(self, url): -- GitLab From 217918987a1687071abfd5f3d4fc2b917fcd5a33 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:46:52 +0100 Subject: [PATCH 201/384] [mitele] fix free video extraction(#24624)(closes #25827)(closes #26757) --- haruhi_dl/extractor/mitele.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/haruhi_dl/extractor/mitele.py b/haruhi_dl/extractor/mitele.py index ad9da9612..b5937233b 100644 --- a/haruhi_dl/extractor/mitele.py +++ b/haruhi_dl/extractor/mitele.py @@ -1,15 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor +from .telecinco import TelecincoIE from ..utils import ( int_or_none, parse_iso8601, - smuggle_url, ) -class MiTeleIE(InfoExtractor): +class MiTeleIE(TelecincoIE): IE_DESC = 'mitele.es' _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P[^/]+)/player' @@ -31,7 +30,6 @@ class MiTeleIE(InfoExtractor): 'timestamp': 1471209401, 'upload_date': '20160814', }, - 'add_ie': ['Ooyala'], }, { # no explicit title 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player', @@ -54,7 +52,6 @@ class MiTeleIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': ['Ooyala'], }, { 'url': 'http://www.mitele.es/series-online/la-que-se-avecina/57aac5c1c915da951a8b45ed/player', 'only_matching': True, @@ -70,16 +67,11 @@ class MiTeleIE(InfoExtractor): r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=\s*({.+})', webpage, 'Pre Player'), display_id)['prePlayer'] title = pre_player['title'] - video = pre_player['video'] - video_id = video['dataMediaId'] + video_info = self._parse_content(pre_player['video'], url) content = pre_player.get('content') or {} info = content.get('info') or {} - return { - '_type': 'url_transparent', - # for some reason only HLS is supported - 'url': smuggle_url('ooyala:' + video_id, {'supportedformats': 'm3u8,dash'}), - 'id': video_id, + video_info.update({ 'title': title, 'description': info.get('synopsis'), 'series': content.get('title'), @@ -87,7 +79,7 @@ class MiTeleIE(InfoExtractor): 'episode': content.get('subtitle'), 'episode_number': int_or_none(info.get('episode_number')), 'duration': int_or_none(info.get('duration')), - 'thumbnail': video.get('dataPoster'), 'age_limit': int_or_none(info.get('rating')), 'timestamp': parse_iso8601(pre_player.get('publishedTime')), - } + }) + return video_info -- GitLab From 2a0a9bac02fa495bbcd064c75699742f0318fda4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:46:56 +0100 Subject: [PATCH 202/384] [teachable] Improve embed detection (closes #26923) --- haruhi_dl/extractor/teachable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/teachable.py b/haruhi_dl/extractor/teachable.py index df305e38a..3a337afd8 100644 --- a/haruhi_dl/extractor/teachable.py +++ b/haruhi_dl/extractor/teachable.py @@ -140,7 +140,7 @@ class TeachableIE(TeachableBaseIE): @staticmethod def _is_teachable(webpage): return 'teachableTracker.linker:autoLink' in webpage and re.search( - r']+href=["\']https?://process\.fs\.teachablecdn\.com', + r']+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com', webpage) @staticmethod -- GitLab From 7b1f0173c12ba2d420f99f310cc05962cd62a644 Mon Sep 17 00:00:00 2001 From: JamKage Date: Fri, 26 Feb 2021 15:47:34 +0100 Subject: [PATCH 203/384] [go] Added support for FXNetworks (#26826) Co-authored-by: James Kirrage closes #13972 closes #22467 closes #23754 --- haruhi_dl/extractor/extractors.py | 1 - haruhi_dl/extractor/fxnetworks.py | 77 ------------------------------- haruhi_dl/extractor/go.py | 21 ++++++++- 3 files changed, 19 insertions(+), 80 deletions(-) delete mode 100644 haruhi_dl/extractor/fxnetworks.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 69c7d9f62..1a5cee636 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -424,7 +424,6 @@ from .funkwhale import ( FunkwhaleRadioSHIE, ) from .fusion import FusionIE -from .fxnetworks import FXNetworksIE from .gaia import GaiaIE from .gameinformer import GameInformerIE from .gamespot import GameSpotIE diff --git a/haruhi_dl/extractor/fxnetworks.py b/haruhi_dl/extractor/fxnetworks.py deleted file mode 100644 index 00e67426b..000000000 --- a/haruhi_dl/extractor/fxnetworks.py +++ /dev/null @@ -1,77 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .adobepass import AdobePassIE -from ..utils import ( - extract_attributes, - int_or_none, - parse_age_limit, - smuggle_url, - update_url_query, -) - - -class FXNetworksIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P\d+)' - _TESTS = [{ - 'url': 'http://www.fxnetworks.com/video/1032565827847', - 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703', - 'info_dict': { - 'id': 'dRzwHC_MMqIv', - 'ext': 'mp4', - 'title': 'First Look: Better Things - Season 2', - 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.', - 'age_limit': 14, - 'uploader': 'NEWA-FNG-FX', - 'upload_date': '20170825', - 'timestamp': 1503686274, - 'episode_number': 0, - 'season_number': 2, - 'series': 'Better Things', - }, - 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.simpsonsworld.com/video/716094019682', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - if 'The content you are trying to access is not available in your region.' in webpage: - self.raise_geo_restricted() - video_data = extract_attributes(self._search_regex( - r'()', webpage, 'video data')) - player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) - release_url = video_data['rel'] - title = video_data['data-title'] - rating = video_data.get('data-rating') - query = { - 'mbr': 'true', - } - if player_type == 'movies': - query.update({ - 'manifest': 'm3u', - }) - else: - query.update({ - 'switch': 'http', - }) - if video_data.get('data-req-auth') == '1': - resource = self._get_mvpd_resource( - video_data['data-channel'], title, - video_data.get('data-guid'), rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource) - - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), - 'series': video_data.get('data-show-title'), - 'episode_number': int_or_none(video_data.get('data-episode')), - 'season_number': int_or_none(video_data.get('data-season')), - 'thumbnail': video_data.get('data-large-thumb'), - 'age_limit': parse_age_limit(rating), - 'ie_key': 'ThePlatform', - } diff --git a/haruhi_dl/extractor/go.py b/haruhi_dl/extractor/go.py index 03cfba91f..0d731e90a 100644 --- a/haruhi_dl/extractor/go.py +++ b/haruhi_dl/extractor/go.py @@ -38,13 +38,17 @@ class GoIE(AdobePassIE): 'disneynow': { 'brand': '011', 'resource_id': 'Disney', - } + }, + 'fxnow.fxnetworks': { + 'brand': '025', + 'requestor_id': 'dtci', + }, } _VALID_URL = r'''(?x) https?:// (?: (?:(?P%s)\.)?go| - (?Pabc|freeform|disneynow) + (?Pabc|freeform|disneynow|fxnow\.fxnetworks) )\.com/ (?: (?:[^/]+/)*(?P[Vv][Dd][Kk][Aa]\w+)| @@ -99,6 +103,19 @@ class GoIE(AdobePassIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'https://fxnow.fxnetworks.com/shows/better-things/video/vdka12782841', + 'info_dict': { + 'id': 'VDKA12782841', + 'ext': 'mp4', + 'title': 'First Look: Better Things - Season 2', + 'description': 'md5:fa73584a95761c605d9d54904e35b407', + }, + 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, -- GitLab From d5bf4b0fea8b3e9047cc733ded9ce5da685cff31 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:07 +0100 Subject: [PATCH 204/384] [toggle] add support for live.mewatch.sg (closes #27555) --- haruhi_dl/extractor/toggle.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/toggle.py b/haruhi_dl/extractor/toggle.py index 3b9b54759..270c84daa 100644 --- a/haruhi_dl/extractor/toggle.py +++ b/haruhi_dl/extractor/toggle.py @@ -200,7 +200,7 @@ class ToggleIE(InfoExtractor): class MeWatchIE(InfoExtractor): IE_NAME = 'mewatch' - _VALID_URL = r'https?://(?:www\.)?mewatch\.sg/watch/[^/?#&]+-(?P[0-9]+)' + _VALID_URL = r'https?://(?:(?:www|live)\.)?mewatch\.sg/watch/[^/?#&]+-(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371', 'info_dict': { @@ -220,6 +220,9 @@ class MeWatchIE(InfoExtractor): }, { 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-%E6%90%9C%E5%AF%86%E3%80%82%E6%89%93%E5%8D%A1%E3%80%82%E5%B0%8F%E7%BA%A2%E7%82%B9-S2-E1-176232', 'only_matching': True, + }, { + 'url': 'https://live.mewatch.sg/watch/Recipe-Of-Life-E41-189759', + 'only_matching': True, }] def _real_extract(self, url): -- GitLab From 68335e76a7423943c0b8e96ba1282112a2f79672 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:14 +0100 Subject: [PATCH 205/384] [zype] Add support for uplynk videos --- haruhi_dl/extractor/zype.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/zype.py b/haruhi_dl/extractor/zype.py index f336ebdb9..60dc6cb24 100644 --- a/haruhi_dl/extractor/zype.py +++ b/haruhi_dl/extractor/zype.py @@ -85,7 +85,13 @@ class ZypeIE(InfoExtractor): else: m3u8_url = self._search_regex( r'(["\'])(?P(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', - body, 'm3u8 url', group='url') + body, 'm3u8 url', group='url', default=None) + if not m3u8_url: + source = self._parse_json(self._search_regex( + r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, + 'source'), video_id, js_to_json) + if source.get('integration') == 'verizon-media': + m3u8_url = 'https://content.uplynk.com/%s.m3u8' % source['id'] formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') text_tracks = self._search_regex( -- GitLab From afa77db7313f4a185c9ca213fef16f08363491e3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:21 +0100 Subject: [PATCH 206/384] [piksel] import format extraction --- haruhi_dl/extractor/nhk.py | 2 +- haruhi_dl/extractor/piksel.py | 109 ++++++++++++++++++++++++---------- 2 files changed, 80 insertions(+), 31 deletions(-) diff --git a/haruhi_dl/extractor/nhk.py b/haruhi_dl/extractor/nhk.py index c5b406573..8a9331a79 100644 --- a/haruhi_dl/extractor/nhk.py +++ b/haruhi_dl/extractor/nhk.py @@ -90,7 +90,7 @@ class NhkVodIE(NhkBaseIE): _TESTS = [{ # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', - 'md5': '256a1be14f48d960a7e61e2532d95ec3', + 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', 'info_dict': { 'id': 'a95j5iza', 'ext': 'mp4', diff --git a/haruhi_dl/extractor/piksel.py b/haruhi_dl/extractor/piksel.py index 88b6859b0..ecf56ff8f 100644 --- a/haruhi_dl/extractor/piksel.py +++ b/haruhi_dl/extractor/piksel.py @@ -6,16 +6,33 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - ExtractorError, dict_get, + ExtractorError, int_or_none, - unescapeHTML, parse_iso8601, + try_get, + unescapeHTML, ) class PikselIE(InfoExtractor): - _VALID_URL = r'https?://player\.piksel\.com/v/(?:refid/[^/]+/prefid/)?(?P[a-z0-9_]+)' + _VALID_URL = r'''(?x)https?:// + (?: + (?: + player\. + (?: + olympusattelecom| + vibebyvista + )| + (?:api|player)\.multicastmedia| + (?:api-ovp|player)\.piksel + )\.com| + (?: + mz-edge\.stream\.co| + movie-s\.nhk\.or + )\.jp| + vidego\.baltimorecity\.gov + )/v/(?:refid/(?P[^/]+)/prefid/)?(?P[\w-]+)''' _TESTS = [ { 'url': 'http://player.piksel.com/v/ums2867l', @@ -56,46 +73,41 @@ class PikselIE(InfoExtractor): if mobj: return mobj.group('url') + def _call_api(self, app_token, resource, display_id, query, fatal=True): + response = (self._download_json( + 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), + display_id, query=query, fatal=fatal) or {}).get('response') + failure = try_get(response, lambda x: x['failure']['reason']) + if failure: + if fatal: + raise ExtractorError(failure, expected=True) + self.report_warning(failure) + return response + def _real_extract(self, url): - display_id = self._match_id(url) + ref_id, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'data-de-program-uuid=[\'"]([a-z0-9]+)', - webpage, 'program uuid', default=display_id) app_token = self._search_regex([ r'clientAPI\s*:\s*"([^"]+)"', r'data-de-api-key\s*=\s*"([^"]+)"' ], webpage, 'app token') - response = self._download_json( - 'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token, - video_id, query={ - 'v': video_id - })['response'] - failure = response.get('failure') - if failure: - raise ExtractorError(response['failure']['reason'], expected=True) - video_data = response['WsProgramResponse']['program']['asset'] + query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} + program = self._call_api( + app_token, 'program', display_id, query)['WsProgramResponse']['program'] + video_id = program['uuid'] + video_data = program['asset'] title = video_data['title'] + asset_type = dict_get(video_data, ['assetType', 'asset_type']) formats = [] - m3u8_url = dict_get(video_data, [ - 'm3u8iPadURL', - 'ipadM3u8Url', - 'm3u8AndroidURL', - 'm3u8iPhoneURL', - 'iphoneM3u8Url']) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - asset_type = dict_get(video_data, ['assetType', 'asset_type']) - for asset_file in video_data.get('assetFiles', []): + def process_asset_file(asset_file): + if not asset_file: + return # TODO: extract rtmp formats http_url = asset_file.get('http_url') if not http_url: - continue + return tbr = None vbr = int_or_none(asset_file.get('videoBitrate'), 1024) abr = int_or_none(asset_file.get('audioBitrate'), 1024) @@ -118,6 +130,43 @@ class PikselIE(InfoExtractor): 'filesize': int_or_none(asset_file.get('filesize')), 'tbr': tbr, }) + + def process_asset_files(asset_files): + for asset_file in (asset_files or []): + process_asset_file(asset_file) + + process_asset_files(video_data.get('assetFiles')) + process_asset_file(video_data.get('referenceFile')) + if not formats: + asset_id = video_data.get('assetid') or program.get('assetid') + if asset_id: + process_asset_files(try_get(self._call_api( + app_token, 'asset_file', display_id, { + 'assetid': asset_id, + }, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) + + m3u8_url = dict_get(video_data, [ + 'm3u8iPadURL', + 'ipadM3u8Url', + 'm3u8AndroidURL', + 'm3u8iPhoneURL', + 'iphoneM3u8Url']) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + smil_url = dict_get(video_data, ['httpSmil', 'hdSmil', 'rtmpSmil']) + if smil_url: + transform_source = None + if ref_id == 'nhkworld': + # TODO: figure out if this is something to be fixed in urljoin, + # _parse_smil_formats or keep it here + transform_source = lambda x: x.replace('src="/', 'src="').replace('/media"', '/media/"') + formats.extend(self._extract_smil_formats( + re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, + transform_source=transform_source, fatal=False)) + self._sort_formats(formats) subtitles = {} -- GitLab From f3474e105d447d5a30315bf81b17d01544b3e2b1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:26 +0100 Subject: [PATCH 207/384] [brightcove] remove sonyliv specific code --- haruhi_dl/extractor/brightcove.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/haruhi_dl/extractor/brightcove.py b/haruhi_dl/extractor/brightcove.py index 2845f4df2..675ede04c 100644 --- a/haruhi_dl/extractor/brightcove.py +++ b/haruhi_dl/extractor/brightcove.py @@ -534,14 +534,6 @@ class BrightcoveNewIE(AdobePassIE): 'format_id': build_format_id('rtmp'), }) formats.append(f) - if not formats: - # for sonyliv.com DRM protected videos - s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') - if s3_source_url: - formats.append({ - 'url': s3_source_url, - 'format_id': 'source', - }) errors = json_data.get('errors') if not formats and errors: -- GitLab From 838ac10bc721c4f5597dc4bbd0b9d5bd6c4aa5d0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:32 +0100 Subject: [PATCH 208/384] [aparat] Fix extraction closes #22285 closes #22611 closes #23348 closes #24354 closes #24591 closes #24904 closes #25418 closes #26070 closes #26350 closes #26738 closes #27563 --- haruhi_dl/extractor/aparat.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/haruhi_dl/extractor/aparat.py b/haruhi_dl/extractor/aparat.py index 883dcee7a..a9527e785 100644 --- a/haruhi_dl/extractor/aparat.py +++ b/haruhi_dl/extractor/aparat.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + get_element_by_id, int_or_none, merge_dicts, mimetype2ext, @@ -39,23 +40,15 @@ class AparatIE(InfoExtractor): webpage = self._download_webpage(url, video_id, fatal=False) if not webpage: - # Note: There is an easier-to-parse configuration at - # http://www.aparat.com/video/video/config/videohash/%video_id - # but the URL in there does not work webpage = self._download_webpage( 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, video_id) - options = self._parse_json( - self._search_regex( - r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P(?:(?!\1).)+)\1\s*\)', - webpage, 'options', group='value'), - video_id) - - player = options['plugins']['sabaPlayerPlugin'] + options = self._parse_json(self._search_regex( + r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id) formats = [] - for sources in player['multiSRC']: + for sources in (options.get('multiSRC') or []): for item in sources: if not isinstance(item, dict): continue @@ -85,11 +78,12 @@ class AparatIE(InfoExtractor): info = self._search_json_ld(webpage, video_id, default={}) if not info.get('title'): - info['title'] = player['title'] + info['title'] = get_element_by_id('videoTitle', webpage) or \ + self._html_search_meta(['og:title', 'twitter:title', 'DC.Title', 'title'], webpage, fatal=True) return merge_dicts(info, { 'id': video_id, 'thumbnail': url_or_none(options.get('poster')), - 'duration': int_or_none(player.get('duration')), + 'duration': int_or_none(options.get('duration')), 'formats': formats, }) -- GitLab From 21f2e0a12eef8a67380458d59cff405daa84387f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:42 +0100 Subject: [PATCH 209/384] =?UTF-8?q?[brightcove]=20raise=20ExtractorError?= =?UTF-8?q?=20for=20DRM=20protected=20videos(closes=20#23=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …467)(closes #27568) --- haruhi_dl/extractor/brightcove.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/brightcove.py b/haruhi_dl/extractor/brightcove.py index 675ede04c..0a2ec3879 100644 --- a/haruhi_dl/extractor/brightcove.py +++ b/haruhi_dl/extractor/brightcove.py @@ -471,13 +471,18 @@ class BrightcoveNewIE(AdobePassIE): def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() + num_drm_sources = 0 formats = [] - for source in json_data.get('sources', []): + sources = json_data.get('sources') or [] + for source in sources: container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if ext == 'ism' or container == 'WVM' or source.get('key_systems'): + if container == 'WVM' or source.get('key_systems'): + num_drm_sources += 1 + continue + elif ext == 'ism': continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -535,11 +540,14 @@ class BrightcoveNewIE(AdobePassIE): }) formats.append(f) - errors = json_data.get('errors') - if not formats and errors: - error = errors[0] - raise ExtractorError( - error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if not formats: + errors = json_data.get('errors') + if errors: + error = errors[0] + raise ExtractorError( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if sources and num_drm_sources == len(sources): + raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) -- GitLab From db69be3ccc39bb023f124fd4ba9d29d9c31569dd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:47 +0100 Subject: [PATCH 210/384] [tenplay] fix format extraction(closes #26653) --- haruhi_dl/extractor/tenplay.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/haruhi_dl/extractor/tenplay.py b/haruhi_dl/extractor/tenplay.py index af325fea8..cd30d57f4 100644 --- a/haruhi_dl/extractor/tenplay.py +++ b/haruhi_dl/extractor/tenplay.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + HEADRequest, parse_age_limit, parse_iso8601, - smuggle_url, + # smuggle_url, ) @@ -24,14 +25,16 @@ class TenPlayIE(InfoExtractor): 'uploader_id': '2199827728001', }, 'params': { - 'format': 'bestvideo', + # 'format': 'bestvideo', 'skip_download': True, } }, { 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + _GEO_BYPASS = False + _FASTLY_URL_TEMPL = 'https://10-selector.global.ssl.fastly.net/s/kYEXFC/media/%s?mbr=true&manifest=m3u&format=redirect' def _real_extract(self, url): content_id = self._match_id(url) @@ -40,19 +43,28 @@ class TenPlayIE(InfoExtractor): video = data.get('video') or {} metadata = data.get('metaData') or {} brightcove_id = video.get('videoId') or metadata['showContentVideoId'] - brightcove_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['AU']}) + # brightcove_url = smuggle_url( + # self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + # {'geo_countries': ['AU']}) + m3u8_url = self._request_webpage(HEADRequest( + self._FASTLY_URL_TEMPL % brightcove_id), brightcove_id).geturl() + if '10play-not-in-oz' in m3u8_url: + self.raise_geo_restricted(countries=['AU']) + formats = self._extract_m3u8_formats(m3u8_url, brightcove_id, 'mp4') + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'url': brightcove_url, - 'id': content_id, - 'title': video.get('title') or metadata.get('pageContentName') or metadata.get('showContentName'), + # '_type': 'url_transparent', + # 'url': brightcove_url, + 'formats': formats, + 'id': brightcove_id, + 'title': video.get('title') or metadata.get('pageContentName') or metadata['showContentName'], 'description': video.get('description'), 'age_limit': parse_age_limit(video.get('showRatingClassification') or metadata.get('showProgramClassification')), 'series': metadata.get('showName'), 'season': metadata.get('showContentSeason'), 'timestamp': parse_iso8601(metadata.get('contentPublishDate') or metadata.get('pageContentPublishDate')), - 'ie_key': 'BrightcoveNew', + 'thumbnail': video.get('poster'), + 'uploader_id': '2199827728001', + # 'ie_key': 'BrightcoveNew', } -- GitLab From 1dbf12006f535d5176539d80d7103450a63d7669 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:54 +0100 Subject: [PATCH 211/384] [sevenplay] detect API errors --- haruhi_dl/extractor/sevenplus.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/haruhi_dl/extractor/sevenplus.py b/haruhi_dl/extractor/sevenplus.py index 84568ac69..240afc18f 100644 --- a/haruhi_dl/extractor/sevenplus.py +++ b/haruhi_dl/extractor/sevenplus.py @@ -4,8 +4,12 @@ from __future__ import unicode_literals import re from .brightcove import BrightcoveNewIE -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( + ExtractorError, try_get, update_url_query, ) @@ -41,16 +45,22 @@ class SevenPlusIE(BrightcoveNewIE): def _real_extract(self, url): path, episode_id = re.match(self._VALID_URL, url).groups() - media = self._download_json( - 'https://videoservice.swm.digital/playback', episode_id, query={ - 'appId': '7plus', - 'deviceType': 'web', - 'platformType': 'web', - 'accountId': 5303576322001, - 'referenceId': 'ref:' + episode_id, - 'deliveryId': 'csai', - 'videoType': 'vod', - })['media'] + try: + media = self._download_json( + 'https://videoservice.swm.digital/playback', episode_id, query={ + 'appId': '7plus', + 'deviceType': 'web', + 'platformType': 'web', + 'accountId': 5303576322001, + 'referenceId': 'ref:' + episode_id, + 'deliveryId': 'csai', + 'videoType': 'vod', + })['media'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), episode_id)[0]['error_code'], expected=True) + raise for source in media.get('sources', {}): src = source.get('src') -- GitLab From 50162a3580d87a7f90aae53fea8494711aaddd6b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:59 +0100 Subject: [PATCH 212/384] [uktvplay] match new video URLs(closes #17909) --- haruhi_dl/extractor/uktvplay.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/uktvplay.py b/haruhi_dl/extractor/uktvplay.py index 2137502a1..f28fd514d 100644 --- a/haruhi_dl/extractor/uktvplay.py +++ b/haruhi_dl/extractor/uktvplay.py @@ -5,10 +5,9 @@ from .common import InfoExtractor class UKTVPlayIE(InfoExtractor): - _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/.+?\?.*?\bvideo=(?P\d+)' - _TEST = { + _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P\d+)' + _TESTS = [{ 'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001', - 'md5': '', 'info_dict': { 'id': '2117008346001', 'ext': 'mp4', @@ -23,7 +22,11 @@ class UKTVPlayIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': ['Failed to download MPD manifest'] - } + }, { + 'url': 'https://uktvplay.uktv.co.uk/shows/africa/watch-online/5983349675001', + 'only_matching': True, + }] + # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/1242911124001/OrCyvJ2gyL_default/index.html?videoId=%s' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911124001/H1xnMOqP_default/index.html?videoId=%s' def _real_extract(self, url): -- GitLab From f7bef2772cfeec3954de444aef6651f918087967 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:50:35 +0100 Subject: [PATCH 213/384] [aenetworks] add support for biography.com (closes #3863) --- haruhi_dl/extractor/aenetworks.py | 97 ++++++++++++++++++++++--------- haruhi_dl/extractor/extractors.py | 2 + 2 files changed, 73 insertions(+), 26 deletions(-) diff --git a/haruhi_dl/extractor/aenetworks.py b/haruhi_dl/extractor/aenetworks.py index 3d0cf1208..237012978 100644 --- a/haruhi_dl/extractor/aenetworks.py +++ b/haruhi_dl/extractor/aenetworks.py @@ -6,6 +6,7 @@ import re from .theplatform import ThePlatformIE from ..utils import ( ExtractorError, + GeoRestrictedError, int_or_none, update_url_query, urlencode_postdata, @@ -28,6 +29,7 @@ class AENetworksBaseIE(ThePlatformIE): 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), 'fyi.tv': ('FYI', 'fyi'), 'historyvault.com': (None, 'historyvault'), + 'biography.com': (None, 'biography'), } def _extract_aen_smil(self, smil_url, video_id, auth=None): @@ -54,6 +56,8 @@ class AENetworksBaseIE(ThePlatformIE): tp_formats, tp_subtitles = self._extract_theplatform_smil( m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes'])) except ExtractorError as e: + if isinstance(e, GeoRestrictedError): + raise last_e = e continue formats.extend(tp_formats) @@ -67,6 +71,34 @@ class AENetworksBaseIE(ThePlatformIE): 'subtitles': subtitles, } + def _extract_aetn_info(self, domain, filter_key, filter_value, url): + requestor_id, brand = self._DOMAIN_MAP[domain] + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, + filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + title = result['title'] + video_id = result['id'] + media_url = result['publicUrl'] + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + auth = None + if theplatform_metadata.get('AETN$isBehindWall'): + resource = self._get_mvpd_resource( + requestor_id, theplatform_metadata['title'], + theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), + theplatform_metadata['ratings'][0]['rating']) + auth = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + info.update(self._extract_aen_smil(media_url, video_id, auth)) + info.update({ + 'title': title, + 'series': result.get('seriesName'), + 'season_number': int_or_none(result.get('tvSeasonNumber')), + 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), + }) + return info + class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' @@ -139,32 +171,7 @@ class AENetworksIE(AENetworksBaseIE): def _real_extract(self, url): domain, canonical = re.match(self._VALID_URL, url).groups() - requestor_id, brand = self._DOMAIN_MAP[domain] - result = self._download_json( - 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, - canonical, query={'filter[canonical]': '/' + canonical})['results'][0] - title = result['title'] - video_id = result['id'] - media_url = result['publicUrl'] - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) - info = self._parse_theplatform_metadata(theplatform_metadata) - auth = None - if theplatform_metadata.get('AETN$isBehindWall'): - resource = self._get_mvpd_resource( - requestor_id, theplatform_metadata['title'], - theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), - theplatform_metadata['ratings'][0]['rating']) - auth = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - info.update(self._extract_aen_smil(media_url, video_id, auth)) - info.update({ - 'title': title, - 'series': result.get('seriesName'), - 'season_number': int_or_none(result.get('tvSeasonNumber')), - 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), - }) - return info + return self._extract_aetn_info(domain, 'canonical', '/' + canonical, url) class AENetworksListBaseIE(AENetworksBaseIE): @@ -294,3 +301,41 @@ class HistoryTopicIE(AENetworksBaseIE): return self.url_result( 'http://www.history.com/videos/' + display_id, AENetworksIE.ie_key()) + + +class HistoryPlayerIE(AENetworksBaseIE): + IE_NAME = 'history:player' + _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_aetn_info(domain, 'id', video_id, url) + + +class BiographyIE(AENetworksBaseIE): + _VALID_URL = r'https?://(?:www\.)?biography\.com/video/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.biography.com/video/vincent-van-gogh-full-episode-2075049808', + 'info_dict': { + 'id': '30322987', + 'ext': 'mp4', + 'title': 'Vincent Van Gogh - Full Episode', + 'description': 'A full biography about the most influential 20th century painter, Vincent Van Gogh.', + 'timestamp': 1311970571, + 'upload_date': '20110729', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_url = self._search_regex( + r']+src="(%s)' % HistoryPlayerIE._VALID_URL, + webpage, 'player URL') + return self.url_result(player_url, HistoryPlayerIE.ie_key()) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 1a5cee636..1f8366076 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -33,6 +33,8 @@ from .aenetworks import ( AENetworksCollectionIE, AENetworksShowIE, HistoryTopicIE, + HistoryPlayerIE, + BiographyIE, ) from .afreecatv import AfreecaTVIE from .agora import ( -- GitLab From 3ca3074dc3a54e736fd82fcb717acdaac7a2c563 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:51:52 +0100 Subject: [PATCH 214/384] [aenetworks] fix HistoryPlayerIE tests --- haruhi_dl/extractor/aenetworks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haruhi_dl/extractor/aenetworks.py b/haruhi_dl/extractor/aenetworks.py index 237012978..8e4963131 100644 --- a/haruhi_dl/extractor/aenetworks.py +++ b/haruhi_dl/extractor/aenetworks.py @@ -306,6 +306,7 @@ class HistoryTopicIE(AENetworksBaseIE): class HistoryPlayerIE(AENetworksBaseIE): IE_NAME = 'history:player' _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' + _TESTS = [] def _real_extract(self, url): domain, video_id = re.match(self._VALID_URL, url).groups() -- GitLab From f1931b8ba87e01a690ca236bcb59298f265b8b6d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:51:59 +0100 Subject: [PATCH 215/384] [nbc] fix NBCSport VPlayer URL extraction(closes #16640) --- haruhi_dl/extractor/nbc.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/nbc.py b/haruhi_dl/extractor/nbc.py index ea5f5a315..9695a9616 100644 --- a/haruhi_dl/extractor/nbc.py +++ b/haruhi_dl/extractor/nbc.py @@ -158,7 +158,8 @@ class NBCIE(AdobePassIE): class NBCSportsVPlayerIE(InfoExtractor): - _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' + _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' + _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' _TESTS = [{ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', @@ -174,12 +175,15 @@ class NBCSportsVPlayerIE(InfoExtractor): }, { 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', 'only_matching': True, + }, { + 'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true', + 'only_matching': True, }] @staticmethod def _extract_url(webpage): iframe_m = re.search( - r']+src="(?Phttps?://vplayer\.nbcsports\.com/[^"]+)"', webpage) + r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) if iframe_m: return iframe_m.group('url') @@ -192,21 +196,29 @@ class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsIE(InfoExtractor): - # Does not include https because its certificate is invalid - _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?:[^/]+/)+(?P[0-9a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?!vplayer/)(?:[^/]+/)+(?P[0-9a-z-]+)' - _TEST = { + _TESTS = [{ + # iframe src 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', 'info_dict': { 'id': 'PHJSaFWbrTY9', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', 'uploader': 'NBCU-SPORTS', 'upload_date': '20150330', 'timestamp': 1427726529, } - } + }, { + # data-mpx-src + 'url': 'https://www.nbcsports.com/philadelphia/philadelphia-phillies/bruce-bochy-hector-neris-hes-idiot', + 'only_matching': True, + }, { + # data-src + 'url': 'https://www.nbcsports.com/boston/video/report-card-pats-secondary-no-match-josh-allen', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) -- GitLab From c7d0af171ff8591f022d589e7fa8222d0a0b4a2a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:52:06 +0100 Subject: [PATCH 216/384] [nbc] Remove CSNNE extractor --- haruhi_dl/extractor/extractors.py | 1 - haruhi_dl/extractor/nbc.py | 27 --------------------------- 2 files changed, 28 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 1f8366076..c8e99253e 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -723,7 +723,6 @@ from .nba import ( NBAChannelIE, ) from .nbc import ( - CSNNEIE, NBCIE, NBCNewsIE, NBCOlympicsIE, diff --git a/haruhi_dl/extractor/nbc.py b/haruhi_dl/extractor/nbc.py index 9695a9616..0d77648c2 100644 --- a/haruhi_dl/extractor/nbc.py +++ b/haruhi_dl/extractor/nbc.py @@ -286,33 +286,6 @@ class NBCSportsStreamIE(AdobePassIE): } -class CSNNEIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P[0-9a-z-]+)' - - _TEST = { - 'url': 'http://www.csnne.com/video/snc-evening-update-wright-named-red-sox-no-5-starter', - 'info_dict': { - 'id': 'yvBLLUgQ8WU0', - 'ext': 'mp4', - 'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.', - 'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3', - 'timestamp': 1459369979, - 'upload_date': '20160330', - 'uploader': 'NBCU-SPORTS', - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': self._html_search_meta('twitter:player:stream', webpage), - 'display_id': display_id, - } - - class NBCNewsIE(ThePlatformIE): _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P[^/?]+)' -- GitLab From 10af8572d432e3dde30b3f02310c46755ee7d213 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:53:16 +0100 Subject: [PATCH 217/384] [YoutubeDL] Allow format filtering using audio language(#16209) --- haruhi_dl/HaruhiDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/HaruhiDL.py b/haruhi_dl/HaruhiDL.py index 813d32d76..7320403d2 100755 --- a/haruhi_dl/HaruhiDL.py +++ b/haruhi_dl/HaruhiDL.py @@ -1087,7 +1087,7 @@ class HaruhiDL(object): '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) - \s*(?Pext|acodec|vcodec|container|protocol|format_id) + \s*(?Pext|acodec|vcodec|container|protocol|format_id|language) \s*(?P!\s*)?(?P%s)(?P\s*\?)? \s*(?P[a-zA-Z0-9._-]+) \s*$ -- GitLab From 2c4b3dd8649151e1409d5af6adb05b26eed6679a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:53:27 +0100 Subject: [PATCH 218/384] [utils] accept only supported protocols in url_or_none --- haruhi_dl/utils.py | 2 +- test/test_utils.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/utils.py b/haruhi_dl/utils.py index 2bba1b04c..62b59bcdb 100644 --- a/haruhi_dl/utils.py +++ b/haruhi_dl/utils.py @@ -3642,7 +3642,7 @@ def url_or_none(url): if not url or not isinstance(url, compat_str): return None url = url.strip() - return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None + return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None def parse_duration(s): diff --git a/test/test_utils.py b/test/test_utils.py index fcb86d92a..dc3dde0c4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -554,6 +554,11 @@ class TestUtil(unittest.TestCase): self.assertEqual(url_or_none('http$://foo.de'), None) self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') self.assertEqual(url_or_none('//foo.de'), '//foo.de') + self.assertEqual(url_or_none('s3://foo.de'), None) + self.assertEqual(url_or_none('rtmpte://foo.de'), 'rtmpte://foo.de') + self.assertEqual(url_or_none('mms://foo.de'), 'mms://foo.de') + self.assertEqual(url_or_none('rtspu://foo.de'), 'rtspu://foo.de') + self.assertEqual(url_or_none('ftps://foo.de'), 'ftps://foo.de') def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) -- GitLab From a2f4d6ec07bed6bc52ede04d51dbfc03725ab21e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:53:33 +0100 Subject: [PATCH 219/384] [yandexvideo] fix extraction(closes #25000) --- haruhi_dl/extractor/yandexvideo.py | 116 +++++++++++++++++++---------- 1 file changed, 76 insertions(+), 40 deletions(-) diff --git a/haruhi_dl/extractor/yandexvideo.py b/haruhi_dl/extractor/yandexvideo.py index 46529be05..36d01cc8e 100644 --- a/haruhi_dl/extractor/yandexvideo.py +++ b/haruhi_dl/extractor/yandexvideo.py @@ -13,26 +13,30 @@ class YandexVideoIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - yandex\.ru(?:/portal/(?:video|efir))?/?\?.*?stream_id=| + yandex\.ru(?:/(?:portal/(?:video|efir)|efir))?/?\?.*?stream_id=| frontend\.vh\.yandex\.ru/player/ ) - (?P[\da-f]+) + (?P(?:[\da-f]{32}|[\w-]{12})) ''' _TESTS = [{ - 'url': 'https://yandex.ru/portal/video?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', - 'md5': '33955d7ae052f15853dc41f35f17581c', + 'url': 'https://yandex.ru/portal/video?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374', + 'md5': 'e02a05bfaf0d9615ef07ae3a10f4faf4', 'info_dict': { - 'id': '4dbb262b4fe5cf15a215de4f34eee34d', + 'id': '4dbb36ec4e0526d58f9f2dc8f0ecf374', 'ext': 'mp4', - 'title': 'В Нью-Йорке баржи и теплоход оторвались от причала и расплылись по Гудзону', - 'description': '', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 0, - 'duration': 30, + 'title': 'Русский Вудсток - главный рок-фест в истории СССР / вДудь', + 'description': 'md5:7d6b8d4bc4a3b9a56499916c1ea5b5fa', + 'thumbnail': r're:^https?://', + 'timestamp': 1549972939, + 'duration': 5575, 'age_limit': 18, + 'upload_date': '20190212', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, }, }, { - 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374&from=morda', + 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb262b4fe5cf15a215de4f34eee34d&from=morda', 'only_matching': True, }, { 'url': 'https://yandex.ru/?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', @@ -52,53 +56,85 @@ class YandexVideoIE(InfoExtractor): # DASH with DRM 'url': 'https://yandex.ru/portal/video?from=morda&stream_id=485a92d94518d73a9d0ff778e13505f8', 'only_matching': True, + }, { + 'url': 'https://yandex.ru/efir?stream_active=watching&stream_id=v7a2dZ-v5mSI&from_block=efir_newtab', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) content = self._download_json( - 'https://frontend.vh.yandex.ru/v22/player/%s.json' % video_id, - video_id, query={ - 'stream_options': 'hires', - 'disable_trackings': 1, - })['content'] - - content_url = url_or_none(content.get('content_url')) or url_or_none( - content['streams'][0]['url']) - title = content.get('title') or content.get('computed_title') + # 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, + # video_id, query={ + # 'stream_options': 'hires', + # 'disable_trackings': 1, + # })['content'] + 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ + player(content_id: "%s") { + computed_title + content_url + description + dislikes + duration + likes + program_title + release_date + release_date_ut + release_year + restriction_age + season + start_time + streams + thumbnail + title + views_count + } +}''' % video_id.encode())['player']['content']['content'] - ext = determine_ext(content_url) + title = content.get('title') or content['computed_title'] - if ext == 'm3u8': - formats = self._extract_m3u8_formats( - content_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - elif ext == 'mpd': - formats = self._extract_mpd_formats( - content_url, video_id, mpd_id='dash') - else: - formats = [{'url': content_url}] + formats = [] + streams = content.get('streams') or [] + streams.append({'url': content.get('content_url')}) + for stream in streams: + content_url = url_or_none(stream.get('url')) + if not content_url: + continue + ext = determine_ext(content_url) + if ext == 'ismc': + continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + content_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + content_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({'url': content_url}) self._sort_formats(formats) - description = content.get('description') - thumbnail = content.get('thumbnail') timestamp = (int_or_none(content.get('release_date')) or int_or_none(content.get('release_date_ut')) or int_or_none(content.get('start_time'))) - duration = int_or_none(content.get('duration')) - series = content.get('program_title') - age_limit = int_or_none(content.get('restriction_age')) + season = content.get('season') or {} return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'description': content.get('description'), + 'thumbnail': content.get('thumbnail'), 'timestamp': timestamp, - 'duration': duration, - 'series': series, - 'age_limit': age_limit, + 'duration': int_or_none(content.get('duration')), + 'series': content.get('program_title'), + 'age_limit': int_or_none(content.get('restriction_age')), + 'view_count': int_or_none(content.get('views_count')), + 'like_count': int_or_none(content.get('likes')), + 'dislike_count': int_or_none(content.get('dislikes')), + 'season_number': int_or_none(season.get('season_number')), + 'season_id': season.get('id'), + 'release_year': int_or_none(content.get('release_year')), 'formats': formats, } -- GitLab From 95b5454a31a56604423c08a59290ba4daad1f0fe Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:53:41 +0100 Subject: [PATCH 220/384] [yandexvideo] use old api call as fallback --- haruhi_dl/extractor/yandexvideo.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/yandexvideo.py b/haruhi_dl/extractor/yandexvideo.py index 36d01cc8e..ab8c84c93 100644 --- a/haruhi_dl/extractor/yandexvideo.py +++ b/haruhi_dl/extractor/yandexvideo.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + try_get, url_or_none, ) @@ -64,12 +65,7 @@ class YandexVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - content = self._download_json( - # 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, - # video_id, query={ - # 'stream_options': 'hires', - # 'disable_trackings': 1, - # })['content'] + player = try_get((self._download_json( 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ player(content_id: "%s") { computed_title @@ -90,7 +86,15 @@ class YandexVideoIE(InfoExtractor): title views_count } -}''' % video_id.encode())['player']['content']['content'] +}''' % video_id.encode(), fatal=False)), lambda x: x['player']['content']) + if not player or player.get('error'): + player = self._download_json( + 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, + video_id, query={ + 'stream_options': 'hires', + 'disable_trackings': 1, + }) + content = player['content'] title = content.get('title') or content['computed_title'] -- GitLab From 355b6d9ab6ec671678fc69acd58df191796c47bc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:53:46 +0100 Subject: [PATCH 221/384] [yandexdisk] fix extraction(closes #17861)(closes #27131) --- haruhi_dl/extractor/yandexdisk.py | 144 +++++++++++++++++------------- 1 file changed, 84 insertions(+), 60 deletions(-) diff --git a/haruhi_dl/extractor/yandexdisk.py b/haruhi_dl/extractor/yandexdisk.py index e8f6ae10f..21f37c192 100644 --- a/haruhi_dl/extractor/yandexdisk.py +++ b/haruhi_dl/extractor/yandexdisk.py @@ -1,19 +1,40 @@ # coding: utf-8 from __future__ import unicode_literals +import json + from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( determine_ext, + ExtractorError, float_or_none, int_or_none, - try_get, - urlencode_postdata, + mimetype2ext, + parse_iso8601, + urljoin, ) class YandexDiskIE(InfoExtractor): - _VALID_URL = r'https?://yadi\.sk/[di]/(?P[^/?#&]+)' + _VALID_URL = r'''(?x)https?:// + (?: + (?:www\.)?yadi\.sk| + disk\.yandex\. + (?: + az| + by| + co(?:m(?:\.(?:am|ge|tr))?|\.il)| + ee| + fr| + k[gz]| + l[tv]| + md| + t[jm]| + u[az]| + ru + ) + )/(?:[di]/|public.*?\bhash=)(?P[^/?#&]+)''' _TESTS = [{ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', @@ -25,94 +46,97 @@ class YandexDiskIE(InfoExtractor): 'duration': 168.6, 'uploader': 'y.botova', 'uploader_id': '300043621', + 'timestamp': 1421396809, + 'upload_date': '20150116', 'view_count': int, }, }, { 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', 'only_matching': True, + }, { + 'url': 'https://yadi.sk/public?hash=5DZ296JK9GWCLp02f6jrObjnctjRxMs8L6%2B%2FuhNqk38%3D', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - status = self._download_webpage( - 'https://disk.yandex.com/auth/status', video_id, query={ - 'urlOrigin': url, - 'source': 'public', - 'md5': 'false', - }) - - sk = self._search_regex( - r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P(?:(?!\2).)+)\2', - status, 'sk', group='value') - - webpage = self._download_webpage(url, video_id) - - models = self._parse_json( - self._search_regex( - r']+id=["\']models-client[^>]+>\s*(\[.+?\])\s* Date: Fri, 26 Feb 2021 15:53:51 +0100 Subject: [PATCH 222/384] [yandexdisk] extract info from webpage the public API does not return metadata when download limit is reached --- haruhi_dl/extractor/yandexdisk.py | 89 ++++++++++++++++--------------- 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/haruhi_dl/extractor/yandexdisk.py b/haruhi_dl/extractor/yandexdisk.py index 21f37c192..6fcd8ee7e 100644 --- a/haruhi_dl/extractor/yandexdisk.py +++ b/haruhi_dl/extractor/yandexdisk.py @@ -2,24 +2,23 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor -from ..compat import compat_HTTPError from ..utils import ( determine_ext, - ExtractorError, float_or_none, int_or_none, mimetype2ext, - parse_iso8601, + try_get, urljoin, ) class YandexDiskIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// - (?: - (?:www\.)?yadi\.sk| + (?P + yadi\.sk| disk\.yandex\. (?: az| @@ -38,7 +37,7 @@ class YandexDiskIE(InfoExtractor): _TESTS = [{ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', - 'md5': '33955d7ae052f15853dc41f35f17581c', + 'md5': 'a4a8d52958c8fddcf9845935070402ae', 'info_dict': { 'id': 'VdOeDou8eZs6Y', 'ext': 'mp4', @@ -46,10 +45,9 @@ class YandexDiskIE(InfoExtractor): 'duration': 168.6, 'uploader': 'y.botova', 'uploader_id': '300043621', - 'timestamp': 1421396809, - 'upload_date': '20150116', 'view_count': int, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', 'only_matching': True, @@ -59,51 +57,58 @@ class YandexDiskIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + domain, video_id = re.match(self._VALID_URL, url).groups() - try: - resource = self._download_json( - 'https://cloud-api.yandex.net/v1/disk/public/resources', - video_id, query={'public_key': url}) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error_description = self._parse_json( - e.cause.read().decode(), video_id)['description'] - raise ExtractorError(error_description, expected=True) - raise + webpage = self._download_webpage(url, video_id) + store = self._parse_json(self._search_regex( + r']+id="store-prefetch"[^>]*>\s*({.+?})\s*', + webpage, 'store'), video_id) + resource = store['resources'][store['rootResourceId']] title = resource['name'] - public_url = resource.get('public_url') + meta = resource.get('meta') or {} + + public_url = meta.get('short_url') if public_url: video_id = self._match_id(public_url) - self._set_cookie('yadi.sk', 'yandexuid', '0') + source_url = (self._download_json( + 'https://cloud-api.yandex.net/v1/disk/public/resources/download', + video_id, query={'public_key': url}, fatal=False) or {}).get('href') + video_streams = resource.get('videoStreams') or {} + video_hash = resource.get('hash') or url + environment = store.get('environment') or {} + sk = environment.get('sk') + yandexuid = environment.get('yandexuid') + if sk and yandexuid and not (source_url and video_streams): + self._set_cookie(domain, 'yandexuid', yandexuid) - def call_api(action): - return (self._download_json( - urljoin(url, '/public/api/') + action, video_id, data=json.dumps({ - 'hash': url, - # obtain sk if needed from call_api('check-auth') while - # the yandexuid cookie is set and sending an empty JSON object - 'sk': 'ya6b52f8c6b12abe91a66d22d3a31084b' - }).encode(), headers={ - 'Content-Type': 'text/plain', - }, fatal=False) or {}).get('data') or {} + def call_api(action): + return (self._download_json( + urljoin(url, '/public/api/') + action, video_id, data=json.dumps({ + 'hash': video_hash, + 'sk': sk, + }).encode(), headers={ + 'Content-Type': 'text/plain', + }, fatal=False) or {}).get('data') or {} + if not source_url: + # TODO: figure out how to detect if download limit has + # been reached and then avoid unnecessary source format + # extraction requests + source_url = call_api('download-url').get('url') + if not video_streams: + video_streams = call_api('get-video-streams') formats = [] - source_url = resource.get('file') - if not source_url: - source_url = call_api('download-url').get('url') if source_url: formats.append({ 'url': source_url, 'format_id': 'source', - 'ext': determine_ext(title, mimetype2ext(resource.get('mime_type')) or 'mp4'), + 'ext': determine_ext(title, meta.get('ext') or mimetype2ext(meta.get('mime_type')) or 'mp4'), 'quality': 1, - 'filesize': int_or_none(resource.get('size')) + 'filesize': int_or_none(meta.get('size')) }) - video_streams = call_api('get-video-streams') for video in (video_streams.get('videos') or []): format_url = video.get('url') if not format_url: @@ -128,15 +133,15 @@ class YandexDiskIE(InfoExtractor): }) self._sort_formats(formats) - owner = resource.get('owner') or {} + uid = resource.get('uid') + display_name = try_get(store, lambda x: x['users'][uid]['displayName']) return { 'id': video_id, 'title': title, 'duration': float_or_none(video_streams.get('duration'), 1000), - 'uploader': owner.get('display_name'), - 'uploader_id': owner.get('uid'), - 'view_count': int_or_none(resource.get('views_count')), - 'timestamp': parse_iso8601(resource.get('created')), + 'uploader': display_name, + 'uploader_id': uid, + 'view_count': int_or_none(meta.get('views_counter')), 'formats': formats, } -- GitLab From 0165049a526dc14f4963d751d8b5ae814de4b0d5 Mon Sep 17 00:00:00 2001 From: nixxo Date: Fri, 26 Feb 2021 15:53:57 +0100 Subject: [PATCH 223/384] [vvvvid] add playlists support (#27574) closes #18130 --- haruhi_dl/extractor/extractors.py | 5 ++- haruhi_dl/extractor/vvvvid.py | 65 ++++++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index c8e99253e..2b81187ca 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1483,7 +1483,10 @@ from .vshare import VShareIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE -from .vvvvid import VVVVIDIE +from .vvvvid import ( + VVVVIDIE, + VVVVIDShowIE, +) from .vyborymos import VyboryMosIE from .vzaar import VzaarIE from .wakanim import WakanimIE diff --git a/haruhi_dl/extractor/vvvvid.py b/haruhi_dl/extractor/vvvvid.py index 6906cd2ab..5b8ea3665 100644 --- a/haruhi_dl/extractor/vvvvid.py +++ b/haruhi_dl/extractor/vvvvid.py @@ -12,7 +12,8 @@ from ..utils import ( class VVVVIDIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' + _VALID_URL_BASE = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/' + _VALID_URL = r'%s(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' % _VALID_URL_BASE _TESTS = [{ # video_type == 'video/vvvvid' 'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong', @@ -45,20 +46,26 @@ class VVVVIDIE(InfoExtractor): 'https://www.vvvvid.it/user/login', None, headers=self.geo_verification_headers())['data']['conn_id'] - def _real_extract(self, url): - show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + def _download_info(self, show_id, path, video_id, fatal=True): response = self._download_json( - 'https://www.vvvvid.it/vvvvid/ondemand/%s/season/%s' % (show_id, season_id), + 'https://www.vvvvid.it/vvvvid/ondemand/%s%s' % (show_id, path), video_id, headers=self.geo_verification_headers(), query={ 'conn_id': self._conn_id, - }) + }, fatal=fatal) if response['result'] == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, response['message']), expected=True) + return response['data'] + + def _real_extract(self, url): + show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + + response = self._download_info( + show_id, '/season/%s' % season_id, video_id) vid = int(video_id) video_data = list(filter( - lambda episode: episode.get('video_id') == vid, response['data']))[0] + lambda episode: episode.get('video_id') == vid, response))[0] formats = [] # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js @@ -156,3 +163,49 @@ class VVVVIDIE(InfoExtractor): 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), } + + +class VVVVIDShowIE(VVVVIDIE): + _VALID_URL = r'(?P%s(?P\d+)/(?P[^/]+))/?(?:$|[\?&].*$)?$' % VVVVIDIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.vvvvid.it/show/156/psyco-pass', + 'info_dict': { + 'id': '156', + 'title': 'Psycho-Pass', + 'description': 'md5:94d572c0bd85894b193b8aebc9a3a806', + }, + 'playlist_count': 46, + }] + + def _real_extract(self, url): + base_url, show_id, show_title = re.match(self._VALID_URL, url).groups() + + response = self._download_info( + show_id, '/seasons/', show_title) + + show_infos = self._download_info( + show_id, '/info/', show_title, fatal=False) + + entries = [] + for season in response: + episodes = season.get('episodes') or [] + for episode in episodes: + season_id = str_or_none(episode.get('season_id')) + video_id = str_or_none(episode.get('video_id')) + if not (season_id and video_id): + continue + + video_url = '/'.join([base_url, season_id, video_id]) + + entries.append({ + '_type': 'url_transparent', + 'ie_key': VVVVIDIE.ie_key(), + 'url': video_url, + 'title': episode.get('title'), + 'thumbnail': episode.get('thumbnail'), + 'description': episode.get('description'), + 'season_number': int_or_none(episode.get('season_number')), + 'episode_number': int_or_none(episode.get('number')), + }) + return self.playlist_result( + entries, show_id, show_infos.get('title'), show_infos.get('description')) -- GitLab From 9a6885f335f798cb467b3da2cafac5a134df1173 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:54:01 +0100 Subject: [PATCH 224/384] [vvvvid] imporove info extraction --- haruhi_dl/extractor/vvvvid.py | 78 +++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/haruhi_dl/extractor/vvvvid.py b/haruhi_dl/extractor/vvvvid.py index 5b8ea3665..014a67e53 100644 --- a/haruhi_dl/extractor/vvvvid.py +++ b/haruhi_dl/extractor/vvvvid.py @@ -22,6 +22,16 @@ class VVVVIDIE(InfoExtractor): 'id': '489048', 'ext': 'mp4', 'title': 'Ping Pong', + 'duration': 239, + 'series': '"Perché dovrei guardarlo?" di Dario Moccia', + 'season_id': '437', + 'season_number': 1, + 'episode': 'Ping Pong', + 'episode_number': 1, + 'episode_id': '3334', + 'view_count': int, + 'like_count': int, + 'repost_count': int, }, 'params': { 'skip_download': True, @@ -38,6 +48,9 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', + 'only_matching': True }] _conn_id = None @@ -48,24 +61,34 @@ class VVVVIDIE(InfoExtractor): def _download_info(self, show_id, path, video_id, fatal=True): response = self._download_json( - 'https://www.vvvvid.it/vvvvid/ondemand/%s%s' % (show_id, path), + 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), video_id, headers=self.geo_verification_headers(), query={ 'conn_id': self._conn_id, }, fatal=fatal) - if response['result'] == 'error': + if not (response or fatal): + return + if response.get('result') == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, response['message']), expected=True) return response['data'] + def _extract_common_video_info(self, video_data): + return { + 'thumbnail': video_data.get('thumbnail'), + 'episode_number': int_or_none(video_data.get('number')), + 'episode_id': str_or_none(video_data.get('id')), + } + def _real_extract(self, url): show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() response = self._download_info( - show_id, '/season/%s' % season_id, video_id) + show_id, 'season/%s' % season_id, video_id) vid = int(video_id) video_data = list(filter( lambda episode: episode.get('video_id') == vid, response))[0] + title = video_data['title'] formats = [] # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js @@ -148,25 +171,25 @@ class VVVVIDIE(InfoExtractor): 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) self._sort_formats(formats) - return { + info = self._extract_common_video_info(video_data) + info.update({ 'id': video_id, - 'title': video_data['title'], + 'title': title, 'formats': formats, - 'thumbnail': video_data.get('thumbnail'), 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, 'season_number': video_data.get('season_number'), - 'episode_id': str_or_none(video_data.get('id')), - 'episode_number': int_or_none(video_data.get('number')), - 'episode_title': video_data['title'], + 'episode': title, 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), - } + 'repost_count': int_or_none(video_data.get('video_shares')), + }) + return info class VVVVIDShowIE(VVVVIDIE): - _VALID_URL = r'(?P%s(?P\d+)/(?P[^/]+))/?(?:$|[\?&].*$)?$' % VVVVIDIE._VALID_URL_BASE + _VALID_URL = r'(?P%s(?P\d+)(?:/(?P[^/?&#]+))?)/?(?:[?#&]|$)' % VVVVIDIE._VALID_URL_BASE _TESTS = [{ 'url': 'https://www.vvvvid.it/show/156/psyco-pass', 'info_dict': { @@ -175,37 +198,40 @@ class VVVVIDShowIE(VVVVIDIE): 'description': 'md5:94d572c0bd85894b193b8aebc9a3a806', }, 'playlist_count': 46, + }, { + 'url': 'https://www.vvvvid.it/show/156', + 'only_matching': True, }] def _real_extract(self, url): base_url, show_id, show_title = re.match(self._VALID_URL, url).groups() - response = self._download_info( - show_id, '/seasons/', show_title) + seasons = self._download_info( + show_id, 'seasons/', show_title) - show_infos = self._download_info( - show_id, '/info/', show_title, fatal=False) + show_info = self._download_info( + show_id, 'info/', show_title, fatal=False) entries = [] - for season in response: + for season in (seasons or []): + season_number = int_or_none(season.get('number')) episodes = season.get('episodes') or [] for episode in episodes: season_id = str_or_none(episode.get('season_id')) video_id = str_or_none(episode.get('video_id')) if not (season_id and video_id): continue - - video_url = '/'.join([base_url, season_id, video_id]) - - entries.append({ - '_type': 'url_transparent', + info = self._extract_common_video_info(episode) + info.update({ + '_type': 'url', 'ie_key': VVVVIDIE.ie_key(), - 'url': video_url, + 'url': '/'.join([base_url, season_id, video_id]), 'title': episode.get('title'), - 'thumbnail': episode.get('thumbnail'), 'description': episode.get('description'), - 'season_number': int_or_none(episode.get('season_number')), - 'episode_number': int_or_none(episode.get('number')), + 'season_number': season_number, + 'season_id': season_id, }) + entries.append(info) + return self.playlist_result( - entries, show_id, show_infos.get('title'), show_infos.get('description')) + entries, show_id, show_info.get('title'), show_info.get('description')) -- GitLab From e4f3383802a321ff5f1e44e259e7878ae4fc9fa1 Mon Sep 17 00:00:00 2001 From: ozburo Date: Fri, 26 Feb 2021 15:54:08 +0100 Subject: [PATCH 225/384] [redditr] Extract all thumbnails --- haruhi_dl/extractor/reddit.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/reddit.py b/haruhi_dl/extractor/reddit.py index 3b2abb262..2d1a1fd99 100644 --- a/haruhi_dl/extractor/reddit.py +++ b/haruhi_dl/extractor/reddit.py @@ -9,6 +9,7 @@ from ..utils import ( float_or_none, try_get, url_or_none, + unescapeHTML, ) @@ -118,11 +119,23 @@ class RedditRIE(InfoExtractor): else: age_limit = None + thumbnails = [] + images = try_get( + data, lambda x: x['preview']['images'][0]['resolutions']) or [] + for image in images: + url = url_or_none(unescapeHTML(image['url'])) + if url is not None: + thumbnails.append({ + 'url': url, + 'width': int_or_none(image['width']), + 'height': int_or_none(image['height']), + }) + return { '_type': 'url_transparent', 'url': video_url, 'title': data.get('title'), - 'thumbnail': url_or_none(data.get('thumbnail')), + 'thumbnails': thumbnails, 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), 'duration': int_or_none(try_get( -- GitLab From 52fd0e8bb8e3c6b07b00f0e44d9961f59c15616a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:54:14 +0100 Subject: [PATCH 226/384] [redditr] Fix review issues and extract source thumbnail (closes #27503) --- haruhi_dl/extractor/reddit.py | 36 +++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/haruhi_dl/extractor/reddit.py b/haruhi_dl/extractor/reddit.py index 2d1a1fd99..222fa0172 100644 --- a/haruhi_dl/extractor/reddit.py +++ b/haruhi_dl/extractor/reddit.py @@ -8,8 +8,8 @@ from ..utils import ( int_or_none, float_or_none, try_get, - url_or_none, unescapeHTML, + url_or_none, ) @@ -57,7 +57,8 @@ class RedditRIE(InfoExtractor): 'id': 'zv89llsvexdz', 'ext': 'mp4', 'title': 'That small heart attack.', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:4', 'timestamp': 1501941939, 'upload_date': '20170805', 'uploader': 'Antw87', @@ -120,16 +121,27 @@ class RedditRIE(InfoExtractor): age_limit = None thumbnails = [] - images = try_get( - data, lambda x: x['preview']['images'][0]['resolutions']) or [] - for image in images: - url = url_or_none(unescapeHTML(image['url'])) - if url is not None: - thumbnails.append({ - 'url': url, - 'width': int_or_none(image['width']), - 'height': int_or_none(image['height']), - }) + + def add_thumbnail(src): + if not isinstance(src, dict): + return + thumbnail_url = url_or_none(src.get('url')) + if not thumbnail_url: + return + thumbnails.append({ + 'url': unescapeHTML(thumbnail_url), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + }) + + for image in try_get(data, lambda x: x['preview']['images']) or []: + if not isinstance(image, dict): + continue + add_thumbnail(image.get('source')) + resolutions = image.get('resolutions') + if isinstance(resolutions, list): + for resolution in resolutions: + add_thumbnail(resolution) return { '_type': 'url_transparent', -- GitLab From 0ade73d5629c2ea85fa9c2fc44b3ec72aae27d80 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:54:32 +0100 Subject: [PATCH 227/384] [yandexvideo] fix extraction for Python 3.4 --- haruhi_dl/extractor/yandexvideo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/yandexvideo.py b/haruhi_dl/extractor/yandexvideo.py index ab8c84c93..6a166ec9b 100644 --- a/haruhi_dl/extractor/yandexvideo.py +++ b/haruhi_dl/extractor/yandexvideo.py @@ -66,7 +66,7 @@ class YandexVideoIE(InfoExtractor): video_id = self._match_id(url) player = try_get((self._download_json( - 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ + 'https://frontend.vh.yandex.ru/graphql', video_id, data=('''{ player(content_id: "%s") { computed_title content_url @@ -86,7 +86,7 @@ class YandexVideoIE(InfoExtractor): title views_count } -}''' % video_id.encode(), fatal=False)), lambda x: x['player']['content']) +}''' % video_id).encode(), fatal=False)), lambda x: x['player']['content']) if not player or player.get('error'): player = self._download_json( 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, -- GitLab From 2aafa2f712d4d1eb447bb44356ed9a6e6d63ac4c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:54:41 +0100 Subject: [PATCH 228/384] =?UTF-8?q?[vvvvid]=20skip=20unplayable=20episodes?= =?UTF-8?q?=20and=20extract=20akamai=20formats(closes=20#=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …27599) --- haruhi_dl/extractor/vvvvid.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/vvvvid.py b/haruhi_dl/extractor/vvvvid.py index 014a67e53..145805492 100644 --- a/haruhi_dl/extractor/vvvvid.py +++ b/haruhi_dl/extractor/vvvvid.py @@ -152,7 +152,6 @@ class VVVVIDIE(InfoExtractor): embed_code = ds(embed_code) video_type = video_data.get('video_type') if video_type in ('video/rcs', 'video/kenc'): - embed_code = re.sub(r'https?://([^/]+)/z/', r'https://\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') if video_type == 'video/kenc': kenc = self._download_json( 'https://www.vvvvid.it/kenc', video_id, query={ @@ -163,9 +162,7 @@ class VVVVIDIE(InfoExtractor): kenc_message = kenc.get('message') if kenc_message: embed_code += '?' + ds(kenc_message) - formats.extend(self._extract_m3u8_formats( - embed_code, video_id, 'mp4', - m3u8_id='hls', fatal=False)) + formats.extend(self._extract_akamai_formats(embed_code, video_id)) else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) @@ -217,6 +214,8 @@ class VVVVIDShowIE(VVVVIDIE): season_number = int_or_none(season.get('number')) episodes = season.get('episodes') or [] for episode in episodes: + if episode.get('playable') is False: + continue season_id = str_or_none(episode.get('season_id')) video_id = str_or_none(episode.get('video_id')) if not (season_id and video_id): -- GitLab From fc156473d92c90f0dad50a6cc779ac47718abf58 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:54:49 +0100 Subject: [PATCH 229/384] =?UTF-8?q?[sky]=20add=20support=20for=20Sports=20?= =?UTF-8?q?News=20articles=20and=20Brighcove=20videos(close=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …s #13054) --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/sky.py | 101 +++++++++++++++++++++++------- 2 files changed, 79 insertions(+), 23 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 2b81187ca..0d92c6d0c 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1097,6 +1097,7 @@ from .skynewsarabia import ( from .sky import ( SkyNewsIE, SkySportsIE, + SkySportsNewsIE, ) from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE diff --git a/haruhi_dl/extractor/sky.py b/haruhi_dl/extractor/sky.py index 681691004..ff2c977a0 100644 --- a/haruhi_dl/extractor/sky.py +++ b/haruhi_dl/extractor/sky.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( extract_attributes, @@ -11,36 +13,59 @@ from ..utils import ( class SkyBaseIE(InfoExtractor): - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_data = extract_attributes(self._search_regex( - r'(]+>)', - webpage, 'video data')) - - video_url = 'ooyala:%s' % video_data['data-video-id'] - if video_data.get('data-token-required') == 'true': - token_fetch_options = self._parse_json(video_data.get( - 'data-token-fetch-options', '{}'), video_id, fatal=False) or {} - token_fetch_url = token_fetch_options.get('url') - if token_fetch_url: - embed_token = self._download_webpage(urljoin( - url, token_fetch_url), video_id, fatal=False) - if embed_token: - video_url = smuggle_url( - video_url, {'embed_token': embed_token.strip('"')}) + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + _SDC_EL_REGEX = r'(?s)(]+data-(?:component-name|fn)="sdc-(?:articl|sit)e-video"[^>]*>)' + + def _process_ooyala_element(self, webpage, sdc_el, url): + sdc = extract_attributes(sdc_el) + provider = sdc.get('data-provider') + if provider == 'ooyala': + video_id = sdc['data-sdc-video-id'] + video_url = 'ooyala:%s' % video_id + ie_key = 'Ooyala' + ooyala_el = self._search_regex( + r'(]+class="[^"]*\bsdc-article-video__media-ooyala\b[^"]*"[^>]+data-video-id="%s"[^>]*>)' % video_id, + webpage, 'video data', fatal=False) + if ooyala_el: + ooyala_attrs = extract_attributes(ooyala_el) or {} + if ooyala_attrs.get('data-token-required') == 'true': + token_fetch_url = (self._parse_json(ooyala_attrs.get( + 'data-token-fetch-options', '{}'), + video_id, fatal=False) or {}).get('url') + if token_fetch_url: + embed_token = self._download_json(urljoin( + url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url( + video_url, {'embed_token': embed_token}) + elif provider == 'brightcove': + video_id = sdc['data-video-id'] + account_id = sdc.get('data-account-id') or '6058004172001' + player_id = sdc.get('data-player-id') or 'RC9PQUaJ6' + video_url = self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id) + ie_key = 'BrightcoveNew' return { '_type': 'url_transparent', 'id': video_id, 'url': video_url, + 'ie_key': ie_key, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = self._process_ooyala_element(webpage, self._search_regex( + self._SDC_EL_REGEX, webpage, 'sdc element'), url) + info.update({ 'title': self._og_search_title(webpage), 'description': strip_or_none(self._og_search_description(webpage)), - 'ie_key': 'Ooyala', - } + }) + return info class SkySportsIE(SkyBaseIE): + IE_NAME = 'sky:sports' _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/([^/]+/)*(?P[0-9]+)' _TESTS = [{ 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', @@ -62,15 +87,45 @@ class SkySportsIE(SkyBaseIE): class SkyNewsIE(SkyBaseIE): + IE_NAME = 'sky:news' _VALID_URL = r'https?://news\.sky\.com/video/[0-9a-z-]+-(?P[0-9]+)' _TEST = { 'url': 'https://news.sky.com/video/russian-plane-inspected-after-deadly-fire-11712962', - 'md5': 'd6327e581473cea9976a3236ded370cd', + 'md5': '411e8893fd216c75eaf7e4c65d364115', 'info_dict': { - 'id': '1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', + 'id': 'ref:1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', 'ext': 'mp4', 'title': 'Russian plane inspected after deadly fire', 'description': 'The Russian Investigative Committee has released video of the wreckage of a passenger plane which caught fire near Moscow.', + 'uploader_id': '6058004172001', + 'timestamp': 1567112345, + 'upload_date': '20190829', }, - 'add_ie': ['Ooyala'], + 'add_ie': ['BrightcoveNew'], } + + +class SkySportsNewsIE(SkyBaseIE): + IE_NAME = 'sky:sports:news' + _VALID_URL = r'https?://(?:www\.)?skysports\.com/([^/]+/)*news/\d+/(?P\d+)' + _TEST = { + 'url': 'http://www.skysports.com/golf/news/12176/10871916/dustin-johnson-ready-to-conquer-players-championship-at-tpc-sawgrass', + 'info_dict': { + 'id': '10871916', + 'title': 'Dustin Johnson ready to conquer Players Championship at TPC Sawgrass', + 'description': 'Dustin Johnson is confident he can continue his dominant form in 2017 by adding the Players Championship to his list of victories.', + }, + 'playlist_count': 2, + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + + entries = [] + for sdc_el in re.findall(self._SDC_EL_REGEX, webpage): + entries.append(self._process_ooyala_element(webpage, sdc_el, url)) + + return self.playlist_result( + entries, article_id, self._og_search_title(webpage), + self._html_search_meta(['og:description', 'description'], webpage)) -- GitLab From a13444f11773850692b4cf9042e353682a3eab53 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:01:11 +0100 Subject: [PATCH 230/384] [arcpublishing] Add new extractor closes #2298 closes #9340 closes #17200 --- haruhi_dl/extractor/arcpublishing.py | 173 ++++++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/generic.py | 16 +++ haruhi_dl/extractor/washingtonpost.py | 101 +++------------ 4 files changed, 207 insertions(+), 84 deletions(-) create mode 100644 haruhi_dl/extractor/arcpublishing.py diff --git a/haruhi_dl/extractor/arcpublishing.py b/haruhi_dl/extractor/arcpublishing.py new file mode 100644 index 000000000..d1fb1a054 --- /dev/null +++ b/haruhi_dl/extractor/arcpublishing.py @@ -0,0 +1,173 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + parse_iso8601, + try_get, +) + + +class ArcPublishingIE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + _VALID_URL = r'arcpublishing:(?P[a-z]+):(?P%s)' % _UUID_REGEX + _TESTS = [{ + # https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/ + 'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'only_matching': True, + }, { + # https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/ + 'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1', + 'only_matching': True, + }, { + # https://www.actionnewsjax.com/video/live-stream/ + 'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a', + 'only_matching': True, + }, { + # https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/ + 'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3', + 'only_matching': True, + }, { + # https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/ + 'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe', + 'only_matching': True, + }, { + # https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/ + 'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e', + 'only_matching': True, + }, { + # https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/ + 'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143', + 'only_matching': True, + }, { + # https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/ + 'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055', + 'only_matching': True, + }, { + # https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/ + 'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d', + 'only_matching': True, + }, { + # https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/ + 'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7', + 'only_matching': True, + }, { + # https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/ + 'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b', + 'only_matching': True, + }, { + # https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html + 'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685', + 'only_matching': True, + }] + _POWA_DEFAULTS = [ + (['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'), + ([ + 'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo', + 'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom', + 'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek', + ], 'video-api-cdn.%s.arcpublishing.com/api'), + ] + + def _extract_urls(webpage): + entries = [] + # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview + for powa_el in re.findall(r'(]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): + powa = extract_attributes(powa_el) or {} + org = powa.get('data-org') + uuid = powa.get('data-uuid') + if org and uuid: + entries.append('arcpublishing:%s:%s' % (org, uuid)) + return entries + + def _real_extract(self, url): + org, uuid = re.match(self._VALID_URL, url).groups() + for orgs, tmpl in self._POWA_DEFAULTS: + if org in orgs: + base_api_tmpl = tmpl + break + else: + base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api' + if org == 'wapo': + org = 'washpost' + video = self._download_json( + 'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org), + uuid, query={'uuid': uuid})[0] + title = video['headlines']['basic'] + is_live = video.get('status') == 'live' + + urls = [] + formats = [] + for s in video.get('streams', []): + s_url = s.get('url') + if not s_url or s_url in urls: + continue + urls.append(s_url) + stream_type = s.get('stream_type') + if stream_type == 'smil': + smil_formats = self._extract_smil_formats( + s_url, uuid, fatal=False) + for f in smil_formats: + if f['url'].endswith('/cfx/st'): + f['app'] = 'cfx/st' + if not f['play_path'].startswith('mp4:'): + f['play_path'] = 'mp4:' + f['play_path'] + if isinstance(f['tbr'], float): + f['vbr'] = f['tbr'] * 1000 + del f['tbr'] + f['format_id'] = 'rtmp-%d' % f['vbr'] + formats.extend(smil_formats) + elif stream_type in ('ts', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False) + if all([f.get('acodec') == 'none' for f in m3u8_formats]): + continue + for f in m3u8_formats: + if f.get('acodec') == 'none': + f['preference'] = -40 + elif f.get('vcodec') == 'none': + f['preference'] = -50 + height = f.get('height') + if not height: + continue + vbr = self._search_regex( + r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None) + if vbr: + f['vbr'] = int(vbr) + formats.extend(m3u8_formats) + else: + vbr = int_or_none(s.get('bitrate')) + formats.append({ + 'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type, + 'vbr': vbr, + 'width': int_or_none(s.get('width')), + 'height': int_or_none(s.get('height')), + 'filesize': int_or_none(s.get('filesize')), + 'url': s_url, + 'preference': -1, + }) + self._sort_formats( + formats, ('preference', 'width', 'height', 'vbr', 'filesize', 'tbr', 'ext', 'format_id')) + + subtitles = {} + for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []): + subtitle_url = subtitle.get('url') + if subtitle_url: + subtitles.setdefault('en', []).append({'url': subtitle_url}) + + return { + 'id': uuid, + 'title': self._live_title(title) if is_live else title, + 'thumbnail': try_get(video, lambda x: x['promo_image']['url']), + 'description': try_get(video, lambda x: x['subheadlines']['basic']), + 'formats': formats, + 'duration': int_or_none(video.get('duration'), 100), + 'timestamp': parse_iso8601(video.get('created_date')), + 'subtitles': subtitles, + 'is_live': is_live, + } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 0d92c6d0c..2a680d4bf 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -66,6 +66,7 @@ from .appletrailers import ( AppleTrailersSectionIE, ) from .archiveorg import ArchiveOrgIE +from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE from .ard import ( ARDBetaMediathekIE, diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index b67f066eb..beb6ad2ad 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -133,6 +133,7 @@ from .rtlnl import RtlNlIE from .xnews import XLinkIE from .libsyn import LibsynIE from .pulsembed import PulsEmbedIE +from .arcpublishing import ArcPublishingIE class GenericIE(InfoExtractor): @@ -2261,6 +2262,20 @@ class GenericIE(InfoExtractor): 'uploader': 'OTT Videos', }, }, + { + # ArcPublishing PoWa video player + 'url': 'https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/', + 'md5': 'b03b2fac8680e1e5a7cc81a5c27e71b3', + 'info_dict': { + 'id': '8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'ext': 'mp4', + 'title': 'Senate candidates wave to voters on Anchorage streets', + 'description': 'md5:91f51a6511f090617353dc720318b20e', + 'timestamp': 1604378735, + 'upload_date': '20201103', + 'duration': 1581, + }, + }, ] def report_following_redirect(self, new_url): @@ -2676,6 +2691,7 @@ class GenericIE(InfoExtractor): XLinkIE, LibsynIE, VHXEmbedIE, + ArcPublishingIE, ): try: ie_key = embie.ie_key() diff --git a/haruhi_dl/extractor/washingtonpost.py b/haruhi_dl/extractor/washingtonpost.py index 329907465..7924d80fc 100644 --- a/haruhi_dl/extractor/washingtonpost.py +++ b/haruhi_dl/extractor/washingtonpost.py @@ -4,17 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - strip_jsonp, -) class WashingtonPostIE(InfoExtractor): IE_NAME = 'washingtonpost' - _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/(?:video|posttv)/(?:[^/]+/)*)(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' - _TEST = { + _TESTS = [{ 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', 'info_dict': { @@ -23,10 +19,15 @@ class WashingtonPostIE(InfoExtractor): 'title': 'Egypt finds belongings, debris from plane crash', 'description': 'md5:a17ceee432f215a5371388c1f680bd86', 'upload_date': '20160520', - 'uploader': 'Reuters', - 'timestamp': 1463778452, + 'timestamp': 1463775187, }, - } + }, { + 'url': 'https://www.washingtonpost.com/video/world/egypt-finds-belongings-debris-from-plane-crash/2016/05/20/480ba4ee-1ec7-11e6-82c2-a7dcb313287d_video.html', + 'only_matching': True, + }, { + 'url': 'https://www.washingtonpost.com/posttv/world/iraq-to-track-down-antiquities-after-islamic-state-museum-rampage/2015/02/28/7c57e916-bf86-11e4-9dfb-03366e719af8_video.html', + 'only_matching': True, + }] @classmethod def _extract_urls(cls, webpage, **kwargs): @@ -35,73 +36,8 @@ class WashingtonPostIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id, - video_id, transform_source=strip_jsonp)[0]['contentConfig'] - title = video_data['title'] - - urls = [] - formats = [] - for s in video_data.get('streams', []): - s_url = s.get('url') - if not s_url or s_url in urls: - continue - urls.append(s_url) - video_type = s.get('type') - if video_type == 'smil': - continue - elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url): - m3u8_formats = self._extract_m3u8_formats( - s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - for m3u8_format in m3u8_formats: - width = m3u8_format.get('width') - if not width: - continue - vbr = self._search_regex( - r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None) - if vbr: - m3u8_format.update({ - 'vbr': int_or_none(vbr), - }) - formats.extend(m3u8_formats) - else: - width = int_or_none(s.get('width')) - vbr = int_or_none(s.get('bitrate')) - has_width = width != 0 - formats.append({ - 'format_id': ( - '%s-%d-%d' % (video_type, width, vbr) - if width - else video_type), - 'vbr': vbr if has_width else None, - 'width': width, - 'height': int_or_none(s.get('height')), - 'acodec': s.get('audioCodec'), - 'vcodec': s.get('videoCodec') if has_width else 'none', - 'filesize': int_or_none(s.get('fileSize')), - 'url': s_url, - 'ext': 'mp4', - 'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None, - }) - source_media_url = video_data.get('sourceMediaURL') - if source_media_url: - formats.append({ - 'format_id': 'source_media', - 'url': source_media_url, - }) - self._sort_formats( - formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id')) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('blurb'), - 'uploader': video_data.get('credits', {}).get('source'), - 'formats': formats, - 'duration': int_or_none(video_data.get('videoDuration'), 100), - 'timestamp': int_or_none( - video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000), - } + return self.url_result( + 'arcpublishing:wapo:' + video_id, 'ArcPublishing', video_id) class WashingtonPostArticleIE(InfoExtractor): @@ -121,9 +57,8 @@ class WashingtonPostArticleIE(InfoExtractor): 'title': 'Breaking Points: The Paper Mine', 'duration': 1290, 'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.', - 'uploader': 'The Washington Post', - 'timestamp': 1395527908, - 'upload_date': '20140322', + 'timestamp': 1395440416, + 'upload_date': '20140321', }, }, { 'md5': '1fff6a689d8770966df78c8cb6c8c17c', @@ -133,9 +68,8 @@ class WashingtonPostArticleIE(InfoExtractor): 'title': 'The town bureaucracy sustains', 'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.', 'duration': 2220, - 'timestamp': 1395528005, - 'upload_date': '20140322', - 'uploader': 'The Washington Post', + 'timestamp': 1395441819, + 'upload_date': '20140321', }, }], }, { @@ -151,8 +85,7 @@ class WashingtonPostArticleIE(InfoExtractor): 'ext': 'mp4', 'description': 'Washington Post transportation reporter Ashley Halsey III explains why a plane\'s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.', 'upload_date': '20141230', - 'uploader': 'The Washington Post', - 'timestamp': 1419974765, + 'timestamp': 1419972442, 'title': 'Why black boxes don’t transmit data in real time', } }] -- GitLab From 56a45e91d22b936c33f29d73e42e925eda3026f5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:01:45 +0100 Subject: [PATCH 231/384] [arcpublishing] add missing staticmethod decorator --- haruhi_dl/extractor/arcpublishing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haruhi_dl/extractor/arcpublishing.py b/haruhi_dl/extractor/arcpublishing.py index d1fb1a054..ca6a6c4d8 100644 --- a/haruhi_dl/extractor/arcpublishing.py +++ b/haruhi_dl/extractor/arcpublishing.py @@ -73,6 +73,7 @@ class ArcPublishingIE(InfoExtractor): ], 'video-api-cdn.%s.arcpublishing.com/api'), ] + @staticmethod def _extract_urls(webpage): entries = [] # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview -- GitLab From 5ccde7fdb33c7afdf37884d32ac36fe43f5c5a16 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:01:52 +0100 Subject: [PATCH 232/384] [acast] fix extraction(closes #21444)(closes #27612)(closes #27613) --- haruhi_dl/extractor/acast.py | 116 ++++++++++++++++------------------- 1 file changed, 53 insertions(+), 63 deletions(-) diff --git a/haruhi_dl/extractor/acast.py b/haruhi_dl/extractor/acast.py index b17c792d2..60378db1b 100644 --- a/haruhi_dl/extractor/acast.py +++ b/haruhi_dl/extractor/acast.py @@ -2,21 +2,47 @@ from __future__ import unicode_literals import re -import functools from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, - float_or_none, int_or_none, - try_get, - unified_timestamp, - OnDemandPagedList, + parse_iso8601, ) -class ACastIE(InfoExtractor): +class ACastBaseIE(InfoExtractor): + def _extract_episode(self, episode, show_info): + title = episode['title'] + info = { + 'id': episode['id'], + 'display_id': episode.get('episodeUrl'), + 'url': episode['url'], + 'title': title, + 'description': clean_html(episode.get('description') or episode.get('summary')), + 'thumbnail': episode.get('image'), + 'timestamp': parse_iso8601(episode.get('publishDate')), + 'duration': int_or_none(episode.get('duration')), + 'filesize': int_or_none(episode.get('contentLength')), + 'season_number': int_or_none(episode.get('season')), + 'episode': title, + 'episode_number': int_or_none(episode.get('episode')), + } + info.update(show_info) + return info + + def _extract_show_info(self, show): + return { + 'creator': show.get('author'), + 'series': show.get('title'), + } + + def _call_api(self, path, video_id, query=None): + return self._download_json( + 'https://feeder.acast.com/api/v1/shows/' + path, video_id, query=query) + + +class ACastIE(ACastBaseIE): IE_NAME = 'acast' _VALID_URL = r'''(?x) https?:// @@ -28,15 +54,15 @@ class ACastIE(InfoExtractor): ''' _TESTS = [{ 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': '16d936099ec5ca2d5869e3a813ee8dc4', + 'md5': 'f5598f3ad1e4776fed12ec1407153e4b', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', - 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', + 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67', 'timestamp': 1477346700, 'upload_date': '20161024', - 'duration': 2766.602563, + 'duration': 2766, 'creator': 'Anton Berg & Martin Johnson', 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', @@ -45,7 +71,7 @@ class ACastIE(InfoExtractor): 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', 'only_matching': True, }, { - 'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22', + 'url': 'https://play.acast.com/s/rattegangspodden/s04e09styckmordetihelenelund-del2-2', 'only_matching': True, }, { 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', @@ -54,40 +80,14 @@ class ACastIE(InfoExtractor): def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() - s = self._download_json( - 'https://feeder.acast.com/api/v1/shows/%s/episodes/%s' % (channel, display_id), - display_id) - media_url = s['url'] - if re.search(r'[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}', display_id): - episode_url = s.get('episodeUrl') - if episode_url: - display_id = episode_url - else: - channel, display_id = re.match(self._VALID_URL, s['link']).groups() - cast_data = self._download_json( - 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), - display_id)['result'] - e = cast_data['episode'] - title = e.get('name') or s['title'] - return { - 'id': compat_str(e['id']), - 'display_id': display_id, - 'url': media_url, - 'title': title, - 'description': e.get('summary') or clean_html(e.get('description') or s.get('description')), - 'thumbnail': e.get('image'), - 'timestamp': unified_timestamp(e.get('publishingDate') or s.get('publishDate')), - 'duration': float_or_none(e.get('duration') or s.get('duration')), - 'filesize': int_or_none(e.get('contentLength')), - 'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str), - 'series': try_get(cast_data, lambda x: x['show']['name'], compat_str), - 'season_number': int_or_none(e.get('seasonNumber')), - 'episode': title, - 'episode_number': int_or_none(e.get('episodeNumber')), - } + episode = self._call_api( + '%s/episodes/%s' % (channel, display_id), + display_id, {'showInfo': 'true'}) + return self._extract_episode( + episode, self._extract_show_info(episode.get('show') or {})) -class ACastChannelIE(InfoExtractor): +class ACastChannelIE(ACastBaseIE): IE_NAME = 'acast:channel' _VALID_URL = r'''(?x) https?:// @@ -102,34 +102,24 @@ class ACastChannelIE(InfoExtractor): 'info_dict': { 'id': '4efc5294-5385-4847-98bd-519799ce5786', 'title': 'Today in Focus', - 'description': 'md5:9ba5564de5ce897faeb12963f4537a64', + 'description': 'md5:c09ce28c91002ce4ffce71d6504abaae', }, - 'playlist_mincount': 35, + 'playlist_mincount': 200, }, { 'url': 'http://play.acast.com/s/ft-banking-weekly', 'only_matching': True, }] - _API_BASE_URL = 'https://play.acast.com/api/' - _PAGE_SIZE = 10 @classmethod def suitable(cls, url): return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url) - def _fetch_page(self, channel_slug, page): - casts = self._download_json( - self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page), - channel_slug, note='Download page %d of channel data' % page) - for cast in casts: - yield self.url_result( - 'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']), - 'ACast', cast['id']) - def _real_extract(self, url): - channel_slug = self._match_id(url) - channel_data = self._download_json( - self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug) - entries = OnDemandPagedList(functools.partial( - self._fetch_page, channel_slug), self._PAGE_SIZE) - return self.playlist_result(entries, compat_str( - channel_data['id']), channel_data['name'], channel_data.get('description')) + show_slug = self._match_id(url) + show = self._call_api(show_slug, show_slug) + show_info = self._extract_show_info(show) + entries = [] + for episode in (show.get('episodes') or []): + entries.append(self._extract_episode(episode, show_info)) + return self.playlist_result( + entries, show.get('id'), show.get('title'), show.get('description')) -- GitLab From 51535e0624cb22b61fadfbed528d1bccca4e16cf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:02:04 +0100 Subject: [PATCH 233/384] [stitcher] fix extraction(closes #20811)(closes #27606) --- haruhi_dl/extractor/stitcher.py | 60 ++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/haruhi_dl/extractor/stitcher.py b/haruhi_dl/extractor/stitcher.py index 97d1ff681..b8b5711b1 100644 --- a/haruhi_dl/extractor/stitcher.py +++ b/haruhi_dl/extractor/stitcher.py @@ -4,25 +4,28 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, + clean_html, + ExtractorError, int_or_none, - js_to_json, - unescapeHTML, + str_or_none, + try_get, ) class StitcherIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' + _VALID_URL = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/(?:[^/]+/)+e(?:pisode)?/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', - 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940', + 'md5': 'e9635098e0da10b21a0e2b85585530f6', 'info_dict': { 'id': '40789481', 'ext': 'mp3', 'title': 'Machine Learning Mastery and Cancer Clusters', - 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3', + 'description': 'md5:547adb4081864be114ae3831b4c2b42f', 'duration': 1604, 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20180126', + 'timestamp': 1516989316, }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', @@ -38,6 +41,7 @@ class StitcherIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Page Not Found', }, { # escaped title 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true', @@ -45,37 +49,39 @@ class StitcherIE(InfoExtractor): }, { 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true', 'only_matching': True, + }, { + 'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - audio_id = mobj.group('id') - display_id = mobj.group('display_id') or audio_id + display_id, audio_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) + resp = self._download_json( + 'https://api.prod.stitcher.com/episode/' + audio_id, + display_id or audio_id) + episode = try_get(resp, lambda x: x['data']['episodes'][0], dict) + if not episode: + raise ExtractorError(resp['errors'][0]['message'], expected=True) - episode = self._parse_json( - js_to_json(self._search_regex( - r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')), - display_id)['config']['episode'] + title = episode['title'].strip() + audio_url = episode['audio_url'] - title = unescapeHTML(episode['title']) - formats = [{ - 'url': episode[episode_key], - 'ext': determine_ext(episode[episode_key]) or 'mp3', - 'vcodec': 'none', - } for episode_key in ('episodeURL',) if episode.get(episode_key)] - description = self._search_regex( - r'Episode Info:\s*([^<]+)<', webpage, 'description', fatal=False) - duration = int_or_none(episode.get('duration')) - thumbnail = episode.get('episodeImage') + thumbnail = None + show_id = episode.get('show_id') + if show_id and episode.get('classic_id') != -1: + thumbnail = 'https://stitcher-classic.imgix.net/feedimages/%s.jpg' % show_id return { 'id': audio_id, 'display_id': display_id, 'title': title, - 'description': description, - 'duration': duration, + 'description': clean_html(episode.get('html_description') or episode.get('description')), + 'duration': int_or_none(episode.get('duration')), 'thumbnail': thumbnail, - 'formats': formats, + 'url': audio_url, + 'vcodec': 'none', + 'timestamp': int_or_none(episode.get('date_created')), + 'season_number': int_or_none(episode.get('season')), + 'season_id': str_or_none(episode.get('season_id')), } -- GitLab From 417963200c033afb95a9fca54eb6b313a6e284a4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:02:17 +0100 Subject: [PATCH 234/384] [vvvvid] fix season metadata extraction(#18130) --- haruhi_dl/extractor/vvvvid.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/haruhi_dl/extractor/vvvvid.py b/haruhi_dl/extractor/vvvvid.py index 145805492..f4cae7fe9 100644 --- a/haruhi_dl/extractor/vvvvid.py +++ b/haruhi_dl/extractor/vvvvid.py @@ -25,7 +25,6 @@ class VVVVIDIE(InfoExtractor): 'duration': 239, 'series': '"Perché dovrei guardarlo?" di Dario Moccia', 'season_id': '437', - 'season_number': 1, 'episode': 'Ping Pong', 'episode_number': 1, 'episode_id': '3334', @@ -75,7 +74,6 @@ class VVVVIDIE(InfoExtractor): def _extract_common_video_info(self, video_data): return { 'thumbnail': video_data.get('thumbnail'), - 'episode_number': int_or_none(video_data.get('number')), 'episode_id': str_or_none(video_data.get('id')), } @@ -145,6 +143,17 @@ class VVVVIDIE(InfoExtractor): return d + info = {} + + def metadata_from_url(r_url): + if not info and r_url: + mobj = re.search(r'_(?:S(\d+))?Ep(\d+)', r_url) + if mobj: + info['episode_number'] = int(mobj.group(2)) + season_number = mobj.group(1) + if season_number: + info['season_number'] = int(season_number) + for quality in ('_sd', ''): embed_code = video_data.get('embed_info' + quality) if not embed_code: @@ -166,9 +175,12 @@ class VVVVIDIE(InfoExtractor): else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) + metadata_from_url(embed_code) + self._sort_formats(formats) - info = self._extract_common_video_info(video_data) + metadata_from_url(video_data.get('thumbnail')) + info.update(self._extract_common_video_info(video_data)) info.update({ 'id': video_id, 'title': title, @@ -176,7 +188,6 @@ class VVVVIDIE(InfoExtractor): 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, - 'season_number': video_data.get('season_number'), 'episode': title, 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), @@ -211,7 +222,6 @@ class VVVVIDShowIE(VVVVIDIE): entries = [] for season in (seasons or []): - season_number = int_or_none(season.get('number')) episodes = season.get('episodes') or [] for episode in episodes: if episode.get('playable') is False: @@ -227,7 +237,6 @@ class VVVVIDShowIE(VVVVIDIE): 'url': '/'.join([base_url, season_id, video_id]), 'title': episode.get('title'), 'description': episode.get('description'), - 'season_number': season_number, 'season_id': season_id, }) entries.append(info) -- GitLab From 973258396df1e68d8fffd85007f012f6988e9597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:02:33 +0100 Subject: [PATCH 235/384] [nrktv] Switch to playback endpoint mediaelement endpoint is no longer in use. --- haruhi_dl/extractor/nrk.py | 273 ++++++++----------------------------- 1 file changed, 57 insertions(+), 216 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index b545f291b..871e4845c 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -6,15 +6,11 @@ import random import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) +from ..compat import compat_str from ..utils import ( determine_ext, ExtractorError, int_or_none, - parse_age_limit, parse_duration, try_get, urljoin, @@ -63,60 +59,8 @@ class NRKBaseIE(InfoExtractor): return self._download_json( urljoin('http://psapi.nrk.no/', path), video_id, note or 'Downloading %s JSON' % item, - fatal=fatal, query=query) - - -class NRKIE(NRKBaseIE): - _VALID_URL = r'''(?x) - (?: - nrk:| - https?:// - (?: - (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| - v8[-.]psapi\.nrk\.no/mediaelement/ - ) - ) - (?P[^?\#&]+) - ''' - - _TESTS = [{ - # video - 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': 'f46be075326e23ad0e524edfcb06aeb6', - 'info_dict': { - 'id': '150533', - 'ext': 'mp4', - 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 262, - } - }, { - # audio - 'url': 'http://www.nrk.no/video/PS*154915', - # MD5 is unstable - 'info_dict': { - 'id': '154915', - 'ext': 'mp4', - 'title': 'Slik høres internett ut når du er blind', - 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', - 'duration': 20, - } - }, { - 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', - 'only_matching': True, - }, { - 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', - 'only_matching': True, - }, { - 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', - 'only_matching': True, - }] + fatal=fatal, query=query, + headers={'Accept-Encoding': 'gzip, deflate, br'}) def _extract_from_playback(self, video_id): path_templ = 'playback/%s/' + video_id @@ -178,6 +122,59 @@ class NRKIE(NRKBaseIE): 'formats': formats, } + +class NRKIE(NRKBaseIE): + _VALID_URL = r'''(?x) + (?: + nrk:| + https?:// + (?: + (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| + v8[-.]psapi\.nrk\.no/mediaelement/ + ) + ) + (?P[^?\#&]+) + ''' + + _TESTS = [{ + # video + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': 'f46be075326e23ad0e524edfcb06aeb6', + 'info_dict': { + 'id': '150533', + 'ext': 'mp4', + 'title': 'Dompap og andre fugler i Piip-Show', + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 262, + } + }, { + # audio + 'url': 'http://www.nrk.no/video/PS*154915', + # MD5 is unstable + 'info_dict': { + 'id': '154915', + 'ext': 'mp4', + 'title': 'Slik høres internett ut når du er blind', + 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, + } + }, { + 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', + 'only_matching': True, + }, { + 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', + 'only_matching': True, + }] + def _real_extract(self, url): video_id = self._match_id(url) return self._extract_from_playback(video_id) @@ -187,7 +184,6 @@ class NRKTVIE(NRKBaseIE): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE - _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', @@ -290,164 +286,9 @@ class NRKTVIE(NRKBaseIE): 'only_matching': True, }] - _api_host = None - - def _extract_from_mediaelement(self, video_id): - api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS - - for api_host in api_hosts: - data = self._download_json( - 'http://%s/mediaelement/%s' % (api_host, video_id), - video_id, 'Downloading mediaelement JSON', - fatal=api_host == api_hosts[-1]) - if not data: - continue - self._api_host = api_host - break - - title = data.get('fullTitle') or data.get('mainTitle') or data['title'] - video_id = data.get('id') or video_id - - urls = [] - entries = [] - - conviva = data.get('convivaStatistics') or {} - live = (data.get('mediaElementType') == 'Live' - or data.get('isLive') is True or conviva.get('isLive')) - - def make_title(t): - return self._live_title(t) if live else t - - media_assets = data.get('mediaAssets') - if media_assets and isinstance(media_assets, list): - def video_id_and_title(idx): - return ((video_id, title) if len(media_assets) == 1 - else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) - for num, asset in enumerate(media_assets, 1): - asset_url = asset.get('url') - if not asset_url or asset_url in urls: - continue - urls.append(asset_url) - formats = self._extract_nrk_formats(asset_url, video_id) - if not formats: - continue - self._sort_formats(formats) - - entry_id, entry_title = video_id_and_title(num) - duration = parse_duration(asset.get('duration')) - subtitles = {} - for subtitle in ('webVtt', 'timedText'): - subtitle_url = asset.get('%sSubtitlesUrl' % subtitle) - if subtitle_url: - subtitles.setdefault('no', []).append({ - 'url': compat_urllib_parse_unquote(subtitle_url) - }) - entries.append({ - 'id': asset.get('carrierId') or entry_id, - 'title': make_title(entry_title), - 'duration': duration, - 'subtitles': subtitles, - 'formats': formats, - 'is_live': live, - }) - - if not entries: - media_url = data.get('mediaUrl') - if media_url and media_url not in urls: - formats = self._extract_nrk_formats(media_url, video_id) - if formats: - self._sort_formats(formats) - duration = parse_duration(data.get('duration')) - entries = [{ - 'id': video_id, - 'title': make_title(title), - 'duration': duration, - 'formats': formats, - 'is_live': live, - }] - - if not entries: - self._raise_error(data) - - series = conviva.get('seriesName') or data.get('seriesTitle') - episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') - - season_number = None - episode_number = None - if data.get('mediaElementType') == 'Episode': - _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ - data.get('relativeOriginUrl', '') - EPISODENUM_RE = [ - r'/s(?P\d{,2})e(?P\d{,2})\.', - r'/sesong-(?P\d{,2})/episode-(?P\d{,2})', - ] - season_number = int_or_none(self._search_regex( - EPISODENUM_RE, _season_episode, 'season number', - default=None, group='season')) - episode_number = int_or_none(self._search_regex( - EPISODENUM_RE, _season_episode, 'episode number', - default=None, group='episode')) - - thumbnails = None - images = data.get('images') - if images and isinstance(images, dict): - web_images = images.get('webImages') - if isinstance(web_images, list): - thumbnails = [{ - 'url': image['imageUrl'], - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - } for image in web_images if image.get('imageUrl')] - - description = data.get('description') - category = data.get('mediaAnalytics', {}).get('category') - - common_info = { - 'description': description, - 'series': series, - 'episode': episode, - 'season_number': season_number, - 'episode_number': episode_number, - 'categories': [category] if category else None, - 'age_limit': parse_age_limit(data.get('legalAge')), - 'thumbnails': thumbnails, - } - - vcodec = 'none' if data.get('mediaType') == 'Audio' else None - - for entry in entries: - entry.update(common_info) - for f in entry['formats']: - f['vcodec'] = vcodec - - points = data.get('shortIndexPoints') - if isinstance(points, list): - chapters = [] - for next_num, point in enumerate(points, start=1): - if not isinstance(point, dict): - continue - start_time = parse_duration(point.get('startPoint')) - if start_time is None: - continue - end_time = parse_duration( - data.get('duration') - if next_num == len(points) - else points[next_num].get('startPoint')) - if end_time is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': point.get('title'), - }) - if chapters and len(entries) == 1: - entries[0]['chapters'] = chapters - - return self.playlist_result(entries, video_id, title, description) - def _real_extract(self, url): video_id = self._match_id(url) - return self._extract_from_mediaelement(video_id) + return self._extract_from_playback(video_id) class NRKTVEpisodeIE(InfoExtractor): -- GitLab From 18be494898ce0befd9da4ab722306b8c93ddc4d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:02 +0100 Subject: [PATCH 236/384] [nrk] Improve extraction (closes #27634, closes #27635) + Add support for mp3 formats * Generalize and delegate all item extractors to nrk, beware ie key breakages + Add support for podcasts + Generalize nrk shortcut form to support all kind of ids --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/nrk.py | 236 +++++++++++++++++++++--------- 2 files changed, 166 insertions(+), 71 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 2a680d4bf..086c7d42a 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -823,6 +823,7 @@ from .nrk import ( NRKSkoleIE, NRKTVIE, NRKTVDirekteIE, + NRKRadioPodkastIE, NRKTVEpisodeIE, NRKTVEpisodesIE, NRKTVSeasonIE, diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 871e4845c..9621522d4 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -62,6 +62,75 @@ class NRKBaseIE(InfoExtractor): fatal=fatal, query=query, headers={'Accept-Encoding': 'gzip, deflate, br'}) + +class NRKIE(NRKBaseIE): + _VALID_URL = r'''(?x) + (?: + nrk:| + https?:// + (?: + (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| + v8[-.]psapi\.nrk\.no/mediaelement/ + ) + ) + (?P[^?\#&]+) + ''' + + _TESTS = [{ + # video + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': 'f46be075326e23ad0e524edfcb06aeb6', + 'info_dict': { + 'id': '150533', + 'ext': 'mp4', + 'title': 'Dompap og andre fugler i Piip-Show', + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 262, + } + }, { + # audio + 'url': 'http://www.nrk.no/video/PS*154915', + # MD5 is unstable + 'info_dict': { + 'id': '154915', + 'ext': 'mp4', + 'title': 'Slik høres internett ut når du er blind', + 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, + } + }, { + 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', + 'only_matching': True, + }, { + 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', + 'only_matching': True, + }, { + # podcast + 'url': 'nrk:l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + # clip + 'url': 'nrk:150533', + 'only_matching': True, + }, { + # episode + 'url': 'nrk:MDDP12000117', + 'only_matching': True, + }, { + # direkte + 'url': 'nrk:nrk1', + 'only_matching': True, + }] + def _extract_from_playback(self, video_id): path_templ = 'playback/%s/' + video_id @@ -84,8 +153,15 @@ class NRKBaseIE(InfoExtractor): format_url = url_or_none(asset.get('url')) if not format_url: continue - if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': + asset_format = (asset.get('format') or '').lower() + if asset_format == 'hls' or determine_ext(format_url) == 'm3u8': formats.extend(self._extract_nrk_formats(format_url, video_id)) + elif asset_format == 'mp3': + formats.append({ + 'url': format_url, + 'format_id': asset_format, + 'vcodec': 'none', + }) self._sort_formats(formats) data = call_playback_api('metadata') @@ -122,65 +198,12 @@ class NRKBaseIE(InfoExtractor): 'formats': formats, } - -class NRKIE(NRKBaseIE): - _VALID_URL = r'''(?x) - (?: - nrk:| - https?:// - (?: - (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| - v8[-.]psapi\.nrk\.no/mediaelement/ - ) - ) - (?P[^?\#&]+) - ''' - - _TESTS = [{ - # video - 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': 'f46be075326e23ad0e524edfcb06aeb6', - 'info_dict': { - 'id': '150533', - 'ext': 'mp4', - 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 262, - } - }, { - # audio - 'url': 'http://www.nrk.no/video/PS*154915', - # MD5 is unstable - 'info_dict': { - 'id': '154915', - 'ext': 'mp4', - 'title': 'Slik høres internett ut når du er blind', - 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', - 'duration': 20, - } - }, { - 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', - 'only_matching': True, - }, { - 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', - 'only_matching': True, - }, { - 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', - 'only_matching': True, - }] - def _real_extract(self, url): video_id = self._match_id(url) return self._extract_from_playback(video_id) -class NRKTVIE(NRKBaseIE): +class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE @@ -288,7 +311,8 @@ class NRKTVIE(NRKBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - return self._extract_from_playback(video_id) + return self.url_result( + 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) class NRKTVEpisodeIE(InfoExtractor): @@ -359,8 +383,6 @@ class NRKTVSerieBaseIE(NRKBaseIE): nrk_id = episode.get('prfId') or episode.get('episodeId') if not nrk_id or not isinstance(nrk_id, compat_str): continue - if not re.match(NRKTVIE._EPISODE_RE, nrk_id): - continue entries.append(self.url_result( 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) return entries @@ -372,6 +394,10 @@ class NRKTVSerieBaseIE(NRKBaseIE): if embedded.get(asset_key): return asset_key + @staticmethod + def _catalog_name(serie_kind): + return 'podcast' if serie_kind in ('podcast', 'podkast') else 'series' + def _entries(self, data, display_id): for page_num in itertools.count(1): embedded = data.get('_embedded') or data @@ -405,7 +431,16 @@ class NRKTVSerieBaseIE(NRKBaseIE): class NRKTVSeasonIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?Ptv|radio)\.nrk\.no/serie/(?P[^/]+)/(?:sesong/)?(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?Ptv|radio)\.nrk\.no/ + (?Pserie|pod[ck]ast)/ + (?P[^/]+)/ + (?: + (?:sesong/)?(?P\d+)| + sesong/(?P[^/?#&]+) + ) + ''' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', 'info_dict': { @@ -441,19 +476,34 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): # 180 entries, single page 'url': 'https://tv.nrk.no/serie/spangas/sesong/1', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/diagnose-kverulant', + 'info_dict': { + 'id': 'hele_historien/diagnose-kverulant', + 'title': 'Diagnose kverulant', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://radio.nrk.no/podkast/loerdagsraadet/sesong/202101', + 'only_matching': True, }] @classmethod def suitable(cls, url): - return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) + return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) or NRKRadioPodkastIE.suitable(url) else super(NRKTVSeasonIE, cls).suitable(url)) def _real_extract(self, url): - domain, serie, season_id = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + domain = mobj.group('domain') + serie_kind = mobj.group('serie_kind') + serie = mobj.group('serie') + season_id = mobj.group('id') or mobj.group('id_2') display_id = '%s/%s' % (serie, season_id) data = self._call_api( - '%s/catalog/series/%s/seasons/%s' % (domain, serie, season_id), + '%s/catalog/%s/%s/seasons/%s' + % (domain, self._catalog_name(serie_kind), serie, season_id), display_id, 'season', query={'pageSize': 50}) title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id @@ -463,7 +513,7 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?P(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/serie/(?P[^/]+)' + _VALID_URL = r'https?://(?P(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/(?Pserie|pod[ck]ast)/(?P[^/]+)' _TESTS = [{ # new layout, instalments 'url': 'https://tv.nrk.no/serie/groenn-glede', @@ -523,23 +573,33 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, { 'url': 'https://nrksuper.no/serie/labyrint', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers', + 'info_dict': { + 'id': 'ulrikkes_univers', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/nrkno-poddkast-26588-134079-05042018030000', + 'only_matching': True, }] @classmethod def suitable(cls, url): return ( False if any(ie.suitable(url) - for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE)) + for ie in (NRKTVIE, NRKTVEpisodeIE, NRKRadioPodkastIE, NRKTVSeasonIE)) else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): - site, series_id = re.match(self._VALID_URL, url).groups() + site, serie_kind, series_id = re.match(self._VALID_URL, url).groups() is_radio = site == 'radio.nrk' domain = 'radio' if is_radio else 'tv' size_prefix = 'p' if is_radio else 'embeddedInstalmentsP' series = self._call_api( - '%s/catalog/series/%s' % (domain, series_id), + '%s/catalog/%s/%s' + % (domain, self._catalog_name(serie_kind), series_id), series_id, 'serie', query={size_prefix + 'ageSize': 50}) titles = try_get(series, [ lambda x: x['titles'], @@ -554,12 +614,14 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): embedded_seasons = embedded.get('seasons') or [] if len(linked_seasons) > len(embedded_seasons): for season in linked_seasons: - season_name = season.get('name') - if season_name and isinstance(season_name, compat_str): + season_url = urljoin(url, season.get('href')) + if not season_url: + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + season_url = 'https://%s.nrk.no/serie/%s/sesong/%s' % (domain, series_id, season_name) + if season_url: entries.append(self.url_result( - 'https://%s.nrk.no/serie/%s/sesong/%s' - % (domain, series_id, season_name), - ie=NRKTVSeasonIE.ie_key(), + season_url, ie=NRKTVSeasonIE.ie_key(), video_title=season.get('title'))) else: for season in embedded_seasons: @@ -584,6 +646,38 @@ class NRKTVDirekteIE(NRKTVIE): }] +class NRKRadioPodkastIE(InfoExtractor): + _VALID_URL = r'https?://radio\.nrk\.no/pod[ck]ast/(?:[^/]+/)+(?Pl_[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + _TESTS = [{ + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'md5': '8d40dab61cea8ab0114e090b029a0565', + 'info_dict': { + 'id': 'MUHH48000314AA', + 'ext': 'mp4', + 'title': '20 spørsmål 23.05.2014', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'duration': 1741, + 'series': '20 spørsmål', + 'episode': '23.05.2014', + }, + }, { + 'url': 'https://radio.nrk.no/podcast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/sesong/1/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/bortfoert-i-bergen/l_774d1a2c-7aa7-4965-8d1a-2c7aa7d9652c', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) + + class NRKPlaylistBaseIE(InfoExtractor): def _extract_description(self, webpage): pass -- GitLab From e1145c77fd95b4df2c3c9e77bd6c4584838897eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:08 +0100 Subject: [PATCH 237/384] [nrk] Add more shortcut tests --- haruhi_dl/extractor/nrk.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 9621522d4..61a7c9aad 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -117,18 +117,30 @@ class NRKIE(NRKBaseIE): # podcast 'url': 'nrk:l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', 'only_matching': True, + }, { + 'url': 'nrk:podcast/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, }, { # clip 'url': 'nrk:150533', 'only_matching': True, }, { - # episode + 'url': 'nrk:clip/150533', + 'only_matching': True, + }, { + # program 'url': 'nrk:MDDP12000117', 'only_matching': True, + }, { + 'url': 'nrk:program/ENRK10100318', + 'only_matching': True, }, { # direkte 'url': 'nrk:nrk1', 'only_matching': True, + }, { + 'url': 'nrk:channel/nrk1', + 'only_matching': True, }] def _extract_from_playback(self, video_id): -- GitLab From 634ebea93d108d522cfb6816b6552b44e6b878f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:15 +0100 Subject: [PATCH 238/384] [nrk] Improve video id extraction --- haruhi_dl/extractor/nrk.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 61a7c9aad..5f12b0d9e 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -151,6 +151,8 @@ class NRKIE(NRKBaseIE): # known values for preferredCdn: akamai, iponly, minicdn and telenor manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) + video_id = try_get(manifest, lambda x: x['id'], compat_str) or video_id + if manifest.get('playability') == 'nonPlayable': self._raise_error(manifest['nonPlayable']) @@ -211,7 +213,7 @@ class NRKIE(NRKBaseIE): } def _real_extract(self, url): - video_id = self._match_id(url) + video_id = self._match_id(url).split('/')[-1] return self._extract_from_playback(video_id) -- GitLab From d9673551d7f3cff0fe8f5f4fa1b101279d475897 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:34 +0100 Subject: [PATCH 239/384] [nrk] Inline _extract_from_playback --- haruhi_dl/extractor/nrk.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 5f12b0d9e..520206534 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -143,7 +143,9 @@ class NRKIE(NRKBaseIE): 'only_matching': True, }] - def _extract_from_playback(self, video_id): + def _real_extract(self, url): + video_id = self._match_id(url).split('/')[-1] + path_templ = 'playback/%s/' + video_id def call_playback_api(item, query=None): @@ -212,10 +214,6 @@ class NRKIE(NRKBaseIE): 'formats': formats, } - def _real_extract(self, url): - video_id = self._match_id(url).split('/')[-1] - return self._extract_from_playback(video_id) - class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' -- GitLab From eff203d3aec5451693a95f21f575c540bc4d164d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:38 +0100 Subject: [PATCH 240/384] [nrk] Fix age limit extraction --- haruhi_dl/extractor/nrk.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 520206534..d023de7f7 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -204,6 +204,9 @@ class NRKIE(NRKBaseIE): 'height': int_or_none(image.get('pixelHeight')), }) + age_limit = int_or_none(try_get( + data, lambda x: x['legalAge']['body']['rating']['code'])) + return { 'id': video_id, 'title': title, @@ -211,6 +214,7 @@ class NRKIE(NRKBaseIE): 'description': description, 'duration': duration, 'thumbnails': thumbnails, + 'age_limit': age_limit, 'formats': formats, } -- GitLab From db48c8dbfe36e43a33041b9738d51b4fa803ab77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:44 +0100 Subject: [PATCH 241/384] [nrk] Extract subtitles --- haruhi_dl/extractor/nrk.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index d023de7f7..bd96d9d14 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -12,6 +12,7 @@ from ..utils import ( ExtractorError, int_or_none, parse_duration, + str_or_none, try_get, urljoin, url_or_none, @@ -204,6 +205,21 @@ class NRKIE(NRKBaseIE): 'height': int_or_none(image.get('pixelHeight')), }) + subtitles = {} + for sub in try_get(playable, lambda x: x['subtitles'], list) or []: + if not isinstance(sub, dict): + continue + sub_url = url_or_none(sub.get('webVtt')) + if not sub_url: + continue + sub_key = str_or_none(sub.get('language')) or 'nb' + sub_type = str_or_none(sub.get('type')) + if sub_type: + sub_key += '-%s' % sub_type + subtitles.setdefault(sub_key, []).append({ + 'url': sub_url, + }) + age_limit = int_or_none(try_get( data, lambda x: x['legalAge']['body']['rating']['code'])) @@ -216,6 +232,7 @@ class NRKIE(NRKBaseIE): 'thumbnails': thumbnails, 'age_limit': age_limit, 'formats': formats, + 'subtitles': subtitles, } -- GitLab From aa829b6cd3a5f6690c195e69a15020ed872bbfe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:49 +0100 Subject: [PATCH 242/384] [nrk] Improve series metadata extraction --- haruhi_dl/extractor/nrk.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index bd96d9d14..20a5d7673 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -223,7 +223,9 @@ class NRKIE(NRKBaseIE): age_limit = int_or_none(try_get( data, lambda x: x['legalAge']['body']['rating']['code'])) - return { + is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' + + info = { 'id': video_id, 'title': title, 'alt_title': alt_title, @@ -235,6 +237,27 @@ class NRKIE(NRKBaseIE): 'subtitles': subtitles, } + if is_series: + series = title + if alt_title: + title += ' - %s' % alt_title + season_number = int_or_none(self._search_regex( + r'Sesong\s+(\d+)', description or '', 'season number', + default=None)) + episode = alt_title if is_series else None + episode_number = int_or_none(self._search_regex( + r'(\d+)\.\s+episode', episode or '', 'episode number', + default=None)) + info.update({ + 'title': title, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + + return info + class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' -- GitLab From c00a4d81ca304960bfcab53261f2506a27e551a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:05:07 +0100 Subject: [PATCH 243/384] [nrktv] Fix tests --- haruhi_dl/extractor/nrk.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 20a5d7673..4fb7df959 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -267,7 +267,7 @@ class NRKTVIE(InfoExtractor): 'url': 'https://tv.nrk.no/program/MDDP12000117', 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', 'info_dict': { - 'id': 'MDDP12000117AA', + 'id': 'MDDP12000117', 'ext': 'mp4', 'title': 'Alarm Trolltunga', 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', @@ -278,24 +278,25 @@ class NRKTVIE(InfoExtractor): 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': '8d40dab61cea8ab0114e090b029a0565', 'info_dict': { - 'id': 'MUHH48000314AA', + 'id': 'MUHH48000314', 'ext': 'mp4', - 'title': '20 spørsmål 23.05.2014', + 'title': '20 spørsmål - 23. mai 2014', + 'alt_title': '23. mai 2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, 'series': '20 spørsmål', - 'episode': '23.05.2014', + 'episode': '23. mai 2014', }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', 'info_dict': { - 'id': 'MDFP15000514CA', + 'id': 'MDFP15000514', 'ext': 'mp4', - 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', + 'title': 'Kunnskapskanalen - Grunnlovsjubiléet - Stor ståhei for ingenting', 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', 'duration': 4605.08, 'series': 'Kunnskapskanalen', - 'episode': '24.05.2014', + 'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting', }, 'params': { 'skip_download': True, @@ -304,7 +305,7 @@ class NRKTVIE(InfoExtractor): # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', 'info_dict': { - 'id': 'MSPO40010515AH', + 'id': 'MSPO40010515', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', @@ -317,22 +318,23 @@ class NRKTVIE(InfoExtractor): }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', 'info_dict': { - 'id': 'MSPO40010515AH', + 'id': 'MSPO40010515', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', }, 'expected_warnings': ['Failed to download m3u8 information'], + 'skip': 'Ikke tilgjengelig utenfor Norge', }, { 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', 'info_dict': { - 'id': 'KMTE50001317AA', + 'id': 'KMTE50001317', 'ext': 'mp4', - 'title': 'Anno 13:30', + 'title': 'Anno - 13. episode', 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', 'duration': 2340, 'series': 'Anno', - 'episode': '13:30', + 'episode': '13. episode', 'season_number': 3, 'episode_number': 13, }, @@ -342,7 +344,7 @@ class NRKTVIE(InfoExtractor): }, { 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', 'info_dict': { - 'id': 'MUHH46000317AA', + 'id': 'MUHH46000317', 'ext': 'mp4', 'title': 'Nytt på Nytt 27.01.2017', 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', -- GitLab From 57a63ed4a12df04093fc94a8e8544f4f34f8c4b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:05:12 +0100 Subject: [PATCH 244/384] [nrk] Improve episode and season number extraction --- haruhi_dl/extractor/nrk.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 4fb7df959..48387420c 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -246,7 +246,9 @@ class NRKIE(NRKBaseIE): default=None)) episode = alt_title if is_series else None episode_number = int_or_none(self._search_regex( - r'(\d+)\.\s+episode', episode or '', 'episode number', + r'^(\d+)\.', episode or '', 'episode number', + default=None)) or int_or_none(self._search_regex( + r'\((\d+)\s*:\s*\d+\)', description or '', 'episode number', default=None)) info.update({ 'title': title, @@ -374,19 +376,19 @@ class NRKTVIE(InfoExtractor): class NRKTVEpisodeIE(InfoExtractor): - _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/\d+/episode/\d+)' + _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/(?P\d+)/episode/(?P\d+))' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2', 'info_dict': { - 'id': 'MUHH36005220BA', + 'id': 'MUHH36005220', 'ext': 'mp4', - 'title': 'Kro, krig og kjærlighet 2:6', - 'description': 'md5:b32a7dc0b1ed27c8064f58b97bda4350', - 'duration': 1563, + 'title': 'Hellums kro - 2. Kro, krig og kjærlighet', + 'description': 'md5:ad92ddffc04cea8ce14b415deef81787', + 'duration': 1563.92, 'series': 'Hellums kro', - 'season_number': 1, + # 'season_number': 1, 'episode_number': 2, - 'episode': '2:6', + 'episode': '2. Kro, krig og kjærlighet', 'age_limit': 6, }, 'params': { @@ -395,15 +397,15 @@ class NRKTVEpisodeIE(InfoExtractor): }, { 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8', 'info_dict': { - 'id': 'MSUI14000816AA', + 'id': 'MSUI14000816', 'ext': 'mp4', - 'title': 'Backstage 8:30', + 'title': 'Backstage - 8. episode', 'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4', 'duration': 1320, 'series': 'Backstage', 'season_number': 1, 'episode_number': 8, - 'episode': '8:30', + 'episode': '8. episode', }, 'params': { 'skip_download': True, @@ -412,7 +414,7 @@ class NRKTVEpisodeIE(InfoExtractor): }] def _real_extract(self, url): - display_id = self._match_id(url) + display_id, season_number, episode_number = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) @@ -424,10 +426,12 @@ class NRKTVEpisodeIE(InfoExtractor): assert re.match(NRKTVIE._EPISODE_RE, nrk_id) info.update({ - '_type': 'url_transparent', + '_type': 'url', 'id': nrk_id, 'url': 'nrk:%s' % nrk_id, 'ie_key': NRKIE.ie_key(), + 'season_number': int(season_number), + 'episode_number': int(episode_number), }) return info -- GitLab From 785078cb0880e52b055e49167a1de13725956068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:05:21 +0100 Subject: [PATCH 245/384] [nrk] PEP 8 --- haruhi_dl/extractor/nrk.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 48387420c..2873d7938 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -247,9 +247,11 @@ class NRKIE(NRKBaseIE): episode = alt_title if is_series else None episode_number = int_or_none(self._search_regex( r'^(\d+)\.', episode or '', 'episode number', - default=None)) or int_or_none(self._search_regex( - r'\((\d+)\s*:\s*\d+\)', description or '', 'episode number', default=None)) + if not episode_number: + episode_number = int_or_none(self._search_regex( + r'\((\d+)\s*:\s*\d+\)', description or '', + 'episode number', default=None)) info.update({ 'title': title, 'series': series, -- GitLab From b51ed7b039f87362fa087ab61a1f19c7816d5ae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:05:26 +0100 Subject: [PATCH 246/384] [nrk] Improve series metadata extraction (closes #27473) --- haruhi_dl/extractor/nrk.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 2873d7938..5d33355e7 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -238,16 +238,29 @@ class NRKIE(NRKBaseIE): } if is_series: - series = title + series = season_id = season_number = episode = episode_number = None + programs = self._call_api( + 'programs/%s' % video_id, video_id, 'programs', fatal=False) + if programs and isinstance(programs, dict): + series = str_or_none(programs.get('seriesTitle')) + season_id = str_or_none(programs.get('seasonId')) + season_number = int_or_none(programs.get('seasonNumber')) + episode = str_or_none(programs.get('episodeTitle')) + episode_number = int_or_none(programs.get('episodeNumber')) + if not series: + series = title if alt_title: title += ' - %s' % alt_title - season_number = int_or_none(self._search_regex( - r'Sesong\s+(\d+)', description or '', 'season number', - default=None)) - episode = alt_title if is_series else None - episode_number = int_or_none(self._search_regex( - r'^(\d+)\.', episode or '', 'episode number', - default=None)) + if not season_number: + season_number = int_or_none(self._search_regex( + r'Sesong\s+(\d+)', description or '', 'season number', + default=None)) + if not episode: + episode = alt_title if is_series else None + if not episode_number: + episode_number = int_or_none(self._search_regex( + r'^(\d+)\.', episode or '', 'episode number', + default=None)) if not episode_number: episode_number = int_or_none(self._search_regex( r'\((\d+)\s*:\s*\d+\)', description or '', @@ -255,6 +268,7 @@ class NRKIE(NRKBaseIE): info.update({ 'title': title, 'series': series, + 'season_id': season_id, 'season_number': season_number, 'episode': episode, 'episode_number': episode_number, @@ -388,7 +402,7 @@ class NRKTVEpisodeIE(InfoExtractor): 'description': 'md5:ad92ddffc04cea8ce14b415deef81787', 'duration': 1563.92, 'series': 'Hellums kro', - # 'season_number': 1, + 'season_number': 1, 'episode_number': 2, 'episode': '2. Kro, krig og kjærlighet', 'age_limit': 6, -- GitLab From 8e538fc605286405544d7f5f8b8c7bee4c555167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:05:31 +0100 Subject: [PATCH 247/384] [nrk] Fix age limit extraction --- haruhi_dl/extractor/nrk.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 5d33355e7..69178e157 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -220,8 +220,15 @@ class NRKIE(NRKBaseIE): 'url': sub_url, }) - age_limit = int_or_none(try_get( - data, lambda x: x['legalAge']['body']['rating']['code'])) + legal_age = try_get( + data, lambda x: x['legalAge']['body']['rating']['code'], compat_str) + # https://en.wikipedia.org/wiki/Norwegian_Media_Authority + if legal_age == 'A': + age_limit = 0 + elif legal_age.isdigit(): + age_limit = int_or_none(legal_age) + else: + age_limit = None is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' @@ -304,6 +311,7 @@ class NRKTVIE(InfoExtractor): 'duration': 1741, 'series': '20 spørsmål', 'episode': '23. mai 2014', + 'age_limit': 0, }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', @@ -315,6 +323,7 @@ class NRKTVIE(InfoExtractor): 'duration': 4605.08, 'series': 'Kunnskapskanalen', 'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -327,6 +336,7 @@ class NRKTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -340,6 +350,7 @@ class NRKTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'age_limit': 0, }, 'expected_warnings': ['Failed to download m3u8 information'], 'skip': 'Ikke tilgjengelig utenfor Norge', @@ -355,6 +366,7 @@ class NRKTVIE(InfoExtractor): 'episode': '13. episode', 'season_number': 3, 'episode_number': 13, + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -369,6 +381,7 @@ class NRKTVIE(InfoExtractor): 'duration': 1796, 'series': 'Nytt på nytt', 'episode': '27.01.2017', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -422,6 +435,7 @@ class NRKTVEpisodeIE(InfoExtractor): 'season_number': 1, 'episode_number': 8, 'episode': '8. episode', + 'age_limit': 0, }, 'params': { 'skip_download': True, -- GitLab From 8406b57ac6d2b1cbebae7517d57dafb1bdcb352e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:05:39 +0100 Subject: [PATCH 248/384] [stv] improve episode id extraction(closes #23083) --- haruhi_dl/extractor/stv.py | 42 +++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/stv.py b/haruhi_dl/extractor/stv.py index bae8b71f4..539220a94 100644 --- a/haruhi_dl/extractor/stv.py +++ b/haruhi_dl/extractor/stv.py @@ -8,13 +8,17 @@ from ..utils import ( compat_str, float_or_none, int_or_none, + smuggle_url, + str_or_none, + try_get, ) class STVPlayerIE(InfoExtractor): IE_NAME = 'stv:player' _VALID_URL = r'https?://player\.stv\.tv/(?Pepisode|video)/(?P[a-z0-9]{4})' - _TEST = { + _TESTS = [{ + # shortform 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/', 'md5': '5adf9439c31d554f8be0707c7abe7e0a', 'info_dict': { @@ -27,7 +31,11 @@ class STVPlayerIE(InfoExtractor): 'uploader_id': '1486976045', }, 'skip': 'this resource is unavailable outside of the UK', - } + }, { + # episodes + 'url': 'https://player.stv.tv/episode/4125/jennifer-saunders-memory-lane', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s' _PTYPE_MAP = { 'episode': 'episodes', @@ -36,11 +44,31 @@ class STVPlayerIE(InfoExtractor): def _real_extract(self, url): ptype, video_id = re.match(self._VALID_URL, url).groups() - resp = self._download_json( - 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id), - video_id) - result = resp['results'] + webpage = self._download_webpage(url, video_id, fatal=False) or '' + props = (self._parse_json(self._search_regex( + r']+id="__NEXT_DATA__"[^>]*>({.+?})', + webpage, 'next data', default='{}'), video_id, + fatal=False) or {}).get('props') or {} + player_api_cache = try_get( + props, lambda x: x['initialReduxState']['playerApiCache']) or {} + + api_path, resp = None, {} + for k, v in player_api_cache.items(): + if k.startswith('/episodes/') or k.startswith('/shortform/'): + api_path, resp = k, v + break + else: + episode_id = str_or_none(try_get( + props, lambda x: x['pageProps']['episodeId'])) + api_path = '/%s/%s' % (self._PTYPE_MAP[ptype], episode_id or video_id) + + result = resp.get('results') + if not result: + resp = self._download_json( + 'https://player.api.stv.tv/v1' + api_path, video_id) + result = resp['results'] + video = result['video'] video_id = compat_str(video['id']) @@ -57,7 +85,7 @@ class STVPlayerIE(InfoExtractor): return { '_type': 'url_transparent', 'id': video_id, - 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_id, + 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['GB']}), 'description': result.get('summary'), 'duration': float_or_none(video.get('length'), 1000), 'subtitles': subtitles, -- GitLab From 3f43c99d4ad44e4f48ee03f76de3e5d7eb03413a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:05:44 +0100 Subject: [PATCH 249/384] =?UTF-8?q?[stitcher]=20Add=20support=20for=20show?= =?UTF-8?q?s=20and=20show=20metadata=20extraction(closes=20=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …#20510) --- haruhi_dl/extractor/extractors.py | 5 +- haruhi_dl/extractor/stitcher.py | 120 ++++++++++++++++++++++-------- 2 files changed, 92 insertions(+), 33 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 086c7d42a..2722c0501 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1137,7 +1137,10 @@ from .spike import ( BellatorIE, ParamountNetworkIE, ) -from .stitcher import StitcherIE +from .stitcher import ( + StitcherIE, + StitcherShowIE, +) from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE diff --git a/haruhi_dl/extractor/stitcher.py b/haruhi_dl/extractor/stitcher.py index b8b5711b1..3dd0d3b5f 100644 --- a/haruhi_dl/extractor/stitcher.py +++ b/haruhi_dl/extractor/stitcher.py @@ -1,19 +1,60 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( clean_html, ExtractorError, int_or_none, str_or_none, try_get, + url_or_none, ) -class StitcherIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/(?:[^/]+/)+e(?:pisode)?/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' +class StitcherBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/' + + def _call_api(self, path, video_id, query): + resp = self._download_json( + 'https://api.prod.stitcher.com/' + path, + video_id, query=query) + error_massage = try_get(resp, lambda x: x['errors'][0]['message']) + if error_massage: + raise ExtractorError(error_massage, expected=True) + return resp['data'] + + def _extract_description(self, data): + return clean_html(data.get('html_description') or data.get('description')) + + def _extract_audio_url(self, episode): + return url_or_none(episode.get('audio_url') or episode.get('guid')) + + def _extract_show_info(self, show): + return { + 'thumbnail': show.get('image_base_url'), + 'series': show.get('title'), + } + + def _extract_episode(self, episode, audio_url, show_info): + info = { + 'id': compat_str(episode['id']), + 'display_id': episode.get('slug'), + 'title': episode['title'].strip(), + 'description': self._extract_description(episode), + 'duration': int_or_none(episode.get('duration')), + 'url': audio_url, + 'vcodec': 'none', + 'timestamp': int_or_none(episode.get('date_published')), + 'season_number': int_or_none(episode.get('season')), + 'season_id': str_or_none(episode.get('season_id')), + } + info.update(show_info) + return info + + +class StitcherIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?:[^/]+/)+e(?:pisode)?/(?:[^/#?&]+-)?(?P\d+)' _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', 'md5': 'e9635098e0da10b21a0e2b85585530f6', @@ -24,8 +65,9 @@ class StitcherIE(InfoExtractor): 'description': 'md5:547adb4081864be114ae3831b4c2b42f', 'duration': 1604, 'thumbnail': r're:^https?://.*\.jpg', - 'upload_date': '20180126', - 'timestamp': 1516989316, + 'upload_date': '20151008', + 'timestamp': 1444285800, + 'series': 'Talking Machines', }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', @@ -55,33 +97,47 @@ class StitcherIE(InfoExtractor): }] def _real_extract(self, url): - display_id, audio_id = re.match(self._VALID_URL, url).groups() + audio_id = self._match_id(url) + data = self._call_api( + 'shows/episodes', audio_id, {'episode_ids': audio_id}) + episode = data['episodes'][0] + audio_url = self._extract_audio_url(episode) + if not audio_url: + self.raise_login_required() + show = try_get(data, lambda x: x['shows'][0], dict) or {} + return self._extract_episode( + episode, audio_url, self._extract_show_info(show)) - resp = self._download_json( - 'https://api.prod.stitcher.com/episode/' + audio_id, - display_id or audio_id) - episode = try_get(resp, lambda x: x['data']['episodes'][0], dict) - if not episode: - raise ExtractorError(resp['errors'][0]['message'], expected=True) - title = episode['title'].strip() - audio_url = episode['audio_url'] +class StitcherShowIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?P[^/#?&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.stitcher.com/podcast/the-talking-machines', + 'info_dict': { + 'id': 'the-talking-machines', + 'title': 'Talking Machines', + 'description': 'md5:831f0995e40f26c10231af39cf1ebf0b', + }, + 'playlist_mincount': 106, + }, { + 'url': 'https://www.stitcher.com/show/the-talking-machines', + 'only_matching': True, + }] + + def _real_extract(self, url): + show_slug = self._match_id(url) + data = self._call_api( + 'search/show/%s/allEpisodes' % show_slug, show_slug, {'count': 10000}) + show = try_get(data, lambda x: x['shows'][0], dict) or {} + show_info = self._extract_show_info(show) - thumbnail = None - show_id = episode.get('show_id') - if show_id and episode.get('classic_id') != -1: - thumbnail = 'https://stitcher-classic.imgix.net/feedimages/%s.jpg' % show_id + entries = [] + for episode in (data.get('episodes') or []): + audio_url = self._extract_audio_url(episode) + if not audio_url: + continue + entries.append(self._extract_episode(episode, audio_url, show_info)) - return { - 'id': audio_id, - 'display_id': display_id, - 'title': title, - 'description': clean_html(episode.get('html_description') or episode.get('description')), - 'duration': int_or_none(episode.get('duration')), - 'thumbnail': thumbnail, - 'url': audio_url, - 'vcodec': 'none', - 'timestamp': int_or_none(episode.get('date_created')), - 'season_number': int_or_none(episode.get('season')), - 'season_id': str_or_none(episode.get('season_id')), - } + return self.playlist_result( + entries, show_slug, show.get('title'), + self._extract_description(show)) -- GitLab From 28c4062a5894da0e236352880d741cfc52a9f4b5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:05:50 +0100 Subject: [PATCH 250/384] =?UTF-8?q?[twitter]=20try=20to=20use=20a=20Generi?= =?UTF-8?q?c=20fallback=20for=20unknown=20twitter=20cards(clo=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ses #25982) --- haruhi_dl/extractor/twitter.py | 52 ++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/haruhi_dl/extractor/twitter.py b/haruhi_dl/extractor/twitter.py index 4284487db..a35e1686c 100644 --- a/haruhi_dl/extractor/twitter.py +++ b/haruhi_dl/extractor/twitter.py @@ -251,10 +251,10 @@ class TwitterIE(TwitterBaseIE): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'simon vetugo - BEAT PROD: @suhmeduh #Damndaniel', + 'title': 'simon vertugo - BEAT PROD: @suhmeduh #Damndaniel', 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'simon vetugo', + 'uploader': 'simon vertugo', 'uploader_id': 'simonvertugo', 'duration': 30.0, 'timestamp': 1455777459, @@ -312,6 +312,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1492000653, 'upload_date': '20170412', }, + 'skip': 'Account suspended', }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', 'info_dict': { @@ -380,6 +381,14 @@ class TwitterIE(TwitterBaseIE): # promo_video_website card 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', 'only_matching': True, + }, { + # promo_video_convo card + 'url': 'https://twitter.com/poco_dandy/status/1047395834013384704', + 'only_matching': True, + }, { + # appplayer card + 'url': 'https://twitter.com/poco_dandy/status/1150646424461176832', + 'only_matching': True, }] def _real_extract(self, url): @@ -462,7 +471,25 @@ class TwitterIE(TwitterBaseIE): return try_get(o, lambda x: x[x['type'].lower() + '_value']) card_name = card['name'].split(':')[-1] - if card_name in ('amplify', 'promo_video_website'): + if card_name == 'player': + info.update({ + '_type': 'url', + 'url': get_binding_value('player_url'), + }) + elif card_name == 'periscope_broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('url') or get_binding_value('player_url'), + 'ie_key': PeriscopeIE.ie_key(), + }) + elif card_name == 'broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('broadcast_url'), + 'ie_key': TwitterBroadcastIE.ie_key(), + }) + # amplify, promo_video_website, promo_video_convo, appplayer, ... + else: is_amplify = card_name == 'amplify' vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) @@ -488,25 +515,6 @@ class TwitterIE(TwitterBaseIE): 'duration': int_or_none(get_binding_value( 'content_duration_seconds')), }) - elif card_name == 'player': - info.update({ - '_type': 'url', - 'url': get_binding_value('player_url'), - }) - elif card_name == 'periscope_broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('url') or get_binding_value('player_url'), - 'ie_key': PeriscopeIE.ie_key(), - }) - elif card_name == 'broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('broadcast_url'), - 'ie_key': TwitterBroadcastIE.ie_key(), - }) - else: - raise ExtractorError('Unsupported Twitter Card.') else: expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) if not expanded_url: -- GitLab From 3f2bf67bc908cb1b11ccdfc29b69c00f6bde1cf5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:05:55 +0100 Subject: [PATCH 251/384] [twitter] Add support for summary card(closes #25121) --- haruhi_dl/extractor/twitter.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/haruhi_dl/extractor/twitter.py b/haruhi_dl/extractor/twitter.py index a35e1686c..1190d721e 100644 --- a/haruhi_dl/extractor/twitter.py +++ b/haruhi_dl/extractor/twitter.py @@ -488,6 +488,11 @@ class TwitterIE(TwitterBaseIE): 'url': get_binding_value('broadcast_url'), 'ie_key': TwitterBroadcastIE.ie_key(), }) + elif card_name == 'summary': + info.update({ + '_type': 'url', + 'url': get_binding_value('card_url'), + }) # amplify, promo_video_website, promo_video_convo, appplayer, ... else: is_amplify = card_name == 'amplify' -- GitLab From a22e2b59b4d4ce1af561668352252554c7d7ae91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:06:00 +0100 Subject: [PATCH 252/384] [nrktv] Add subtitles test --- haruhi_dl/extractor/nrk.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 69178e157..cafb85616 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -298,6 +298,14 @@ class NRKTVIE(InfoExtractor): 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', 'duration': 2223.44, 'age_limit': 6, + 'subtitles': { + 'nb-nor': [{ + 'ext': 'vtt', + }], + 'nb-ttv': [{ + 'ext': 'vtt', + }] + }, }, }, { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', -- GitLab From e98e8454c5298961453be56d1c7f8a66a991f2a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:06:04 +0100 Subject: [PATCH 253/384] [xfileshare] Add support for aparat.cam (closes #27651) --- haruhi_dl/extractor/xfileshare.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/haruhi_dl/extractor/xfileshare.py b/haruhi_dl/extractor/xfileshare.py index 20f7013f3..783358fe9 100644 --- a/haruhi_dl/extractor/xfileshare.py +++ b/haruhi_dl/extractor/xfileshare.py @@ -45,6 +45,7 @@ def aa_decode(aa_code): class XFileShareIE(InfoExtractor): _SITES = ( + (r'aparat\.cam', 'Aparat'), (r'clipwatching\.com', 'ClipWatching'), (r'gounlimited\.to', 'GoUnlimited'), (r'govid\.me', 'GoVid'), @@ -78,6 +79,9 @@ class XFileShareIE(InfoExtractor): 'title': 'sample', 'thumbnail': r're:http://.*\.jpg', }, + }, { + 'url': 'https://aparat.cam/n4d6dh0wvlpr', + 'only_matching': True, }] @staticmethod -- GitLab From 017215032a927504449ccf7c578a472139570e89 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:06:12 +0100 Subject: [PATCH 254/384] [utils] add a function to clean podcast URLs --- haruhi_dl/utils.py | 17 +++++++++++++++++ test/test_utils.py | 5 +++++ 2 files changed, 22 insertions(+) diff --git a/haruhi_dl/utils.py b/haruhi_dl/utils.py index 62b59bcdb..d35033b7e 100644 --- a/haruhi_dl/utils.py +++ b/haruhi_dl/utils.py @@ -5708,3 +5708,20 @@ def random_birthday(year_field, month_field, day_field): month_field: str(random_date.month), day_field: str(random_date.day), } + + +def clean_podcast_url(url): + return re.sub(r'''(?x) + (?: + (?: + chtbl\.com/track| + media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/ + play\.podtrac\.com + )/[^/]+| + (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure + flex\.acast\.com| + pd(?: + cn\.co| # https://podcorn.com/analytics-prefix/ + st\.fm # https://podsights.com/docs/ + )/e + )/''', '', url) diff --git a/test/test_utils.py b/test/test_utils.py index dc3dde0c4..d052a23de 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,6 +21,7 @@ from haruhi_dl.utils import ( encode_base_n, caesar, clean_html, + clean_podcast_url, date_from_str, DateRange, detect_exe_version, @@ -1470,6 +1471,10 @@ Line 1 self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + def test_clean_podcast_url(self): + self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') + self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + if __name__ == '__main__': unittest.main() -- GitLab From 1e653be1d0bb4efbf7011204da9060b5ad1b0ad4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:06:16 +0100 Subject: [PATCH 255/384] [stitcher] clean podcast URLs --- haruhi_dl/extractor/stitcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/stitcher.py b/haruhi_dl/extractor/stitcher.py index 3dd0d3b5f..822782507 100644 --- a/haruhi_dl/extractor/stitcher.py +++ b/haruhi_dl/extractor/stitcher.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( clean_html, + clean_podcast_url, ExtractorError, int_or_none, str_or_none, @@ -43,7 +44,7 @@ class StitcherBaseIE(InfoExtractor): 'title': episode['title'].strip(), 'description': self._extract_description(episode), 'duration': int_or_none(episode.get('duration')), - 'url': audio_url, + 'url': clean_podcast_url(audio_url), 'vcodec': 'none', 'timestamp': int_or_none(episode.get('date_published')), 'season_number': int_or_none(episode.get('season')), -- GitLab From 626d26e13accd0d7424b935e33e31c45b92cc851 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:06:22 +0100 Subject: [PATCH 256/384] [acast] clean podcast URLs --- haruhi_dl/extractor/acast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/acast.py b/haruhi_dl/extractor/acast.py index 60378db1b..b9355a2c8 100644 --- a/haruhi_dl/extractor/acast.py +++ b/haruhi_dl/extractor/acast.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + clean_podcast_url, int_or_none, parse_iso8601, ) @@ -17,7 +18,7 @@ class ACastBaseIE(InfoExtractor): info = { 'id': episode['id'], 'display_id': episode.get('episodeUrl'), - 'url': episode['url'], + 'url': clean_podcast_url(episode['url']), 'title': title, 'description': clean_html(episode.get('description') or episode.get('summary')), 'thumbnail': episode.get('image'), -- GitLab From e52adb5328a6b67d4fc889969d07cb40fa75472b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:06:26 +0100 Subject: [PATCH 257/384] [iheart] Add new extractor for iHeartRadio(#27037) --- haruhi_dl/extractor/extractors.py | 4 ++ haruhi_dl/extractor/iheart.py | 97 +++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 haruhi_dl/extractor/iheart.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 2722c0501..265556a21 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -493,6 +493,10 @@ from .ign import ( OneUPIE, PCMagIE, ) +from .iheart import ( + IHeartRadioIE, + IHeartRadioPodcastIE, +) from .imdb import ( ImdbIE, ImdbListIE diff --git a/haruhi_dl/extractor/iheart.py b/haruhi_dl/extractor/iheart.py new file mode 100644 index 000000000..6710baeb4 --- /dev/null +++ b/haruhi_dl/extractor/iheart.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + str_or_none, +) + + +class IHeartRadioBaseIE(InfoExtractor): + def _call_api(self, path, video_id, fatal=True, query=None): + return self._download_json( + 'https://api.iheart.com/api/v3/podcast/' + path, + video_id, fatal=fatal, query=query) + + def _extract_episode(self, episode): + return { + 'thumbnail': episode.get('imageUrl'), + 'description': episode.get('description'), + 'timestamp': int_or_none(episode.get('startDate'), 1000), + 'duration': int_or_none(episode.get('duration')), + } + + +class IHeartRadioIE(IHeartRadioBaseIE): + IENAME = 'iheartradio' + _VALID_URL = r'(?:https?://(?:www\.)?iheart\.com/podcast/[^/]+/episode/(?P[^/?&#]+)-|iheartradio:)(?P\d+)' + _TEST = { + 'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true', + 'md5': 'c8609c92c8688dcb69d8541042b8abca', + 'info_dict': { + 'id': '70346499', + 'ext': 'mp3', + 'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus', + 'description': 'md5:66480b2d25ec93a5f60c0faa3275ce5c', + 'timestamp': 1597741200, + 'upload_date': '20200818', + } + } + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api( + 'episodes/' + episode_id, episode_id)['episode'] + info = self._extract_episode(episode) + print(episode['mediaUrl']) + info.update({ + 'id': episode_id, + 'title': episode['title'], + 'url': clean_podcast_url(episode['mediaUrl']), + }) + return info + + +class IHeartRadioPodcastIE(IHeartRadioBaseIE): + IE_NAME = 'iheartradio:podcast' + _VALID_URL = r'https?://(?:www\.)?iheart(?:podcastnetwork)?\.com/podcast/[^/?&#]+-(?P\d+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://www.iheart.com/podcast/1119-it-could-happen-here-30717896/', + 'info_dict': { + 'id': '30717896', + 'title': 'It Could Happen Here', + 'description': 'md5:5842117412a967eb0b01f8088eb663e2', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://www.iheartpodcastnetwork.com/podcast/105-stuff-you-should-know-26940277', + 'only_matching': True, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + path = 'podcasts/' + podcast_id + episodes = self._call_api( + path + '/episodes', podcast_id, query={'limit': 1000000000})['data'] + + entries = [] + for episode in episodes: + episode_id = str_or_none(episode.get('id')) + if not episode_id: + continue + info = self._extract_episode(episode) + info.update({ + '_type': 'url', + 'id': episode_id, + 'title': episode.get('title'), + 'url': 'iheartradio:' + episode_id, + 'ie_key': IHeartRadioIE.ie_key(), + }) + entries.append(info) + + podcast = self._call_api(path, podcast_id, False) or {} + + return self.playlist_result( + entries, podcast_id, podcast.get('title'), podcast.get('description')) -- GitLab From 1b1752a1b5806055971fc2cf903eb3e2f888b81a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:08:15 +0100 Subject: [PATCH 258/384] [googlepodcasts] Add new extractor --- haruhi_dl/extractor/extractors.py | 4 ++ haruhi_dl/extractor/googlepodcasts.py | 88 +++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 haruhi_dl/extractor/googlepodcasts.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 265556a21..96b039096 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -448,6 +448,10 @@ from .godtube import GodTubeIE from .golem import GolemIE from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE +from .googlepodcasts import ( + GooglePodcastsIE, + GooglePodcastsFeedIE, +) from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE diff --git a/haruhi_dl/extractor/googlepodcasts.py b/haruhi_dl/extractor/googlepodcasts.py new file mode 100644 index 000000000..31ad79907 --- /dev/null +++ b/haruhi_dl/extractor/googlepodcasts.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + try_get, + urlencode_postdata, +) + + +class GooglePodcastsBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/' + + def _batch_execute(self, func_id, video_id, params): + return json.loads(self._download_json( + 'https://podcasts.google.com/_/PodcastsUi/data/batchexecute', + video_id, data=urlencode_postdata({ + 'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]), + }), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2]) + + def _extract_episode(self, episode): + return { + 'id': episode[4][3], + 'title': episode[8], + 'url': clean_podcast_url(episode[13]), + 'thumbnail': episode[2], + 'description': episode[9], + 'creator': try_get(episode, lambda x: x[14]), + 'timestamp': int_or_none(episode[11]), + 'duration': int_or_none(episode[12]), + 'series': episode[1], + } + + +class GooglePodcastsIE(GooglePodcastsBaseIE): + IE_NAME = 'google:podcasts' + _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P[^/]+)/episode/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh', + 'md5': 'fa56b2ee8bd0703e27e42d4b104c4766', + 'info_dict': { + 'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a', + 'ext': 'mp3', + 'title': 'WWDTM New Year 2021', + 'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.', + 'upload_date': '20210102', + 'timestamp': 1609606800, + 'duration': 2901, + 'series': "Wait Wait... Don't Tell Me!", + } + } + + def _real_extract(self, url): + b64_feed_url, b64_guid = re.match(self._VALID_URL, url).groups() + episode = self._batch_execute( + 'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1] + return self._extract_episode(episode) + + +class GooglePodcastsFeedIE(GooglePodcastsBaseIE): + IE_NAME = 'google:podcasts:feed' + _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P[^/?&#]+)/?(?:[?#&]|$)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA', + 'info_dict': { + 'title': "Wait Wait... Don't Tell Me!", + 'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.", + }, + 'playlist_mincount': 20, + } + + def _real_extract(self, url): + b64_feed_url = self._match_id(url) + data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url]) + + entries = [] + for episode in (try_get(data, lambda x: x[1][0]) or []): + entries.append(self._extract_episode(episode)) + + feed = try_get(data, lambda x: x[3]) or [] + return self.playlist_result( + entries, playlist_title=try_get(feed, lambda x: x[0]), + playlist_description=try_get(feed, lambda x: x[2])) -- GitLab From 607b324dfff8cb513e224e61520d711df282ae5d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:08:20 +0100 Subject: [PATCH 259/384] [applepodcasts] Add new extractor(#25918) --- haruhi_dl/extractor/applepodcasts.py | 61 ++++++++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 1 + 2 files changed, 62 insertions(+) create mode 100644 haruhi_dl/extractor/applepodcasts.py diff --git a/haruhi_dl/extractor/applepodcasts.py b/haruhi_dl/extractor/applepodcasts.py new file mode 100644 index 000000000..95758fece --- /dev/null +++ b/haruhi_dl/extractor/applepodcasts.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + try_get, +) + + +class ApplePodcastsIE(InfoExtractor): + _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P\d+)' + _TESTS = [{ + 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'info_dict': { + 'id': '1000482637777', + 'ext': 'mp3', + 'title': '207 - Whitney Webb Returns', + 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'upload_date': '20200705', + 'timestamp': 1593921600, + 'duration': 6425, + 'series': 'The Tim Dillon Show', + } + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + webpage = self._download_webpage(url, episode_id) + ember_data = self._parse_json(self._search_regex( + r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) + episode = ember_data['data']['attributes'] + description = episode.get('description') or {} + + series = None + for inc in (ember_data.get('included') or []): + if inc.get('type') == 'media/podcast': + series = try_get(inc, lambda x: x['attributes']['name']) + + return { + 'id': episode_id, + 'title': episode['name'], + 'url': clean_podcast_url(episode['assetUrl']), + 'description': description.get('standard') or description.get('short'), + 'timestamp': parse_iso8601(episode.get('releaseDateTime')), + 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), + 'series': series, + } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 96b039096..2b2bd0b7c 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -65,6 +65,7 @@ from .appletrailers import ( AppleTrailersIE, AppleTrailersSectionIE, ) +from .applepodcasts import ApplePodcastsIE from .archiveorg import ArchiveOrgIE from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE -- GitLab From 1dc43fd3fc6d741bad0ee412b0ff0149d418e0ad Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:08:27 +0100 Subject: [PATCH 260/384] [googleplus] Remove Extractor(closes #4955)(closes #7400) --- haruhi_dl/extractor/extractors.py | 1 - haruhi_dl/extractor/googleplus.py | 73 ------------------------------- 2 files changed, 74 deletions(-) delete mode 100644 haruhi_dl/extractor/googleplus.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 2b2bd0b7c..2717c6e45 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -448,7 +448,6 @@ from .go import GoIE from .godtube import GodTubeIE from .golem import GolemIE from .googledrive import GoogleDriveIE -from .googleplus import GooglePlusIE from .googlepodcasts import ( GooglePodcastsIE, GooglePodcastsFeedIE, diff --git a/haruhi_dl/extractor/googleplus.py b/haruhi_dl/extractor/googleplus.py deleted file mode 100644 index 6b927bb44..000000000 --- a/haruhi_dl/extractor/googleplus.py +++ /dev/null @@ -1,73 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import codecs - -from .common import InfoExtractor -from ..utils import unified_strdate - - -class GooglePlusIE(InfoExtractor): - IE_DESC = 'Google Plus' - _VALID_URL = r'https?://plus\.google\.com/(?:[^/]+/)*?posts/(?P\w+)' - IE_NAME = 'plus.google' - _TEST = { - 'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH', - 'info_dict': { - 'id': 'ZButuJc6CtH', - 'ext': 'flv', - 'title': '嘆きの天使 降臨', - 'upload_date': '20120613', - 'uploader': '井上ヨシマサ', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - # Step 1, Retrieve post webpage to extract further information - webpage = self._download_webpage(url, video_id, 'Downloading entry webpage') - - title = self._og_search_description(webpage).splitlines()[0] - upload_date = unified_strdate(self._html_search_regex( - r'''(?x) - ([0-9]{4}-[0-9]{2}-[0-9]{2})''', - webpage, 'upload date', fatal=False, flags=re.VERBOSE)) - uploader = self._html_search_regex( - r'rel="author".*?>(.*?)', webpage, 'uploader', fatal=False) - - # Step 2, Simulate clicking the image box to launch video - DOMAIN = 'https://plus.google.com/' - video_page = self._search_regex( - r' Date: Fri, 26 Feb 2021 16:08:34 +0100 Subject: [PATCH 261/384] [iheart] remove print statement --- haruhi_dl/extractor/iheart.py | 1 - 1 file changed, 1 deletion(-) diff --git a/haruhi_dl/extractor/iheart.py b/haruhi_dl/extractor/iheart.py index 6710baeb4..7a7295ff4 100644 --- a/haruhi_dl/extractor/iheart.py +++ b/haruhi_dl/extractor/iheart.py @@ -45,7 +45,6 @@ class IHeartRadioIE(IHeartRadioBaseIE): episode = self._call_api( 'episodes/' + episode_id, episode_id)['episode'] info = self._extract_episode(episode) - print(episode['mediaUrl']) info.update({ 'id': episode_id, 'title': episode['title'], -- GitLab From 67ff5da6ea2c9ec7dc57bd43d2f345f4e038f751 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:08:39 +0100 Subject: [PATCH 262/384] [iheart] clean HTML tags from episode description --- haruhi_dl/extractor/iheart.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/iheart.py b/haruhi_dl/extractor/iheart.py index 7a7295ff4..266c67a76 100644 --- a/haruhi_dl/extractor/iheart.py +++ b/haruhi_dl/extractor/iheart.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + clean_html, clean_podcast_url, int_or_none, str_or_none, @@ -18,7 +19,7 @@ class IHeartRadioBaseIE(InfoExtractor): def _extract_episode(self, episode): return { 'thumbnail': episode.get('imageUrl'), - 'description': episode.get('description'), + 'description': clean_html(episode.get('description')), 'timestamp': int_or_none(episode.get('startDate'), 1000), 'duration': int_or_none(episode.get('duration')), } -- GitLab From 25dff12eb19d0bc4851057c970c6ebbf995637d4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:08:44 +0100 Subject: [PATCH 263/384] [nrk] fix extraction for videos without a legalAge rating --- haruhi_dl/extractor/nrk.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index cafb85616..40dee2162 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -223,12 +223,12 @@ class NRKIE(NRKBaseIE): legal_age = try_get( data, lambda x: x['legalAge']['body']['rating']['code'], compat_str) # https://en.wikipedia.org/wiki/Norwegian_Media_Authority - if legal_age == 'A': - age_limit = 0 - elif legal_age.isdigit(): - age_limit = int_or_none(legal_age) - else: - age_limit = None + age_limit = None + if legal_age: + if legal_age == 'A': + age_limit = 0 + elif legal_age.isdigit(): + age_limit = int_or_none(legal_age) is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' -- GitLab From 055e9eb904a78069da659711520e46ba1b730d9a Mon Sep 17 00:00:00 2001 From: Yurii H Date: Fri, 26 Feb 2021 16:08:50 +0100 Subject: [PATCH 264/384] [iheart] Update test description value (#27037) the description has no HTML tags now. --- haruhi_dl/extractor/iheart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/iheart.py b/haruhi_dl/extractor/iheart.py index 266c67a76..b54c05eeb 100644 --- a/haruhi_dl/extractor/iheart.py +++ b/haruhi_dl/extractor/iheart.py @@ -35,7 +35,7 @@ class IHeartRadioIE(IHeartRadioBaseIE): 'id': '70346499', 'ext': 'mp3', 'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus', - 'description': 'md5:66480b2d25ec93a5f60c0faa3275ce5c', + 'description': 'md5:96cc7297b3a5a9ebae28643801c96fae', 'timestamp': 1597741200, 'upload_date': '20200818', } -- GitLab From 903c90bd4ccead58f6247d2f2fb06ca16148168e Mon Sep 17 00:00:00 2001 From: Kevin O'Connor Date: Fri, 26 Feb 2021 16:08:56 +0100 Subject: [PATCH 265/384] [downloader/hls] Disable decryption in tests (#27660) Tests truncate the download to 10241 bytes, which is not divisible by 16 and cannot be decrypted. Tests don't really care about the decrypted content, just that the data they retrieved is the expected data. Therefore, it's fine to just return the encrypted data to tests. See: #27621 and #27620 --- haruhi_dl/downloader/hls.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/downloader/hls.py b/haruhi_dl/downloader/hls.py index 56c84e113..3aa58e1c0 100644 --- a/haruhi_dl/downloader/hls.py +++ b/haruhi_dl/downloader/hls.py @@ -172,8 +172,12 @@ class HlsFD(FragmentFD): iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) decrypt_info['KEY'] = decrypt_info.get('KEY') or self.hdl.urlopen( self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() - frag_content = AES.new( - decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) + # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block + # size (see https://github.com/hdl-org/haruhi-dl/pull/27660). Tests only care that the correct data downloaded, + # not what it decrypts to. + if not test: + frag_content = AES.new( + decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) self._append_fragment(ctx, frag_content) # We only download the first fragment during the test if test: -- GitLab From a72df1d2492115e432be97363f6cb2d9f9021414 Mon Sep 17 00:00:00 2001 From: cladmi Date: Fri, 26 Feb 2021 16:09:15 +0100 Subject: [PATCH 266/384] [motherless] Fix recent videos upload date extraction (closes #27661) Less than a week old videos use a '20h ago' or '1d ago' format. I kept the support for 'Ago' with uppercase start at is was already in the code. --- haruhi_dl/extractor/motherless.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/motherless.py b/haruhi_dl/extractor/motherless.py index b1615b4d8..6cc36b308 100644 --- a/haruhi_dl/extractor/motherless.py +++ b/haruhi_dl/extractor/motherless.py @@ -85,18 +85,27 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Views<', r'Views\s+([^<]+)<'), + (r'>([\d,.]+)\s+Views<', # 1,234,567 Views + r'Views\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Favorites<', r'Favorited\s+([^<]+)<'), + (r'>([\d,.]+)\s+Favorites<', # 1,234 Favorites + r'Favorited\s+([^<]+)<'), webpage, 'like count', fatal=False)) upload_date = self._html_search_regex( (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', + r'class=["\']count[^>]+>(\d+[hd])\s+[aA]go<', # 20h/1d ago r'Uploaded\s+([^<]+)<'), webpage, 'upload date') - if 'Ago' in upload_date: - days = int(re.search(r'([0-9]+)', upload_date).group(1)) - upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') + relative = re.match(r'(\d+)([hd])$', upload_date) + if relative: + delta = int(relative.group(1)) + unit = relative.group(2) + if unit == 'h': + delta_t = datetime.timedelta(hours=delta) + else: # unit == 'd' + delta_t = datetime.timedelta(days=delta) + upload_date = (datetime.datetime.now() - delta_t).strftime('%Y%m%d') else: upload_date = unified_strdate(upload_date) -- GitLab From e94762a1a79d81b580f5c9b69b36c079e4fe9bda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:09:21 +0100 Subject: [PATCH 267/384] =?UTF-8?q?[motherless]=20Fix=20review=20issues=20?= =?UTF-8?q?and=20improve=20extraction=20(closes=20#26495,=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … closes #27450) --- haruhi_dl/extractor/motherless.py | 52 ++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/haruhi_dl/extractor/motherless.py b/haruhi_dl/extractor/motherless.py index 6cc36b308..ef1e081f2 100644 --- a/haruhi_dl/extractor/motherless.py +++ b/haruhi_dl/extractor/motherless.py @@ -61,6 +61,23 @@ class MotherlessIE(InfoExtractor): # no keywords 'url': 'http://motherless.com/8B4BBC1', 'only_matching': True, + }, { + # see https://motherless.com/videos/recent for recent videos with + # uploaded date in "ago" format + 'url': 'https://motherless.com/3C3E2CF', + 'info_dict': { + 'id': '3C3E2CF', + 'ext': 'mp4', + 'title': 'a/ Hot Teens', + 'categories': list, + 'upload_date': '20210104', + 'uploader_id': 'yonbiw', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -85,29 +102,28 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - (r'>([\d,.]+)\s+Views<', # 1,234,567 Views - r'Views\s+([^<]+)<'), + (r'>([\d,.]+)\s+Views<', r'Views\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - (r'>([\d,.]+)\s+Favorites<', # 1,234 Favorites + (r'>([\d,.]+)\s+Favorites<', r'Favorited\s+([^<]+)<'), webpage, 'like count', fatal=False)) - upload_date = self._html_search_regex( - (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', - r'class=["\']count[^>]+>(\d+[hd])\s+[aA]go<', # 20h/1d ago - r'Uploaded\s+([^<]+)<'), webpage, 'upload date') - relative = re.match(r'(\d+)([hd])$', upload_date) - if relative: - delta = int(relative.group(1)) - unit = relative.group(2) - if unit == 'h': - delta_t = datetime.timedelta(hours=delta) - else: # unit == 'd' - delta_t = datetime.timedelta(days=delta) - upload_date = (datetime.datetime.now() - delta_t).strftime('%Y%m%d') - else: - upload_date = unified_strdate(upload_date) + upload_date = unified_strdate(self._search_regex( + r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage, + 'upload date', default=None)) + if not upload_date: + uploaded_ago = self._search_regex( + r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago', + default=None) + if uploaded_ago: + delta = int(uploaded_ago[:-1]) + _AGO_UNITS = { + 'h': 'hours', + 'd': 'days', + } + kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} + upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') comment_count = webpage.count('class="media-comment-contents"') uploader_id = self._html_search_regex( -- GitLab From 7f4e988520f71abf757e7781c04548febe67ff4e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:09:33 +0100 Subject: [PATCH 268/384] [dplay] Add suport Discovery+ domains(closes #27680) --- haruhi_dl/extractor/dplay.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/dplay.py b/haruhi_dl/extractor/dplay.py index a7b9db568..47501dbe6 100644 --- a/haruhi_dl/extractor/dplay.py +++ b/haruhi_dl/extractor/dplay.py @@ -17,7 +17,12 @@ from ..utils import ( class DPlayIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?P - (?:www\.)?(?Pdplay\.(?Pdk|fi|jp|se|no))| + (?:www\.)?(?Pd + (?: + play\.(?Pdk|fi|jp|se|no)| + iscoveryplus\.(?Pdk|es|fi|it|se|no) + ) + )| (?Pes|it)\.dplay\.com )/[^/]+/(?P[^/]+/[^/?#]+)''' @@ -126,6 +131,24 @@ class DPlayIE(InfoExtractor): }, { 'url': 'https://www.dplay.jp/video/gold-rush/24086', 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.no/videoer/i-kongens-klr/sesong-1-episode-7', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.it/videos/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.es/videos/la-fiebre-del-oro/temporada-8-episodio-1', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.fi/videot/shifting-gears-with-aaron-kaufman/episode-16', + 'only_matching': True, }] def _get_disco_api_info(self, url, display_id, disco_host, realm, country): @@ -241,7 +264,7 @@ class DPlayIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') domain = mobj.group('domain').lstrip('www.') - country = mobj.group('country') or mobj.group('subdomain_country') - host = 'disco-api.' + domain if domain.startswith('dplay.') else 'eu2-prod.disco-api.com' + country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country') + host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( url, display_id, host, 'dplay' + country, country) -- GitLab From eb001126da5540c22be16037b01efec4ca58b954 Mon Sep 17 00:00:0