From b85fc0e982c09b85d2f4e90102ee2594931df3fb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 01:17:49 +0100 Subject: [PATCH 001/384] [cnbc] fix extraction --- haruhi_dl/extractor/cnbc.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/cnbc.py b/haruhi_dl/extractor/cnbc.py index 6889b0f40..7b9f4536a 100644 --- a/haruhi_dl/extractor/cnbc.py +++ b/haruhi_dl/extractor/cnbc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import smuggle_url @@ -38,7 +39,7 @@ class CNBCIE(InfoExtractor): class CNBCVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P[^./?#&]+)' + _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P/video/(?:[^/]+/)+(?P[^./?#&]+)\.html)' _TEST = { 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', 'info_dict': { @@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor): } def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, - 'video id') + path, display_id = re.match(self._VALID_URL, url).groups() + video_id = self._download_json( + 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ + 'query': '''{ + page(path: "%s") { + vcpsId + } +}''' % path, + })['data']['page']['vcpsId'] return self.url_result( - 'http://video.cnbc.com/gallery/?video=%s' % video_id, + 'http://video.cnbc.com/gallery/?video=%d' % video_id, CNBCIE.ie_key()) From 0f60a7c66cf704acb211dc7559ce043a15d423ca Mon Sep 17 00:00:00 2001 From: Edward Betts Date: Fri, 26 Feb 2021 13:59:51 +0100 Subject: [PATCH 002/384] [devscripts/make_lazy_extractors] Correct a spelling mistake (#26991) --- devscripts/make_lazy_extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 0cfdf37ca..32f344201 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -77,7 +77,7 @@ def build_lazy_ie(ie, name): return s -# find the correct sorting and add the required base classes so that sublcasses +# find the correct sorting and add the required base classes so that subclasses # can be correctly created classes = _ALL_CLASSES[:-1] ordered_cls = [] From 4d26aa35af598a48a555bfc48bda3594e9416835 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 14:00:20 +0100 Subject: [PATCH 003/384] [nbc] fix NBCNews/Today/MSNBC extraction --- haruhi_dl/extractor/nbc.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/nbc.py b/haruhi_dl/extractor/nbc.py index 6f3cb3003..ea5f5a315 100644 --- a/haruhi_dl/extractor/nbc.py +++ b/haruhi_dl/extractor/nbc.py @@ -10,7 +10,6 @@ from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, - js_to_json, parse_duration, smuggle_url, try_get, @@ -394,8 +393,8 @@ class NBCNewsIE(ThePlatformIE): webpage = self._download_webpage(url, video_id) data = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({.+});', webpage, - 'bootstrap json'), video_id, js_to_json) + r']+id="__NEXT_DATA__"[^>]*>({.+?})', + webpage, 'bootstrap json'), video_id)['props']['initialState'] video_data = try_get(data, lambda x: x['video']['current'], dict) if not video_data: video_data = data['article']['content'][0]['primaryMedia']['video'] From 33c8322b1d56a252f848d8f208fab1ca213c60cc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 14:02:17 +0100 Subject: [PATCH 004/384] [usanetwork] fix extraction --- haruhi_dl/extractor/usanetwork.py | 82 ++++++------------------------- 1 file changed, 16 insertions(+), 66 deletions(-) diff --git a/haruhi_dl/extractor/usanetwork.py b/haruhi_dl/extractor/usanetwork.py index 54c7495cc..e3784e55f 100644 --- a/haruhi_dl/extractor/usanetwork.py +++ b/haruhi_dl/extractor/usanetwork.py @@ -1,74 +1,24 @@ # coding: utf-8 from __future__ import unicode_literals -from .adobepass import AdobePassIE -from ..utils import ( - NO_DEFAULT, - smuggle_url, - update_url_query, -) +from .nbc import NBCIE -class USANetworkIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P[^/?#]+)' - _TEST = { - 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity', - 'md5': '33c0d2ba381571b414024440d08d57fd', +class USANetworkIE(NBCIE): + _VALID_URL = r'https?(?P://(?:www\.)?usanetwork\.com/[^/]+/video/[^/]+/(?P\d+))' + _TESTS = [{ + 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302', 'info_dict': { - 'id': '3086229', + 'id': '4185302', 'ext': 'mp4', - 'title': 'HPE Cybersecurity', - 'description': 'The more we digitize our world, the more vulnerable we are.', - 'upload_date': '20160818', - 'timestamp': 1471535460, - 'uploader': 'NBCU-USA', + 'title': 'Intelligence (Trailer)', + 'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.', + 'upload_date': '20200715', + 'timestamp': 1594785600, + 'uploader': 'NBCU-MPAT', }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def _x(name, default=NO_DEFAULT): - return self._search_regex( - r'data-%s\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' % name, - webpage, name, default=default, group='value') - - video_id = _x('mpx-guid') - title = _x('episode-title') - mpx_account_id = _x('mpx-account-id', '2304992029') - - query = { - 'mbr': 'true', - } - if _x('is-full-episode', None) == '1': - query['manifest'] = 'm3u' - - if _x('is-entitlement', None) == '1': - adobe_pass = {} - drupal_settings = self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings', fatal=False) - if drupal_settings: - drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False) - if drupal_settings: - adobe_pass = drupal_settings.get('adobePass', {}) - resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'usa'), - title, video_id, _x('episode-rating', 'TV-14')) - query['auth'] = self._extract_mvpd_auth( - url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource) - - info = self._search_json_ld(webpage, video_id, default={}) - info.update({ - '_type': 'url_transparent', - 'url': smuggle_url(update_url_query( - 'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id), - query), {'force_smil_url': True}), - 'id': video_id, - 'title': title, - 'series': _x('show-title', None), - 'episode': title, - 'ie_key': 'ThePlatform', - }) - return info + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] From d52a2bf577a44f6bed6c6cefcadf0229266ca635 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 14:09:47 +0100 Subject: [PATCH 005/384] [rai] fix RaiPlay extraction --- haruhi_dl/extractor/rai.py | 61 +++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/haruhi_dl/extractor/rai.py b/haruhi_dl/extractor/rai.py index 207a6c247..bee2d53f5 100644 --- a/haruhi_dl/extractor/rai.py +++ b/haruhi_dl/extractor/rai.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -16,7 +17,6 @@ from ..utils import ( int_or_none, parse_duration, strip_or_none, - try_get, unescapeHTML, unified_strdate, unified_timestamp, @@ -141,6 +141,7 @@ class RaiPlayIE(RaiBaseIE): 'series': 'La Casa Bianca', 'season': '2016', }, + 'skip': 'This content is not available', }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', @@ -148,14 +149,12 @@ class RaiPlayIE(RaiBaseIE): 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', 'title': 'Report del 07/04/2014', - 'alt_title': 'S2013/14 - Puntata del 07/04/2014', - 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', + 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014', + 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 5', - 'creator': 'Rai 5', + 'uploader': 'Rai Gulp', 'duration': 6160, 'series': 'Report', - 'season_number': 5, 'season': '2013/14', }, 'params': { @@ -167,48 +166,51 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - url, video_id = mobj.group('url', 'id') + url, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( - '%s?json' % url, video_id, 'Downloading video JSON') + url.replace('.html', '.json'), video_id, 'Downloading video JSON') title = media['name'] video = media['video'] - relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) + relinker_info = self._extract_relinker_info(video['content_url'], video_id) self._sort_formats(relinker_info['formats']) thumbnails = [] - if 'images' in media: - for _, value in media.get('images').items(): - if value: - thumbnails.append({ - 'url': value.replace('[RESOLUTION]', '600x400') - }) + for _, value in media.get('images', {}).items(): + if value: + thumbnails.append({ + 'url': urljoin(url, value), + }) - timestamp = unified_timestamp(try_get( - media, lambda x: x['availabilities'][0]['start'], compat_str)) + date_published = media.get('date_published') + time_published = media.get('time_published') + if date_published and time_published: + date_published += ' ' + time_published subtitles = self._extract_subtitles(url, video.get('subtitles')) + program_info = media.get('program_info') or {} + season = media.get('season') + info = { 'id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, - 'alt_title': media.get('subtitle'), + 'alt_title': strip_or_none(media.get('subtitle')), 'description': media.get('description'), 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor')), + 'creator': strip_or_none(media.get('editor') or None), 'duration': parse_duration(video.get('duration')), - 'timestamp': timestamp, + 'timestamp': unified_timestamp(date_published), 'thumbnails': thumbnails, - 'series': try_get( - media, lambda x: x['isPartOf']['name'], compat_str), - 'season_number': int_or_none(try_get( - media, lambda x: x['isPartOf']['numeroStagioni'])), - 'season': media.get('stagione') or None, + 'series': program_info.get('name'), + 'season_number': int_or_none(season), + 'season': season if (season and not season.isdigit()) else None, + 'episode': media.get('episode_title'), + 'episode_number': int_or_none(media.get('episode')), 'subtitles': subtitles, } @@ -300,7 +302,8 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1758, 'upload_date': '20140612', - } + }, + 'skip': 'This content is available only in Italy', }, { # with ContentItem in many metas 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', @@ -316,7 +319,7 @@ class RaiIE(RaiBaseIE): }, { # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '11959b4e44fa74de47011b5799490adf', + 'md5': '6865dd00cf0bbf5772fdd89d59bd768a', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', @@ -338,6 +341,7 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20141221', }, + 'skip': 'This content is not available', }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', @@ -360,6 +364,7 @@ class RaiIE(RaiBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'This content is available only in Italy', }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', From 44676b32c351b02b26e50706df448bde8da4e207 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 14:09:54 +0100 Subject: [PATCH 006/384] [bandcamp] fix extraction --- haruhi_dl/extractor/bandcamp.py | 149 +++++++++++++------------------- 1 file changed, 58 insertions(+), 91 deletions(-) diff --git a/haruhi_dl/extractor/bandcamp.py b/haruhi_dl/extractor/bandcamp.py index 9ac93645e..82b605531 100644 --- a/haruhi_dl/extractor/bandcamp.py +++ b/haruhi_dl/extractor/bandcamp.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import random @@ -5,10 +6,7 @@ import re import time from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, @@ -17,30 +15,32 @@ from ..utils import ( parse_filesize, str_or_none, try_get, - unescapeHTML, update_url_query, unified_strdate, unified_timestamp, url_or_none, + urljoin, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P[^/?#&]+)' + _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://haruhi-dl.bandcamp.com/track/haruhi-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "haruhi-dl \"'/\\\u00e4\u21ad - haruhi-dl test song \"'/\\\u00e4\u21ad", + 'title': "haruhi-dl \"'/\\ä↭ - haruhi-dl \"'/\\ä↭ - haruhi-dl test song \"'/\\ä↭", 'duration': 9.8485, + 'uploader': 'haruhi-dl "\'/\\ä↭', + 'upload_date': '20121129', + 'timestamp': 1354224127, }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '853e35bf34aa1d6fe2615ae612564b36', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', @@ -79,11 +79,16 @@ class BandcampIE(InfoExtractor): }, }] + def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): + return self._parse_json(self._html_search_regex( + r'data-%s=(["\'])({.+?})\1' % attr, webpage, + attr + ' data', group=2), video_id, fatal=fatal) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') + title = self._match_id(url) webpage = self._download_webpage(url, title) - thumbnail = self._html_search_meta('og:image', webpage, default=None) + tralbum = self._extract_data_attr(webpage, title) + thumbnail = self._og_search_thumbnail(webpage) track_id = None track = None @@ -91,10 +96,7 @@ class BandcampIE(InfoExtractor): duration = None formats = [] - track_info = self._parse_json( - self._search_regex( - r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', - webpage, 'track info', default='{}'), title) + track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -111,37 +113,25 @@ class BandcampIE(InfoExtractor): 'abr': int_or_none(abr_str), }) track = track_info.get('title') - track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) + track_id = str_or_none( + track_info.get('track_id') or track_info.get('id')) track_number = int_or_none(track_info.get('track_num')) duration = float_or_none(track_info.get('duration')) - def extract(key): - return self._search_regex( - r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key, - webpage, key, default=None, group='value') - - artist = extract('artist') - album = extract('album_title') + embed = self._extract_data_attr(webpage, title, 'embed', False) + current = tralbum.get('current') or {} + artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') timestamp = unified_timestamp( - extract('publish_date') or extract('album_publish_date')) - release_date = unified_strdate(extract('album_release_date')) + current.get('publish_date') or tralbum.get('album_publish_date')) - download_link = self._search_regex( - r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'download link', default=None, group='url') + download_link = tralbum.get('freeDownloadPage') if download_link: - track_id = self._search_regex( - r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', - webpage, 'track id') + track_id = compat_str(tralbum['id']) download_webpage = self._download_webpage( download_link, track_id, 'Downloading free downloads page') - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, - 'blob', group='blob'), - track_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(download_webpage, track_id, 'blob') info = try_get( blob, (lambda x: x['digital_items'][0], @@ -207,20 +197,20 @@ class BandcampIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': artist, 'timestamp': timestamp, - 'release_date': release_date, + 'release_date': unified_strdate(tralbum.get('album_release_date')), 'duration': duration, 'track': track, 'track_number': track_number, 'track_id': track_id, 'artist': artist, - 'album': album, + 'album': embed.get('album_title'), 'formats': formats, } -class BandcampAlbumIE(InfoExtractor): +class BandcampAlbumIE(BandcampIE): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -230,7 +220,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '1353101989', 'ext': 'mp3', - 'title': 'Intro', + 'title': 'Blazo - Intro', + 'timestamp': 1311756226, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, { @@ -238,7 +231,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '38097443', 'ext': 'mp3', - 'title': 'Kero One - Keep It Alive (Blazo remix)', + 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)', + 'timestamp': 1311757238, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, ], @@ -294,41 +290,31 @@ class BandcampAlbumIE(InfoExtractor): else super(BandcampAlbumIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader_id = mobj.group('subdomain') - album_id = mobj.group('album_id') + uploader_id, album_id = re.match(self._VALID_URL, url).groups() playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) - track_elements = re.findall( - r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) - if not track_elements: + tralbum = self._extract_data_attr(webpage, playlist_id) + track_info = tralbum.get('trackinfo') + if not track_info: raise ExtractorError('The page doesn\'t contain any tracks') # Only tracks with duration info have songs entries = [ self.url_result( - compat_urlparse.urljoin(url, t_path), - ie=BandcampIE.ie_key(), - video_title=self._search_regex( - r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', - elem_content, 'track title', fatal=False)) - for elem_content, t_path in track_elements - if self._html_search_meta('duration', elem_content, default=None)] + urljoin(url, t['title_link']), BandcampIE.ie_key(), + str_or_none(t.get('track_id') or t.get('id')), t.get('title')) + for t in track_info + if t.get('duration')] - title = self._html_search_regex( - r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', - webpage, 'title', fatal=False) - if title: - title = title.replace(r'\"', '"') return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, - 'title': title, + 'title': try_get(tralbum, lambda x: x['current']['title'], compat_str), 'entries': entries, } -class BandcampWeeklyIE(InfoExtractor): +class BandcampWeeklyIE(BandcampIE): IE_NAME = 'Bandcamp:weekly' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' _TESTS = [{ @@ -343,29 +329,23 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': '20170404', 'series': 'Bandcamp Weekly', 'episode': 'Magic Moments', - 'episode_number': 208, 'episode_id': '224', - } + }, + 'params': { + 'format': 'opus-lo', + }, }, { 'url': 'https://bandcamp.com/?blah/blah@&show=228', 'only_matching': True }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, - 'blob', group='blob'), - video_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(webpage, show_id, 'blob') - show = blob['bcw_show'] - - # This is desired because any invalid show id redirects to `bandcamp.com` - # which happens to expose the latest Bandcamp Weekly episode. - show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) + show = blob['bcw_data'][show_id] formats = [] for format_id, format_url in show['audio_stream'].items(): @@ -390,20 +370,8 @@ class BandcampWeeklyIE(InfoExtractor): if subtitle: title += ' - %s' % subtitle - episode_number = None - seq = blob.get('bcw_seq') - - if seq and isinstance(seq, list): - try: - episode_number = next( - int_or_none(e.get('episode_number')) - for e in seq - if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) - except StopIteration: - pass - return { - 'id': video_id, + 'id': show_id, 'title': title, 'description': show.get('desc') or show.get('short_desc'), 'duration': float_or_none(show.get('audio_duration')), @@ -411,7 +379,6 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': unified_strdate(show.get('published_date')), 'series': 'Bandcamp Weekly', 'episode': show.get('subtitle'), - 'episode_number': episode_number, - 'episode_id': compat_str(video_id), + 'episode_id': show_id, 'formats': formats } From c62c95923ad915d07631afb2afe0ca2551f52681 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:10:03 +0100 Subject: [PATCH 007/384] [condenast] fix extraction and extract subtitles --- haruhi_dl/extractor/condenast.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/condenast.py b/haruhi_dl/extractor/condenast.py index ed278fefc..d5e77af32 100644 --- a/haruhi_dl/extractor/condenast.py +++ b/haruhi_dl/extractor/condenast.py @@ -16,6 +16,8 @@ from ..utils import ( mimetype2ext, orderedSet, parse_iso8601, + strip_or_none, + try_get, ) @@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor): 'uploader': 'gq', 'upload_date': '20170321', 'timestamp': 1490126427, + 'description': 'How much grimmer would things be if these people were competent?', }, }, { # JS embed @@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor): 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', 'uploader': 'arstechnica', 'upload_date': '20150916', - 'timestamp': 1442434955, + 'timestamp': 1442434920, } }, { 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', @@ -196,6 +199,13 @@ class CondeNastIE(InfoExtractor): }) self._sort_formats(formats) + subtitles = {} + for t, caption in video_info.get('captions', {}).items(): + caption_url = caption.get('src') + if not (t in ('vtt', 'srt', 'tml') and caption_url): + continue + subtitles.setdefault('en', []).append({'url': caption_url}) + return { 'id': video_id, 'formats': formats, @@ -208,6 +218,7 @@ class CondeNastIE(InfoExtractor): 'season': video_info.get('season_title'), 'timestamp': parse_iso8601(video_info.get('premiere_date')), 'categories': video_info.get('categories'), + 'subtitles': subtitles, } def _real_extract(self, url): @@ -225,8 +236,16 @@ class CondeNastIE(InfoExtractor): if url_type == 'series': return self._extract_series(url, webpage) else: - params = self._extract_video_params(webpage, display_id) - info = self._search_json_ld( - webpage, display_id, fatal=False) + video = try_get(self._parse_json(self._search_regex( + r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', '{}'), display_id), + lambda x: x['transformed']['video']) + if video: + params = {'videoId': video['id']} + info = {'description': strip_or_none(video.get('description'))} + else: + params = self._extract_video_params(webpage, display_id) + info = self._search_json_ld( + webpage, display_id, fatal=False) info.update(self._extract_video(params)) return info From 2901a6439ba3b9a36ae9c6b159d69eceaeb8e55c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:10:38 +0100 Subject: [PATCH 008/384] [lrt] fix extraction --- haruhi_dl/extractor/lrt.py | 91 +++++++++++++++----------------------- 1 file changed, 36 insertions(+), 55 deletions(-) diff --git a/haruhi_dl/extractor/lrt.py b/haruhi_dl/extractor/lrt.py index f5c997ef4..a89434adb 100644 --- a/haruhi_dl/extractor/lrt.py +++ b/haruhi_dl/extractor/lrt.py @@ -5,28 +5,26 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, - int_or_none, - parse_duration, - remove_end, + clean_html, + merge_dicts, ) class LRTIE(InfoExtractor): IE_NAME = 'lrt.lt' - _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))' _TESTS = [{ # m3u8 download - 'url': 'http://www.lrt.lt/mediateka/irasas/54391/', - 'md5': 'fe44cf7e4ab3198055f2c598fc175cb0', + 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene', + 'md5': '85cb2bb530f31d91a9c65b479516ade4', 'info_dict': { - 'id': '54391', + 'id': '2000127261', 'ext': 'mp4', - 'title': 'Septynios Kauno dienos', - 'description': 'md5:24d84534c7dc76581e59f5689462411a', - 'duration': 1783, - 'view_count': int, - 'like_count': int, + 'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė', + 'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa', + 'duration': 3035, + 'timestamp': 1604079000, + 'upload_date': '20201030', }, }, { # direct mp3 download @@ -43,52 +41,35 @@ class LRTIE(InfoExtractor): }, }] + def _extract_js_var(self, webpage, var_name, default): + return self._search_regex( + r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name, + webpage, var_name.replace('_', ' '), default, group=2) + def _real_extract(self, url): - video_id = self._match_id(url) + path, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' - LRT') + media_url = self._extract_js_var(webpage, 'main_url', path) + media = self._download_json(self._extract_js_var( + webpage, 'media_info_url', + 'https://www.lrt.lt/servisai/stream_url/vod/media_info/'), + video_id, query={'url': media_url}) + jw_data = self._parse_jwplayer_data( + media['playlist_item'], video_id, base_url=url) - formats = [] - for _, file_url in re.findall( - r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): - ext = determine_ext(file_url) - if ext not in ('m3u8', 'mp3'): + json_ld_data = self._search_json_ld(webpage, video_id) + + tags = [] + for tag in media.get('tags', []): + tag_name = tag.get('name') + if not tag_name: continue - # mp3 served as m3u8 produces stuttered media file - if ext == 'm3u8' and '.mp3' in file_url: - continue - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - file_url, video_id, 'mp4', entry_protocol='m3u8_native', - fatal=False)) - elif ext == 'mp3': - formats.append({ - 'url': file_url, - 'vcodec': 'none', - }) - self._sort_formats(formats) + tags.append(tag_name) - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage) - duration = parse_duration(self._search_regex( - r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1', - webpage, 'duration', default=None, group='duration')) - - view_count = int_or_none(self._html_search_regex( - r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>', - webpage, 'view count', fatal=False, group='count')) - like_count = int_or_none(self._search_regex( - r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<', - webpage, 'like count', fatal=False, group='count')) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, + clean_info = { + 'description': clean_html(media.get('content')), + 'tags': tags, } + + return merge_dicts(clean_info, jw_data, json_ld_data) From bc38ef944526c0217e1c60351d920e22a233b5b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:10:46 +0100 Subject: [PATCH 009/384] [utils] Skip ! prefixed code in js_to_json --- haruhi_dl/utils.py | 5 +++-- test/test_utils.py | 22 ++++++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/utils.py b/haruhi_dl/utils.py index 6a02c6f0d..2bba1b04c 100644 --- a/haruhi_dl/utils.py +++ b/haruhi_dl/utils.py @@ -4080,7 +4080,7 @@ def js_to_json(code): v = m.group(0) if v in ('true', 'false', 'null'): return v - elif v.startswith('/*') or v.startswith('//') or v == ',': + elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': return "" if v[0] in ("'", '"'): @@ -4105,7 +4105,8 @@ def js_to_json(code): {comment}|,(?={skip}[\]}}])| (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*| \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| - [0-9]+(?={skip}:) + [0-9]+(?={skip}:)| + !+ '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) diff --git a/test/test_utils.py b/test/test_utils.py index a57863825..fcb86d92a 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -937,6 +937,28 @@ class TestUtil(unittest.TestCase): self.assertEqual(d['x'], 1) self.assertEqual(d['y'], 'a') + # Just drop ! prefix for now though this results in a wrong value + on = js_to_json('''{ + a: !0, + b: !1, + c: !!0, + d: !!42.42, + e: !!![], + f: !"abc", + g: !"", + !42: 42 + }''') + self.assertEqual(json.loads(on), { + 'a': 0, + 'b': 1, + 'c': 0, + 'd': 42.42, + 'e': [], + 'f': "abc", + 'g': "", + '42': 42 + }) + on = js_to_json('["abc", "def",]') self.assertEqual(json.loads(on), ['abc', 'def']) From 51dd5a4cc51e420bc93919062f3efeb50e581469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:10:56 +0100 Subject: [PATCH 010/384] [xtube] Fix extraction (closes #26996) --- haruhi_dl/extractor/xtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/xtube.py b/haruhi_dl/extractor/xtube.py index 01b253dcb..18969058f 100644 --- a/haruhi_dl/extractor/xtube.py +++ b/haruhi_dl/extractor/xtube.py @@ -90,7 +90,7 @@ class XTubeIE(InfoExtractor): title, thumbnail, duration = [None] * 3 config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config', + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', default='{}'), video_id, transform_source=js_to_json, fatal=False) if config: config = config.get('mainRoll') From 058b02f57f04e4b443bd2073eeb56d5f680fbd99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:11:04 +0100 Subject: [PATCH 011/384] =?UTF-8?q?[servus]=20Fix=20extraction=20(closes?= =?UTF-8?q?=20#26872,=20closes=20#26967,=20closes=20#26983,=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … closes #27000) --- haruhi_dl/extractor/servus.py | 106 +++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 15 deletions(-) diff --git a/haruhi_dl/extractor/servus.py b/haruhi_dl/extractor/servus.py index 9401bf2cf..206bc1801 100644 --- a/haruhi_dl/extractor/servus.py +++ b/haruhi_dl/extractor/servus.py @@ -1,9 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + unified_timestamp, + urlencode_postdata, + url_or_none, +) class ServusIE(InfoExtractor): @@ -19,13 +25,22 @@ class ServusIE(InfoExtractor): _TESTS = [{ # new URL schema 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', - 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', + 'md5': '60474d4c21f3eb148838f215c37f02b9', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', 'ext': 'mp4', 'title': 'Die Grünen aus Sicht des Volkes', + 'alt_title': 'Talk im Hangar-7 Voxpops Gruene', 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 62.442, + 'timestamp': 1605193976, + 'upload_date': '20201112', + 'series': 'Talk im Hangar-7', + 'season': 'Season 9', + 'season_number': 9, + 'episode': 'Episode 31 - September 14', + 'episode_number': 31, } }, { # old URL schema @@ -44,26 +59,87 @@ class ServusIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url).upper() - webpage = self._download_webpage(url, video_id) - title = self._search_regex( - (r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', - r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), - webpage, 'title', default=None, - group='title') or self._og_search_title(webpage) - title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + token = self._download_json( + 'https://auth.redbullmediahouse.com/token', video_id, + 'Downloading token', data=urlencode_postdata({ + 'grant_type': 'client_credentials', + }), headers={ + 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==', + }) + access_token = token['access_token'] + token_type = token.get('token_type', 'Bearer') - formats = self._extract_m3u8_formats( - 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, - video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + video = self._download_json( + 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id, + video_id, 'Downloading video JSON', headers={ + 'Authorization': '%s %s' % (token_type, access_token), + }) + + formats = [] + thumbnail = None + for resource in video['resources']: + if not isinstance(resource, dict): + continue + format_url = url_or_none(resource.get('url')) + if not format_url: + continue + extension = resource.get('extension') + type_ = resource.get('type') + if extension == 'jpg' or type_ == 'reference_keyframe': + thumbnail = format_url + continue + ext = determine_ext(format_url) + if type_ == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif type_ == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif extension == 'mp4' or ext == 'mp4': + formats.append({ + 'url': format_url, + 'format_id': type_, + 'width': int_or_none(resource.get('width')), + 'height': int_or_none(resource.get('height')), + }) self._sort_formats(formats) + attrs = {} + for attribute in video['attributes']: + if not isinstance(attribute, dict): + continue + key = attribute.get('fieldKey') + value = attribute.get('fieldValue') + if not key or not value: + continue + attrs[key] = value + + title = attrs.get('title_stv') or video_id + alt_title = attrs.get('title') + description = attrs.get('long_description') or attrs.get('short_description') + series = attrs.get('label') + season = attrs.get('season') + episode = attrs.get('chapter') + duration = float_or_none(attrs.get('duration'), scale=1000) + season_number = int_or_none(self._search_regex( + r'Season (\d+)', season or '', 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'Episode (\d+)', episode or '', 'episode number', default=None)) + return { 'id': video_id, 'title': title, + 'alt_title': alt_title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': unified_timestamp(video.get('lastPublished')), + 'series': series, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, 'formats': formats, } From ae004ab316f3d44521833f333089c410496c0a93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:11:12 +0100 Subject: [PATCH 012/384] [servus] Add support for pm-wissen.com (closes #25869) --- haruhi_dl/extractor/servus.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/servus.py b/haruhi_dl/extractor/servus.py index 206bc1801..1610ddc2c 100644 --- a/haruhi_dl/extractor/servus.py +++ b/haruhi_dl/extractor/servus.py @@ -18,7 +18,7 @@ class ServusIE(InfoExtractor): (?:www\.)? (?: servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| - servustv\.com/videos + (?:servustv|pm-wissen)\.com/videos ) /(?P<id>[aA]{2}-\w+|\d+-\d+) ''' @@ -55,6 +55,9 @@ class ServusIE(InfoExtractor): }, { 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', 'only_matching': True, + }, { + 'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/', + 'only_matching': True, }] def _real_extract(self, url): From 883cf213dc5155e01cde5c2060033589cfb81fba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:11:26 +0100 Subject: [PATCH 013/384] [ndr:embed:base] Extract subtitles (closes #25447, closes #26106) --- haruhi_dl/extractor/ndr.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/haruhi_dl/extractor/ndr.py b/haruhi_dl/extractor/ndr.py index 2447c812e..ddd828d92 100644 --- a/haruhi_dl/extractor/ndr.py +++ b/haruhi_dl/extractor/ndr.py @@ -81,6 +81,29 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + }, { + # with subtitles + 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', + 'info_dict': { + 'id': 'extra18674', + 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', + 'ext': 'mp4', + 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', + 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', + 'uploader': 'ndrtv', + 'upload_date': '20201113', + 'duration': 1749, + 'subtitles': { + 'de': [{ + 'ext': 'ttml', + 'url': r're:^https://www\.ndr\.de.+', + }], + }, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', 'only_matching': True, @@ -239,6 +262,20 @@ class NDREmbedBaseIE(InfoExtractor): 'preference': quality_key(thumbnail.get('quality')), }) + subtitles = {} + tracks = config.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + track_url = urljoin(url, track.get('src')) + if not track_url: + continue + subtitles.setdefault(track.get('srclang') or 'de', []).append({ + 'url': track_url, + 'ext': 'ttml', + }) + return { 'id': video_id, 'title': title, @@ -248,6 +285,7 @@ class NDREmbedBaseIE(InfoExtractor): 'duration': duration, 'thumbnails': thumbnails, 'formats': formats, + 'subtitles': subtitles, } From ebc218c4c4fa5db963a4407b80e3e39456eb4326 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:11:39 +0100 Subject: [PATCH 014/384] [lrt] fix extraction with empty tags(closes #20264) --- haruhi_dl/extractor/lrt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/lrt.py b/haruhi_dl/extractor/lrt.py index a89434adb..89d549858 100644 --- a/haruhi_dl/extractor/lrt.py +++ b/haruhi_dl/extractor/lrt.py @@ -61,7 +61,7 @@ class LRTIE(InfoExtractor): json_ld_data = self._search_json_ld(webpage, video_id) tags = [] - for tag in media.get('tags', []): + for tag in (media.get('tags') or []): tag_name = tag.get('name') if not tag_name: continue From 768e8bb238e9bd511f4faf197379d68d85923ff1 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:11:55 +0100 Subject: [PATCH 015/384] [urplay] fix extraction(closes #26828) --- haruhi_dl/extractor/urplay.py | 77 +++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 26 deletions(-) diff --git a/haruhi_dl/extractor/urplay.py b/haruhi_dl/extractor/urplay.py index 6030b7cb5..10b817760 100644 --- a/haruhi_dl/extractor/urplay.py +++ b/haruhi_dl/extractor/urplay.py @@ -2,7 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_timestamp +from ..utils import ( + dict_get, + int_or_none, + unified_timestamp, +) class URPlayIE(InfoExtractor): @@ -15,8 +19,8 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', - 'timestamp': 1513512768, - 'upload_date': '20171217', + 'timestamp': 1513292400, + 'upload_date': '20171214', }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -25,7 +29,7 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'Tripp, Trapp, Träd : Sovkudde', 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', - 'timestamp': 1440093600, + 'timestamp': 1440086400, 'upload_date': '20150820', }, }, { @@ -35,37 +39,58 @@ class URPlayIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - + url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - urplayer_data = self._parse_json(self._search_regex( - r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id) - host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] + urplayer_data = self._parse_json(self._html_search_regex( + r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['currentProduct'] + episode = urplayer_data['title'] + raw_streaming_info = urplayer_data['streamingInfo']['raw'] + host = self._download_json( + 'http://streaming-loadbalancer.ur.se/loadbalancer.json', + video_id)['redirect'] formats = [] - for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): - file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) + for k, v in raw_streaming_info.items(): + if not (k in ('sd', 'hd') and isinstance(v, dict)): + continue + file_http = v.get('location') if file_http: formats.extend(self._extract_wowza_formats( - 'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['rtmp', 'rtsp'])) + 'http://%s/%splaylist.m3u8' % (host, file_http), + video_id, skip_protocols=['f4m', 'rtmp', 'rtsp'])) self._sort_formats(formats) - subtitles = {} - for subtitle in urplayer_data.get('subtitles', []): - subtitle_url = subtitle.get('file') - kind = subtitle.get('kind') - if not subtitle_url or (kind and kind != 'captions'): - continue - subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ - 'url': subtitle_url, - }) + image = urplayer_data.get('image') or {} + thumbnails = [] + for k, v in image.items(): + t = { + 'id': k, + 'url': v, + } + wh = k.split('x') + if len(wh) == 2: + t.update({ + 'width': int_or_none(wh[0]), + 'height': int_or_none(wh[1]), + }) + thumbnails.append(t) + + series = urplayer_data.get('series') or {} + series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle')) return { 'id': video_id, - 'title': urplayer_data['title'], - 'description': self._og_search_description(webpage), - 'thumbnail': urplayer_data.get('image'), - 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), webpage, 'timestamp')), - 'series': urplayer_data.get('series_title'), - 'subtitles': subtitles, + 'title': '%s : %s' % (series_title, episode) if series_title else episode, + 'description': urplayer_data.get('description'), + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(urplayer_data.get('publishedAt')), + 'series': series_title, 'formats': formats, + 'duration': int_or_none(urplayer_data.get('duration')), + 'categories': urplayer_data.get('categories'), + 'tags': urplayer_data.get('keywords'), + 'season': series.get('label'), + 'episode': episode, + 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), } From 4826425743e6e22d7f5c7d01c0f10e29e86382ba Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:12:02 +0100 Subject: [PATCH 016/384] [bandcamp] extract playlist_description(closes #22684) --- haruhi_dl/extractor/bandcamp.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/bandcamp.py b/haruhi_dl/extractor/bandcamp.py index 82b605531..4c6b55035 100644 --- a/haruhi_dl/extractor/bandcamp.py +++ b/haruhi_dl/extractor/bandcamp.py @@ -270,6 +270,7 @@ class BandcampAlbumIE(BandcampIE): 'title': '"Entropy" EP', 'uploader_id': 'jstrecords', 'id': 'entropy-ep', + 'description': 'md5:0ff22959c943622972596062f2f366a5', }, 'playlist_mincount': 3, }, { @@ -279,6 +280,7 @@ class BandcampAlbumIE(BandcampIE): 'id': 'we-are-the-plague', 'title': 'WE ARE THE PLAGUE', 'uploader_id': 'insulters', + 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f', }, 'playlist_count': 2, }] @@ -305,11 +307,14 @@ class BandcampAlbumIE(BandcampIE): for t in track_info if t.get('duration')] + current = tralbum.get('current') or {} + return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, - 'title': try_get(tralbum, lambda x: x['current']['title'], compat_str), + 'title': current.get('title'), + 'description': current.get('about'), 'entries': entries, } From 14539655d5a3cf2f7fbebca02a427b7beac74fa6 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:12:10 +0100 Subject: [PATCH 017/384] [malltv] fix extraction(closes #27035) --- haruhi_dl/extractor/malltv.py | 60 +++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 14 deletions(-) diff --git a/haruhi_dl/extractor/malltv.py b/haruhi_dl/extractor/malltv.py index 6f4fd927f..fadfd9338 100644 --- a/haruhi_dl/extractor/malltv.py +++ b/haruhi_dl/extractor/malltv.py @@ -1,10 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import merge_dicts +from ..utils import ( + clean_html, + dict_get, + float_or_none, + int_or_none, + merge_dicts, + parse_duration, + try_get, +) class MallTVIE(InfoExtractor): @@ -17,7 +23,7 @@ class MallTVIE(InfoExtractor): 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', 'ext': 'mp4', 'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', - 'description': 'md5:25fc0ec42a72ba602b602c683fa29deb', + 'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35', 'duration': 216, 'timestamp': 1538870400, 'upload_date': '20181007', @@ -37,20 +43,46 @@ class MallTVIE(InfoExtractor): webpage = self._download_webpage( url, display_id, headers=self.geo_verification_headers()) - SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b' + video = self._parse_json(self._search_regex( + r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);', + webpage, 'video object'), display_id) + video_source = video['VideoSource'] video_id = self._search_regex( - SOURCE_RE, webpage, 'video id', group='id') + r'/([\da-z]+)/index\b', video_source, 'video id') - media = self._parse_html5_media_entries( - url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id, - m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] + formats = self._extract_m3u8_formats( + video_source + '.m3u8', video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + subtitles = {} + for s in (video.get('Subtitles') or {}): + s_url = s.get('Url') + if not s_url: + continue + subtitles.setdefault(s.get('Language') or 'cz', []).append({ + 'url': s_url, + }) + + entity_counts = video.get('EntityCounts') or {} + + def get_count(k): + v = entity_counts.get(k + 's') or {} + return int_or_none(dict_get(v, ('Count', 'StrCount'))) info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(media, info, { + return merge_dicts({ 'id': video_id, 'display_id': display_id, - 'title': self._og_search_title(webpage, default=None) or display_id, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - }) + 'title': video.get('Title'), + 'description': clean_html(video.get('Description')), + 'thumbnail': video.get('ThumbnailUrl'), + 'formats': formats, + 'subtitles': subtitles, + 'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')), + 'view_count': get_count('View'), + 'like_count': get_count('Like'), + 'dislike_count': get_count('Dislike'), + 'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])), + 'comment_count': get_count('Comment'), + }, info) From 9f47f2a04e5c04510a18c15f1d785eceeb428a03 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:12:29 +0100 Subject: [PATCH 018/384] [spiegel] Fix extraction (closes #24206, closes #24767) Code picked from PR #24767 since original repo is not available due to takedown. --- haruhi_dl/extractor/extractors.py | 3 +- haruhi_dl/extractor/spiegel.py | 161 ++++++------------------------ haruhi_dl/extractor/spiegeltv.py | 17 ---- 3 files changed, 29 insertions(+), 152 deletions(-) delete mode 100644 haruhi_dl/extractor/spiegeltv.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 412d02955..1341b84bd 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1101,8 +1101,7 @@ from .spankbang import ( SpankBangPlaylistIE, ) from .spankwire import SpankwireIE -from .spiegel import SpiegelIE, SpiegelArticleIE -from .spiegeltv import SpiegeltvIE +from .spiegel import SpiegelIE from .spike import ( BellatorIE, ParamountNetworkIE, diff --git a/haruhi_dl/extractor/spiegel.py b/haruhi_dl/extractor/spiegel.py index 4df7f4ddc..2da32b9b2 100644 --- a/haruhi_dl/extractor/spiegel.py +++ b/haruhi_dl/extractor/spiegel.py @@ -1,159 +1,54 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .spiegeltv import SpiegeltvIE -from ..compat import compat_urlparse -from ..utils import ( - parse_duration, - strip_or_none, - unified_timestamp, -) +from .jwplatform import JWPlatformIE class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': 'b57399839d055fccfeb9a0455c439868', + 'md5': '50c7948883ec85a3e431a0a44b7ad1d6', 'info_dict': { - 'id': '563747', + 'id': 'II0BUyxY', + 'display_id': '1259285', 'ext': 'mp4', - 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', + 'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', - 'duration': 49, + 'duration': 48.0, 'upload_date': '20130311', - 'timestamp': 1362994320, + 'timestamp': 1362997920, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', - 'info_dict': { - 'id': '580988', - 'ext': 'mp4', - 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', - 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', - 'duration': 983, - 'upload_date': '20131115', - 'timestamp': 1384546642, - }, - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': '97b91083a672d72976faa8433430afb9', - 'info_dict': { - 'id': '601883', - 'ext': 'mp4', - 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', - 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', - 'upload_date': '20140904', - 'timestamp': 1409834160, - } - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', 'only_matching': True, }, { - # nexx video + 'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html', + 'only_matching': True, + }, { + 'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7', + 'only_matching': True, + }, { 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id - handle = self._request_webpage(metadata_url, video_id) - - # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html - if SpiegeltvIE.suitable(handle.geturl()): - return self.url_result(handle.geturl(), 'Spiegeltv') - - video_data = self._parse_json(self._webpage_read_content( - handle, metadata_url, video_id), video_id) - title = video_data['title'] - nexx_id = video_data['nexxOmniaId'] - domain_id = video_data.get('nexxOmniaDomain') or '748' - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': 'nexx:%s:%s' % (domain_id, nexx_id), - 'title': title, - 'description': strip_or_none(video_data.get('teaser')), - 'duration': parse_duration(video_data.get('duration')), - 'timestamp': unified_timestamp(video_data.get('datum')), - 'ie_key': NexxIE.ie_key(), - } - - -class SpiegelArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' - IE_NAME = 'Spiegel:Article' - IE_DESC = 'Articles on spiegel.de' - _TESTS = [{ + }, { 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', - 'info_dict': { - 'id': '1516455', - 'ext': 'mp4', - 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', - 'description': 're:^Patrick Kämnitz gehört.{100,}', - 'upload_date': '20140825', - }, - }, { - 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', - 'info_dict': { - - }, - 'playlist_count': 6, - }, { - # Nexx iFrame embed - 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', - 'info_dict': { - 'id': '161464', - 'ext': 'mp4', - 'title': 'Nervenkitzel Achterbahn', - 'alt_title': 'Karussellbauer in Deutschland', - 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', - 'release_year': 2005, - 'creator': 'SPIEGEL TV', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2761, - 'timestamp': 1394021479, - 'upload_date': '20140305', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - - # Single video on top of the page - video_link = self._search_regex( - r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage, - 'video page URL', default=None) - if video_link: - video_url = compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', video_link) - return self.url_result(video_url) - - # Multiple embedded videos - embeds = re.findall( - r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"', - webpage) - entries = [ - self.url_result(compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', embed_path)) - for embed_path in embeds] - if embeds: - return self.playlist_result(entries) - - return self.playlist_from_matches( - NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) + media_id = self._html_search_regex( + r'("|["\'])mediaId\1\s*:\s*("|["\'])(?P<id>(?:(?!\2).)+)\2', + webpage, 'media id', group='id') + return { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': video_id, + 'url': 'jwplatform:%s' % media_id, + 'title': self._og_search_title(webpage, default=None), + 'ie_key': JWPlatformIE.ie_key(), + } diff --git a/haruhi_dl/extractor/spiegeltv.py b/haruhi_dl/extractor/spiegeltv.py deleted file mode 100644 index 6ccf4c342..000000000 --- a/haruhi_dl/extractor/spiegeltv.py +++ /dev/null @@ -1,17 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .nexx import NexxIE - - -class SpiegeltvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/', - 'only_matching': True, - } - - def _real_extract(self, url): - return self.url_result( - 'https://api.nexx.cloud/v3/748/videos/byid/%s' - % self._match_id(url), ie=NexxIE.ie_key()) From ff92752e7c448d8b7b7c8d3d6f98b7e02ae78726 Mon Sep 17 00:00:00 2001 From: gdzx <6490707+gdzx@users.noreply.github.com> Date: Fri, 26 Feb 2021 14:12:41 +0100 Subject: [PATCH 019/384] [francetv] Add fallback video url extraction (#27047) Fallback on another API endpoint when no video formats are found. Closes ytdl-org#22561 --- haruhi_dl/extractor/francetv.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/francetv.py b/haruhi_dl/extractor/francetv.py index 8598576e5..f29fd2666 100644 --- a/haruhi_dl/extractor/francetv.py +++ b/haruhi_dl/extractor/francetv.py @@ -128,17 +128,37 @@ class FranceTVIE(InfoExtractor): is_live = None - formats = [] + videos = [] + for video in info['videos']: if video['statut'] != 'ONLINE': continue + if not video['url']: + continue + videos.append(video) + + if not videos: + for device_type in ['desktop', 'mobile']: + fallback_info = self._download_json( + 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, + video_id, 'Downloading fallback %s video JSON' % device_type, query={ + 'device_type': device_type, + 'browser': 'chrome', + }, fatal=False) + + if fallback_info and fallback_info.get('video'): + videos.append(fallback_info['video']) + + formats = [] + for video in videos: video_url = video['url'] if not video_url: continue if is_live is None: - is_live = (try_get( - video, lambda x: x['plages_ouverture'][0]['direct'], - bool) is True) or '/live.francetv.fr/' in video_url + is_live = ((try_get( + video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True) + or video.get('is_live') is True + or '/live.francetv.fr/' in video_url) format_id = video['format'] ext = determine_ext(video_url) if ext == 'f4m': @@ -154,6 +174,9 @@ class FranceTVIE(InfoExtractor): sign(video_url, format_id), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False)) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, @@ -166,6 +189,7 @@ class FranceTVIE(InfoExtractor): 'url': video_url, 'format_id': format_id, }) + self._sort_formats(formats) title = info['titre'] From 9a4014d3941329610cb6c3b3569d7304601a6969 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:12:48 +0100 Subject: [PATCH 020/384] [francetv] improve info extraction --- haruhi_dl/extractor/francetv.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/haruhi_dl/extractor/francetv.py b/haruhi_dl/extractor/francetv.py index f29fd2666..ab2280630 100644 --- a/haruhi_dl/extractor/francetv.py +++ b/haruhi_dl/extractor/francetv.py @@ -17,6 +17,7 @@ from ..utils import ( parse_duration, try_get, url_or_none, + urljoin, ) from .dailymotion import DailymotionIE @@ -130,10 +131,10 @@ class FranceTVIE(InfoExtractor): videos = [] - for video in info['videos']: - if video['statut'] != 'ONLINE': + for video in (info.get('videos') or []): + if video.get('statut') != 'ONLINE': continue - if not video['url']: + if not video.get('url'): continue videos.append(video) @@ -151,15 +152,15 @@ class FranceTVIE(InfoExtractor): formats = [] for video in videos: - video_url = video['url'] + video_url = video.get('url') if not video_url: continue if is_live is None: - is_live = ((try_get( - video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True) + is_live = (try_get( + video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True or video.get('is_live') is True or '/live.francetv.fr/' in video_url) - format_id = video['format'] + format_id = video.get('format') ext = determine_ext(video_url) if ext == 'f4m': if georestricted: @@ -209,10 +210,10 @@ class FranceTVIE(InfoExtractor): return { 'id': video_id, 'title': self._live_title(title) if is_live else title, - 'description': clean_html(info['synopsis']), - 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), - 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), - 'timestamp': int_or_none(info['diffusion']['timestamp']), + 'description': clean_html(info.get('synopsis')), + 'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')), + 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), + 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, From e2b997d3bf703e0e4ab7cc7f6ace650b27202ace Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:13:00 +0100 Subject: [PATCH 021/384] =?UTF-8?q?[extractor/common]=20Output=20error=20f?= =?UTF-8?q?or=20invalid=20URLs=20in=20=5Fis=5Fvalid=5Furl=20(re=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …fs #21400, refs #24151, refs #25617, refs #25618, refs #25586, refs #26068, refs #27072) --- haruhi_dl/extractor/common.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index 699aced61..fb616b05a 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -1474,9 +1474,10 @@ class InfoExtractor(object): try: self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True - except ExtractorError: + except ExtractorError as e: self.to_screen( - '%s: %s URL is invalid, skipping' % (video_id, item)) + '%s: %s URL is invalid, skipping: %s' + % (video_id, item, error_to_compat_str(e.cause))) return False def http_scheme(self): From f8fb19832619230796df57c2fd258c3c40f6094d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:13:08 +0100 Subject: [PATCH 022/384] [mgtv] fix format extraction(closes #26415) --- haruhi_dl/extractor/mgtv.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/mgtv.py b/haruhi_dl/extractor/mgtv.py index 71fc3ec56..cab3aa045 100644 --- a/haruhi_dl/extractor/mgtv.py +++ b/haruhi_dl/extractor/mgtv.py @@ -17,9 +17,8 @@ from ..utils import ( class MGTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' + _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' IE_DESC = '芒果TV' - _GEO_COUNTRIES = ['CN'] _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', @@ -34,14 +33,18 @@ class MGTVIE(InfoExtractor): }, { 'url': 'http://www.mgtv.com/b/301817/3826653.html', 'only_matching': True, + }, { + 'url': 'https://w.mgtv.com/b/301817/3826653.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1] try: api_data = self._download_json( 'https://pcweb.api.mgtv.com/player/video', video_id, query={ - 'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: @@ -56,6 +59,7 @@ class MGTVIE(InfoExtractor): stream_data = self._download_json( 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ 'pm2': api_data['atc']['pm2'], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] stream_domain = stream_data['stream_domain'][0] From 93064492e9ef1574e9a157afcfd21370996c0818 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:13:13 +0100 Subject: [PATCH 023/384] [arte] Extract m3u8 formats (closes #27061) --- haruhi_dl/extractor/arte.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/arte.py b/haruhi_dl/extractor/arte.py index 2bd3bfe8a..b80467548 100644 --- a/haruhi_dl/extractor/arte.py +++ b/haruhi_dl/extractor/arte.py @@ -11,6 +11,7 @@ from ..utils import ( qualities, try_get, unified_strdate, + url_or_none, ) # There are different sources of video in arte.tv, the extraction process @@ -63,8 +64,13 @@ class ArteTVBaseIE(InfoExtractor): langcode = LANGS.get(lang, lang) formats = [] + m3u8_formats = [] for format_id, format_dict in vsr.items(): f = dict(format_dict) + format_url = url_or_none(f.get('url')) + streamer = f.get('streamer') + if not format_url and not streamer: + continue versionCode = f.get('versionCode') l = re.escape(langcode) @@ -107,6 +113,15 @@ class ArteTVBaseIE(InfoExtractor): else: lang_pref = -1 + media_type = f.get('mediaType') + if media_type == 'hls': + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + for m3u8_format in m3u8_formats: + m3u8_format['language_preference'] = lang_pref + continue + format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -118,7 +133,7 @@ class ArteTVBaseIE(InfoExtractor): 'quality': qfunc(f.get('quality')), } - if f.get('mediaType') == 'rtmp': + if media_type == 'rtmp': format['url'] = f['streamer'] format['play_path'] = 'mp4:' + f['url'] format['ext'] = 'flv' @@ -128,6 +143,8 @@ class ArteTVBaseIE(InfoExtractor): formats.append(format) self._check_formats(formats, video_id) + + formats.extend(m3u8_formats) self._sort_formats(formats) info_dict['formats'] = formats From 1451f4f49864c0c07c9db01b9af8fc3730091b02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:13:19 +0100 Subject: [PATCH 024/384] [arte] Rework extractors * Reimplement embed and playlist extractors to delegate to the single entrypoint artetv extractor Beware reluctant download archive extractor keys breakage. * Improve embeds detection (closes #27057) - Remove obsolete code --- haruhi_dl/extractor/arte.py | 154 ++++++++++++++++++------------ haruhi_dl/extractor/extractors.py | 2 +- haruhi_dl/extractor/generic.py | 9 +- 3 files changed, 100 insertions(+), 65 deletions(-) diff --git a/haruhi_dl/extractor/arte.py b/haruhi_dl/extractor/arte.py index b80467548..03abdbfaf 100644 --- a/haruhi_dl/extractor/arte.py +++ b/haruhi_dl/extractor/arte.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -14,14 +17,44 @@ from ..utils import ( url_or_none, ) -# There are different sources of video in arte.tv, the extraction process -# is different for each one. The videos usually expire in 7 days, so we can't -# add tests. - class ArteTVBaseIE(InfoExtractor): - def _extract_from_json_url(self, json_url, video_id, lang, title=None): - info = self._download_json(json_url, video_id) + _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' + _API_BASE = 'https://api.arte.tv/api/player/v1' + + +class ArteTVIE(ArteTVBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos| + api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s) + ) + /(?P<id>\d{6}-\d{3}-[AF]) + ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'info_dict': { + 'id': '088501-000-A', + 'ext': 'mp4', + 'title': 'Mexico: Stealing Petrol to Survive', + 'upload_date': '20190628', + }, + }, { + 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', + 'only_matching': True, + }, { + 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + lang = mobj.group('lang') or mobj.group('lang_2') + + info = self._download_json( + '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id) player_info = info['videoJsonPlayer'] vsr = try_get(player_info, lambda x: x['VSR'], dict) @@ -38,18 +71,11 @@ class ArteTVBaseIE(InfoExtractor): if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] - title = (player_info.get('VTI') or title or player_info['VID']).strip() + title = (player_info.get('VTI') or player_info['VID']).strip() subtitle = player_info.get('VSU', '').strip() if subtitle: title += ' - %s' % subtitle - info_dict = { - 'id': player_info['VID'], - 'title': title, - 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(upload_date_str), - 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), - } qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) LANGS = { @@ -64,7 +90,6 @@ class ArteTVBaseIE(InfoExtractor): langcode = LANGS.get(lang, lang) formats = [] - m3u8_formats = [] for format_id, format_dict in vsr.items(): f = dict(format_dict) format_url = url_or_none(f.get('url')) @@ -120,6 +145,7 @@ class ArteTVBaseIE(InfoExtractor): m3u8_id=format_id, fatal=False) for m3u8_format in m3u8_formats: m3u8_format['language_preference'] = lang_pref + formats.extend(m3u8_formats) continue format = { @@ -142,58 +168,50 @@ class ArteTVBaseIE(InfoExtractor): formats.append(format) - self._check_formats(formats, video_id) - - formats.extend(m3u8_formats) self._sort_formats(formats) - info_dict['formats'] = formats - return info_dict + return { + 'id': player_info.get('VID') or video_id, + 'title': title, + 'description': player_info.get('VDE'), + 'upload_date': unified_strdate(upload_date_str), + 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + 'formats': formats, + } -class ArteTVPlus7IE(ArteTVBaseIE): - IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])' - +class ArteTVEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' _TESTS = [{ - 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', 'info_dict': { - 'id': '088501-000-A', + 'id': '100605-013-A', 'ext': 'mp4', - 'title': 'Mexico: Stealing Petrol to Survive', - 'upload_date': '20190628', + 'title': 'United we Stream November Lockdown Edition #13', + 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', + 'upload_date': '20201116', }, + }, { + 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, }] - def _real_extract(self, url): - lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url( - 'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), - video_id, lang) - - -class ArteTVEmbedIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:embed' - _VALID_URL = r'''(?x) - https://www\.arte\.tv - /player/v3/index\.php\?json_url= - (?P<json_url> - https?://api\.arte\.tv/api/player/v1/config/ - (?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF]) - ) - ''' - - _TESTS = [] + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', + webpage)] def _real_extract(self, url): - json_url, lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url(json_url, video_id, lang) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + json_url = qs['json_url'][0] + video_id = ArteTVIE._match_id(json_url) + return self.url_result( + json_url, ie=ArteTVIE.ie_key(), video_id=video_id) class ArteTVPlaylistIE(ArteTVBaseIE): - IE_NAME = 'arte.tv:playlist' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})' - + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', 'info_dict': { @@ -202,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE): 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', }, 'playlist_mincount': 6, + }, { + 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', + 'only_matching': True, }] def _real_extract(self, url): lang, playlist_id = re.match(self._VALID_URL, url).groups() collection = self._download_json( - 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' - % (lang, playlist_id), playlist_id) + '%s/collectionData/%s/%s?source=videos' + % (self._API_BASE, lang, playlist_id), playlist_id) + entries = [] + for video in collection['videos']: + if not isinstance(video, dict): + continue + video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) + if not video_url: + continue + video_id = video.get('programId') + entries.append({ + '_type': 'url_transparent', + 'url': video_url, + 'id': video_id, + 'title': video.get('title'), + 'alt_title': video.get('subtitle'), + 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), + 'duration': int_or_none(video.get('durationSeconds')), + 'view_count': int_or_none(video.get('views')), + 'ie_key': ArteTVIE.ie_key(), + }) title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') - entries = [ - self._extract_from_json_url( - video['jsonUrl'], video.get('programId') or playlist_id, lang) - for video in collection['videos'] if video.get('jsonUrl')] return self.playlist_result(entries, playlist_id, title, description) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 1341b84bd..6a7fc43f4 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -68,7 +68,7 @@ from .ard import ( ARDMediathekIE, ) from .arte import ( - ArteTVPlus7IE, + ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, ) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index c81247dd0..babc59dcc 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -91,6 +91,7 @@ from .piksel import PikselIE from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE +from .arte import ArteTVEmbedIE from .videopress import VideoPressIE from .rutube import RutubeIE from .limelight import LimelightBaseIE @@ -2751,11 +2752,9 @@ class GenericIE(InfoExtractor): return self.url_result(ustream_url, UstreamIE.ie_key()) # Look for embedded arte.tv player - mobj = re.search( - r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'ArteTVEmbed') + arte_urls = ArteTVEmbedIE._extract_urls(webpage) + if arte_urls: + return self.playlist_from_matches(arte_urls, video_id, video_title) # Look for embedded francetv player mobj = re.search( From 3a32ea072b9837e75e989893229ca64ebc789438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:13:24 +0100 Subject: [PATCH 025/384] =?UTF-8?q?[youporn]=20Fix=20upload=20date=20extra?= =?UTF-8?q?ction=20and=20make=20comment=20count=20optional=20=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …(closes #26986) --- haruhi_dl/extractor/youporn.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/youporn.py b/haruhi_dl/extractor/youporn.py index c178e2f39..901651b8b 100644 --- a/haruhi_dl/extractor/youporn.py +++ b/haruhi_dl/extractor/youporn.py @@ -29,7 +29,6 @@ class YouPornIE(InfoExtractor): 'upload_date': '20101217', 'average_rating': int, 'view_count': int, - 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -48,7 +47,6 @@ class YouPornIE(InfoExtractor): 'upload_date': '20110418', 'average_rating': int, 'view_count': int, - 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -156,7 +154,8 @@ class YouPornIE(InfoExtractor): r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - [r'Date\s+[Aa]dded:\s*<span>([^<]+)', + [r'UPLOADED:\s*<span>([^<]+)', + r'Date\s+[Aa]dded:\s*<span>([^<]+)', r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], webpage, 'upload date', fatal=False)) @@ -171,7 +170,7 @@ class YouPornIE(InfoExtractor): webpage, 'view count', fatal=False, group='count')) comment_count = str_to_int(self._search_regex( r'>All [Cc]omments? \(([\d,.]+)\)', - webpage, 'comment count', fatal=False)) + webpage, 'comment count', default=None)) def extract_tag_box(regex, title): tag_box = self._search_regex(regex, webpage, title, default=None) From 46fce7272c49df0e74683a229a68eb2faeba93e8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:13:32 +0100 Subject: [PATCH 026/384] [mtv] fix mgid extraction(closes #26841) --- haruhi_dl/extractor/mtv.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/haruhi_dl/extractor/mtv.py b/haruhi_dl/extractor/mtv.py index fedd5f46b..df1034fc5 100644 --- a/haruhi_dl/extractor/mtv.py +++ b/haruhi_dl/extractor/mtv.py @@ -349,6 +349,18 @@ class MTVIE(MTVServicesInfoExtractor): 'only_matching': True, }] + @staticmethod + def extract_child_with_type(parent, t): + children = parent['children'] + return next(c for c in children if c.get('type') == t) + + def _extract_mgid(self, webpage): + data = self._parse_json(self._search_regex( + r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) + main_container = self.extract_child_with_type(data, 'MainContainer') + video_player = self.extract_child_with_type(main_container, 'VideoPlayer') + return video_player['props']['media']['video']['config']['uri'] + class MTVJapanIE(MTVServicesInfoExtractor): IE_NAME = 'mtvjapan' From 514683921adf72de4d5b40726fbea5a736f6e16b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:13:40 +0100 Subject: [PATCH 027/384] [vimeo:album] fix extraction(closes #27079) --- haruhi_dl/extractor/vimeo.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/vimeo.py b/haruhi_dl/extractor/vimeo.py index e14551459..e8a4547cd 100644 --- a/haruhi_dl/extractor/vimeo.py +++ b/haruhi_dl/extractor/vimeo.py @@ -946,10 +946,13 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): def _real_extract(self, url): album_id = self._match_id(url) - webpage = self._download_webpage(url, album_id) - viewer = self._parse_json(self._search_regex( - r'bootstrap_data\s*=\s*({.+?})</script>', - webpage, 'bootstrap data'), album_id)['viewer'] + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', album_id, fatal=False) + if not viewer: + webpage = self._download_webpage(url, album_id) + viewer = self._parse_json(self._search_regex( + r'bootstrap_data\s*=\s*({.+?})</script>', + webpage, 'bootstrap data'), album_id)['viewer'] jwt = viewer['jwt'] album = self._download_json( 'https://api.vimeo.com/albums/' + album_id, From 9a527679ed18644ebc16577442b6fd2373920d5c Mon Sep 17 00:00:00 2001 From: Joost Verdoorn <jpverdoorn@gmail.com> Date: Fri, 26 Feb 2021 14:14:59 +0100 Subject: [PATCH 028/384] [Amara] Add new extractor (#20618) * [Amara] Add new extractor --- haruhi_dl/extractor/amara.py | 76 +++++++++++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 1 + 2 files changed, 77 insertions(+) create mode 100644 haruhi_dl/extractor/amara.py diff --git a/haruhi_dl/extractor/amara.py b/haruhi_dl/extractor/amara.py new file mode 100644 index 000000000..b222154bd --- /dev/null +++ b/haruhi_dl/extractor/amara.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals +from .common import InfoExtractor + + +class AmaraIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)' + _TESTS = [ + { + 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', + 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', + 'info_dict': { + 'id': 'h6ZuVdvYnfE', + 'ext': 'mp4', + 'title': 'Why jury trials are becoming less common', + 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20160813', + 'uploader': 'PBS NewsHour', + 'uploader_id': 'PBSNewsHour' + } + }, + { + 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', + 'md5': '99392c75fa05d432a8f11df03612195e', + 'info_dict': { + 'id': '18622084', + 'ext': 'mov', + 'title': 'Vimeo at CES 2011!', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'timestamp': 1294649110, + 'upload_date': '20110110', + 'uploader': 'Sam Morrill', + 'uploader_id': 'sammorrill' + } + }, + { + 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', + 'md5': 'd3970f08512738ee60c5807311ff5d3f', + 'info_dict': { + 'id': 'ChimamandaAdichie_2009G-transcript', + 'ext': 'mp4', + 'title': 'The danger of a single story', + 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20131206' + } + } + ] + + def get_subtitles_for_language(self, language): + return [{ + 'ext': type, + 'url': language['subtitles_uri'].replace('format=json', 'format=' + type) + } for type in ['vtt', 'srt', 'json']] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._download_json('https://amara.org/api/videos/%s/' % video_id, video_id, query={'format': 'json'}) + + video_url = meta.get('all_urls')[0] + subtitles = dict([(language['code'], self.get_subtitles_for_language(language)) for language in meta.get('languages', []) if language['published']]) + + return { + '_type': 'url_transparent', + 'url': video_url, + 'id': video_id, + 'subtitles': subtitles, + 'title': meta['title'], + 'description': meta.get('description'), + 'thumbnail': meta.get('thumbnail') + } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 6a7fc43f4..bd6003c93 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -43,6 +43,7 @@ from .airmozilla import AirMozillaIE from .albicla import AlbiclaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .amara import AmaraIE from .amcnetworks import AMCNetworksIE from .americastestkitchen import AmericasTestKitchenIE from .animeondemand import AnimeOnDemandIE From 339f127540188c06c99ec6f09ff03a1a03e21893 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:16:30 +0100 Subject: [PATCH 029/384] [amara] improve extraction --- haruhi_dl/extractor/amara.py | 143 +++++++++++++++++++++-------------- 1 file changed, 85 insertions(+), 58 deletions(-) diff --git a/haruhi_dl/extractor/amara.py b/haruhi_dl/extractor/amara.py index b222154bd..61d469574 100644 --- a/haruhi_dl/extractor/amara.py +++ b/haruhi_dl/extractor/amara.py @@ -1,76 +1,103 @@ # coding: utf-8 from __future__ import unicode_literals + from .common import InfoExtractor +from .youtube import YoutubeIE +from .vimeo import VimeoIE +from ..utils import ( + int_or_none, + parse_iso8601, + update_url_query, +) class AmaraIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)' - _TESTS = [ - { - 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', - 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', - 'info_dict': { - 'id': 'h6ZuVdvYnfE', - 'ext': 'mp4', - 'title': 'Why jury trials are becoming less common', - 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': dict, - 'upload_date': '20160813', - 'uploader': 'PBS NewsHour', - 'uploader_id': 'PBSNewsHour' - } - }, - { - 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', - 'md5': '99392c75fa05d432a8f11df03612195e', - 'info_dict': { - 'id': '18622084', - 'ext': 'mov', - 'title': 'Vimeo at CES 2011!', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': dict, - 'timestamp': 1294649110, - 'upload_date': '20110110', - 'uploader': 'Sam Morrill', - 'uploader_id': 'sammorrill' - } - }, - { - 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', - 'md5': 'd3970f08512738ee60c5807311ff5d3f', - 'info_dict': { - 'id': 'ChimamandaAdichie_2009G-transcript', - 'ext': 'mp4', - 'title': 'The danger of a single story', - 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', - 'thumbnail': r're:^https?://.*\.jpg$', - 'subtitles': dict, - 'upload_date': '20131206' - } + _TESTS = [{ + # Youtube + 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', + 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', + 'info_dict': { + 'id': 'h6ZuVdvYnfE', + 'ext': 'mp4', + 'title': 'Why jury trials are becoming less common', + 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20160813', + 'uploader': 'PBS NewsHour', + 'uploader_id': 'PBSNewsHour', + 'timestamp': 1549639570, } - ] - - def get_subtitles_for_language(self, language): - return [{ - 'ext': type, - 'url': language['subtitles_uri'].replace('format=json', 'format=' + type) - } for type in ['vtt', 'srt', 'json']] + }, { + # Vimeo + 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', + 'md5': '99392c75fa05d432a8f11df03612195e', + 'info_dict': { + 'id': '18622084', + 'ext': 'mov', + 'title': 'Vimeo at CES 2011!', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'timestamp': 1294763658, + 'upload_date': '20110111', + 'uploader': 'Sam Morrill', + 'uploader_id': 'sammorrill' + } + }, { + # Direct Link + 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', + 'md5': 'd3970f08512738ee60c5807311ff5d3f', + 'info_dict': { + 'id': 's8KL7I3jLmh6', + 'ext': 'mp4', + 'title': 'The danger of a single story', + 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20091007', + 'timestamp': 1254942511, + } + }] def _real_extract(self, url): video_id = self._match_id(url) - meta = self._download_json('https://amara.org/api/videos/%s/' % video_id, video_id, query={'format': 'json'}) + meta = self._download_json( + 'https://amara.org/api/videos/%s/' % video_id, + video_id, query={'format': 'json'}) + title = meta['title'] + video_url = meta['all_urls'][0] - video_url = meta.get('all_urls')[0] - subtitles = dict([(language['code'], self.get_subtitles_for_language(language)) for language in meta.get('languages', []) if language['published']]) + subtitles = {} + for language in (meta.get('languages') or []): + subtitles_uri = language.get('subtitles_uri') + if not (subtitles_uri and language.get('published')): + continue + subtitle = subtitles.setdefault(language.get('code') or 'en', []) + for f in ('json', 'srt', 'vtt'): + subtitle.append({ + 'ext': f, + 'url': update_url_query(subtitles_uri, {'format': f}), + }) - return { - '_type': 'url_transparent', + info = { 'url': video_url, 'id': video_id, 'subtitles': subtitles, - 'title': meta['title'], + 'title': title, 'description': meta.get('description'), - 'thumbnail': meta.get('thumbnail') + 'thumbnail': meta.get('thumbnail'), + 'duration': int_or_none(meta.get('duration')), + 'timestamp': parse_iso8601(meta.get('created')), } + + for ie in (YoutubeIE, VimeoIE): + if ie.suitable(video_url): + info.update({ + '_type': 'url_transparent', + 'ie_key': ie.ie_key(), + }) + break + + return info From 9adedd82f32a81f1439fbf4e85e9985642dffdb9 Mon Sep 17 00:00:00 2001 From: beefchop <32330393+beefchop@users.noreply.github.com> Date: Fri, 26 Feb 2021 14:18:39 +0100 Subject: [PATCH 030/384] [viki] fix stream extraction from mpd (#27092) Co-authored-by: beefchop <beefchop@users.noreply.github.com> --- haruhi_dl/extractor/viki.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/haruhi_dl/extractor/viki.py b/haruhi_dl/extractor/viki.py index b0dcdc0e6..48ab7b944 100644 --- a/haruhi_dl/extractor/viki.py +++ b/haruhi_dl/extractor/viki.py @@ -296,6 +296,9 @@ class VikiIE(VikiBaseIE): if f.get('acodec') == 'none' and f.get('vcodec') != 'none': f['acodec'] = None formats.extend(m3u8_formats) + elif format_id == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, 'mpd-%s' % protocol, fatal=False)) elif format_url.startswith('rtmp'): mobj = re.search( r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', From ddc62043ed31799433868043188bcd9e90c2ccca Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:18:46 +0100 Subject: [PATCH 031/384] [viki] improve format extraction --- haruhi_dl/extractor/viki.py | 142 +++++++++++++++++++++--------------- 1 file changed, 83 insertions(+), 59 deletions(-) diff --git a/haruhi_dl/extractor/viki.py b/haruhi_dl/extractor/viki.py index 48ab7b944..a003b7af8 100644 --- a/haruhi_dl/extractor/viki.py +++ b/haruhi_dl/extractor/viki.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import hashlib import hmac import itertools @@ -9,6 +10,10 @@ import re import time from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( ExtractorError, int_or_none, @@ -165,19 +170,20 @@ class VikiIE(VikiBaseIE): }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '5fa476a902e902783ac7a4d615cdbc7a', + 'md5': '94e0e34fd58f169f40c184f232356cfe', 'info_dict': { 'id': '44699v', 'ext': 'mp4', 'title': 'Boys Over Flowers - Episode 1', 'description': 'md5:b89cf50038b480b88b5b3c93589a9076', - 'duration': 4204, + 'duration': 4172, 'timestamp': 1270496524, 'upload_date': '20100405', 'uploader': 'group8', 'like_count': int, 'age_limit': 13, - } + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # youtube external 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', @@ -194,14 +200,15 @@ class VikiIE(VikiBaseIE): 'uploader_id': 'ad14065n', 'like_count': int, 'age_limit': 13, - } + }, + 'skip': 'Page not found!', }, { 'url': 'http://www.viki.com/player/44699v', 'only_matching': True, }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '1713ae35df5a521b31f6dc40730e7c9c', + 'md5': 'adf9e321a0ae5d0aace349efaaff7691', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -217,8 +224,11 @@ class VikiIE(VikiBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - video = self._call_api( - 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') + resp = self._download_json( + 'https://www.viki.com/api/videos/' + video_id, + video_id, 'Downloading video JSON', + headers={'x-viki-app-ver': '4.0.57'}) + video = resp['video'] self._check_errors(video) @@ -265,60 +275,74 @@ class VikiIE(VikiBaseIE): 'subtitles': subtitles, } - streams = self._call_api( - 'videos/%s/streams.json' % video_id, video_id, - 'Downloading video streams JSON') - - if 'external' in streams: - result.update({ - '_type': 'url_transparent', - 'url': streams['external']['url'], - }) - return result - formats = [] - for format_id, stream_dict in streams.items(): - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - for protocol, format_dict in stream_dict.items(): - # rtmps URLs does not seem to work - if protocol == 'rtmps': - continue - format_url = format_dict['url'] - if format_id == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False) - # Despite CODECS metadata in m3u8 all video-only formats - # are actually video+audio - for f in m3u8_formats: - if f.get('acodec') == 'none' and f.get('vcodec') != 'none': - f['acodec'] = None - formats.extend(m3u8_formats) - elif format_id == 'mpd': - formats.extend(self._extract_mpd_formats( - format_url, video_id, 'mpd-%s' % protocol, fatal=False)) - elif format_url.startswith('rtmp'): - mobj = re.search( - r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', - format_url) - if not mobj: + + def add_format(format_id, format_dict, protocol='http'): + # rtmps URLs does not seem to work + if protocol == 'rtmps': + return + format_url = format_dict.get('url') + if not format_url: + return + qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query) + stream = qs.get('stream', [None])[0] + if stream: + format_url = base64.b64decode(stream).decode() + if format_id in ('m3u8', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', + m3u8_id='m3u8-%s' % protocol, fatal=False) + # Despite CODECS metadata in m3u8 all video-only formats + # are actually video+audio + for f in m3u8_formats: + if '_drm/index_' in f['url']: continue - formats.append({ - 'format_id': 'rtmp-%s' % format_id, - 'ext': 'flv', - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': url, - }) - else: - formats.append({ - 'url': format_url, - 'format_id': '%s-%s' % (format_id, protocol), - 'height': height, - }) + if f.get('acodec') == 'none' and f.get('vcodec') != 'none': + f['acodec'] = None + formats.append(f) + elif format_id in ('mpd', 'dash'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, 'mpd-%s' % protocol, fatal=False)) + elif format_url.startswith('rtmp'): + mobj = re.search( + r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', + format_url) + if not mobj: + return + formats.append({ + 'format_id': 'rtmp-%s' % format_id, + 'ext': 'flv', + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': url, + }) + else: + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (format_id, protocol), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)), + }) + + for format_id, format_dict in (resp.get('streams') or {}).items(): + add_format(format_id, format_dict) + if not formats: + streams = self._call_api( + 'videos/%s/streams.json' % video_id, video_id, + 'Downloading video streams JSON') + + if 'external' in streams: + result.update({ + '_type': 'url_transparent', + 'url': streams['external']['url'], + }) + return result + + for format_id, stream_dict in streams.items(): + for protocol, format_dict in stream_dict.items(): + add_format(format_id, format_dict, protocol) self._sort_formats(formats) result['formats'] = formats From 9fd254036be0a95a4d29e0ff1a0d6eb51a2e3785 Mon Sep 17 00:00:00 2001 From: Leonardo Taccari <iamleot@gmail.com> Date: Fri, 26 Feb 2021 14:18:51 +0100 Subject: [PATCH 032/384] [rai] Fix extraction for recent raiplay.it updates (#27077) - Remove first test of RaiPlayIE: it is no longer available - Make RaiPlayIE extension-agnostic (passing possible `.json' URLs is now supported too) - Adjust RaiPlayLiveIE to recent raiplay.it updates. Passing it as `url_transparent' is no longer supported (there is no longer an accessible ContentItem) - Adjust RaiPlayPlaylistIE to recent raiplay.it updates and instruct it about ContentSet-s. - Update a RaiIE test and remove two tests that are no longer availables Thanks to @remitamine for the review! --- haruhi_dl/extractor/rai.py | 126 +++++++++++++++---------------------- 1 file changed, 52 insertions(+), 74 deletions(-) diff --git a/haruhi_dl/extractor/rai.py b/haruhi_dl/extractor/rai.py index bee2d53f5..dae7800d2 100644 --- a/haruhi_dl/extractor/rai.py +++ b/haruhi_dl/extractor/rai.py @@ -17,7 +17,6 @@ from ..utils import ( int_or_none, parse_duration, strip_or_none, - unescapeHTML, unified_strdate, unified_timestamp, update_url_query, @@ -122,27 +121,8 @@ class RaiBaseIE(InfoExtractor): class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)\.(?:html|json)' % RaiBaseIE._UUID_RE _TESTS = [{ - 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', - 'md5': '340aa3b7afb54bfd14a8c11786450d76', - 'info_dict': { - 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', - 'ext': 'mp4', - 'title': 'La Casa Bianca', - 'alt_title': 'S2016 - Puntata del 23/10/2016', - 'description': 'md5:a09d45890850458077d1f68bb036e0a5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 3', - 'creator': 'Rai 3', - 'duration': 3278, - 'timestamp': 1477764300, - 'upload_date': '20161029', - 'series': 'La Casa Bianca', - 'season': '2016', - }, - 'skip': 'This content is not available', - }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { @@ -166,10 +146,11 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - url, video_id = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + base, video_id, = mobj.group('base', 'id') media = self._download_json( - url.replace('.html', '.json'), video_id, 'Downloading video JSON') + '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON') title = media['name'] @@ -219,7 +200,7 @@ class RaiPlayIE(RaiBaseIE): class RaiPlayLiveIE(RaiBaseIE): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)' + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' _TEST = { 'url': 'http://www.raiplay.it/dirette/rainews24', 'info_dict': { @@ -227,7 +208,7 @@ class RaiPlayLiveIE(RaiBaseIE): 'display_id': 'rainews24', 'ext': 'mp4', 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:6eca31500550f9376819f174e5644754', + 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', 'uploader': 'Rai News 24', 'creator': 'Rai News 24', 'is_live': True, @@ -238,53 +219,75 @@ class RaiPlayLiveIE(RaiBaseIE): } def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + base, display_id, = mobj.group('base', 'id') - webpage = self._download_webpage(url, display_id) + media = self._download_json( + '%s.json' % base, + display_id, 'Downloading channel JSON') - video_id = self._search_regex( - r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, - webpage, 'content id') + title = media['name'] + video = media['video'] + video_id = media['id'].replace('ContentItem-', '') - return { - '_type': 'url_transparent', - 'ie_key': RaiPlayIE.ie_key(), - 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, + relinker_info = self._extract_relinker_info(video['content_url'], video_id) + self._sort_formats(relinker_info['formats']) + + info = { 'id': video_id, 'display_id': display_id, + 'title': self._live_title(title) if relinker_info.get( + 'is_live') else title, + 'description': media.get('description'), + 'uploader': strip_or_none(media.get('channel')), + 'creator': strip_or_none(media.get('editor')), } + info.update(relinker_info) + return info + class RaiPlayPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', - 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', + 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, 'playlist_mincount': 12, }] def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + base, playlist_id, = mobj.group('base', 'id') - webpage = self._download_webpage(url, playlist_id) + media = self._download_json( + '%s.json' % base, + playlist_id, 'Downloading program JSON') - title = self._html_search_meta( - ('programma', 'nomeProgramma'), webpage, 'title') - description = unescapeHTML(self._html_search_meta( - ('description', 'og:description'), webpage, 'description')) + title = media.get('name') + description = None + if media.get('program_info') and media['program_info'].get('description'): + description = media['program_info']['description'] entries = [] - for mobj in re.finditer( - r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1', - webpage): - video_url = urljoin(url, mobj.group('path')) - entries.append(self.url_result( - video_url, ie=RaiPlayIE.ie_key(), - video_id=RaiPlayIE._match_id(video_url))) + for b in media.get('blocks', []): + for s in b.get('sets', []): + cs = s.get('id') + if not cs: + continue + medias = self._download_json( + '%s/%s.json' % (base, cs), + cs, 'Downloading content set JSON', fatal=False) + if not medias: + continue + for m in medias['items']: + video_url = urljoin(url, m['path_id']) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) return self.playlist_result(entries, playlist_id, title, description) @@ -329,19 +332,6 @@ class RaiIE(RaiBaseIE): 'duration': 2214, 'upload_date': '20161103', } - }, { - # drawMediaRaiTV(...) - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '2dd727e61114e1ee9c47f0da6914e178', - 'info_dict': { - 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'mp4', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20141221', - }, - 'skip': 'This content is not available', }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', @@ -353,18 +343,6 @@ class RaiIE(RaiBaseIE): 'upload_date': '20170401', }, 'skip': 'Changes daily', - }, { - # HDS live stream with only relinker URL - 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - 'info_dict': { - 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', - 'ext': 'flv', - 'title': 'EuroNews', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This content is available only in Italy', }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', From a7bd83e154d2adf01096662e14173d5848305fa8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:18:55 +0100 Subject: [PATCH 033/384] [rai] improve extraction --- haruhi_dl/extractor/rai.py | 82 +++++++++++++------------------------- 1 file changed, 27 insertions(+), 55 deletions(-) diff --git a/haruhi_dl/extractor/rai.py b/haruhi_dl/extractor/rai.py index dae7800d2..b072a0f38 100644 --- a/haruhi_dl/extractor/rai.py +++ b/haruhi_dl/extractor/rai.py @@ -16,7 +16,9 @@ from ..utils import ( GeoRestrictedError, int_or_none, parse_duration, + remove_start, strip_or_none, + try_get, unified_strdate, unified_timestamp, update_url_query, @@ -121,7 +123,7 @@ class RaiBaseIE(InfoExtractor): class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)\.(?:html|json)' % RaiBaseIE._UUID_RE + _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE _TESTS = [{ 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', @@ -146,11 +148,10 @@ class RaiPlayIE(RaiBaseIE): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - base, video_id, = mobj.group('base', 'id') + base, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( - '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON') + base + '.json', video_id, 'Downloading video JSON') title = media['name'] @@ -177,7 +178,8 @@ class RaiPlayIE(RaiBaseIE): season = media.get('season') info = { - 'id': video_id, + 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, + 'display_id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, 'alt_title': strip_or_none(media.get('subtitle')), @@ -199,9 +201,9 @@ class RaiPlayIE(RaiBaseIE): return info -class RaiPlayLiveIE(RaiBaseIE): +class RaiPlayLiveIE(RaiPlayIE): _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' - _TEST = { + _TESTS = [{ 'url': 'http://www.raiplay.it/dirette/rainews24', 'info_dict': { 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', @@ -216,35 +218,7 @@ class RaiPlayLiveIE(RaiBaseIE): 'params': { 'skip_download': True, }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - base, display_id, = mobj.group('base', 'id') - - media = self._download_json( - '%s.json' % base, - display_id, 'Downloading channel JSON') - - title = media['name'] - video = media['video'] - video_id = media['id'].replace('ContentItem-', '') - - relinker_info = self._extract_relinker_info(video['content_url'], video_id) - self._sort_formats(relinker_info['formats']) - - info = { - 'id': video_id, - 'display_id': display_id, - 'title': self._live_title(title) if relinker_info.get( - 'is_live') else title, - 'description': media.get('description'), - 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor')), - } - - info.update(relinker_info) - return info + }] class RaiPlayPlaylistIE(InfoExtractor): @@ -260,36 +234,34 @@ class RaiPlayPlaylistIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - base, playlist_id, = mobj.group('base', 'id') + base, playlist_id = re.match(self._VALID_URL, url).groups() - media = self._download_json( - '%s.json' % base, - playlist_id, 'Downloading program JSON') - - title = media.get('name') - description = None - if media.get('program_info') and media['program_info'].get('description'): - description = media['program_info']['description'] + program = self._download_json( + base + '.json', playlist_id, 'Downloading program JSON') entries = [] - for b in media.get('blocks', []): - for s in b.get('sets', []): - cs = s.get('id') - if not cs: + for b in (program.get('blocks') or []): + for s in (b.get('sets') or []): + s_id = s.get('id') + if not s_id: continue medias = self._download_json( - '%s/%s.json' % (base, cs), - cs, 'Downloading content set JSON', fatal=False) + '%s/%s.json' % (base, s_id), s_id, + 'Downloading content set JSON', fatal=False) if not medias: continue - for m in medias['items']: - video_url = urljoin(url, m['path_id']) + for m in (medias.get('items') or []): + path_id = m.get('path_id') + if not path_id: + continue + video_url = urljoin(url, path_id) entries.append(self.url_result( video_url, ie=RaiPlayIE.ie_key(), video_id=RaiPlayIE._match_id(video_url))) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + entries, playlist_id, program.get('name'), + try_get(program, lambda x: x['program_info']['description'])) class RaiIE(RaiBaseIE): From 493d2796046ac269c105e69704bb86b6e74d3f93 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:19:02 +0100 Subject: [PATCH 034/384] [rai] fix unavailable video format detection --- haruhi_dl/extractor/rai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/rai.py b/haruhi_dl/extractor/rai.py index b072a0f38..06958966f 100644 --- a/haruhi_dl/extractor/rai.py +++ b/haruhi_dl/extractor/rai.py @@ -68,7 +68,7 @@ class RaiBaseIE(InfoExtractor): # This does not imply geo restriction (e.g. # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) - if media_url == 'http://download.rai.it/video_no_available.mp4': + if '/video_no_available.mp4' in media_url: continue ext = determine_ext(media_url) From a7324932923e9fee73a92441e1c033fb1a6c071d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:19:07 +0100 Subject: [PATCH 035/384] [rai] fix protocol relative relinker URLs(closes #22766) --- haruhi_dl/extractor/rai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/rai.py b/haruhi_dl/extractor/rai.py index 06958966f..ecb628f14 100644 --- a/haruhi_dl/extractor/rai.py +++ b/haruhi_dl/extractor/rai.py @@ -424,7 +424,7 @@ class RaiIE(RaiBaseIE): except ExtractorError: pass - relinker_url = self._search_regex( + relinker_url = self._proto_relative_url(self._search_regex( r'''(?x) (?: var\s+videoURL| @@ -436,7 +436,7 @@ class RaiIE(RaiBaseIE): //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 ''', - webpage, 'relinker URL', group='url') + webpage, 'relinker URL', group='url')) relinker_info = self._extract_relinker_info( urljoin(url, relinker_url), video_id) From 3ffb6438448804348c3a0c9f45b16fdbc5118c41 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:19:12 +0100 Subject: [PATCH 036/384] [discoverynetworks] add support new TLC/DMAX URLs(closes #27100) --- haruhi_dl/extractor/discoverynetworks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/discoverynetworks.py b/haruhi_dl/extractor/discoverynetworks.py index 607a54948..c512b95d0 100644 --- a/haruhi_dl/extractor/discoverynetworks.py +++ b/haruhi_dl/extractor/discoverynetworks.py @@ -7,7 +7,7 @@ from .dplay import DPlayIE class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)' _TESTS = [{ 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', @@ -29,6 +29,9 @@ class DiscoveryNetworksDeIE(DPlayIE): }, { 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', 'only_matching': True, + }, { + 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', + 'only_matching': True, }] def _real_extract(self, url): From 8175a5e8b18d1ad04ce412eecafa01f65e6f0f53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:19:38 +0100 Subject: [PATCH 037/384] =?UTF-8?q?[YoutubeDL]=20Fix=20--ignore-errors=20f?= =?UTF-8?q?or=20playlists=20with=20generator-based=20en=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …tries of url_transparent (closes #27064) --- haruhi_dl/HaruhiDL.py | 52 +++++++++++++++++++------------- test/test_HaruhiDL.py | 70 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 21 deletions(-) diff --git a/haruhi_dl/HaruhiDL.py b/haruhi_dl/HaruhiDL.py index 0a1a5a5a9..e67c01a9d 100755 --- a/haruhi_dl/HaruhiDL.py +++ b/haruhi_dl/HaruhiDL.py @@ -797,21 +797,14 @@ class HaruhiDL(object): self.report_warning('The program functionality for this site has been marked as broken, ' 'and will probably not work.') + return self.__extract_info(url, ie, download, extra_info, process) + else: + self.report_error('no suitable InfoExtractor for URL %s' % url) + + def __handle_extraction_exceptions(func): + def wrapper(self, *args, **kwargs): try: - ie_result = ie.extract(url) - if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) - break - if isinstance(ie_result, list): - # Backwards compatibility: old IE result format - ie_result = { - '_type': 'compat_list', - 'entries': ie_result, - } - self.add_default_extra_info(ie_result, ie, url) - if process: - return self.process_ie_result(ie_result, download, extra_info) - else: - return ie_result + return func(self, *args, **kwargs) except GeoRestrictedError as e: msg = e.msg if e.countries: @@ -819,20 +812,33 @@ class HaruhiDL(object): map(ISO3166Utils.short2full, e.countries)) msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' self.report_error(msg) - break except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) - break except MaxDownloadsReached: raise except Exception as e: if self.params.get('ignoreerrors', False): self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) - break else: raise + return wrapper + + @__handle_extraction_exceptions + def __extract_info(self, url, ie, download, extra_info, process): + ie_result = ie.extract(url) + if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + return + if isinstance(ie_result, list): + # Backwards compatibility: old IE result format + ie_result = { + '_type': 'compat_list', + 'entries': ie_result, + } + self.add_default_extra_info(ie_result, ie, url) + if process: + return self.process_ie_result(ie_result, download, extra_info) else: - self.report_error('no suitable InfoExtractor for URL %s' % url) + return ie_result def add_default_extra_info(self, ie_result, ie, url): self.add_extra_info(ie_result, { @@ -1007,9 +1013,8 @@ class HaruhiDL(object): self.to_screen('[download] ' + reason) continue - entry_result = self.process_ie_result(entry, - download=download, - extra_info=extra) + entry_result = self.__process_iterable_entry(entry, download, extra) + # TODO: skip failed (empty) entries? playlist_results.append(entry_result) ie_result['entries'] = playlist_results self.to_screen('[download] Finished downloading playlist: %s' % playlist) @@ -1038,6 +1043,11 @@ class HaruhiDL(object): else: raise Exception('Invalid result type: %s' % result_type) + @__handle_extraction_exceptions + def __process_iterable_entry(self, entry, download, extra_info): + return self.process_ie_result( + entry, download=download, extra_info=extra_info) + def _build_format_filter(self, filter_spec): " Returns a function to filter the formats according to the filter_spec " diff --git a/test/test_HaruhiDL.py b/test/test_HaruhiDL.py index c6346118a..7b93d0cdb 100644 --- a/test/test_HaruhiDL.py +++ b/test/test_HaruhiDL.py @@ -922,6 +922,76 @@ class TestHaruhiDL(unittest.TestCase): self.assertEqual(downloaded['extractor'], 'testex') self.assertEqual(downloaded['extractor_key'], 'TestEx') + # Test case for https://github.com/hdl-org/haruhi-dl/issues/27064 + def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self): + + class _YDL(YDL): + def __init__(self, *args, **kwargs): + super(_YDL, self).__init__(*args, **kwargs) + + def trouble(self, s, tb=None): + pass + + ydl = _YDL({ + 'format': 'extra', + 'ignoreerrors': True, + }) + + class VideoIE(InfoExtractor): + _VALID_URL = r'video:(?P<id>\d+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = [{ + 'format_id': 'default', + 'url': 'url:', + }] + if video_id == '0': + raise ExtractorError('foo') + if video_id == '2': + formats.append({ + 'format_id': 'extra', + 'url': TEST_URL, + }) + return { + 'id': video_id, + 'title': 'Video %s' % video_id, + 'formats': formats, + } + + class PlaylistIE(InfoExtractor): + _VALID_URL = r'playlist:' + + def _entries(self): + for n in range(3): + video_id = compat_str(n) + yield { + '_type': 'url_transparent', + 'ie_key': VideoIE.ie_key(), + 'id': video_id, + 'url': 'video:%s' % video_id, + 'title': 'Video Transparent %s' % video_id, + } + + def _real_extract(self, url): + return self.playlist_result(self._entries()) + + ydl.add_info_extractor(VideoIE(ydl)) + ydl.add_info_extractor(PlaylistIE(ydl)) + info = ydl.extract_info('playlist:') + entries = info['entries'] + self.assertEqual(len(entries), 3) + self.assertTrue(entries[0] is None) + self.assertTrue(entries[1] is None) + self.assertEqual(len(ydl.downloaded_info_dicts), 1) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(entries[2], downloaded) + self.assertEqual(downloaded['url'], TEST_URL) + self.assertEqual(downloaded['title'], 'Video Transparent 2') + self.assertEqual(downloaded['id'], '2') + self.assertEqual(downloaded['extractor'], 'Video') + self.assertEqual(downloaded['extractor_key'], 'Video') + if __name__ == '__main__': unittest.main() From acfb99b684a5b7cdb9b02691ea6f476b0eff88c6 Mon Sep 17 00:00:00 2001 From: Laura Liberda <laura@selfisekai.rocks> Date: Fri, 26 Feb 2021 14:27:42 +0100 Subject: [PATCH 038/384] improve copykitku patch hook --- devscripts/copykitku-patch-hook.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/devscripts/copykitku-patch-hook.js b/devscripts/copykitku-patch-hook.js index cd6e93a84..1db74257a 100644 --- a/devscripts/copykitku-patch-hook.js +++ b/devscripts/copykitku-patch-hook.js @@ -4,16 +4,18 @@ module.exports = function patchHook(patchContent) { [ + [/(?:youtube-|yt-?)dl\.org/g, 'haruhi.download'], [/youtube_dl/g, 'haruhi_dl'], [/youtube-dl/g, 'haruhi-dl'], [/youtubedl/g, 'haruhidl'], [/YoutubeDL/g, 'HaruhiDL'], [/ytdl/g, 'hdl'], - [/(?:youtube-|yt-?)dl\.org/g, 'haruhi.download'], [/yt-dl/g, 'h-dl'], + [/ydl/g, 'hdl'], // prevent from linking to non-existent repository [/github\.com\/ytdl-org\/haruhi-dl/g, 'github.com/ytdl-org/youtube-dl'], + [/github\.com\/rg3\/haruhi-dl/g, 'github.com/ytdl-org/youtube-dl'], // prevent changing the smuggle URLs (for compatibility with ytdl) [/__haruhidl_smuggle/g, '__youtubedl_smuggle'], ].forEach(([regex, replacement]) => patchContent = patchContent.replace(regex, replacement)); From bb0f8c2607eb0cfb1b21e1733a2d70a17f568458 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:27:49 +0100 Subject: [PATCH 039/384] =?UTF-8?q?[downloader/http]=20Fix=20crash=20durin?= =?UTF-8?q?g=20urlopen=20caused=20by=20missing=20reason=20o=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …f URLError --- haruhi_dl/downloader/http.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/downloader/http.py b/haruhi_dl/downloader/http.py index c9c908b34..6d3d8f1e5 100644 --- a/haruhi_dl/downloader/http.py +++ b/haruhi_dl/downloader/http.py @@ -109,7 +109,9 @@ class HttpFD(FileDownloader): try: ctx.data = self.hdl.urlopen(request) except (compat_urllib_error.URLError, ) as err: - if isinstance(err.reason, socket.timeout): + # reason may not be available, e.g. for urllib2.HTTPError on python 2.6 + reason = getattr(err, 'reason', None) + if isinstance(reason, socket.timeout): raise RetryDownload(err) raise err # When trying to resume, Content-Range HTTP header of response has to be checked From f3c426a2ee0db526c1e3a732d67ffc95f6b41d52 Mon Sep 17 00:00:00 2001 From: renalid <renalid@gmail.com> Date: Fri, 26 Feb 2021 14:28:03 +0100 Subject: [PATCH 040/384] [francetv] Update to fix thumbnail URL issue (#27120) Fix the thumbnail URL. The issue was here for many years, never fixed. It's done ! :-) Example : https://www.france.tv/france-2/de-gaulle-l-eclat-et-le-secret/de-gaulle-l-eclat-et-le-secret-saison-1/2035247-solitude.html failed thumbnail url generated : http://pluzz.francetv.fr/staticftv/ref_emissions/2020-11-02/EMI_1104da66f533cc7dc5d0d07a181a18c2e2fe1d81_20201014122553940.jpg right thumbnail url fixed : https://sivideo.webservices.francetelevisions.fr/staticftv/ref_emissions/2020-11-02/EMI_1104da66f533cc7dc5d0d07a181a18c2e2fe1d81_20201014122553940.jpg --- haruhi_dl/extractor/francetv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/francetv.py b/haruhi_dl/extractor/francetv.py index ab2280630..3cb17751e 100644 --- a/haruhi_dl/extractor/francetv.py +++ b/haruhi_dl/extractor/francetv.py @@ -211,7 +211,7 @@ class FranceTVIE(InfoExtractor): 'id': video_id, 'title': self._live_title(title) if is_live else title, 'description': clean_html(info.get('synopsis')), - 'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')), + 'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')), 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), 'is_live': is_live, From 968583c56f3731d30f89f87675014a1e1ab8b0ee Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:28:13 +0100 Subject: [PATCH 041/384] [infoq] fix format extraction(closes #25984) --- haruhi_dl/extractor/infoq.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/infoq.py b/haruhi_dl/extractor/infoq.py index 18249cf9b..0a70a1fb4 100644 --- a/haruhi_dl/extractor/infoq.py +++ b/haruhi_dl/extractor/infoq.py @@ -54,7 +54,7 @@ class InfoQIE(BokeCCBaseIE): def _extract_rtmp_video(self, webpage): # The server URL is hardcoded - video_url = 'rtmpe://video.infoq.com/cfx/st/' + video_url = 'rtmpe://videof.infoq.com/cfx/st/' # Extract video URL encoded_id = self._search_regex( @@ -86,17 +86,18 @@ class InfoQIE(BokeCCBaseIE): return [{ 'format_id': 'http_video', 'url': http_video_url, + 'http_headers': {'Referer': 'https://www.infoq.com/'}, }] def _extract_http_audio(self, webpage, video_id): - fields = self._hidden_inputs(webpage) + fields = self._form_hidden_inputs('mp3Form', webpage) http_audio_url = fields.get('filename') if not http_audio_url: return [] # base URL is found in the Location header in the response returned by # GET https://www.infoq.com/mp3download.action?filename=... when logged in. - http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + http_audio_url = compat_urlparse.urljoin('http://ress.infoq.com/downloads/mp3downloads/', http_audio_url) http_audio_url = update_url_query(http_audio_url, self._extract_cf_auth(webpage)) # audio file seem to be missing some times even if there is a download link From 9e816eca8f20a327e40e3612937efd4a3172784e Mon Sep 17 00:00:00 2001 From: Mattias Wadman <mattias.wadman@gmail.com> Date: Fri, 26 Feb 2021 14:32:52 +0100 Subject: [PATCH 042/384] [svt] Extract timestamp and thumbnail in more cases (#27130) Add timestamp, set to "valid from" which i think could been seen as publish time. Add thumbnail in more cases, seems to was only done in the embedded data case for some reason. Switch svtplay test url to an existing video and also one with no expire date. Also add an additional thumbnail url test regex. --- haruhi_dl/extractor/svt.py | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/haruhi_dl/extractor/svt.py b/haruhi_dl/extractor/svt.py index a5e480f0b..0f8b2d61f 100644 --- a/haruhi_dl/extractor/svt.py +++ b/haruhi_dl/extractor/svt.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, dict_get, int_or_none, + unified_timestamp, str_or_none, strip_or_none, try_get, @@ -44,7 +45,8 @@ class SVTBaseIE(InfoExtractor): 'format_id': player_type, 'url': vurl, }) - if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): + rights = try_get(video_info, lambda x: x['rights'], dict) or {} + if not formats and rights.get('geoBlockedSweden'): self.raise_geo_restricted( 'This video is only available in Sweden', countries=self._GEO_COUNTRIES) @@ -70,6 +72,7 @@ class SVTBaseIE(InfoExtractor): episode = video_info.get('episodeTitle') episode_number = int_or_none(video_info.get('episodeNumber')) + timestamp = unified_timestamp(rights.get('validFrom')) duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) age_limit = None adult = dict_get( @@ -84,6 +87,7 @@ class SVTBaseIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'duration': duration, + 'timestamp': timestamp, 'age_limit': age_limit, 'series': series, 'season_number': season_number, @@ -140,21 +144,30 @@ class SVTPlayIE(SVTPlayBaseIE): ) ''' _TESTS = [{ - 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', - 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', + 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', + 'md5': '2382036fd6f8c994856c323fe51c426e', 'info_dict': { - 'id': '5996901', + 'id': 'jNwpV9P', 'ext': 'mp4', - 'title': 'Flygplan till Haile Selassie', - 'duration': 3527, - 'thumbnail': r're:^https?://.*[\.-]jpg$', + 'title': 'Det h\xe4r \xe4r himlen', + 'timestamp': 1586044800, + 'upload_date': '20200405', + 'duration': 3515, + 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', 'age_limit': 0, 'subtitles': { 'sv': [{ - 'ext': 'wsrt', + 'ext': 'vtt', }] }, }, + 'params': { + 'format': 'bestvideo', + # skip for now due to download test asserts that segment is > 10000 bytes and svt uses + # init segments that are smaller + # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B + 'skip_download': True, + }, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -235,7 +248,10 @@ class SVTPlayIE(SVTPlayBaseIE): r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), webpage, 'video id') - return self._extract_by_video_id(svt_id, webpage) + info_dict = self._extract_by_video_id(svt_id, webpage) + info_dict['thumbnail'] = thumbnail + + return info_dict class SVTSeriesIE(SVTPlayBaseIE): From 5a94d1b61d23f78644e7aec3b3b45a9ff3358742 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:32:59 +0100 Subject: [PATCH 043/384] [svtplay] Add support for svt.se/barnkanalen (closes #24817) --- haruhi_dl/extractor/svt.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/svt.py b/haruhi_dl/extractor/svt.py index 0f8b2d61f..3b1908a82 100644 --- a/haruhi_dl/extractor/svt.py +++ b/haruhi_dl/extractor/svt.py @@ -139,7 +139,11 @@ class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'''(?x) (?: - svt:(?P<svt_id>[^/?#&]+)| + (?: + svt:| + https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/ + ) + (?P<svt_id>[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) ) ''' @@ -184,6 +188,12 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'svt:14278044', 'only_matching': True, + }, { + 'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/', + 'only_matching': True, + }, { + 'url': 'svt:eWv5MLX', + 'only_matching': True, }] def _adjust_title(self, info): @@ -375,7 +385,7 @@ class SVTPageIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url) def _real_extract(self, url): path, display_id = re.match(self._VALID_URL, url).groups() From d64e153832c59fb3120cad5643b895286cb6e3a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:33:05 +0100 Subject: [PATCH 044/384] [svtplay] Fix test title --- haruhi_dl/extractor/svt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/svt.py b/haruhi_dl/extractor/svt.py index 3b1908a82..1c2e747c8 100644 --- a/haruhi_dl/extractor/svt.py +++ b/haruhi_dl/extractor/svt.py @@ -153,7 +153,7 @@ class SVTPlayIE(SVTPlayBaseIE): 'info_dict': { 'id': 'jNwpV9P', 'ext': 'mp4', - 'title': 'Det h\xe4r \xe4r himlen', + 'title': 'Det här är himlen', 'timestamp': 1586044800, 'upload_date': '20200405', 'duration': 3515, From 9a5816f425afe7d37c86aeb60f397bf3dc67f927 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:33:38 +0100 Subject: [PATCH 045/384] [pinterest] Add extractor (closes #25747) --- haruhi_dl/extractor/extractors.py | 4 + haruhi_dl/extractor/pinterest.py | 176 ++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+) create mode 100644 haruhi_dl/extractor/pinterest.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index bd6003c93..27fb2062d 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -880,6 +880,10 @@ from .picarto import ( ) from .piksel import PikselIE from .pinkbike import PinkbikeIE +from .pinterest import ( + PinterestIE, + PinterestCollectionIE, +) from .pladform import PladformIE from .platzi import ( PlatziIE, diff --git a/haruhi_dl/extractor/pinterest.py b/haruhi_dl/extractor/pinterest.py new file mode 100644 index 000000000..2bb4ca660 --- /dev/null +++ b/haruhi_dl/extractor/pinterest.py @@ -0,0 +1,176 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +class PinterestBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' + + def _extract_resource(self, webpage, video_id): + return self._parse_json( + self._search_regex( + r'<script[^>]+\bid=["\']initial-state["\'][^>]*>({.+?})</script>', + webpage, 'application json'), + video_id)['resourceResponses'] + + def _extract_video(self, data, extract_formats=True): + video_id = data['id'] + + title = (data.get('title') or data.get('grid_title') or video_id).strip() + + formats = [] + duration = None + if extract_formats: + for format_id, format_dict in data['videos']['video_list'].items(): + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('url')) + if not format_url: + continue + duration = float_or_none(format_dict.get('duration'), scale=1000) + ext = determine_ext(format_url) + if 'hls' in format_id.lower() or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'width': int_or_none(format_dict.get('width')), + 'height': int_or_none(format_dict.get('height')), + 'duration': duration, + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) + + description = data.get('description') or data.get('description_html') or data.get('seo_description') + timestamp = unified_timestamp(data.get('created_at')) + + def _u(field): + return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) + + uploader = _u('full_name') + uploader_id = _u('id') + + repost_count = int_or_none(data.get('repin_count')) + comment_count = int_or_none(data.get('comment_count')) + categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) + tags = data.get('hashtags') + + thumbnails = [] + images = data.get('images') + if isinstance(images, dict): + for thumbnail_id, thumbnail in images.items(): + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'repost_count': repost_count, + 'comment_count': comment_count, + 'categories': categories, + 'tags': tags, + 'formats': formats, + 'extractor_key': PinterestIE.ie_key(), + } + + +class PinterestIE(PinterestBaseIE): + _VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.com/pin/664281013778109217/', + 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', + 'info_dict': { + 'id': '664281013778109217', + 'ext': 'mp4', + 'title': 'Origami', + 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', + 'duration': 57.7, + 'timestamp': 1593073622, + 'upload_date': '20200625', + 'uploader': 'Love origami -I am Dafei', + 'uploader_id': '586523688879454212', + 'repost_count': 50, + 'comment_count': 0, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://co.pinterest.com/pin/824721750502199491/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + data = self._extract_resource(webpage, video_id)[0]['response']['data'] + return self._extract_video(data) + + +class PinterestCollectionIE(PinterestBaseIE): + _VALID_URL = r'%s/[^/]+/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', + 'info_dict': { + 'id': '585890301462791043', + 'title': 'cool diys', + }, + 'playlist_count': 8, + }] + + @classmethod + def suitable(cls, url): + return False if PinterestIE.suitable(url) else super( + PinterestCollectionIE, cls).suitable(url) + + def _real_extract(self, url): + collection_name = self._match_id(url) + webpage = self._download_webpage(url, collection_name) + resource = self._extract_resource(webpage, collection_name)[1] + entries = [] + for item in resource['response']['data']: + if not isinstance(item, dict) or item.get('type') != 'pin': + continue + video_id = item.get('id') + if video_id: + # Some pins may not be available anonymously via pin URL + # video = self._extract_video(item, extract_formats=False) + # video.update({ + # '_type': 'url_transparent', + # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, + # }) + # entries.append(video) + entries.append(self._extract_video(item)) + title = try_get( + resource, lambda x: x['options']['board_title'], compat_str) + collection_id = try_get( + resource, lambda x: x['options']['board_id'], + compat_str) or collection_name + return self.playlist_result( + entries, playlist_id=collection_id, playlist_title=title) From ac852e57a0f46a28cfbfe1aafbfe4ab873bec7a9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:33:51 +0100 Subject: [PATCH 046/384] [extractor/common] add generic support for akamai http format extraction --- haruhi_dl/extractor/common.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index fb616b05a..32a391a85 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -2614,6 +2614,7 @@ class InfoExtractor(object): def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats = [] + hdcore_sign = 'hdcore=3.7.0' f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') hds_host = hosts.get('hds') @@ -2626,6 +2627,7 @@ class InfoExtractor(object): for entry in f4m_formats: entry.update({'extra_param_to_segment_url': hdcore_sign}) formats.extend(f4m_formats) + m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') hls_host = hosts.get('hls') if hls_host: @@ -2633,6 +2635,31 @@ class InfoExtractor(object): formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + + http_host = hosts.get('http') + if http_host and 'hdnea=' not in manifest_url: + REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' + qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') + qualities_length = len(qualities) + if len(formats) in (qualities_length + 1, qualities_length * 2 + 1): + i = 0 + http_formats = [] + for f in formats: + if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none': + for protocol in ('http', 'https'): + http_f = f.copy() + del http_f['manifest_url'] + http_url = re.sub( + REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url']) + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), + 'url': http_url, + 'protocol': protocol, + }) + http_formats.append(http_f) + i += 1 + formats.extend(http_formats) + return formats def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): From 186e07f960702ff25df8065ee7e9fb61c84495a8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:34:00 +0100 Subject: [PATCH 047/384] [skyit] add support for multiple Sky Italia websites(closes #26629) --- haruhi_dl/extractor/extractors.py | 10 ++ haruhi_dl/extractor/skyit.py | 239 ++++++++++++++++++++++++++++++ 2 files changed, 249 insertions(+) create mode 100644 haruhi_dl/extractor/skyit.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 27fb2062d..87d509df5 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1060,6 +1060,16 @@ from .shared import ( from .showroomlive import ShowRoomLiveIE from .sina import SinaIE from .sixplay import SixPlayIE +from .skyit import ( + SkyItPlayerIE, + SkyItVideoIE, + SkyItVideoLiveIE, + SkyItIE, + SkyItAcademyIE, + SkyItArteIE, + CieloTVItIE, + TV8ItIE, +) from .skylinewebcams import SkylineWebcamsIE from .skynewsarabia import ( SkyNewsArabiaIE, diff --git a/haruhi_dl/extractor/skyit.py b/haruhi_dl/extractor/skyit.py new file mode 100644 index 000000000..14a4d8d4c --- /dev/null +++ b/haruhi_dl/extractor/skyit.py @@ -0,0 +1,239 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + dict_get, + int_or_none, + parse_duration, + unified_timestamp, +) + + +class SkyItPlayerIE(InfoExtractor): + IE_NAME = 'player.sky.it' + _VALID_URL = r'https?://player\.sky\.it/player/(?:external|social)\.html\?.*?\bid=(?P<id>\d+)' + _GEO_BYPASS = False + _DOMAIN = 'sky' + _PLAYER_TMPL = 'https://player.sky.it/player/external.html?id=%s&domain=%s' + # http://static.sky.it/static/skyplayer/conf.json + _TOKEN_MAP = { + 'cielo': 'Hh9O7M8ks5yi6nSROL7bKYz933rdf3GhwZlTLMgvy4Q', + 'hotclub': 'kW020K2jq2lk2eKRJD2vWEg832ncx2EivZlTLQput2C', + 'mtv8': 'A5Nn9GGb326CI7vP5e27d7E4PIaQjota', + 'salesforce': 'C6D585FD1615272C98DE38235F38BD86', + 'sitocommerciale': 'VJwfFuSGnLKnd9Phe9y96WkXgYDCguPMJ2dLhGMb2RE', + 'sky': 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk', + 'skyacademy': 'A6LAn7EkO2Q26FRy0IAMBekX6jzDXYL3', + 'skyarte': 'LWk29hfiU39NNdq87ePeRach3nzTSV20o0lTv2001Cd', + 'theupfront': 'PRSGmDMsg6QMGc04Obpoy7Vsbn7i2Whp', + } + + def _player_url_result(self, video_id): + return self.url_result( + self._PLAYER_TMPL % (video_id, self._DOMAIN), + SkyItPlayerIE.ie_key(), video_id) + + def _parse_video(self, video, video_id): + title = video['title'] + is_live = video.get('type') == 'live' + hls_url = video.get(('streaming' if is_live else 'hls') + '_url') + if not hls_url and video.get('geoblock' if is_live else 'geob'): + self.raise_geo_restricted(countries=['IT']) + + if is_live: + formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4') + else: + formats = self._extract_akamai_formats( + hls_url, video_id, {'http': 'videoplatform.sky.it'}) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'formats': formats, + 'thumbnail': dict_get(video, ('video_still', 'video_still_medium', 'thumb')), + 'description': video.get('short_desc') or None, + 'timestamp': unified_timestamp(video.get('create_date')), + 'duration': int_or_none(video.get('duration_sec')) or parse_duration(video.get('duration')), + 'is_live': is_live, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + domain = compat_parse_qs(compat_urllib_parse_urlparse( + url).query).get('domain', [None])[0] + token = dict_get(self._TOKEN_MAP, (domain, 'sky')) + video = self._download_json( + 'https://apid.sky.it/vdp/v1/getVideoData', + video_id, query={ + 'caller': 'sky', + 'id': video_id, + 'token': token + }, headers=self.geo_verification_headers()) + return self._parse_video(video, video_id) + + +class SkyItVideoIE(SkyItPlayerIE): + IE_NAME = 'video.sky.it' + _VALID_URL = r'https?://(?:masterchef|video|xfactor)\.sky\.it(?:/[^/]+)*/video/[0-9a-z-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://video.sky.it/news/mondo/video/uomo-ucciso-da-uno-squalo-in-australia-631227', + 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', + 'info_dict': { + 'id': '631227', + 'ext': 'mp4', + 'title': 'Uomo ucciso da uno squalo in Australia', + 'timestamp': 1606036192, + 'upload_date': '20201122', + } + }, { + 'url': 'https://xfactor.sky.it/video/x-factor-2020-replay-audizioni-1-615820', + 'only_matching': True, + }, { + 'url': 'https://masterchef.sky.it/video/masterchef-9-cosa-e-successo-nella-prima-puntata-562831', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._player_url_result(video_id) + + +class SkyItVideoLiveIE(SkyItPlayerIE): + IE_NAME = 'video.sky.it:live' + _VALID_URL = r'https?://video\.sky\.it/diretta/(?P<id>[^/?&#]+)' + _TEST = { + 'url': 'https://video.sky.it/diretta/tg24', + 'info_dict': { + 'id': '1', + 'ext': 'mp4', + 'title': r're:Diretta TG24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'description': 'Guarda la diretta streaming di SkyTg24, segui con Sky tutti gli appuntamenti e gli speciali di Tg24.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + asset_id = compat_str(self._parse_json(self._search_regex( + r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', + webpage, 'next data'), display_id)['props']['initialState']['livePage']['content']['asset_id']) + livestream = self._download_json( + 'https://apid.sky.it/vdp/v1/getLivestream', + asset_id, query={'id': asset_id}) + return self._parse_video(livestream, asset_id) + + +class SkyItIE(SkyItPlayerIE): + IE_NAME = 'sky.it' + _VALID_URL = r'https?://(?:sport|tg24)\.sky\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://sport.sky.it/calcio/serie-a/2020/11/21/juventus-cagliari-risultato-gol', + 'info_dict': { + 'id': '631201', + 'ext': 'mp4', + 'title': 'Un rosso alla violenza: in campo per i diritti delle donne', + 'upload_date': '20201121', + 'timestamp': 1605995753, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'https://tg24.sky.it/mondo/2020/11/22/australia-squalo-uccide-uomo', + 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', + 'info_dict': { + 'id': '631227', + 'ext': 'mp4', + 'title': 'Uomo ucciso da uno squalo in Australia', + 'timestamp': 1606036192, + 'upload_date': '20201122', + }, + }] + _VIDEO_ID_REGEX = r'data-videoid="(\d+)"' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + self._VIDEO_ID_REGEX, webpage, 'video id') + return self._player_url_result(video_id) + + +class SkyItAcademyIE(SkyItIE): + IE_NAME = 'skyacademy.it' + _VALID_URL = r'https?://(?:www\.)?skyacademy\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.skyacademy.it/eventi-speciali/2019/07/05/a-lezione-di-cinema-con-sky-academy-/', + 'md5': 'ced5c26638b7863190cbc44dd6f6ba08', + 'info_dict': { + 'id': '523458', + 'ext': 'mp4', + 'title': 'Sky Academy "The Best CineCamp 2019"', + 'timestamp': 1562843784, + 'upload_date': '20190711', + } + }] + _DOMAIN = 'skyacademy' + _VIDEO_ID_REGEX = r'id="news-videoId_(\d+)"' + + +class SkyItArteIE(SkyItIE): + IE_NAME = 'arte.sky.it' + _VALID_URL = r'https?://arte\.sky\.it/video/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://arte.sky.it/video/serie-musei-venezia-collezionismo-12-novembre/', + 'md5': '515aee97b87d7a018b6c80727d3e7e17', + 'info_dict': { + 'id': '627926', + 'ext': 'mp4', + 'title': "Musei Galleria Franchetti alla Ca' d'Oro Palazzo Grimani", + 'upload_date': '20201106', + 'timestamp': 1604664493, + } + }] + _DOMAIN = 'skyarte' + _VIDEO_ID_REGEX = r'(?s)<iframe[^>]+src="(?:https:)?//player\.sky\.it/player/external\.html\?[^"]*\bid=(\d+)' + + +class CieloTVItIE(SkyItIE): + IE_NAME = 'cielotv.it' + _VALID_URL = r'https?://(?:www\.)?cielotv\.it/video/(?P<id>[^.]+)\.html' + _TESTS = [{ + 'url': 'https://www.cielotv.it/video/Il-lunedi-e-sempre-un-dramma.html', + 'md5': 'c4deed77552ba901c2a0d9258320304b', + 'info_dict': { + 'id': '499240', + 'ext': 'mp4', + 'title': 'Il lunedì è sempre un dramma', + 'upload_date': '20190329', + 'timestamp': 1553862178, + } + }] + _DOMAIN = 'cielo' + _VIDEO_ID_REGEX = r'videoId\s*=\s*"(\d+)"' + + +class TV8ItIE(SkyItVideoIE): + IE_NAME = 'tv8.it' + _VALID_URL = r'https?://tv8\.it/showvideo/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://tv8.it/showvideo/630529/ogni-mattina-ucciso-asino-di-andrea-lo-cicero/18-11-2020/', + 'md5': '9ab906a3f75ea342ed928442f9dabd21', + 'info_dict': { + 'id': '630529', + 'ext': 'mp4', + 'title': 'Ogni mattina - Ucciso asino di Andrea Lo Cicero', + 'timestamp': 1605721374, + 'upload_date': '20201118', + } + }] + _DOMAIN = 'mtv8' From abe5d97246c8c782a4d54cc2bf530ee6b1fa36bc Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:34:06 +0100 Subject: [PATCH 048/384] [rumble] add support for embed pages(#10785) --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/rumble.py | 67 +++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 haruhi_dl/extractor/rumble.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 87d509df5..0fc7d1d7f 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1010,6 +1010,7 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe from .rtvnh import RTVNHIE from .rtvs import RTVSIE from .ruhd import RUHDIE +from .rumble import RumbleEmbedIE from .rutube import ( RutubeIE, RutubeChannelIE, diff --git a/haruhi_dl/extractor/rumble.py b/haruhi_dl/extractor/rumble.py new file mode 100644 index 000000000..4a0225109 --- /dev/null +++ b/haruhi_dl/extractor/rumble.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + parse_iso8601, + try_get, +) + + +class RumbleEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)' + _TESTS = [{ + 'url': 'https://rumble.com/embed/v5pv5f', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, { + 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._download_json( + 'https://rumble.com/embedJS/', video_id, + query={'request': 'video', 'v': video_id}) + title = video['title'] + + formats = [] + for height, ua in (video.get('ua') or {}).items(): + for i in range(2): + f_url = try_get(ua, lambda x: x[i], compat_str) + if f_url: + ext = determine_ext(f_url) + f = { + 'ext': ext, + 'format_id': '%s-%sp' % (ext, height), + 'height': int_or_none(height), + 'url': f_url, + } + bitrate = try_get(ua, lambda x: x[i + 2]['bitrate']) + if bitrate: + f['tbr'] = int_or_none(bitrate) + formats.append(f) + self._sort_formats(formats) + + author = video.get('author') or {} + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video.get('i'), + 'timestamp': parse_iso8601(video.get('pubDate')), + 'channel': author.get('name'), + 'channel_url': author.get('url'), + 'duration': int_or_none(video.get('duration')), + } From 7a0255f6e2b5b68efcf5dcc017a124a1fd1eb0c0 Mon Sep 17 00:00:00 2001 From: Jia Rong Yee <28086837+fourjr@users.noreply.github.com> Date: Fri, 26 Feb 2021 14:34:21 +0100 Subject: [PATCH 049/384] [nytimes] Add new cooking.nytimes.com extractor (#27143) * [nytimes] support cooking.nytimes.com, resolves #27112 Co-authored-by: remitamine <remitamine@gmail.com> --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/nytimes.py | 38 +++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 0fc7d1d7f..0a002df66 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -817,6 +817,7 @@ from .ntvru import NTVRuIE from .nytimes import ( NYTimesIE, NYTimesArticleIE, + NYTimesCookingIE, ) from .nuvid import NuvidIE from .nzz import NZZIE diff --git a/haruhi_dl/extractor/nytimes.py b/haruhi_dl/extractor/nytimes.py index fc78ca56c..976b1c694 100644 --- a/haruhi_dl/extractor/nytimes.py +++ b/haruhi_dl/extractor/nytimes.py @@ -221,3 +221,41 @@ class NYTimesArticleIE(NYTimesBaseIE): r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'), webpage, 'podcast data') return self._extract_podcast_from_json(podcast_data, page_id, webpage) + + +class NYTimesCookingIE(NYTimesBaseIE): + _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', + 'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3', + 'info_dict': { + 'id': '100000004756089', + 'ext': 'mov', + 'timestamp': 1479383008, + 'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON', + 'title': 'Cranberry Tart', + 'upload_date': '20161117', + 'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.', + }, + }, { + 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', + 'md5': '4b2e8c70530a89b8d905a2b572316eb8', + 'info_dict': { + 'id': '100000003951728', + 'ext': 'mov', + 'timestamp': 1445509539, + 'description': 'Turkey guide', + 'upload_date': '20151022', + 'title': 'Turkey', + } + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + video_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'video id') + + return self._extract_video_from_id(video_id) From e1c07eb79f62e6305dc2b63766509433852df6a8 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:34:37 +0100 Subject: [PATCH 050/384] [box] Add new extractor(#5949) --- haruhi_dl/extractor/box.py | 98 +++++++++++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 1 + 2 files changed, 99 insertions(+) create mode 100644 haruhi_dl/extractor/box.py diff --git a/haruhi_dl/extractor/box.py b/haruhi_dl/extractor/box.py new file mode 100644 index 000000000..aae82d1af --- /dev/null +++ b/haruhi_dl/extractor/box.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_iso8601, + # try_get, + update_url_query, +) + + +class BoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)' + _TEST = { + 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', + 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', + 'info_dict': { + 'id': '510727257538', + 'ext': 'mp4', + 'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4', + 'uploader': 'MLS Video', + 'timestamp': 1566320259, + 'upload_date': '20190820', + 'uploader_id': '235196876', + } + } + + def _real_extract(self, url): + shared_name, file_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, file_id) + request_token = self._parse_json(self._search_regex( + r'Box\.config\s*=\s*({.+?});', webpage, + 'Box config'), file_id)['requestToken'] + access_token = self._download_json( + 'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, + 'Downloading token JSON metadata', + data=json.dumps({'fileIDs': [file_id]}).encode(), headers={ + 'Content-Type': 'application/json', + 'X-Request-Token': request_token, + 'X-Box-EndUser-API': 'sharedName=' + shared_name, + })[file_id]['read'] + shared_link = 'https://app.box.com/s/' + shared_name + f = self._download_json( + 'https://api.box.com/2.0/files/' + file_id, file_id, + 'Downloading file JSON metadata', headers={ + 'Authorization': 'Bearer ' + access_token, + 'BoxApi': 'shared_link=' + shared_link, + 'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats + }, query={ + 'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size' + }) + title = f['name'] + + query = { + 'access_token': access_token, + 'shared_link': shared_link + } + + formats = [] + + # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): + # entry_url_template = try_get( + # entry, lambda x: x['content']['url_template']) + # if not entry_url_template: + # continue + # representation = entry.get('representation') + # if representation == 'dash': + # TODO: append query to every fragment URL + # formats.extend(self._extract_mpd_formats( + # entry_url_template.replace('{+asset_path}', 'manifest.mpd'), + # file_id, query=query)) + + authenticated_download_url = f.get('authenticated_download_url') + if authenticated_download_url and f.get('is_download_available'): + formats.append({ + 'ext': f.get('extension') or determine_ext(title), + 'filesize': f.get('size'), + 'format_id': 'download', + 'url': update_url_query(authenticated_download_url, query), + }) + + self._sort_formats(formats) + + creator = f.get('created_by') or {} + + return { + 'id': file_id, + 'title': title, + 'formats': formats, + 'description': f.get('description') or None, + 'uploader': creator.get('name'), + 'timestamp': parse_iso8601(f.get('created_at')), + 'uploader_id': creator.get('id'), + } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 0a002df66..234763076 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -132,6 +132,7 @@ from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bostonglobe import BostonGlobeIE +from .box import BoxIE from .bpb import BpbIE from .br import ( BRIE, From 950c574c2218a74610e9306a00cd8065078eb376 Mon Sep 17 00:00:00 2001 From: renalid <renalid@gmail.com> Date: Fri, 26 Feb 2021 14:35:11 +0100 Subject: [PATCH 051/384] [franceinter] add thumbnail url (#27153) Co-authored-by: remitamine <remitamine@gmail.com> --- haruhi_dl/extractor/franceinter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/haruhi_dl/extractor/franceinter.py b/haruhi_dl/extractor/franceinter.py index 05806895c..a009f4d38 100644 --- a/haruhi_dl/extractor/franceinter.py +++ b/haruhi_dl/extractor/franceinter.py @@ -16,6 +16,7 @@ class FranceInterIE(InfoExtractor): 'ext': 'mp3', 'title': 'Affaire Cahuzac : le contentieux du compte en Suisse', 'description': 'md5:401969c5d318c061f86bda1fa359292b', + 'thumbnail': r're:^https?://.*\.jpg', 'upload_date': '20160907', }, } @@ -31,6 +32,7 @@ class FranceInterIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage) + thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) upload_date_str = self._search_regex( r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', @@ -48,6 +50,7 @@ class FranceInterIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, + 'thumbnail' : thumbnail, 'upload_date': upload_date, 'formats': [{ 'url': video_url, From 3d030642c704eeefc3665da8fca56710427e2153 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:35:16 +0100 Subject: [PATCH 052/384] [franceinter] flake8 --- haruhi_dl/extractor/franceinter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/franceinter.py b/haruhi_dl/extractor/franceinter.py index a009f4d38..ae822a50e 100644 --- a/haruhi_dl/extractor/franceinter.py +++ b/haruhi_dl/extractor/franceinter.py @@ -50,7 +50,7 @@ class FranceInterIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, - 'thumbnail' : thumbnail, + 'thumbnail': thumbnail, 'upload_date': upload_date, 'formats': [{ 'url': video_url, From 2a368bc78e70c318847dc69a6d3b0ed55f693eaf Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:35:22 +0100 Subject: [PATCH 053/384] [pinterest] Add support for large collections(more than 25 pins) --- haruhi_dl/extractor/pinterest.py | 87 ++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 31 deletions(-) diff --git a/haruhi_dl/extractor/pinterest.py b/haruhi_dl/extractor/pinterest.py index 2bb4ca660..b249c9eda 100644 --- a/haruhi_dl/extractor/pinterest.py +++ b/haruhi_dl/extractor/pinterest.py @@ -1,6 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -16,12 +19,12 @@ from ..utils import ( class PinterestBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' - def _extract_resource(self, webpage, video_id): - return self._parse_json( - self._search_regex( - r'<script[^>]+\bid=["\']initial-state["\'][^>]*>({.+?})</script>', - webpage, 'application json'), - video_id)['resourceResponses'] + def _call_api(self, resource, video_id, options): + return self._download_json( + 'https://www.pinterest.com/resource/%sResource/get/' % resource, + video_id, 'Download %s JSON metadata' % resource, query={ + 'data': json.dumps({'options': options}) + })['resource_response'] def _extract_video(self, data, extract_formats=True): video_id = data['id'] @@ -128,13 +131,16 @@ class PinterestIE(PinterestBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - data = self._extract_resource(webpage, video_id)[0]['response']['data'] + data = self._call_api( + 'Pin', video_id, { + 'field_set_key': 'unauth_react_main_pin', + 'id': video_id, + })['data'] return self._extract_video(data) class PinterestCollectionIE(PinterestBaseIE): - _VALID_URL = r'%s/[^/]+/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE + _VALID_URL = r'%s/(?P<username>[^/]+)/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE _TESTS = [{ 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', 'info_dict': { @@ -142,6 +148,14 @@ class PinterestCollectionIE(PinterestBaseIE): 'title': 'cool diys', }, 'playlist_count': 8, + }, { + 'url': 'https://www.pinterest.ca/fudohub/videos/', + 'info_dict': { + 'id': '682858430939307450', + 'title': 'VIDEOS', + }, + 'playlist_mincount': 365, + 'skip': 'Test with extract_formats=False', }] @classmethod @@ -150,27 +164,38 @@ class PinterestCollectionIE(PinterestBaseIE): PinterestCollectionIE, cls).suitable(url) def _real_extract(self, url): - collection_name = self._match_id(url) - webpage = self._download_webpage(url, collection_name) - resource = self._extract_resource(webpage, collection_name)[1] + username, slug = re.match(self._VALID_URL, url).groups() + board = self._call_api( + 'Board', slug, { + 'slug': slug, + 'username': username + })['data'] + board_id = board['id'] + options = { + 'board_id': board_id, + 'page_size': 250, + } + bookmark = None entries = [] - for item in resource['response']['data']: - if not isinstance(item, dict) or item.get('type') != 'pin': - continue - video_id = item.get('id') - if video_id: - # Some pins may not be available anonymously via pin URL - # video = self._extract_video(item, extract_formats=False) - # video.update({ - # '_type': 'url_transparent', - # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, - # }) - # entries.append(video) - entries.append(self._extract_video(item)) - title = try_get( - resource, lambda x: x['options']['board_title'], compat_str) - collection_id = try_get( - resource, lambda x: x['options']['board_id'], - compat_str) or collection_name + while True: + if bookmark: + options['bookmarks'] = [bookmark] + board_feed = self._call_api('BoardFeed', board_id, options) + for item in (board_feed.get('data') or []): + if not isinstance(item, dict) or item.get('type') != 'pin': + continue + video_id = item.get('id') + if video_id: + # Some pins may not be available anonymously via pin URL + # video = self._extract_video(item, extract_formats=False) + # video.update({ + # '_type': 'url_transparent', + # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, + # }) + # entries.append(video) + entries.append(self._extract_video(item)) + bookmark = board_feed.get('bookmark') + if not bookmark: + break return self.playlist_result( - entries, playlist_id=collection_id, playlist_title=title) + entries, playlist_id=board_id, playlist_title=board.get('name')) From c7196194719d03e47f5e506e26cde42d1e61ae8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:35:49 +0100 Subject: [PATCH 054/384] [nrk] Fix extraction --- haruhi_dl/extractor/nrk.py | 442 +++++++++++++++++++++---------------- 1 file changed, 257 insertions(+), 185 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 84aacbcda..4a395546f 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -9,6 +9,7 @@ from ..compat import ( compat_urllib_parse_unquote, ) from ..utils import ( + determine_ext, ExtractorError, int_or_none, js_to_json, @@ -16,17 +17,269 @@ from ..utils import ( parse_age_limit, parse_duration, try_get, + url_or_none, ) class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] - _api_host = None + +class NRKIE(NRKBaseIE): + _VALID_URL = r'''(?x) + (?: + nrk:| + https?:// + (?: + (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| + v8[-.]psapi\.nrk\.no/mediaelement/ + ) + ) + (?P<id>[^?\#&]+) + ''' + + _TESTS = [{ + # video + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': '706f34cdf1322577589e369e522b50ef', + 'info_dict': { + 'id': '150533', + 'ext': 'mp4', + 'title': 'Dompap og andre fugler i Piip-Show', + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 262, + } + }, { + # audio + 'url': 'http://www.nrk.no/video/PS*154915', + # MD5 is unstable + 'info_dict': { + 'id': '154915', + 'ext': 'flv', + 'title': 'Slik høres internett ut når du er blind', + 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, + } + }, { + 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', + 'only_matching': True, + }, { + 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', + 'only_matching': True, + }] + + def _extract_from_playback(self, video_id): + manifest = self._download_json( + 'http://psapi.nrk.no/playback/manifest/%s' % video_id, + video_id, 'Downloading manifest JSON') + + playable = manifest['playable'] + + formats = [] + for asset in playable['assets']: + if not isinstance(asset, dict): + continue + if asset.get('encrypted'): + continue + format_url = url_or_none(asset.get('url')) + if not format_url: + continue + if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + data = self._download_json( + 'http://psapi.nrk.no/playback/metadata/%s' % video_id, + video_id, 'Downloading metadata JSON') + + preplay = data['preplay'] + titles = preplay['titles'] + title = titles['title'] + alt_title = titles.get('subtitle') + + description = preplay.get('description') + duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) + + thumbnails = [] + for image in try_get( + preplay, lambda x: x['poster']['images'], list) or []: + if not isinstance(image, dict): + continue + image_url = url_or_none(image.get('url')) + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('pixelWidth')), + 'height': int_or_none(image.get('pixelHeight')), + }) + + return { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } def _real_extract(self, url): video_id = self._match_id(url) + return self._extract_from_playback(video_id) + +class NRKTVIE(NRKBaseIE): + IE_DESC = 'NRK TV and NRK Radio' + _EPISODE_RE = r'(?P<id>[a-zA-Z]{4}\d{8})' + _VALID_URL = r'''(?x) + https?:// + (?:tv|radio)\.nrk(?:super)?\.no/ + (?:serie(?:/[^/]+){1,2}|program)/ + (?![Ee]pisodes)%s + (?:/\d{2}-\d{2}-\d{4})? + (?:\#del=(?P<part_id>\d+))? + ''' % _EPISODE_RE + _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') + _TESTS = [{ + 'url': 'https://tv.nrk.no/program/MDDP12000117', + 'md5': '8270824df46ec629b66aeaa5796b36fb', + 'info_dict': { + 'id': 'MDDP12000117AA', + 'ext': 'mp4', + 'title': 'Alarm Trolltunga', + 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', + 'duration': 2223, + 'age_limit': 6, + }, + }, { + 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', + 'md5': '9a167e54d04671eb6317a37b7bc8a280', + 'info_dict': { + 'id': 'MUHH48000314AA', + 'ext': 'mp4', + 'title': '20 spørsmål 23.05.2014', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'duration': 1741, + 'series': '20 spørsmål', + 'episode': '23.05.2014', + }, + 'skip': 'NoProgramRights', + }, { + 'url': 'https://tv.nrk.no/program/mdfp15000514', + 'info_dict': { + 'id': 'MDFP15000514CA', + 'ext': 'mp4', + 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', + 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', + 'duration': 4605, + 'series': 'Kunnskapskanalen', + 'episode': '24.05.2014', + }, + 'params': { + 'skip_download': True, + }, + }, { + # single playlist video + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', + 'info_dict': { + 'id': 'MSPO40010515-part2', + 'ext': 'flv', + 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', + 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Video is geo restricted'], + 'skip': 'particular part is not supported currently', + }, { + 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', + 'playlist': [{ + 'info_dict': { + 'id': 'MSPO40010515AH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + 'duration': 772, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'info_dict': { + 'id': 'MSPO40010515BH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + 'duration': 6175, + 'series': 'Tour de Ski', + 'episode': '06.01.2015', + }, + 'params': { + 'skip_download': True, + }, + }], + 'info_dict': { + 'id': 'MSPO40010515', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + }, + 'expected_warnings': ['Video is geo restricted'], + }, { + 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', + 'info_dict': { + 'id': 'KMTE50001317AA', + 'ext': 'mp4', + 'title': 'Anno 13:30', + 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', + 'duration': 2340, + 'series': 'Anno', + 'episode': '13:30', + 'season_number': 3, + 'episode_number': 13, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', + 'info_dict': { + 'id': 'MUHH46000317AA', + 'ext': 'mp4', + 'title': 'Nytt på Nytt 27.01.2017', + 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', + 'duration': 1796, + 'series': 'Nytt på nytt', + 'episode': '27.01.2017', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', + 'only_matching': True, + }] + + _api_host = None + + def _extract_from_mediaelement(self, video_id): api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS for api_host in api_hosts: @@ -195,190 +448,9 @@ class NRKBaseIE(InfoExtractor): return self.playlist_result(entries, video_id, title, description) - -class NRKIE(NRKBaseIE): - _VALID_URL = r'''(?x) - (?: - nrk:| - https?:// - (?: - (?:www\.)?nrk\.no/video/PS\*| - v8[-.]psapi\.nrk\.no/mediaelement/ - ) - ) - (?P<id>[^?#&]+) - ''' - _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no') - _TESTS = [{ - # video - 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '706f34cdf1322577589e369e522b50ef', - 'info_dict': { - 'id': '150533', - 'ext': 'mp4', - 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 262, - } - }, { - # audio - 'url': 'http://www.nrk.no/video/PS*154915', - # MD5 is unstable - 'info_dict': { - 'id': '154915', - 'ext': 'flv', - 'title': 'Slik høres internett ut når du er blind', - 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', - 'duration': 20, - } - }, { - 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', - 'only_matching': True, - }, { - 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }] - - -class NRKTVIE(NRKBaseIE): - IE_DESC = 'NRK TV and NRK Radio' - _EPISODE_RE = r'(?P<id>[a-zA-Z]{4}\d{8})' - _VALID_URL = r'''(?x) - https?:// - (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie(?:/[^/]+){1,2}|program)/ - (?![Ee]pisodes)%s - (?:/\d{2}-\d{2}-\d{4})? - (?:\#del=(?P<part_id>\d+))? - ''' % _EPISODE_RE - _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') - _TESTS = [{ - 'url': 'https://tv.nrk.no/program/MDDP12000117', - 'md5': '8270824df46ec629b66aeaa5796b36fb', - 'info_dict': { - 'id': 'MDDP12000117AA', - 'ext': 'mp4', - 'title': 'Alarm Trolltunga', - 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', - 'duration': 2223, - 'age_limit': 6, - }, - }, { - 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '9a167e54d04671eb6317a37b7bc8a280', - 'info_dict': { - 'id': 'MUHH48000314AA', - 'ext': 'mp4', - 'title': '20 spørsmål 23.05.2014', - 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', - 'duration': 1741, - 'series': '20 spørsmål', - 'episode': '23.05.2014', - }, - 'skip': 'NoProgramRights', - }, { - 'url': 'https://tv.nrk.no/program/mdfp15000514', - 'info_dict': { - 'id': 'MDFP15000514CA', - 'ext': 'mp4', - 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', - 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', - 'duration': 4605, - 'series': 'Kunnskapskanalen', - 'episode': '24.05.2014', - }, - 'params': { - 'skip_download': True, - }, - }, { - # single playlist video - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', - 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Video is geo restricted'], - 'skip': 'particular part is not supported currently', - }, { - 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', - 'playlist': [{ - 'info_dict': { - 'id': 'MSPO40010515AH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 772, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'info_dict': { - 'id': 'MSPO40010515BH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 6175, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }], - 'info_dict': { - 'id': 'MSPO40010515', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - }, - 'expected_warnings': ['Video is geo restricted'], - }, { - 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', - 'info_dict': { - 'id': 'KMTE50001317AA', - 'ext': 'mp4', - 'title': 'Anno 13:30', - 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', - 'duration': 2340, - 'series': 'Anno', - 'episode': '13:30', - 'season_number': 3, - 'episode_number': 13, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', - 'info_dict': { - 'id': 'MUHH46000317AA', - 'ext': 'mp4', - 'title': 'Nytt på Nytt 27.01.2017', - 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', - 'duration': 1796, - 'series': 'Nytt på nytt', - 'episode': '27.01.2017', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', - 'only_matching': True, - }, { - 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', - 'only_matching': True, - }] + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_from_mediaelement(video_id) class NRKTVEpisodeIE(InfoExtractor): From 00088ef4b17ea4b261626dc7903094a5c6c25a27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:35:54 +0100 Subject: [PATCH 055/384] =?UTF-8?q?[downloader/fragment]=20Set=20final=20f?= =?UTF-8?q?ile's=20mtime=20according=20to=20last=20fragme=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …nt's Last-Modified header (closes #11718, closes #18384, closes #27138) --- haruhi_dl/downloader/fragment.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/downloader/fragment.py b/haruhi_dl/downloader/fragment.py index 65d8c881d..090941024 100644 --- a/haruhi_dl/downloader/fragment.py +++ b/haruhi_dl/downloader/fragment.py @@ -97,12 +97,15 @@ class FragmentFD(FileDownloader): def _download_fragment(self, ctx, frag_url, info_dict, headers=None): fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) - success = ctx['dl'].download(fragment_filename, { + fragment_info_dict = { 'url': frag_url, 'http_headers': headers or info_dict.get('http_headers'), - }) + } + success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False, None + if fragment_info_dict.get('filetime'): + ctx['fragment_filetime'] = fragment_info_dict.get('filetime') down, frag_sanitized = sanitize_open(fragment_filename, 'rb') ctx['fragment_filename_sanitized'] = frag_sanitized frag_content = down.read() @@ -258,6 +261,13 @@ class FragmentFD(FileDownloader): downloaded_bytes = ctx['complete_frags_downloaded_bytes'] else: self.try_rename(ctx['tmpfilename'], ctx['filename']) + if self.params.get('updatetime', True): + filetime = ctx.get('fragment_filetime') + if filetime: + try: + os.utime(ctx['filename'], (time.time(), filetime)) + except Exception: + pass downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) self._hook_progress({ From 997dc3ca4446b0f9f02189dfdcdbf72befdfe70c Mon Sep 17 00:00:00 2001 From: Joshua Lochner <admin@xenova.com> Date: Fri, 26 Feb 2021 14:36:01 +0100 Subject: [PATCH 056/384] [medaltv] Add new extractor (#27149) --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/medaltv.py | 138 ++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 haruhi_dl/extractor/medaltv.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 234763076..5de842c31 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -640,6 +640,7 @@ from .mastodon import MastodonSHIE from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE +from .medaltv import MedalTVIE from .mediaset import MediasetIE from .mediasite import ( MediasiteIE, diff --git a/haruhi_dl/extractor/medaltv.py b/haruhi_dl/extractor/medaltv.py new file mode 100644 index 000000000..06f7b6e92 --- /dev/null +++ b/haruhi_dl/extractor/medaltv.py @@ -0,0 +1,138 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + try_get, + float_or_none, + int_or_none +) + + +class MedalTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr', + 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', + 'info_dict': { + 'id': '34934644', + 'ext': 'mp4', + 'title': 'Quad Cold', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'MowgliSB', + 'timestamp': 1603165266, + 'upload_date': '20201020', + 'uploader_id': 10619174, + } + }, { + 'url': 'https://medal.tv/clips/36787208', + 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', + 'info_dict': { + 'id': '36787208', + 'ext': 'mp4', + 'title': 'u tk me i tk u bigger', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'Mimicc', + 'timestamp': 1605580939, + 'upload_date': '20201117', + 'uploader_id': 5156321, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + hydration_data = self._search_regex( + r'<script[^>]*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*</script>', + webpage, 'hydration data', default='{}') + parsed = self._parse_json(hydration_data, video_id) + + clip_info = try_get(parsed, lambda x: x['clips'][video_id], dict) or {} + if not clip_info: + raise ExtractorError('Could not find video information.', + video_id=video_id) + + width = int_or_none(clip_info.get('sourceWidth')) + height = int_or_none(clip_info.get('sourceHeight')) + + aspect_ratio = (width / height) if(width and height) else (16 / 9) + + # ordered from lowest to highest resolution + heights = (144, 240, 360, 480, 720, 1080) + + formats = [] + thumbnails = [] + + for height in heights: + format_key = '{0}p'.format(height) + video_key = 'contentUrl{0}'.format(format_key) + thumbnail_key = 'thumbnail{0}'.format(format_key) + width = int(round(aspect_ratio * height)) + + # Second condition needed as sometimes medal says + # they have a format when in fact it is another format. + format_url = clip_info.get(video_key) + if(format_url and format_key in format_url): + formats.append({ + 'url': format_url, + 'format_id': format_key, + 'width': width, + 'height': height + }) + + thumbnail_url = clip_info.get(thumbnail_key) + if(thumbnail_url and format_key in thumbnail_url): + thumbnails.append({ + 'id': format_key, + 'url': thumbnail_url, + 'width': width, + 'height': height + }) + + # add source to formats + source_url = clip_info.get('contentUrl') + if(source_url): + formats.append({ + 'url': source_url, + 'format_id': 'source', + 'width': width, + 'height': height + }) + + error = clip_info.get('error') + if not formats and error: + if(error == 404): + raise ExtractorError('That clip does not exist.', + expected=True, video_id=video_id) + else: + raise ExtractorError('An unknown error occurred ({0}).'.format(error), + video_id=video_id) + + # Necessary because the id of the author is not known in advance. + # Won't raise an issue if no profile can be found as this is optional. + author_info = try_get(parsed, + lambda x: list(x['profiles'].values())[0], dict + ) or {} + author_id = author_info.get('id') + author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None + + return { + 'id': video_id, + 'title': clip_info.get('contentTitle'), + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clip_info.get('contentDescription'), + + 'uploader': author_info.get('displayName'), + 'timestamp': float_or_none(clip_info.get('created'), 1000), + 'uploader_id': author_id, + 'uploader_url': author_url, + + 'duration': float_or_none(clip_info.get('videoLengthSeconds')), + 'view_count': int_or_none(clip_info.get('views')), + 'like_count': int_or_none(clip_info.get('likes')), + 'comment_count': int_or_none(clip_info.get('comments')) + } From d1114a12e179565e128fa6a384e04a6d45e78391 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:36:07 +0100 Subject: [PATCH 057/384] [medaltv] improve extraction --- haruhi_dl/extractor/medaltv.py | 131 ++++++++++++++++----------------- 1 file changed, 62 insertions(+), 69 deletions(-) diff --git a/haruhi_dl/extractor/medaltv.py b/haruhi_dl/extractor/medaltv.py index 06f7b6e92..1603b55f6 100644 --- a/haruhi_dl/extractor/medaltv.py +++ b/haruhi_dl/extractor/medaltv.py @@ -1,13 +1,16 @@ # coding: utf-8 - from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, - try_get, float_or_none, - int_or_none + int_or_none, + str_or_none, + try_get, ) @@ -45,94 +48,84 @@ class MedalTVIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - hydration_data = self._search_regex( + hydration_data = self._parse_json(self._search_regex( r'<script[^>]*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*</script>', - webpage, 'hydration data', default='{}') - parsed = self._parse_json(hydration_data, video_id) + webpage, 'hydration data', default='{}'), video_id) - clip_info = try_get(parsed, lambda x: x['clips'][video_id], dict) or {} - if not clip_info: - raise ExtractorError('Could not find video information.', - video_id=video_id) + clip = try_get( + hydration_data, lambda x: x['clips'][video_id], dict) or {} + if not clip: + raise ExtractorError( + 'Could not find video information.', video_id=video_id) - width = int_or_none(clip_info.get('sourceWidth')) - height = int_or_none(clip_info.get('sourceHeight')) + title = clip['contentTitle'] - aspect_ratio = (width / height) if(width and height) else (16 / 9) + source_width = int_or_none(clip.get('sourceWidth')) + source_height = int_or_none(clip.get('sourceHeight')) - # ordered from lowest to highest resolution - heights = (144, 240, 360, 480, 720, 1080) + aspect_ratio = source_width / source_height if source_width and source_height else 16 / 9 - formats = [] - thumbnails = [] - - for height in heights: - format_key = '{0}p'.format(height) - video_key = 'contentUrl{0}'.format(format_key) - thumbnail_key = 'thumbnail{0}'.format(format_key) + def add_item(container, item_url, height, id_key='format_id', item_id=None): + item_id = item_id or '%dp' % height + if item_id not in item_url: + return width = int(round(aspect_ratio * height)) - - # Second condition needed as sometimes medal says - # they have a format when in fact it is another format. - format_url = clip_info.get(video_key) - if(format_url and format_key in format_url): - formats.append({ - 'url': format_url, - 'format_id': format_key, - 'width': width, - 'height': height - }) - - thumbnail_url = clip_info.get(thumbnail_key) - if(thumbnail_url and format_key in thumbnail_url): - thumbnails.append({ - 'id': format_key, - 'url': thumbnail_url, - 'width': width, - 'height': height - }) - - # add source to formats - source_url = clip_info.get('contentUrl') - if(source_url): - formats.append({ - 'url': source_url, - 'format_id': 'source', + container.append({ + 'url': item_url, + id_key: item_id, 'width': width, 'height': height }) - error = clip_info.get('error') + formats = [] + thumbnails = [] + for k, v in clip.items(): + if not (v and isinstance(v, compat_str)): + continue + mobj = re.match(r'(contentUrl|thumbnail)(?:(\d+)p)?$', k) + if not mobj: + continue + prefix = mobj.group(1) + height = int_or_none(mobj.group(2)) + if prefix == 'contentUrl': + add_item( + formats, v, height or source_height, + item_id=None if height else 'source') + elif prefix == 'thumbnail': + add_item(thumbnails, v, height, 'id') + + error = clip.get('error') if not formats and error: - if(error == 404): - raise ExtractorError('That clip does not exist.', - expected=True, video_id=video_id) + if error == 404: + raise ExtractorError( + 'That clip does not exist.', + expected=True, video_id=video_id) else: - raise ExtractorError('An unknown error occurred ({0}).'.format(error), - video_id=video_id) + raise ExtractorError( + 'An unknown error occurred ({0}).'.format(error), + video_id=video_id) + + self._sort_formats(formats) # Necessary because the id of the author is not known in advance. # Won't raise an issue if no profile can be found as this is optional. - author_info = try_get(parsed, - lambda x: list(x['profiles'].values())[0], dict - ) or {} - author_id = author_info.get('id') + author = try_get( + hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} + author_id = str_or_none(author.get('id')) author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None return { 'id': video_id, - 'title': clip_info.get('contentTitle'), + 'title': title, 'formats': formats, 'thumbnails': thumbnails, - 'description': clip_info.get('contentDescription'), - - 'uploader': author_info.get('displayName'), - 'timestamp': float_or_none(clip_info.get('created'), 1000), + 'description': clip.get('contentDescription'), + 'uploader': author.get('displayName'), + 'timestamp': float_or_none(clip.get('created'), 1000), 'uploader_id': author_id, 'uploader_url': author_url, - - 'duration': float_or_none(clip_info.get('videoLengthSeconds')), - 'view_count': int_or_none(clip_info.get('views')), - 'like_count': int_or_none(clip_info.get('likes')), - 'comment_count': int_or_none(clip_info.get('comments')) + 'duration': int_or_none(clip.get('videoLengthSeconds')), + 'view_count': int_or_none(clip.get('views')), + 'like_count': int_or_none(clip.get('likes')), + 'comment_count': int_or_none(clip.get('comments')), } From 45eded9bd23596d982a8165e40cc8a44354ff6ed Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:36:12 +0100 Subject: [PATCH 058/384] [bbc] fix BBC News videos extraction --- haruhi_dl/extractor/bbc.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/bbc.py b/haruhi_dl/extractor/bbc.py index 1ff7834bf..fbd79ce4d 100644 --- a/haruhi_dl/extractor/bbc.py +++ b/haruhi_dl/extractor/bbc.py @@ -981,7 +981,7 @@ class BBCIE(BBCCoUkIE): group_id = self._search_regex( r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, webpage, 'group id', default=None) - if playlist_id: + if group_id: return self.url_result( 'https://www.bbc.co.uk/programmes/%s' % group_id, ie=BBCCoUkIE.ie_key()) @@ -1118,6 +1118,39 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if initial_data: + def parse_media(media): + if not media: + return + for item in (try_get(media, lambda x: x['media']['items'], list) or []): + item_id = item.get('id') + item_title = item.get('title') + if not (item_id and item_title): + continue + formats, subtitles = self._download_media_selector(item_id) + self._sort_formats(formats) + entries.append({ + 'id': item_id, + 'title': item_title, + 'thumbnail': item.get('holdingImageUrl'), + 'formats': formats, + 'subtitles': subtitles, + }) + for resp in (initial_data.get('data') or {}).values(): + name = resp.get('name') + if name == 'media-experience': + parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) + elif name == 'article': + for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + if block.get('type') != 'media': + continue + parse_media(block.get('model')) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), From 3f6dc5d4ef783523e3a2e3dc473a668470f2293c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:36:19 +0100 Subject: [PATCH 059/384] [bbc] fix BBC Three clip extraction --- haruhi_dl/extractor/bbc.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/bbc.py b/haruhi_dl/extractor/bbc.py index fbd79ce4d..7aa3a11b5 100644 --- a/haruhi_dl/extractor/bbc.py +++ b/haruhi_dl/extractor/bbc.py @@ -1092,10 +1092,26 @@ class BBCIE(BBCCoUkIE): self._search_regex( r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, 'bbcthree config', default='{}'), - playlist_id, transform_source=js_to_json, fatal=False) - if bbc3_config: + playlist_id, transform_source=js_to_json, fatal=False) or {} + payload = bbc3_config.get('payload') or {} + if payload: + clip = payload.get('currentClip') or {} + clip_vpid = clip.get('vpid') + clip_title = clip.get('title') + if clip_vpid and clip_title: + formats, subtitles = self._download_media_selector(clip_vpid) + self._sort_formats(formats) + return { + 'id': clip_vpid, + 'title': clip_title, + 'thumbnail': dict_get(clip, ('poster', 'imageUrl')), + 'description': clip.get('description'), + 'duration': parse_duration(clip.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + } bbc3_playlist = try_get( - bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + payload, lambda x: x['content']['bbcMedia']['playlist'], dict) if bbc3_playlist: playlist_title = bbc3_playlist.get('title') or playlist_title From 7a49184ca69784612ecee1d8e86bc6a3414816d2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:36:24 +0100 Subject: [PATCH 060/384] [viki] fix video API request(closes #27184) --- haruhi_dl/extractor/viki.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/viki.py b/haruhi_dl/extractor/viki.py index a003b7af8..a311f21ef 100644 --- a/haruhi_dl/extractor/viki.py +++ b/haruhi_dl/extractor/viki.py @@ -20,6 +20,7 @@ from ..utils import ( parse_age_limit, parse_iso8601, sanitized_Request, + std_headers, ) @@ -226,8 +227,10 @@ class VikiIE(VikiBaseIE): resp = self._download_json( 'https://www.viki.com/api/videos/' + video_id, - video_id, 'Downloading video JSON', - headers={'x-viki-app-ver': '4.0.57'}) + video_id, 'Downloading video JSON', headers={ + 'x-client-user-agent': std_headers['User-Agent'], + 'x-viki-app-ver': '4.0.57', + }) video = resp['video'] self._check_errors(video) From 79cd28f514a519a1221f7fc20fed914e4adad1f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:37:00 +0100 Subject: [PATCH 061/384] [spreaker] Add extractor (closes #13480, closes #13877) --- haruhi_dl/extractor/extractors.py | 6 + haruhi_dl/extractor/spreaker.py | 176 ++++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+) create mode 100644 haruhi_dl/extractor/spreaker.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 5de842c31..3e26dfe40 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1129,6 +1129,12 @@ from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE +from .spreaker import ( + SpreakerIE, + SpreakerPageIE, + SpreakerShowIE, + SpreakerShowPageIE, +) from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE from .srgssr import ( diff --git a/haruhi_dl/extractor/spreaker.py b/haruhi_dl/extractor/spreaker.py new file mode 100644 index 000000000..beee6670c --- /dev/null +++ b/haruhi_dl/extractor/spreaker.py @@ -0,0 +1,176 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + str_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +def _extract_episode(data, episode_id=None): + title = data['title'] + download_url = data['download_url'] + + series = try_get(data, lambda x: x['show']['title'], compat_str) + uploader = try_get(data, lambda x: x['author']['fullname'], compat_str) + + thumbnails = [] + for image in ('image_original', 'image_medium', 'image'): + image_url = url_or_none(data.get('%s_url' % image)) + if image_url: + thumbnails.append({'url': image_url}) + + def stats(key): + return int_or_none(try_get( + data, + (lambda x: x['%ss_count' % key], + lambda x: x['stats']['%ss' % key]))) + + def duration(key): + return float_or_none(data.get(key), scale=1000) + + return { + 'id': compat_str(episode_id or data['episode_id']), + 'url': download_url, + 'display_id': data.get('permalink'), + 'title': title, + 'description': data.get('description'), + 'timestamp': unified_timestamp(data.get('published_at')), + 'uploader': uploader, + 'uploader_id': str_or_none(data.get('author_id')), + 'creator': uploader, + 'duration': duration('duration') or duration('length'), + 'view_count': stats('play'), + 'like_count': stats('like'), + 'comment_count': stats('message'), + 'format': 'MPEG Layer 3', + 'format_id': 'mp3', + 'container': 'mp3', + 'ext': 'mp3', + 'thumbnails': thumbnails, + 'series': series, + 'extractor_key': SpreakerIE.ie_key(), + } + + +class SpreakerIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + api\.spreaker\.com/ + (?: + (?:download/)?episode| + v2/episodes + )/ + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'https://api.spreaker.com/episode/12534508', + 'info_dict': { + 'id': '12534508', + 'display_id': 'swm-ep15-how-to-market-your-music-part-2', + 'ext': 'mp3', + 'title': 'EP:15 | Music Marketing (Likes) - Part 2', + 'description': 'md5:0588c43e27be46423e183076fa071177', + 'timestamp': 1502250336, + 'upload_date': '20170809', + 'uploader': 'SWM', + 'uploader_id': '9780658', + 'duration': 1063.42, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'series': 'Success With Music (SWM)', + }, + }, { + 'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3', + 'only_matching': True, + }, { + 'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + data = self._download_json( + 'https://api.spreaker.com/v2/episodes/%s' % episode_id, + episode_id)['response']['episode'] + return _extract_episode(data, episode_id) + + +class SpreakerPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + episode_id = self._search_regex( + (r'data-episode_id=["\'](?P<id>\d+)', + r'episode_id\s*:\s*(?P<id>\d+)'), webpage, 'episode id') + return self.url_result( + 'https://api.spreaker.com/episode/%s' % episode_id, + ie=SpreakerIE.ie_key(), video_id=episode_id) + + +class SpreakerShowIE(InfoExtractor): + _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/show/3-ninjas-podcast', + 'info_dict': { + 'id': '4652058', + }, + 'playlist_mincount': 118, + }] + + def _entries(self, show_id): + for page_num in itertools.count(1): + episodes = self._download_json( + 'https://api.spreaker.com/show/%s/episodes' % show_id, + show_id, note='Downloading JSON page %d' % page_num, query={ + 'page': page_num, + 'max_per_page': 100, + }) + pager = try_get(episodes, lambda x: x['response']['pager'], dict) + if not pager: + break + results = pager.get('results') + if not results or not isinstance(results, list): + break + for result in results: + if not isinstance(result, dict): + continue + yield _extract_episode(result) + if page_num == pager.get('last_page'): + break + + def _real_extract(self, url): + show_id = self._match_id(url) + return self.playlist_result(self._entries(show_id), playlist_id=show_id) + + +class SpreakerShowPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/show/success-with-music', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show_id = self._search_regex( + r'show_id\s*:\s*(?P<id>\d+)', webpage, 'show id') + return self.url_result( + 'https://api.spreaker.com/show/%s' % show_id, + ie=SpreakerShowIE.ie_key(), video_id=show_id) From 37108e29a6af96e058c31452aefec4a236cc520e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:37:08 +0100 Subject: [PATCH 062/384] [spreaker] fix SpreakerShowIE test URL --- haruhi_dl/extractor/spreaker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/spreaker.py b/haruhi_dl/extractor/spreaker.py index beee6670c..6c7e40ae4 100644 --- a/haruhi_dl/extractor/spreaker.py +++ b/haruhi_dl/extractor/spreaker.py @@ -126,7 +126,7 @@ class SpreakerPageIE(InfoExtractor): class SpreakerShowIE(InfoExtractor): _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)' _TESTS = [{ - 'url': 'https://www.spreaker.com/show/3-ninjas-podcast', + 'url': 'https://api.spreaker.com/show/4652058', 'info_dict': { 'id': '4652058', }, From 34a34d7f710f96a67e2de02e89e895fa749a9a89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FAdrian=3D20Heine=3D20n=3DC3=3DA9=3D20Lan?= =?UTF-8?q?g=3F=3D?= <mail@adrianheine.de> Date: Fri, 26 Feb 2021 14:37:14 +0100 Subject: [PATCH 063/384] [videa] Adapt to updates (#26301) closes #25973, closes #25650. --- haruhi_dl/extractor/videa.py | 62 ++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/videa.py b/haruhi_dl/extractor/videa.py index 5830e7fd7..0f0702852 100644 --- a/haruhi_dl/extractor/videa.py +++ b/haruhi_dl/extractor/videa.py @@ -2,15 +2,24 @@ from __future__ import unicode_literals import re +import random +import string +import struct from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, mimetype2ext, parse_codecs, xpath_element, xpath_text, ) +from ..compat import ( + compat_b64decode, + compat_ord, + compat_parse_qs, +) class VideaIE(InfoExtractor): @@ -60,15 +69,63 @@ class VideaIE(InfoExtractor): r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', webpage)] + def rc4(self, ciphertext, key): + res = b'' + + keyLen = len(key) + S = list(range(256)) + + j = 0 + for i in range(256): + j = (j + S[i] + ord(key[i % keyLen])) % 256 + S[i], S[j] = S[j], S[i] + + i = 0 + j = 0 + for m in range(len(ciphertext)): + i = (i + 1) % 256 + j = (j + S[i]) % 256 + S[i], S[j] = S[j], S[i] + k = S[(S[i] + S[j]) % 256] + res += struct.pack("B", k ^ compat_ord(ciphertext[m])) + + return res + def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, fatal=True) + error = self._search_regex(r'<p class="error-text">([^<]+)</p>', webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) - info = self._download_xml( + video_src_params_raw = self._search_regex(r'<iframe[^>]+id="videa_player_iframe"[^>]+src="/player\?([^"]+)"', webpage, 'video_src_params') + video_src_params = compat_parse_qs(video_src_params_raw) + player_page = self._download_webpage("https://videa.hu/videojs_player?%s" % video_src_params_raw, video_id, fatal=True) + nonce = self._search_regex(r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') + random_seed = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(8)) + static_secret = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' + l = nonce[:32] + s = nonce[32:] + result = '' + for i in range(0, 32): + result += s[i - (static_secret.index(l[i]) - 31)] + + video_src_params['_s'] = random_seed + video_src_params['_t'] = result[:16] + encryption_key_stem = result[16:] + random_seed + + [b64_info, handle] = self._download_webpage_handle( 'http://videa.hu/videaplayer_get_xml.php', video_id, - query={'v': video_id}) + query=video_src_params, fatal=True) + + encrypted_info = compat_b64decode(b64_info) + key = encryption_key_stem + handle.info()['x-videa-xs'] + info_str = self.rc4(encrypted_info, key).decode('utf8') + info = self._parse_xml(info_str, video_id) video = xpath_element(info, './/video', 'video', fatal=True) sources = xpath_element(info, './/video_sources', 'sources', fatal=True) + hash_values = xpath_element(info, './/hash_values', 'hash_values', fatal=True) title = xpath_text(video, './title', fatal=True) @@ -77,6 +134,7 @@ class VideaIE(InfoExtractor): source_url = source.text if not source_url: continue + source_url += '?md5=%s&expires=%s' % (hash_values.find('hash_value_%s' % source.get('name')).text, source.get('exp')) f = parse_codecs(source.get('codecs')) f.update({ 'url': source_url, From de296b234ab6ffc9559b60f9cb401cbc36c11687 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:37:50 +0100 Subject: [PATCH 064/384] =?UTF-8?q?[YoutubeDL]=20Write=20static=20debug=20?= =?UTF-8?q?to=20stderr=20and=20respect=20quiet=20for=20dynami=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …c debug (closes #14579, closes #22593) TODO: logging and verbosity needs major refactoring (refs #10894) --- haruhi_dl/HaruhiDL.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/HaruhiDL.py b/haruhi_dl/HaruhiDL.py index e67c01a9d..ffc583e82 100755 --- a/haruhi_dl/HaruhiDL.py +++ b/haruhi_dl/HaruhiDL.py @@ -1614,7 +1614,7 @@ class HaruhiDL(object): if req_format is None: req_format = self._default_format_spec(info_dict, download=download) if self.params.get('verbose'): - self.to_stdout('[debug] Default format spec: %s' % req_format) + self._write_string('[debug] Default format spec: %s\n' % req_format) format_selector = self.build_format_selector(req_format) @@ -1875,7 +1875,7 @@ class HaruhiDL(object): for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): - self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) + self.to_screen('[debug] Invoking downloader on %r' % info.get('url')) return fd.download(name, info) if info_dict.get('requested_formats') is not None: From bbb93695b039b9037399ced819adaaeb05a5e10b Mon Sep 17 00:00:00 2001 From: bopol <bopol@e.email> Date: Fri, 26 Feb 2021 14:37:59 +0100 Subject: [PATCH 065/384] [ina] Add support for mobile URLs (#27229) --- haruhi_dl/extractor/ina.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/ina.py b/haruhi_dl/extractor/ina.py index 12695af27..b3b2683cb 100644 --- a/haruhi_dl/extractor/ina.py +++ b/haruhi_dl/extractor/ina.py @@ -12,7 +12,7 @@ from ..utils import ( class InaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)' _TESTS = [{ 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', 'md5': 'a667021bf2b41f8dc6049479d9bb38a3', @@ -31,6 +31,9 @@ class InaIE(InfoExtractor): }, { 'url': 'https://www.ina.fr/video/P16173408-video.html', 'only_matching': True, + }, { + 'url': 'http://m.ina.fr/video/I12055569', + 'only_matching': True, }] def _real_extract(self, url): From f44820d71839ecd969158af4fb0c7884b6d6a622 Mon Sep 17 00:00:00 2001 From: Michael Munch <mm.munk@gmail.com> Date: Fri, 26 Feb 2021 14:38:09 +0100 Subject: [PATCH 066/384] [drtv] Extend _VALID_URL (#27243) --- haruhi_dl/extractor/drtv.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/drtv.py b/haruhi_dl/extractor/drtv.py index 390e79f8c..c0036adb6 100644 --- a/haruhi_dl/extractor/drtv.py +++ b/haruhi_dl/extractor/drtv.py @@ -29,7 +29,7 @@ class DRTVIE(InfoExtractor): https?:// (?: (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*| - (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode)/ + (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ ) (?P<id>[\da-z_-]+) ''' @@ -111,6 +111,9 @@ class DRTVIE(InfoExtractor): }, { 'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769', 'only_matching': True, + }, { + 'url': 'https://www.dr.dk/drtv/program/jagten_220924', + 'only_matching': True, }] def _real_extract(self, url): From 863cae8fe46a2114c5c2ec7409242c75a00d9403 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:38:16 +0100 Subject: [PATCH 067/384] =?UTF-8?q?[yandexmusic:track]=20Fix=20extraction?= =?UTF-8?q?=20(closes=20#26449,=20closes=20#26669,=20clo=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ses #26747, closes #26748, closes #26762) --- haruhi_dl/extractor/yandexmusic.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/yandexmusic.py b/haruhi_dl/extractor/yandexmusic.py index c50bc8156..ffbbdcefa 100644 --- a/haruhi_dl/extractor/yandexmusic.py +++ b/haruhi_dl/extractor/yandexmusic.py @@ -109,8 +109,7 @@ class YandexMusicTrackIE(YandexMusicBaseIE): 'Downloading track location JSON', query={'format': 'json'}) key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + fd_data['path'][1:] + fd_data['s']).encode('utf-8')).hexdigest() - storage = track['storageDir'].split('.') - f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], storage[1]) + f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], track['id']) thumbnail = None cover_uri = track.get('albums', [{}])[0].get('coverUri') From 31a2706650940b063a270fed793e05c5bb25e215 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:38:48 +0100 Subject: [PATCH 068/384] [mediaset] add support for movie URLs(closes #27240) --- haruhi_dl/extractor/mediaset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/mediaset.py b/haruhi_dl/extractor/mediaset.py index 933df1495..2c16fc9e2 100644 --- a/haruhi_dl/extractor/mediaset.py +++ b/haruhi_dl/extractor/mediaset.py @@ -23,7 +23,7 @@ class MediasetIE(ThePlatformBaseIE): https?:// (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ (?: - (?:video|on-demand)/(?:[^/]+/)+[^/]+_| + (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_| player/index\.html\?.*?\bprogramGuid= ) )(?P<id>[0-9A-Z]{16,}) @@ -88,6 +88,9 @@ class MediasetIE(ThePlatformBaseIE): }, { 'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135', 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/movie/herculeslaleggendahainizio/hercules-la-leggenda-ha-inizio_F305927501000102', + 'only_matching': True, }] @staticmethod From a321724c883cfde0de88d91b1307d049512f2061 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FRoman=3D20Ber=3DC3=3DA1nek=3F=3D?= <zavorka@users.noreply.github.com> Date: Fri, 26 Feb 2021 14:38:55 +0100 Subject: [PATCH 069/384] =?UTF-8?q?[cspan]=20Pass=20Referer=20header=20wit?= =?UTF-8?q?h=20format's=20video=20URL=20(#26032)=20(closes=20=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …#25729) --- haruhi_dl/extractor/cspan.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/haruhi_dl/extractor/cspan.py b/haruhi_dl/extractor/cspan.py index 67d6df4b0..3356cc280 100644 --- a/haruhi_dl/extractor/cspan.py +++ b/haruhi_dl/extractor/cspan.py @@ -165,6 +165,8 @@ class CSpanIE(InfoExtractor): formats = self._extract_m3u8_formats( path, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] + for f in formats: + f.setdefault('http_headers', {})['Referer'] = url self._sort_formats(formats) entries.append({ 'id': '%s_%d' % (video_id, partnum + 1), From 8c785f84724e8624bf33138dc6215b72e9ad3923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:39:10 +0100 Subject: [PATCH 070/384] =?UTF-8?q?[cspan]=20Extract=20info=20from=20jwpla?= =?UTF-8?q?yer=20data=20(closes=20#3672,=20closes=20#3734,=20=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …closes #10638, closes #13030, closes #18806, closes #23148, closes #24461, closes #26171, closes #26800, closes #27263) --- haruhi_dl/extractor/cspan.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/cspan.py b/haruhi_dl/extractor/cspan.py index 3356cc280..766942146 100644 --- a/haruhi_dl/extractor/cspan.py +++ b/haruhi_dl/extractor/cspan.py @@ -10,6 +10,8 @@ from ..utils import ( find_xpath_attr, get_element_by_class, int_or_none, + js_to_json, + merge_dicts, smuggle_url, unescapeHTML, ) @@ -98,6 +100,26 @@ class CSpanIE(InfoExtractor): bc_attr['data-bcid']) return self.url_result(smuggle_url(bc_url, {'source_url': url})) + def add_referer(formats): + for f in formats: + f.setdefault('http_headers', {})['Referer'] = url + + # As of 01.12.2020 this path looks to cover all cases making the rest + # of the code unnecessary + jwsetup = self._parse_json( + self._search_regex( + r'(?s)jwsetup\s*=\s*({.+?})\s*;', webpage, 'jwsetup', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if jwsetup: + info = self._parse_jwplayer_data( + jwsetup, video_id, require_title=False, m3u8_id='hls', + base_url=url) + add_referer(info['formats']) + ld_info = self._search_json_ld(webpage, video_id, default={}) + return merge_dicts(info, ld_info) + + # Obsolete # We first look for clipid, because clipprog always appears before patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] results = list(filter(None, (re.search(p, webpage) for p in patterns))) @@ -165,8 +187,7 @@ class CSpanIE(InfoExtractor): formats = self._extract_m3u8_formats( path, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] - for f in formats: - f.setdefault('http_headers', {})['Referer'] = url + add_referer(formats) self._sort_formats(formats) entries.append({ 'id': '%s_%d' % (video_id, partnum + 1), From 9b24767e1e3951e745f0e8fa10193f83c5f6b4a6 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:39:18 +0100 Subject: [PATCH 071/384] [toggle] Add support for new MeWatch URLs (closes #27256) --- haruhi_dl/extractor/extractors.py | 5 ++- haruhi_dl/extractor/toggle.py | 74 ++++++++++++++++++------------- 2 files changed, 47 insertions(+), 32 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 3e26dfe40..314d7bfe0 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1232,7 +1232,10 @@ from .tnaflix import ( EMPFlixIE, MovieFapIE, ) -from .toggle import ToggleIE +from .toggle import ( + ToggleIE, + MeWatchIE, +) from .tonline import TOnlineIE from .toongoggles import ToonGogglesIE from .toutv import TouTvIE diff --git a/haruhi_dl/extractor/toggle.py b/haruhi_dl/extractor/toggle.py index ca2e36efe..cababa69e 100644 --- a/haruhi_dl/extractor/toggle.py +++ b/haruhi_dl/extractor/toggle.py @@ -11,13 +11,13 @@ from ..utils import ( float_or_none, int_or_none, parse_iso8601, - sanitized_Request, + strip_or_none, ) class ToggleIE(InfoExtractor): IE_NAME = 'toggle' - _VALID_URL = r'https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)' + _VALID_URL = r'(?:https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}|toggle:)(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.mewatch.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { @@ -96,16 +96,6 @@ class ToggleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, note='Downloading video page') - - api_user = self._search_regex( - r'apiUser\s*:\s*(["\'])(?P<user>.+?)\1', webpage, 'apiUser', - default=self._API_USER, group='user') - api_pass = self._search_regex( - r'apiPass\s*:\s*(["\'])(?P<pass>.+?)\1', webpage, 'apiPass', - default=self._API_PASS, group='pass') - params = { 'initObj': { 'Locale': { @@ -118,17 +108,16 @@ class ToggleIE(InfoExtractor): 'SiteGuid': 0, 'DomainID': '0', 'UDID': '', - 'ApiUser': api_user, - 'ApiPass': api_pass + 'ApiUser': self._API_USER, + 'ApiPass': self._API_PASS }, 'MediaID': video_id, 'mediaType': 0, } - req = sanitized_Request( + info = self._download_json( 'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo', - json.dumps(params).encode('utf-8')) - info = self._download_json(req, video_id, 'Downloading video info json') + video_id, 'Downloading video info json', data=json.dumps(params).encode('utf-8')) title = info['MediaName'] @@ -172,14 +161,6 @@ class ToggleIE(InfoExtractor): raise ExtractorError('No downloadable videos found', expected=True) self._sort_formats(formats) - duration = int_or_none(info.get('Duration')) - description = info.get('Description') - created_at = parse_iso8601(info.get('CreationDate') or None) - - average_rating = float_or_none(info.get('Rating')) - view_count = int_or_none(info.get('ViewCounter') or info.get('view_counter')) - like_count = int_or_none(info.get('LikeCounter') or info.get('like_counter')) - thumbnails = [] for picture in info.get('Pictures', []): if not isinstance(picture, dict): @@ -199,15 +180,46 @@ class ToggleIE(InfoExtractor): }) thumbnails.append(thumbnail) + def counter(prefix): + return int_or_none( + info.get(prefix + 'Counter') or info.get(prefix.lower() + '_counter')) + return { 'id': video_id, 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': created_at, - 'average_rating': average_rating, - 'view_count': view_count, - 'like_count': like_count, + 'description': strip_or_none(info.get('Description')), + 'duration': int_or_none(info.get('Duration')), + 'timestamp': parse_iso8601(info.get('CreationDate') or None), + 'average_rating': float_or_none(info.get('Rating')), + 'view_count': counter('View'), + 'like_count': counter('Like'), 'thumbnails': thumbnails, 'formats': formats, } + + +class MeWatchIE(InfoExtractor): + IE_NAME = 'mewatch' + _VALID_URL = r'https?://(?:www\.)?mewatch\.sg/watch/[0-9a-zA-Z-]+-(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371', + 'info_dict': { + 'id': '1008625', + 'ext': 'mp4', + 'title': 'Recipe Of Life 味之道', + 'timestamp': 1603306526, + 'description': 'md5:6e88cde8af2068444fc8e1bc3ebf257c', + 'upload_date': '20201021', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + }] + + def _real_extract(self, url): + item_id = self._match_id(url) + custom_id = self._download_json( + 'https://cdn.mewatch.sg/api/items/' + item_id, + item_id, query={'segments': 'all'})['customId'] + return self.url_result( + 'toggle:' + custom_id, ToggleIE.ie_key(), custom_id) From 9e5ac5f6291968a68392e5a47c08d62e0229210f Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:39:24 +0100 Subject: [PATCH 072/384] [toggle] Detect DRM protected videos (closes #16479)(closes #20805) --- haruhi_dl/extractor/toggle.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/haruhi_dl/extractor/toggle.py b/haruhi_dl/extractor/toggle.py index cababa69e..91b8023b8 100644 --- a/haruhi_dl/extractor/toggle.py +++ b/haruhi_dl/extractor/toggle.py @@ -84,12 +84,6 @@ class ToggleIE(InfoExtractor): 'only_matching': True, }] - _FORMAT_PREFERENCES = { - 'wvm-STBMain': -10, - 'wvm-iPadMain': -20, - 'wvm-iPhoneMain': -30, - 'wvm-Android': -40, - } _API_USER = 'tvpapi_147' _API_PASS = '11111' @@ -130,11 +124,16 @@ class ToggleIE(InfoExtractor): vid_format = vid_format.replace(' ', '') # if geo-restricted, m3u8 is inaccessible, but mp4 is okay if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( video_url, video_id, ext='mp4', m3u8_id=vid_format, note='Downloading %s m3u8 information' % vid_format, errnote='Failed to download %s m3u8 information' % vid_format, - fatal=False)) + fatal=False) + for f in m3u8_formats: + # Apple FairPlay Streaming + if '/fpshls/' in f['url']: + continue + formats.append(f) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id=vid_format, @@ -147,16 +146,17 @@ class ToggleIE(InfoExtractor): note='Downloading %s ISM manifest' % vid_format, errnote='Failed to download %s ISM manifest' % vid_format, fatal=False)) - elif ext in ('mp4', 'wvm'): - # wvm are drm-protected files + elif ext == 'mp4': formats.append({ 'ext': ext, 'url': video_url, 'format_id': vid_format, - 'preference': self._FORMAT_PREFERENCES.get(ext + '-' + vid_format) or -1, - 'format_note': 'DRM-protected video' if ext == 'wvm' else None }) if not formats: + for meta in (info.get('Metas') or []): + if meta.get('Key') == 'Encryption' and meta.get('Value') == '1': + raise ExtractorError( + 'This video is DRM protected.', expected=True) # Most likely because geo-blocked raise ExtractorError('No downloadable videos found', expected=True) self._sort_formats(formats) From 0475d9eaff9dcd5cf06bd2c45338e18a8c349e16 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:39:30 +0100 Subject: [PATCH 073/384] [tva] Add support for qub.ca (closes #27235) --- haruhi_dl/extractor/tva.py | 65 ++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 17 deletions(-) diff --git a/haruhi_dl/extractor/tva.py b/haruhi_dl/extractor/tva.py index 443f46e8a..52a4ddf32 100644 --- a/haruhi_dl/extractor/tva.py +++ b/haruhi_dl/extractor/tva.py @@ -4,7 +4,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( float_or_none, + int_or_none, smuggle_url, + strip_or_none, ) @@ -23,7 +25,8 @@ class TVAIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'https://video.tva.ca/details/_5596811470001', 'only_matching': True, @@ -32,26 +35,54 @@ class TVAIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'https://videos.tva.ca/proxy/item/_' + video_id, video_id, headers={ - 'Accept': 'application/json', - }, query={ - 'appId': '5955fc5f23eec60006c951f1', - }) - - def get_attribute(key): - for attribute in video_data.get('attributes', []): - if attribute.get('key') == key: - return attribute.get('value') - return None return { '_type': 'url_transparent', 'id': video_id, - 'title': get_attribute('title'), 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['CA']}), - 'description': get_attribute('description'), - 'thumbnail': get_attribute('image-background') or get_attribute('image-landscape'), - 'duration': float_or_none(get_attribute('video-duration'), 1000), 'ie_key': 'BrightcoveNew', } + + +class QubIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?qub\.ca/(?:[^/]+/)*[0-9a-z-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.qub.ca/tvaplus/tva/alerte-amber/saison-1/episode-01-1000036619', + 'md5': '949490fd0e7aee11d0543777611fbd53', + 'info_dict': { + 'id': '6084352463001', + 'ext': 'mp4', + 'title': 'Épisode 01', + 'uploader_id': '5481942443001', + 'upload_date': '20190907', + 'timestamp': 1567899756, + 'description': 'md5:9c0d7fbb90939420c651fd977df90145', + }, + }, { + 'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943', + 'only_matching': True, + }] + # reference_id also works with old account_id(5481942443001) + # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5813221784001/default_default/index.html?videoId=ref:%s' + + def _real_extract(self, url): + entity_id = self._match_id(url) + entity = self._download_json( + 'https://www.qub.ca/proxy/pfu/content-delivery-service/v1/entities', + entity_id, query={'id': entity_id}) + video_id = entity['videoId'] + episode = strip_or_none(entity.get('name')) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': episode, + # 'url': self.BRIGHTCOVE_URL_TEMPLATE % entity['referenceId'], + 'url': 'https://videos.tva.ca/details/_' + video_id, + 'description': entity.get('longDescription'), + 'duration': float_or_none(entity.get('durationMillis'), 1000), + 'episode': episode, + 'episode_number': int_or_none(entity.get('episodeNumber')), + # 'ie_key': 'BrightcoveNew', + 'ie_key': TVAIE.ie_key(), + } From 87889f1fe81b3aefe56281179b266b74c4bf1ecd Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:39:36 +0100 Subject: [PATCH 074/384] [extractors] Add QubIE import --- haruhi_dl/extractor/extractors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 314d7bfe0..a83e8fe6b 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1276,7 +1276,10 @@ from .tv2dk import ( from .tv2hu import TV2HuIE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE -from .tva import TVAIE +from .tva import ( + TVAIE, + QubIE, +) from .tvanouvelles import ( TVANouvellesIE, TVANouvellesArticleIE, From b789c2b6bb25deec1e6d93122698b9723fb6a128 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:40:26 +0100 Subject: [PATCH 075/384] [tver] Add new extractor (closes #26662)(closes #27284) --- haruhi_dl/extractor/extractors.py | 2 + haruhi_dl/extractor/fujitv.py | 35 ++++++++++++++++ haruhi_dl/extractor/tver.py | 67 +++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) create mode 100644 haruhi_dl/extractor/fujitv.py create mode 100644 haruhi_dl/extractor/tver.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index a83e8fe6b..46f3b604c 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -410,6 +410,7 @@ from .frontendmasters import ( FrontendMastersLessonIE, FrontendMastersCourseIE ) +from .fujitv import FujiTVFODPlus7IE from .funimation import FunimationIE from .funk import FunkIE from .funkwhale import ( @@ -1288,6 +1289,7 @@ from .tvc import ( TVCIE, TVCArticleIE, ) +from .tver import TVerIE from .tvigle import TvigleIE from .tvland import TVLandIE from .tvn24 import TVN24IE diff --git a/haruhi_dl/extractor/fujitv.py b/haruhi_dl/extractor/fujitv.py new file mode 100644 index 000000000..39685e075 --- /dev/null +++ b/haruhi_dl/extractor/fujitv.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class FujiTVFODPlus7IE(InfoExtractor): + _VALID_URL = r'https?://i\.fod\.fujitv\.co\.jp/plus7/web/[0-9a-z]{4}/(?P<id>[0-9a-z]+)' + _BASE_URL = 'http://i.fod.fujitv.co.jp/' + _BITRATE_MAP = { + 300: (320, 180), + 800: (640, 360), + 1200: (1280, 720), + 2000: (1280, 720), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = self._extract_m3u8_formats( + self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id) + for f in formats: + wh = self._BITRATE_MAP.get(f.get('tbr')) + if wh: + f.update({ + 'width': wh[0], + 'height': wh[1], + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + 'thumbnail': self._BASE_URL + 'pc/image/wbtn/wbtn_%s.jpg' % video_id, + } diff --git a/haruhi_dl/extractor/tver.py b/haruhi_dl/extractor/tver.py new file mode 100644 index 000000000..c5299722d --- /dev/null +++ b/haruhi_dl/extractor/tver.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + remove_start, + smuggle_url, + try_get, +) + + +class TVerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>(?:corner|episode|feature)/(?P<id>f?\d+))' + # videos are only available for 7 days + _TESTS = [{ + 'url': 'https://tver.jp/corner/f0062178', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/feature/f0062413', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/episode/79622438', + 'only_matching': True, + }] + _TOKEN = None + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + + def _real_initialize(self): + self._TOKEN = self._download_json( + 'https://tver.jp/api/access_token.php', None)['token'] + + def _real_extract(self, url): + path, video_id = re.match(self._VALID_URL, url).groups() + main = self._download_json( + 'https://api.tver.jp/v4/' + path, video_id, + query={'token': self._TOKEN})['main'] + p_id = main['publisher_id'] + service = remove_start(main['service'], 'ts_') + info = { + '_type': 'url_transparent', + 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), + 'episode': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + } + + if service == 'cx': + info.update({ + 'title': main.get('subtitle') or main['title'], + 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), + 'ie_key': 'FujiTVFODPlus7', + }) + else: + r_id = main['reference_id'] + if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): + r_id = 'ref:' + r_id + bc_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), + {'geo_countries': ['JP']}) + info.update({ + 'url': bc_url, + 'ie_key': 'BrightcoveNew', + }) + + return info From d60195c74b7d1e551bb7915aed6f9d5d3b6c2b7a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:42:28 +0100 Subject: [PATCH 076/384] [extractor/common] improve Akamai HTTP format extraction - Allow m3u8 manifest without an additional audio format - Fix extraction for qualities starting with a number Solution provided by @nixxo based on: https://stackoverflow.com/a/5984688 --- haruhi_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index 32a391a85..3492d8865 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -2641,7 +2641,7 @@ class InfoExtractor(object): REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') qualities_length = len(qualities) - if len(formats) in (qualities_length + 1, qualities_length * 2 + 1): + if len(formats) in (qualities_length, qualities_length + 1, qualities_length * 2, qualities_length * 2 + 1): i = 0 http_formats = [] for f in formats: @@ -2650,7 +2650,7 @@ class InfoExtractor(object): http_f = f.copy() del http_f['manifest_url'] http_url = re.sub( - REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url']) + REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url']) http_f.update({ 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), 'url': http_url, From 0a48eb0c7f60db75c07e2dab821981749ba75c28 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:42:39 +0100 Subject: [PATCH 077/384] [tver] correct episode_number key --- haruhi_dl/extractor/tver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/tver.py b/haruhi_dl/extractor/tver.py index c5299722d..931d4d650 100644 --- a/haruhi_dl/extractor/tver.py +++ b/haruhi_dl/extractor/tver.py @@ -43,7 +43,7 @@ class TVerIE(InfoExtractor): info = { '_type': 'url_transparent', 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), - 'episode': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), } if service == 'cx': From 76263cc89371812634d7d41b52c27b413caefa59 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:42:46 +0100 Subject: [PATCH 078/384] [extractor/commons] improve Akamai HTTP formats extraction --- haruhi_dl/extractor/common.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index 3492d8865..2db95d592 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -2632,20 +2632,20 @@ class InfoExtractor(object): hls_host = hosts.get('hls') if hls_host: m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) http_host = hosts.get('http') - if http_host and 'hdnea=' not in manifest_url: - REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' + if http_host and m3u8_formats and 'hdnea=' not in m3u8_url: + REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+' qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') qualities_length = len(qualities) - if len(formats) in (qualities_length, qualities_length + 1, qualities_length * 2, qualities_length * 2 + 1): + if len(m3u8_formats) in (qualities_length, qualities_length + 1): i = 0 - http_formats = [] - for f in formats: - if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none': + for f in m3u8_formats: + if f['vcodec'] != 'none': for protocol in ('http', 'https'): http_f = f.copy() del http_f['manifest_url'] @@ -2656,9 +2656,8 @@ class InfoExtractor(object): 'url': http_url, 'protocol': protocol, }) - http_formats.append(http_f) + formats.append(http_f) i += 1 - formats.extend(http_formats) return formats From 2f04ca9dac9b11744d236e11b4bf0a12f56fddf4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:42:52 +0100 Subject: [PATCH 079/384] [gamespot] Extract DASH and HTTP formats --- haruhi_dl/extractor/gamespot.py | 110 ++++++++------------------------ 1 file changed, 25 insertions(+), 85 deletions(-) diff --git a/haruhi_dl/extractor/gamespot.py b/haruhi_dl/extractor/gamespot.py index 4236a5ed8..7a1beae3c 100644 --- a/haruhi_dl/extractor/gamespot.py +++ b/haruhi_dl/extractor/gamespot.py @@ -1,16 +1,7 @@ from __future__ import unicode_literals -import re - from .once import OnceIE -from ..compat import ( - compat_urllib_parse_unquote, -) -from ..utils import ( - unescapeHTML, - url_basename, - dict_get, -) +from ..compat import compat_urllib_parse_unquote class GameSpotIE(OnceIE): @@ -24,17 +15,16 @@ class GameSpotIE(OnceIE): 'title': 'Arma 3 - Community Guide: SITREP I', 'description': 'Check out this video where some of the basics of Arma 3 is explained.', }, + 'skip': 'manifest URL give HTTP Error 404: Not Found', }, { 'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/', + 'md5': '173ea87ad762cf5d3bf6163dceb255a6', 'info_dict': { 'id': 'gs-2300-6424837', 'ext': 'mp4', 'title': 'Now Playing - The Witcher 3: Wild Hunt', 'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.', }, - 'params': { - 'skip_download': True, # m3u8 downloads - }, }, { 'url': 'https://www.gamespot.com/videos/embed/6439218/', 'only_matching': True, @@ -49,90 +39,40 @@ class GameSpotIE(OnceIE): def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - data_video_json = self._search_regex( - r'data-video=["\'](.*?)["\']', webpage, 'data video') - data_video = self._parse_json(unescapeHTML(data_video_json), page_id) + data_video = self._parse_json(self._html_search_regex( + r'data-video=(["\'])({.*?})\1', webpage, + 'video data', group=2), page_id) + title = compat_urllib_parse_unquote(data_video['title']) streams = data_video['videoStreams'] - - manifest_url = None formats = [] - f4m_url = streams.get('f4m_stream') - if f4m_url: - manifest_url = f4m_url - formats.extend(self._extract_f4m_formats( - f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False)) - m3u8_url = dict_get(streams, ('m3u8_stream', 'adaptive_stream')) + + m3u8_url = streams.get('adaptive_stream') if m3u8_url: - manifest_url = m3u8_url m3u8_formats = self._extract_m3u8_formats( m3u8_url, page_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(m3u8_formats) - progressive_url = dict_get( - streams, ('progressive_hd', 'progressive_high', 'progressive_low', 'other_lr')) - if progressive_url and manifest_url: - qualities_basename = self._search_regex( - r'/([^/]+)\.csmil/', - manifest_url, 'qualities basename', default=None) - if qualities_basename: - QUALITIES_RE = r'((,\d+)+,?)' - qualities = self._search_regex( - QUALITIES_RE, qualities_basename, - 'qualities', default=None) - if qualities: - qualities = list(map(lambda q: int(q), qualities.strip(',').split(','))) - qualities.sort() - http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename) - http_url_basename = url_basename(progressive_url) - if m3u8_formats: - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none', m3u8_formats)) - if len(qualities) == len(m3u8_formats): - for q, m3u8_format in zip(qualities, m3u8_formats): - f = m3u8_format.copy() - f.update({ - 'url': progressive_url.replace( - http_url_basename, http_template % q), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - for q in qualities: - formats.append({ - 'url': progressive_url.replace( - http_url_basename, http_template % q), - 'ext': 'mp4', - 'format_id': 'http-%d' % q, - 'tbr': q, - }) + for f in m3u8_formats: + formats.append(f) + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': f['url'].replace('.m3u8', '.mp4'), + }) + formats.append(http_f) - onceux_json = self._search_regex( - r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None) - if onceux_json: - onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri') - if onceux_url: - formats.extend(self._extract_once_formats(re.sub( - r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url), - http_formats_preference=-1)) + mpd_url = streams.get('adaptive_dash') + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, page_id, mpd_id='dash', fatal=False)) - if not formats: - for quality in ['sd', 'hd']: - # It's actually a link to a flv file - flv_url = streams.get('f4m_{0}'.format(quality)) - if flv_url is not None: - formats.append({ - 'url': flv_url, - 'ext': 'flv', - 'format_id': quality, - }) self._sort_formats(formats) return { - 'id': data_video['guid'], + 'id': data_video.get('guid') or page_id, 'display_id': page_id, - 'title': compat_urllib_parse_unquote(data_video['title']), + 'title': title, 'formats': formats, 'description': self._html_search_meta('description', webpage), 'thumbnail': self._og_search_thumbnail(webpage), From 8c8e98ffdd0a60cd479803740de00832cc7614bb Mon Sep 17 00:00:00 2001 From: Matthew Rayermann <matthew.rayermann@gmail.com> Date: Fri, 26 Feb 2021 14:42:58 +0100 Subject: [PATCH 080/384] [nhk] Add audio clip test to NHK extractor (#27269) --- haruhi_dl/extractor/nhk.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nhk.py b/haruhi_dl/extractor/nhk.py index de6a707c4..6a61a47d2 100644 --- a/haruhi_dl/extractor/nhk.py +++ b/haruhi_dl/extractor/nhk.py @@ -10,7 +10,7 @@ class NhkVodIE(InfoExtractor): # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - # clip + # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', 'md5': '256a1be14f48d960a7e61e2532d95ec3', 'info_dict': { @@ -21,6 +21,19 @@ class NhkVodIE(InfoExtractor): 'timestamp': 1565965194, 'upload_date': '20190816', }, + }, { + # audio clip + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/', + 'info_dict': { + 'id': 'r_inventions-20201104-1-en', + 'ext': 'm4a', + 'title': "Japan's Top Inventions - Miniature Video Cameras", + 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', 'only_matching': True, From ecfc7cb9f199aaf3df9e2a797df1e4c5dcbd36e1 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:43:06 +0100 Subject: [PATCH 081/384] [zdf] extract webm formats(closes #26659) --- haruhi_dl/extractor/zdf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/zdf.py b/haruhi_dl/extractor/zdf.py index 656864b2e..5ed2946c2 100644 --- a/haruhi_dl/extractor/zdf.py +++ b/haruhi_dl/extractor/zdf.py @@ -40,7 +40,7 @@ class ZDFBaseIE(InfoExtractor): class ZDFIE(ZDFBaseIE): _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?]+)\.html' - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') _GEO_COUNTRIES = ['DE'] _TESTS = [{ @@ -119,7 +119,7 @@ class ZDFIE(ZDFBaseIE): if not ptmd_path: ptmd_path = t[ 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'portal') + '{playerId}', 'ngplayer_2_4') ptmd = self._call_api( urljoin(url, ptmd_path), player, url, video_id, 'metadata') From 841628af91d84f6941e329f1334ddc4b9c6f46d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:43:47 +0100 Subject: [PATCH 082/384] [nrktv] Relax _VALID_URL (closes #27299, closes #26185) --- haruhi_dl/extractor/nrk.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 4a395546f..0c4b126ed 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -146,7 +146,7 @@ class NRKTVIE(NRKBaseIE): _VALID_URL = r'''(?x) https?:// (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie(?:/[^/]+){1,2}|program)/ + (?:serie(?:/[^/]+){1,}|program)/ (?![Ee]pisodes)%s (?:/\d{2}-\d{2}-\d{4})? (?:\#del=(?P<part_id>\d+))? @@ -275,6 +275,9 @@ class NRKTVIE(NRKBaseIE): }, { 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201507/NPUB21019315', + 'only_matching': True, }] _api_host = None From 58edf65c1b27fa476dfb695a77de8c138a1d3cc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:43:56 +0100 Subject: [PATCH 083/384] [pornhub] Handle HTTP errors gracefully (closes #26414) --- haruhi_dl/extractor/pornhub.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/pornhub.py b/haruhi_dl/extractor/pornhub.py index d91c869c4..20af84955 100644 --- a/haruhi_dl/extractor/pornhub.py +++ b/haruhi_dl/extractor/pornhub.py @@ -33,7 +33,12 @@ class PornHubBaseIE(InfoExtractor): def dl(*args, **kwargs): return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) - webpage, urlh = dl(*args, **kwargs) + ret = dl(*args, **kwargs) + + if not ret: + return ret + + webpage, urlh = ret if any(re.search(p, webpage) for p in ( r'<body\b[^>]+\bonload=["\']go\(\)', From d439a5df63cedcceb2b64b3796476beaff968635 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:44:04 +0100 Subject: [PATCH 084/384] =?UTF-8?q?[nrk]=20improve=20format=20extraction?= =?UTF-8?q?=20and=20geo-restriction=20detection=20(closes=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … #24221) --- haruhi_dl/extractor/nrk.py | 43 +++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 0c4b126ed..19d820f61 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -24,6 +24,11 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] + def _extract_nrk_formats(self, asset_url, video_id): + return self._extract_m3u8_formats( + re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url), + video_id, 'mp4', 'm3u8_native', fatal=False) + class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) @@ -94,9 +99,7 @@ class NRKIE(NRKBaseIE): if not format_url: continue if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + formats.extend(self._extract_nrk_formats(format_url, video_id)) self._sort_formats(formats) data = self._download_json( @@ -298,6 +301,7 @@ class NRKTVIE(NRKBaseIE): title = data.get('fullTitle') or data.get('mainTitle') or data['title'] video_id = data.get('id') or video_id + urls = [] entries = [] conviva = data.get('convivaStatistics') or {} @@ -314,19 +318,13 @@ class NRKTVIE(NRKBaseIE): else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) for num, asset in enumerate(media_assets, 1): asset_url = asset.get('url') - if not asset_url: + if not asset_url or asset_url in urls: continue - formats = self._extract_akamai_formats(asset_url, video_id) + formats = extract_nrk_formats(asset_url, video_id) if not formats: continue self._sort_formats(formats) - # Some f4m streams may not work with hdcore in fragments' URLs - for f in formats: - extra_param = f.get('extra_param_to_segment_url') - if extra_param and 'hdcore' in extra_param: - del f['extra_param_to_segment_url'] - entry_id, entry_title = video_id_and_title(num) duration = parse_duration(asset.get('duration')) subtitles = {} @@ -346,16 +344,17 @@ class NRKTVIE(NRKBaseIE): if not entries: media_url = data.get('mediaUrl') - if media_url: - formats = self._extract_akamai_formats(media_url, video_id) - self._sort_formats(formats) - duration = parse_duration(data.get('duration')) - entries = [{ - 'id': video_id, - 'title': make_title(title), - 'duration': duration, - 'formats': formats, - }] + if media_url and media_url not in urls: + formats = extract_nrk_formats(media_url, video_id) + if formats: + self._sort_formats(formats) + duration = parse_duration(data.get('duration')) + entries = [{ + 'id': video_id, + 'title': make_title(title), + 'duration': duration, + 'formats': formats, + }] if not entries: MESSAGES = { @@ -366,7 +365,7 @@ class NRKTVIE(NRKBaseIE): } message_type = data.get('messageType', '') # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type: + if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is Trues: self.raise_geo_restricted( msg=MESSAGES.get('ProgramIsGeoBlocked'), countries=self._GEO_COUNTRIES) From 226efefec6259a1d30dc436cc6eff9f693c8d899 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:44:09 +0100 Subject: [PATCH 085/384] [nrk] fix typo --- haruhi_dl/extractor/nrk.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 19d820f61..0f69579c5 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -365,7 +365,7 @@ class NRKTVIE(NRKBaseIE): } message_type = data.get('messageType', '') # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is Trues: + if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: self.raise_geo_restricted( msg=MESSAGES.get('ProgramIsGeoBlocked'), countries=self._GEO_COUNTRIES) From 75dc35e41846adf9872089c37a189d1f16f57731 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:44:14 +0100 Subject: [PATCH 086/384] [nrk] fix call to moved method --- haruhi_dl/extractor/nrk.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 0f69579c5..8595f55b1 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -320,7 +320,7 @@ class NRKTVIE(NRKBaseIE): asset_url = asset.get('url') if not asset_url or asset_url in urls: continue - formats = extract_nrk_formats(asset_url, video_id) + formats = self._extract_nrk_formats(asset_url, video_id) if not formats: continue self._sort_formats(formats) @@ -345,7 +345,7 @@ class NRKTVIE(NRKBaseIE): if not entries: media_url = data.get('mediaUrl') if media_url and media_url not in urls: - formats = extract_nrk_formats(media_url, video_id) + formats = self._extract_nrk_formats(media_url, video_id) if formats: self._sort_formats(formats) duration = parse_duration(data.get('duration')) From 08fea1baa17abcfff65899e7a06f3f0c616547c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:44:20 +0100 Subject: [PATCH 087/384] [nrktv:season] Improve extraction --- haruhi_dl/extractor/nrk.py | 99 ++++++++++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 16 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 8595f55b1..4d5f4c5ba 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor @@ -17,6 +18,7 @@ from ..utils import ( parse_age_limit, parse_duration, try_get, + urljoin, url_or_none, ) @@ -547,44 +549,109 @@ class NRKTVSerieBaseIE(InfoExtractor): return [] entries = [] for episode in entry_list: - nrk_id = episode.get('prfId') + nrk_id = episode.get('prfId') or episode.get('episodeId') if not nrk_id or not isinstance(nrk_id, compat_str): continue + if not re.match(NRKTVIE._EPISODE_RE, nrk_id): + continue entries.append(self.url_result( 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) return entries class NRKTVSeasonIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?P<domain>tv|radio)\.nrk\.no/serie/(?P<serie>[^/]+)/(?:sesong/)?(?P<id>\d+)' + _TESTS = [{ 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', 'info_dict': { - 'id': '1', + 'id': 'backstage/1', 'title': 'Sesong 1', }, 'playlist_mincount': 30, - } + }, { + # no /sesong/ in path + 'url': 'https://tv.nrk.no/serie/lindmo/2016', + 'info_dict': { + 'id': 'lindmo/2016', + 'title': '2016', + }, + 'playlist_mincount': 29, + }, { + # weird nested _embedded in catalog JSON response + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens/sesong/1', + 'info_dict': { + 'id': 'dickie-dick-dickens/1', + 'title': 'Sesong 1', + }, + 'playlist_mincount': 11, + }, { + # 841 entries, multi page + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201509', + 'info_dict': { + 'id': 'dagsnytt/201509', + 'title': 'September 2015', + }, + 'playlist_mincount': 841, + }, { + # 180 entries, single page + 'url': 'https://tv.nrk.no/serie/spangas/sesong/1', + 'only_matching': True, + }] @classmethod def suitable(cls, url): return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) else super(NRKTVSeasonIE, cls).suitable(url)) + _ASSETS_KEYS = ('episodes', 'instalments',) + + def _entries(self, data, display_id): + for page_num in itertools.count(1): + embedded = data.get('_embedded') + if not isinstance(embedded, dict): + break + # Extract entries + for asset_key in self._ASSETS_KEYS: + entries = try_get( + embedded, + (lambda x: x[asset_key]['_embedded'][asset_key], + lambda x: x[asset_key]), + list) + for e in self._extract_entries(entries): + yield e + # Find next URL + for asset_key in self._ASSETS_KEYS: + next_url = urljoin( + 'https://psapi.nrk.no/', + try_get( + data, + (lambda x: x['_links']['next']['href'], + lambda x: x['_embedded'][asset_key]['_links']['next']['href']), + compat_str)) + if next_url: + break + if not next_url: + break + data = self._download_json( + next_url, display_id, + 'Downloading season JSON page %d' % page_num, fatal=False) + if not data: + break + def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + domain = mobj.group('domain') + serie = mobj.group('serie') + season_id = mobj.group('id') + display_id = '%s/%s' % (serie, season_id) - webpage = self._download_webpage(url, display_id) - - series = self._extract_series(webpage, display_id) - - season = next( - s for s in series['seasons'] - if int(display_id) == s.get('seasonNumber')) - - title = try_get(season, lambda x: x['titles']['title'], compat_str) + data = self._download_json( + 'https://psapi.nrk.no/%s/catalog/series/%s/seasons/%s' + % (domain, serie, season_id), display_id, query={'pageSize': 50}) + title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id return self.playlist_result( - self._extract_episodes(season), display_id, title) + self._entries(data, display_id), + display_id, title) class NRKTVSeriesIE(NRKTVSerieBaseIE): From ea80c8f15eb5194dcbd72e75ab1d805a96d1d237 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:44:26 +0100 Subject: [PATCH 088/384] [nrktv:series] Improve extraction --- haruhi_dl/extractor/nrk.py | 138 ++++++++++++++++++++++--------------- 1 file changed, 82 insertions(+), 56 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 4d5f4c5ba..7cfbe7856 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -558,6 +558,46 @@ class NRKTVSerieBaseIE(InfoExtractor): 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) return entries + _ASSETS_KEYS = ('episodes', 'instalments',) + + def _extract_assets_key(self, embedded): + for asset_key in self._ASSETS_KEYS: + if embedded.get(asset_key): + return asset_key + + def _entries(self, data, display_id): + for page_num in itertools.count(1): + embedded = data.get('_embedded') + if not isinstance(embedded, dict): + break + assets_key = self._extract_assets_key(embedded) + if not assets_key: + break + # Extract entries + entries = try_get( + embedded, + (lambda x: x[assets_key]['_embedded'][assets_key], + lambda x: x[assets_key]), + list) + for e in self._extract_entries(entries): + yield e + # Find next URL + next_url = urljoin( + 'https://psapi.nrk.no/', + try_get( + data, + (lambda x: x['_links']['next']['href'], + lambda x: x['_embedded'][assets_key]['_links']['next']['href']), + compat_str)) + if not next_url: + break + data = self._download_json( + next_url, display_id, + 'Downloading %s JSON page %d' % (assets_key, page_num), + fatal=False) + if not data: + break + class NRKTVSeasonIE(NRKTVSerieBaseIE): _VALID_URL = r'https?://(?P<domain>tv|radio)\.nrk\.no/serie/(?P<serie>[^/]+)/(?:sesong/)?(?P<id>\d+)' @@ -603,41 +643,6 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) else super(NRKTVSeasonIE, cls).suitable(url)) - _ASSETS_KEYS = ('episodes', 'instalments',) - - def _entries(self, data, display_id): - for page_num in itertools.count(1): - embedded = data.get('_embedded') - if not isinstance(embedded, dict): - break - # Extract entries - for asset_key in self._ASSETS_KEYS: - entries = try_get( - embedded, - (lambda x: x[asset_key]['_embedded'][asset_key], - lambda x: x[asset_key]), - list) - for e in self._extract_entries(entries): - yield e - # Find next URL - for asset_key in self._ASSETS_KEYS: - next_url = urljoin( - 'https://psapi.nrk.no/', - try_get( - data, - (lambda x: x['_links']['next']['href'], - lambda x: x['_embedded'][asset_key]['_links']['next']['href']), - compat_str)) - if next_url: - break - if not next_url: - break - data = self._download_json( - next_url, display_id, - 'Downloading season JSON page %d' % page_num, fatal=False) - if not data: - break - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) domain = mobj.group('domain') @@ -648,6 +653,7 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): data = self._download_json( 'https://psapi.nrk.no/%s/catalog/series/%s/seasons/%s' % (domain, serie, season_id), display_id, query={'pageSize': 50}) + title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id return self.playlist_result( self._entries(data, display_id), @@ -655,26 +661,9 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?P<domain>tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' _TESTS = [{ - 'url': 'https://tv.nrk.no/serie/blank', - 'info_dict': { - 'id': 'blank', - 'title': 'Blank', - 'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e', - }, - 'playlist_mincount': 30, - }, { - # new layout, seasons - 'url': 'https://tv.nrk.no/serie/backstage', - 'info_dict': { - 'id': 'backstage', - 'title': 'Backstage', - 'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3', - }, - 'playlist_mincount': 60, - }, { # new layout, instalments 'url': 'https://tv.nrk.no/serie/groenn-glede', 'info_dict': { @@ -682,7 +671,30 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'title': 'Grønn glede', 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', }, - 'playlist_mincount': 10, + 'playlist_mincount': 90, + }, { + # new layout, instalments, more entries + 'url': 'https://tv.nrk.no/serie/lindmo', + 'only_matching': True, + }, { + 'url': 'https://tv.nrk.no/serie/blank', + 'info_dict': { + 'id': 'blank', + 'title': 'Blank', + 'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e', + }, + 'playlist_mincount': 30, + 'expected_warnings': ['HTTP Error 404: Not Found'], + }, { + # new layout, seasons + 'url': 'https://tv.nrk.no/serie/backstage', + 'info_dict': { + 'id': 'backstage', + 'title': 'Backstage', + 'description': 'md5:63692ceb96813d9a207e9910483d948b', + }, + 'playlist_mincount': 60, + 'expected_warnings': ['HTTP Error 404: Not Found'], }, { # old layout 'url': 'https://tv.nrksuper.no/serie/labyrint', @@ -711,16 +723,30 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): - series_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + domain = mobj.group('domain') + series_id = mobj.group('id') + + title = description = None webpage = self._download_webpage(url, series_id) - # New layout (e.g. https://tv.nrk.no/serie/backstage) series = self._extract_series(webpage, series_id, fatal=False) if series: title = try_get(series, lambda x: x['titles']['title'], compat_str) description = try_get( series, lambda x: x['titles']['subtitle'], compat_str) + + data = self._download_json( + 'https://psapi.nrk.no/%s/catalog/series/%s/instalments' + % (domain, series_id), series_id, query={'pageSize': 50}, + fatal=False) + if data: + return self.playlist_result( + self._entries(data, series_id), series_id, title, description) + + # New layout (e.g. https://tv.nrk.no/serie/backstage) + if series: entries = [] entries.extend(self._extract_seasons(series.get('seasons'))) entries.extend(self._extract_entries(series.get('instalments'))) From d3b00a0fa6467033f7bfd926e3744b9c739a2fd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:44:34 +0100 Subject: [PATCH 089/384] [nrktv:series] Improve extraction (closes #21926) --- haruhi_dl/extractor/nrk.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 7cfbe7856..4a82b11fd 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -521,7 +521,8 @@ class NRKTVSerieBaseIE(InfoExtractor): config = self._parse_json( self._search_regex( (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;', - r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>'), + r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', + r'PRELOADED_STATE_*\s*=\s*({.+?})\s*\n'), webpage, 'config', default='{}' if not fatal else NO_DEFAULT), display_id, fatal=False, transform_source=js_to_json) if not config: @@ -531,12 +532,26 @@ class NRKTVSerieBaseIE(InfoExtractor): (lambda x: x['initialState']['series'], lambda x: x['series']), dict) - def _extract_seasons(self, seasons): + def _extract_seasons(self, domain, series_id, seasons): + if isinstance(seasons, dict): + seasons = seasons.get('seasons') if not isinstance(seasons, list): return [] entries = [] for season in seasons: - entries.extend(self._extract_episodes(season)) + if not isinstance(season, dict): + continue + episodes = self._extract_episodes(season) + if episodes: + entries.extend(episodes) + continue + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + entries.append(self.url_result( + 'https://%s.nrk.no/serie/%s/sesong/%s' + % (domain, series_id, season_name), + ie=NRKTVSeasonIE.ie_key(), + video_title=season.get('title'))) return entries def _extract_episodes(self, season): @@ -713,6 +728,13 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, { 'url': 'https://tv.nrk.no/serie/postmann-pat', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens', + 'info_dict': { + 'id': 'dickie-dick-dickens', + }, + 'playlist_mincount': 8, + 'expected_warnings': ['HTTP Error 404: Not Found'], }] @classmethod @@ -748,7 +770,7 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): # New layout (e.g. https://tv.nrk.no/serie/backstage) if series: entries = [] - entries.extend(self._extract_seasons(series.get('seasons'))) + entries.extend(self._extract_seasons(domain, series_id, series.get('seasons'))) entries.extend(self._extract_entries(series.get('instalments'))) entries.extend(self._extract_episodes(series.get('extraMaterial'))) return self.playlist_result(entries, series_id, title, description) From 05fae5e182a41cacc8c1c7f2a09dd16c4260cbb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:44:39 +0100 Subject: [PATCH 090/384] [nrktv] Relax _VALID_URL --- haruhi_dl/extractor/nrk.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 4a82b11fd..08e331893 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -148,14 +148,7 @@ class NRKIE(NRKBaseIE): class NRKTVIE(NRKBaseIE): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P<id>[a-zA-Z]{4}\d{8})' - _VALID_URL = r'''(?x) - https?:// - (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie(?:/[^/]+){1,}|program)/ - (?![Ee]pisodes)%s - (?:/\d{2}-\d{2}-\d{4})? - (?:\#del=(?P<part_id>\d+))? - ''' % _EPISODE_RE + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', From 0ef2cc2a31abe0595fb1e9788e5e91911caf78bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:44:43 +0100 Subject: [PATCH 091/384] [nrk] Improve error extraction --- haruhi_dl/extractor/nrk.py | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 08e331893..f5e964753 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -31,6 +31,22 @@ class NRKBaseIE(InfoExtractor): re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url), video_id, 'mp4', 'm3u8_native', fatal=False) + def _raise_error(self, data): + MESSAGES = { + 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', + 'ProgramRightsHasExpired': 'Programmet har gått ut', + 'NoProgramRights': 'Ikke tilgjengelig', + 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', + } + message_type = data.get('messageType', '') + # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* + if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: + self.raise_geo_restricted( + msg=MESSAGES.get('ProgramIsGeoBlocked'), + countries=self._GEO_COUNTRIES) + message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) @@ -89,6 +105,9 @@ class NRKIE(NRKBaseIE): 'http://psapi.nrk.no/playback/manifest/%s' % video_id, video_id, 'Downloading manifest JSON') + if manifest.get('playability') == 'nonPlayable': + self._raise_error(manifest['nonPlayable']) + playable = manifest['playable'] formats = [] @@ -352,22 +371,7 @@ class NRKTVIE(NRKBaseIE): }] if not entries: - MESSAGES = { - 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', - 'ProgramRightsHasExpired': 'Programmet har gått ut', - 'NoProgramRights': 'Ikke tilgjengelig', - 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', - } - message_type = data.get('messageType', '') - # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: - self.raise_geo_restricted( - msg=MESSAGES.get('ProgramIsGeoBlocked'), - countries=self._GEO_COUNTRIES) - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, MESSAGES.get( - message_type, message_type)), - expected=True) + self._raise_error(data) series = conviva.get('seriesName') or data.get('seriesTitle') episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') From 8e06fa07b9df4c72a51509328bfbe91ecf355692 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:46:00 +0100 Subject: [PATCH 092/384] [teachable:course] Improve extraction (closes #24507, closes #27286) --- haruhi_dl/extractor/teachable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/teachable.py b/haruhi_dl/extractor/teachable.py index 5557a9925..df305e38a 100644 --- a/haruhi_dl/extractor/teachable.py +++ b/haruhi_dl/extractor/teachable.py @@ -269,7 +269,7 @@ class TeachableCourseIE(TeachableBaseIE): r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)', webpage): li = mobj.group('li') - if 'fa-youtube-play' not in li: + if 'fa-youtube-play' not in li and not re.search(r'\d{1,2}:\d{2}', li): continue lecture_url = self._search_regex( r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li, From b79c74dad91f5ac0915e9b4b55f353911ea27980 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:46:08 +0100 Subject: [PATCH 093/384] [peertube] Recognize audio-only formats (closes #27295) --- haruhi_dl/extractor/peertube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/haruhi_dl/extractor/peertube.py b/haruhi_dl/extractor/peertube.py index f89ccda7f..b5ee25fff 100644 --- a/haruhi_dl/extractor/peertube.py +++ b/haruhi_dl/extractor/peertube.py @@ -132,6 +132,8 @@ class PeerTubeSHIE(SelfhostedInfoExtractor): 'format_id': format_id, 'filesize': file_size, }) + if format_id == '0p': + f['vcodec'] = 'none' formats.append(f) self._sort_formats(formats) From 93e7c99ad6da5f254e3424f27b53a7dfdac5daca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:46:16 +0100 Subject: [PATCH 094/384] [peertube] Extract fps --- haruhi_dl/extractor/peertube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/haruhi_dl/extractor/peertube.py b/haruhi_dl/extractor/peertube.py index b5ee25fff..66aab5c90 100644 --- a/haruhi_dl/extractor/peertube.py +++ b/haruhi_dl/extractor/peertube.py @@ -134,6 +134,8 @@ class PeerTubeSHIE(SelfhostedInfoExtractor): }) if format_id == '0p': f['vcodec'] = 'none' + else: + f['fps'] = int_or_none(file_.get('fps')) formats.append(f) self._sort_formats(formats) From b8975995efcc5354443833aee0f411408d1f4713 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:46:24 +0100 Subject: [PATCH 095/384] [nrk] improve extraction - improve format extraction for old akamai formats - update some of the tests - add is_live value to entry info dict - request instalments only when their available - fix skole extraction --- haruhi_dl/extractor/nrk.py | 252 ++++++++++++------------------------- 1 file changed, 81 insertions(+), 171 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index f5e964753..8b31a6ad2 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -13,8 +13,6 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, - js_to_json, - NO_DEFAULT, parse_age_limit, parse_duration, try_get, @@ -24,9 +22,10 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): - _GEO_COUNTRIES = ['NO'] - def _extract_nrk_formats(self, asset_url, video_id): + if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): + return self._extract_akamai_formats( + re.sub(r'(?:b=\d+-\d+|__a__=off)&?', '', asset_url), video_id) return self._extract_m3u8_formats( re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url), video_id, 'mp4', 'm3u8_native', fatal=False) @@ -47,6 +46,12 @@ class NRKBaseIE(InfoExtractor): message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) + def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): + return self._download_json( + urljoin('http://psapi.nrk.no/', path), + video_id, note or 'Downloading %s JSON' % item, + fatal=fatal, query=query) + class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) @@ -64,7 +69,7 @@ class NRKIE(NRKBaseIE): _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '706f34cdf1322577589e369e522b50ef', + 'md5': 'f46be075326e23ad0e524edfcb06aeb6', 'info_dict': { 'id': '150533', 'ext': 'mp4', @@ -78,7 +83,7 @@ class NRKIE(NRKBaseIE): # MD5 is unstable 'info_dict': { 'id': '154915', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Slik høres internett ut når du er blind', 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', 'duration': 20, @@ -101,9 +106,9 @@ class NRKIE(NRKBaseIE): }] def _extract_from_playback(self, video_id): - manifest = self._download_json( - 'http://psapi.nrk.no/playback/manifest/%s' % video_id, - video_id, 'Downloading manifest JSON') + path_templ = 'playback/%s/' + video_id + call_playback_api = lambda x: self._call_api(path_templ % x, video_id, x) + manifest = call_playback_api('manifest') if manifest.get('playability') == 'nonPlayable': self._raise_error(manifest['nonPlayable']) @@ -123,9 +128,7 @@ class NRKIE(NRKBaseIE): formats.extend(self._extract_nrk_formats(format_url, video_id)) self._sort_formats(formats) - data = self._download_json( - 'http://psapi.nrk.no/playback/metadata/%s' % video_id, - video_id, 'Downloading metadata JSON') + data = call_playback_api('metadata') preplay = data['preplay'] titles = preplay['titles'] @@ -171,18 +174,18 @@ class NRKTVIE(NRKBaseIE): _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', - 'md5': '8270824df46ec629b66aeaa5796b36fb', + 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', 'info_dict': { 'id': 'MDDP12000117AA', 'ext': 'mp4', 'title': 'Alarm Trolltunga', 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', - 'duration': 2223, + 'duration': 2223.44, 'age_limit': 6, }, }, { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '9a167e54d04671eb6317a37b7bc8a280', + 'md5': '8d40dab61cea8ab0114e090b029a0565', 'info_dict': { 'id': 'MUHH48000314AA', 'ext': 'mp4', @@ -200,7 +203,7 @@ class NRKTVIE(NRKBaseIE): 'ext': 'mp4', 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', - 'duration': 4605, + 'duration': 4605.08, 'series': 'Kunnskapskanalen', 'episode': '24.05.2014', }, @@ -223,39 +226,13 @@ class NRKTVIE(NRKBaseIE): 'skip': 'particular part is not supported currently', }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', - 'playlist': [{ - 'info_dict': { - 'id': 'MSPO40010515AH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 772, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'info_dict': { - 'id': 'MSPO40010515BH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 6175, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }], 'info_dict': { - 'id': 'MSPO40010515', + 'id': 'MSPO40010515AH', + 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + 'description': 'md5:c03aba1e917561eface5214020551b7a', }, - 'expected_warnings': ['Video is geo restricted'], + 'skip': 'Video is geo restricted', }, { 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', 'info_dict': { @@ -286,6 +263,7 @@ class NRKTVIE(NRKBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'ProgramRightsHasExpired', }, { 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', 'only_matching': True, @@ -354,6 +332,7 @@ class NRKTVIE(NRKBaseIE): 'duration': duration, 'subtitles': subtitles, 'formats': formats, + 'is_live': live, }) if not entries: @@ -368,6 +347,7 @@ class NRKTVIE(NRKBaseIE): 'title': make_title(title), 'duration': duration, 'formats': formats, + 'is_live': live, }] if not entries: @@ -513,49 +493,7 @@ class NRKTVEpisodeIE(InfoExtractor): return info -class NRKTVSerieBaseIE(InfoExtractor): - def _extract_series(self, webpage, display_id, fatal=True): - config = self._parse_json( - self._search_regex( - (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;', - r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', - r'PRELOADED_STATE_*\s*=\s*({.+?})\s*\n'), - webpage, 'config', default='{}' if not fatal else NO_DEFAULT), - display_id, fatal=False, transform_source=js_to_json) - if not config: - return - return try_get( - config, - (lambda x: x['initialState']['series'], lambda x: x['series']), - dict) - - def _extract_seasons(self, domain, series_id, seasons): - if isinstance(seasons, dict): - seasons = seasons.get('seasons') - if not isinstance(seasons, list): - return [] - entries = [] - for season in seasons: - if not isinstance(season, dict): - continue - episodes = self._extract_episodes(season) - if episodes: - entries.extend(episodes) - continue - season_name = season.get('name') - if season_name and isinstance(season_name, compat_str): - entries.append(self.url_result( - 'https://%s.nrk.no/serie/%s/sesong/%s' - % (domain, series_id, season_name), - ie=NRKTVSeasonIE.ie_key(), - video_title=season.get('title'))) - return entries - - def _extract_episodes(self, season): - if not isinstance(season, dict): - return [] - return self._extract_entries(season.get('episodes')) - +class NRKTVSerieBaseIE(NRKBaseIE): def _extract_entries(self, entry_list): if not isinstance(entry_list, list): return [] @@ -579,7 +517,7 @@ class NRKTVSerieBaseIE(InfoExtractor): def _entries(self, data, display_id): for page_num in itertools.count(1): - embedded = data.get('_embedded') + embedded = data.get('_embedded') or data if not isinstance(embedded, dict): break assets_key = self._extract_assets_key(embedded) @@ -594,18 +532,16 @@ class NRKTVSerieBaseIE(InfoExtractor): for e in self._extract_entries(entries): yield e # Find next URL - next_url = urljoin( - 'https://psapi.nrk.no/', - try_get( - data, - (lambda x: x['_links']['next']['href'], - lambda x: x['_embedded'][assets_key]['_links']['next']['href']), - compat_str)) - if not next_url: + next_url_path = try_get( + data, + (lambda x: x['_links']['next']['href'], + lambda x: x['_embedded'][assets_key]['_links']['next']['href']), + compat_str) + if not next_url_path: break - data = self._download_json( - next_url, display_id, - 'Downloading %s JSON page %d' % (assets_key, page_num), + data = self._call_api( + next_url_path, display_id, + note='Downloading %s JSON page %d' % (assets_key, page_num), fatal=False) if not data: break @@ -656,15 +592,12 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): else super(NRKTVSeasonIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - domain = mobj.group('domain') - serie = mobj.group('serie') - season_id = mobj.group('id') + domain, serie, season_id = re.match(self._VALID_URL, url).groups() display_id = '%s/%s' % (serie, season_id) - data = self._download_json( - 'https://psapi.nrk.no/%s/catalog/series/%s/seasons/%s' - % (domain, serie, season_id), display_id, query={'pageSize': 50}) + data = self._call_api( + '%s/catalog/series/%s/seasons/%s' % (domain, serie, season_id), + display_id, 'season', query={'pageSize': 50}) title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id return self.playlist_result( @@ -673,8 +606,7 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?P<domain>tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)' - _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)' + _VALID_URL = r'https?://(?P<domain>(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/serie/(?P<id>[^/]+)' _TESTS = [{ # new layout, instalments 'url': 'https://tv.nrk.no/serie/groenn-glede', @@ -696,7 +628,6 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'description': 'md5:7664b4e7e77dc6810cd3bca367c25b6e', }, 'playlist_mincount': 30, - 'expected_warnings': ['HTTP Error 404: Not Found'], }, { # new layout, seasons 'url': 'https://tv.nrk.no/serie/backstage', @@ -706,14 +637,13 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'description': 'md5:63692ceb96813d9a207e9910483d948b', }, 'playlist_mincount': 60, - 'expected_warnings': ['HTTP Error 404: Not Found'], }, { # old layout 'url': 'https://tv.nrksuper.no/serie/labyrint', 'info_dict': { 'id': 'labyrint', 'title': 'Labyrint', - 'description': 'md5:318b597330fdac5959247c9b69fdb1ec', + 'description': 'I Daidalos sin undersjøiske Labyrint venter spennende oppgaver, skumle robotskapninger og slim.', }, 'playlist_mincount': 3, }, { @@ -729,9 +659,13 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens', 'info_dict': { 'id': 'dickie-dick-dickens', + 'title': 'Dickie Dick Dickens', + 'description': 'md5:19e67411ffe57f7dce08a943d7a0b91f', }, 'playlist_mincount': 8, - 'expected_warnings': ['HTTP Error 404: Not Found'], + }, { + 'url': 'https://nrksuper.no/serie/labyrint', + 'only_matching': True, }] @classmethod @@ -742,57 +676,39 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - domain = mobj.group('domain') - series_id = mobj.group('id') + site, series_id = re.match(self._VALID_URL, url).groups() + domain = 'radio' if site == 'radio.nrk' else 'tv' - title = description = None + series = self._call_api( + '%s/catalog/series/%s' % (domain, series_id), series_id, 'serie') + titles = try_get(series, [ + lambda x: x['titles'], + lambda x: x[x['type']]['titles'], + lambda x: x[x['seriesType']]['titles'], + ]) or {} - webpage = self._download_webpage(url, series_id) + entries = [] + entries.extend(self._entries(series, series_id)) + embedded = series.get('_embedded') or {} + linked_seasons = try_get(series, lambda x: x['_links']['seasons']) or [] + embedded_seasons = embedded.get('seasons') or [] + if len(linked_seasons) > len(embedded_seasons): + for season in linked_seasons: + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + entries.append(self.url_result( + 'https://%s.nrk.no/serie/%s/sesong/%s' + % (domain, series_id, season_name), + ie=NRKTVSeasonIE.ie_key(), + video_title=season.get('title'))) + else: + for season in embedded_seasons: + entries.extend(self._entries(season, series_id)) + entries.extend(self._entries( + embedded.get('extraMaterial') or {}, series_id)) - series = self._extract_series(webpage, series_id, fatal=False) - if series: - title = try_get(series, lambda x: x['titles']['title'], compat_str) - description = try_get( - series, lambda x: x['titles']['subtitle'], compat_str) - - data = self._download_json( - 'https://psapi.nrk.no/%s/catalog/series/%s/instalments' - % (domain, series_id), series_id, query={'pageSize': 50}, - fatal=False) - if data: - return self.playlist_result( - self._entries(data, series_id), series_id, title, description) - - # New layout (e.g. https://tv.nrk.no/serie/backstage) - if series: - entries = [] - entries.extend(self._extract_seasons(domain, series_id, series.get('seasons'))) - entries.extend(self._extract_entries(series.get('instalments'))) - entries.extend(self._extract_episodes(series.get('extraMaterial'))) - return self.playlist_result(entries, series_id, title, description) - - # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint) - entries = [ - self.url_result( - 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( - series=series_id, season=season_id)) - for season_id in re.findall(self._ITEM_RE, webpage) - ] - - title = self._html_search_meta( - 'seriestitle', webpage, - 'title', default=None) or self._og_search_title( - webpage, fatal=False) - if title: - title = self._search_regex( - r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title) - - description = self._html_search_meta( - 'series_description', webpage, - 'description', default=None) or self._og_search_description(webpage) - - return self.playlist_result(entries, series_id, title, description) + return self.playlist_result( + entries, series_id, titles.get('title'), titles.get('subtitle')) class NRKTVDirekteIE(NRKTVIE): @@ -896,14 +812,8 @@ class NRKSkoleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://mimir.nrk.no/plugin/1.0/static?mediaId=%s' % video_id, - video_id) - - nrk_id = self._parse_json( - self._search_regex( - r'<script[^>]+type=["\']application/json["\'][^>]*>({.+?})</script>', - webpage, 'application json'), - video_id)['activeMedia']['psId'] + nrk_id = self._download_json( + 'https://nrkno-skole-prod.kube.nrk.no/skole/api/media/%s' % video_id, + video_id)['psId'] return self.url_result('nrk:%s' % nrk_id) From d88959f3b3a880eb18055d1ace630a080bf7050d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:46:29 +0100 Subject: [PATCH 096/384] [nrk] improve format extraction --- haruhi_dl/extractor/nrk.py | 40 ++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 8b31a6ad2..289a0a3a4 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import itertools +import random import re from .common import InfoExtractor @@ -22,13 +23,26 @@ from ..utils import ( class NRKBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['NO'] + _CDN_REPL_REGEX = r'''(?x):// + (?: + nrkod\d{1,2}-httpcache0-47115-cacheod0\.dna\.ip-only\.net/47115-cacheod0| + nrk-od-no\.telenorcdn\.net| + minicdn-od\.nrk\.no/od/nrkhd-osl-rr\.netwerk\.no/no + )/''' + def _extract_nrk_formats(self, asset_url, video_id): if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): return self._extract_akamai_formats( re.sub(r'(?:b=\d+-\d+|__a__=off)&?', '', asset_url), video_id) - return self._extract_m3u8_formats( - re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url), - video_id, 'mp4', 'm3u8_native', fatal=False) + asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url) + formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) + if not formats and re.search(self._CDN_REPL_REGEX, asset_url): + formats = self._extract_m3u8_formats( + re.sub(self._CDN_REPL_REGEX, '://nrk-od-%02d.akamaized.net/no/' % random.randint(0, 99), asset_url), + video_id, 'mp4', 'm3u8_native', fatal=False) + return formats def _raise_error(self, data): MESSAGES = { @@ -107,8 +121,10 @@ class NRKIE(NRKBaseIE): def _extract_from_playback(self, video_id): path_templ = 'playback/%s/' + video_id - call_playback_api = lambda x: self._call_api(path_templ % x, video_id, x) - manifest = call_playback_api('manifest') + def call_playback_api(item, query=None): + return self._call_api(path_templ % item, video_id, item, query=query) + # known values for preferredCdn: akamai, iponly, minicdn and telenor + manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) if manifest.get('playability') == 'nonPlayable': self._raise_error(manifest['nonPlayable']) @@ -195,7 +211,6 @@ class NRKTVIE(NRKBaseIE): 'series': '20 spørsmål', 'episode': '23.05.2014', }, - 'skip': 'NoProgramRights', }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', 'info_dict': { @@ -214,15 +229,15 @@ class NRKTVIE(NRKBaseIE): # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'id': 'MSPO40010515AH', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:c03aba1e917561eface5214020551b7a', }, 'params': { 'skip_download': True, }, - 'expected_warnings': ['Video is geo restricted'], + 'expected_warnings': ['Failed to download m3u8 information'], 'skip': 'particular part is not supported currently', }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', @@ -232,7 +247,7 @@ class NRKTVIE(NRKBaseIE): 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', }, - 'skip': 'Video is geo restricted', + 'expected_warnings': ['Failed to download m3u8 information'], }, { 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', 'info_dict': { @@ -312,6 +327,7 @@ class NRKTVIE(NRKBaseIE): asset_url = asset.get('url') if not asset_url or asset_url in urls: continue + urls.append(asset_url) formats = self._extract_nrk_formats(asset_url, video_id) if not formats: continue From 04ac0950aa22aee204820aa155f2e2df1cc39ef4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:46:34 +0100 Subject: [PATCH 097/384] [nrk] reduce the number of instalments requests --- haruhi_dl/extractor/nrk.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 289a0a3a4..24993b1c8 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -121,6 +121,7 @@ class NRKIE(NRKBaseIE): def _extract_from_playback(self, video_id): path_templ = 'playback/%s/' + video_id + def call_playback_api(item, query=None): return self._call_api(path_templ % item, video_id, item, query=query) # known values for preferredCdn: akamai, iponly, minicdn and telenor @@ -696,7 +697,8 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): domain = 'radio' if site == 'radio.nrk' else 'tv' series = self._call_api( - '%s/catalog/series/%s' % (domain, series_id), series_id, 'serie') + '%s/catalog/series/%s' % (domain, series_id), + series_id, 'serie', query={'embeddedInstalmentsPageSize': 50}) titles = try_get(series, [ lambda x: x['titles'], lambda x: x[x['type']]['titles'], From 5c239bfc6547c85ee466d6b48f5385562ee9abdf Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:48:24 +0100 Subject: [PATCH 098/384] [nrk] reduce requests for Radio series --- haruhi_dl/extractor/nrk.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 24993b1c8..fdf2d7407 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -694,11 +694,13 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): def _real_extract(self, url): site, series_id = re.match(self._VALID_URL, url).groups() - domain = 'radio' if site == 'radio.nrk' else 'tv' + is_radio = site == 'radio.nrk' + domain = 'radio' if is_radio else 'tv' + size_prefix = 'p' if is_radio else 'embeddedInstalmentsP' series = self._call_api( '%s/catalog/series/%s' % (domain, series_id), - series_id, 'serie', query={'embeddedInstalmentsPageSize': 50}) + series_id, 'serie', query={size_prefix + 'ageSize': 50}) titles = try_get(series, [ lambda x: x['titles'], lambda x: x[x['type']]['titles'], From e76a3363ba73b82b0ca616f4a235a94934ad24b8 Mon Sep 17 00:00:00 2001 From: renalid <renalid@gmail.com> Date: Fri, 26 Feb 2021 14:48:29 +0100 Subject: [PATCH 099/384] [generic] Extract RSS video description (#27177) --- haruhi_dl/extractor/generic.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index babc59dcc..0a6bc25c4 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -204,11 +204,19 @@ class GenericIE(InfoExtractor): { 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'info_dict': { - 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - 'ext': 'm4v', - 'upload_date': '20150228', - 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - } + 'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'title': 'MSNBC Rachel Maddow (video)', + 'description': 're:.*her unique approach to storytelling.*', + }, + 'playlist': [{ + 'info_dict': { + 'ext': 'mov', + 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726', + 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726', + 'description': 're:.*her unique approach to storytelling.*', + 'upload_date': '20201204', + }, + }], }, # RSS feed with enclosures and unsupported link URLs { @@ -2236,6 +2244,7 @@ class GenericIE(InfoExtractor): '_type': 'url_transparent', 'url': next_url, 'title': it.find('title').text, + 'description': xpath_text(it, 'description', default=None), }) return { From 0257cb6e427c8291f07d80a5177addef81e64da7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:48:35 +0100 Subject: [PATCH 100/384] [generic] Extract RSS video timestamp --- haruhi_dl/extractor/generic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 0a6bc25c4..bdc6271aa 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -30,6 +30,7 @@ from ..utils import ( smuggle_url, unescapeHTML, unified_strdate, + unified_timestamp, unsmuggle_url, UnsupportedError, xpath_text, @@ -2245,6 +2246,8 @@ class GenericIE(InfoExtractor): 'url': next_url, 'title': it.find('title').text, 'description': xpath_text(it, 'description', default=None), + 'timestamp': unified_timestamp( + xpath_text(it, 'pubDate', default=None)), }) return { From 1744410baa2b705aaacf4bbbc1adb61333a6ef7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:48:44 +0100 Subject: [PATCH 101/384] [generic] Extract RSS video itunes metadata --- haruhi_dl/extractor/generic.py | 36 +++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index bdc6271aa..0b9ec2b74 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -20,12 +20,14 @@ from ..utils import ( ExtractorError, float_or_none, HEADRequest, + int_or_none, is_html, js_to_json, KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, orderedSet, + parse_duration, sanitized_Request, smuggle_url, unescapeHTML, @@ -33,7 +35,9 @@ from ..utils import ( unified_timestamp, unsmuggle_url, UnsupportedError, + url_or_none, xpath_text, + xpath_with_ns, ) from .commonprotocols import RtmpIE from .brightcove import ( @@ -212,10 +216,12 @@ class GenericIE(InfoExtractor): 'playlist': [{ 'info_dict': { 'ext': 'mov', - 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726', - 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726', + 'id': 'pdv_maddow_netcast_mov-12-04-2020-224335', + 'title': 're:MSNBC Rachel Maddow', 'description': 're:.*her unique approach to storytelling.*', - 'upload_date': '20201204', + 'timestamp': int, + 'upload_date': compat_str, + 'duration': float, }, }], }, @@ -2226,6 +2232,10 @@ class GenericIE(InfoExtractor): playlist_desc_el = doc.find('./channel/description') playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text + NS_MAP = { + 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', + } + entries = [] for it in doc.findall('./channel/item'): next_url = None @@ -2241,6 +2251,20 @@ class GenericIE(InfoExtractor): if not next_url: continue + def itunes(key): + return xpath_text( + it, xpath_with_ns('./itunes:%s' % key, NS_MAP), + default=None) + + duration = itunes('duration') + explicit = itunes('explicit') + if explicit == 'true': + age_limit = 18 + elif explicit == 'false': + age_limit = 0 + else: + age_limit = None + entries.append({ '_type': 'url_transparent', 'url': next_url, @@ -2248,6 +2272,12 @@ class GenericIE(InfoExtractor): 'description': xpath_text(it, 'description', default=None), 'timestamp': unified_timestamp( xpath_text(it, 'pubDate', default=None)), + 'duration': int_or_none(duration) or parse_duration(duration), + 'thumbnail': url_or_none(itunes('image')), + 'episode': itunes('title'), + 'episode_number': int_or_none(itunes('episode')), + 'season_number': int_or_none(itunes('season')), + 'age_limit': age_limit, }) return { From 371904a4d994d26f751b6e7a25b2f217de5a78f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:50:42 +0100 Subject: [PATCH 102/384] [extractor/common] Extract timestamp from Last-Modified header --- haruhi_dl/extractor/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 0b9ec2b74..e67e883b0 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -2397,7 +2397,7 @@ class GenericIE(InfoExtractor): info_dict = { 'id': video_id, 'title': self._generic_title(url), - 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) + 'timestamp': unified_timestamp(head_response.headers.get('Last-Modified')) } # Check for direct link to a video From 96e01843779dd0e49b252280ed55eaaa73229bca Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:50:53 +0100 Subject: [PATCH 103/384] [aenetworks] Fix extraction - Fix Fastly format extraction - Add support for play and watch subdomains - Extract series metadata closes #23363 closes #23390 closes #26795 closes #26985 --- haruhi_dl/extractor/aenetworks.py | 277 ++++++++++++++++++------------ haruhi_dl/extractor/extractors.py | 2 + 2 files changed, 165 insertions(+), 114 deletions(-) diff --git a/haruhi_dl/extractor/aenetworks.py b/haruhi_dl/extractor/aenetworks.py index 611b948f5..3d0cf1208 100644 --- a/haruhi_dl/extractor/aenetworks.py +++ b/haruhi_dl/extractor/aenetworks.py @@ -5,20 +5,30 @@ import re from .theplatform import ThePlatformIE from ..utils import ( - extract_attributes, ExtractorError, int_or_none, - smuggle_url, update_url_query, -) -from ..compat import ( - compat_urlparse, + urlencode_postdata, ) class AENetworksBaseIE(ThePlatformIE): + _BASE_URL_REGEX = r'''(?x)https?:// + (?:(?:www|play|watch)\.)? + (?P<domain> + (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| + fyi\.tv + )/''' _THEPLATFORM_KEY = 'crazyjava' _THEPLATFORM_SECRET = 's3cr3t' + _DOMAIN_MAP = { + 'history.com': ('HISTORY', 'history'), + 'aetv.com': ('AETV', 'aetv'), + 'mylifetime.com': ('LIFETIME', 'lifetime'), + 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), + 'fyi.tv': ('FYI', 'fyi'), + 'historyvault.com': (None, 'historyvault'), + } def _extract_aen_smil(self, smil_url, video_id, auth=None): query = {'mbr': 'true'} @@ -31,7 +41,7 @@ class AENetworksBaseIE(ThePlatformIE): 'assetTypes': 'high_video_s3' }, { 'assetTypes': 'high_video_s3', - 'switch': 'hls_ingest_fastly' + 'switch': 'hls_high_fastly', }] formats = [] subtitles = {} @@ -61,20 +71,13 @@ class AENetworksBaseIE(ThePlatformIE): class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?P<domain> - (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| - fyi\.tv - )/ - (?: - shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})| - movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?| - specials/(?P<special_display_id>[^/]+)/(?:full-special|preview-)| - collections/[^/]+/(?P<collection_display_id>[^/]+) - ) - ''' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'''(?P<id> + shows/[^/]+/season-\d+/episode-\d+| + (?: + (?:movie|special)s/[^/]+| + (?:shows/[^/]+/)?videos + )/[^/?#&]+ + )''' _TESTS = [{ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'info_dict': { @@ -91,22 +94,23 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.history.com/shows/ancient-aliens/season-1', - 'info_dict': { - 'id': '71889446852', - }, - 'playlist_mincount': 5, - }, { - 'url': 'http://www.mylifetime.com/shows/atlanta-plastic', - 'info_dict': { - 'id': 'SERIES4317', - 'title': 'Atlanta Plastic', - }, - 'playlist_mincount': 2, + 'skip': 'This video is only available for users of participating TV providers.', }, { 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', - 'only_matching': True + 'info_dict': { + 'id': '600587331957', + 'ext': 'mp4', + 'title': 'Inlawful Entry', + 'description': 'md5:57c12115a2b384d883fe64ca50529e08', + 'timestamp': 1452634428, + 'upload_date': '20160112', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True @@ -117,80 +121,152 @@ class AENetworksIE(AENetworksBaseIE): 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', 'only_matching': True }, { - 'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us', + 'url': 'https://watch.lifetimemovieclub.com/movies/10-year-reunion/full-movie', 'only_matching': True }, { 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', 'only_matching': True - }, { - 'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward', - 'only_matching': True }, { 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', 'only_matching': True + }, { + 'url': 'http://www.history.com/videos/history-of-valentines-day', + 'only_matching': True + }, { + 'url': 'https://play.aetv.com/shows/duck-dynasty/videos/best-of-duck-dynasty-getting-quack-in-shape', + 'only_matching': True }] - _DOMAIN_TO_REQUESTOR_ID = { - 'history.com': 'HISTORY', - 'aetv.com': 'AETV', - 'mylifetime.com': 'LIFETIME', - 'lifetimemovieclub.com': 'LIFETIMEMOVIECLUB', - 'fyi.tv': 'FYI', - } def _real_extract(self, url): - domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups() - display_id = show_path or movie_display_id or special_display_id or collection_display_id - webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers()) - if show_path: - url_parts = show_path.split('/') - url_parts_len = len(url_parts) - if url_parts_len == 1: - entries = [] - for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): - entries.append(self.url_result( - compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) - if entries: - return self.playlist_result( - entries, self._html_search_meta('aetn:SeriesId', webpage), - self._html_search_meta('aetn:SeriesTitle', webpage)) - else: - # single season - url_parts_len = 2 - if url_parts_len == 2: - entries = [] - for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): - episode_attributes = extract_attributes(episode_item) - episode_url = compat_urlparse.urljoin( - url, episode_attributes['data-canonical']) - entries.append(self.url_result( - episode_url, 'AENetworks', - episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id'))) - return self.playlist_result( - entries, self._html_search_meta('aetn:SeasonId', webpage)) - - video_id = self._html_search_meta('aetn:VideoID', webpage) - media_url = self._search_regex( - [r"media_url\s*=\s*'(?P<url>[^']+)'", - r'data-media-url=(?P<url>(?:https?:)?//[^\s>]+)', - r'data-media-url=(["\'])(?P<url>(?:(?!\1).)+?)\1'], - webpage, 'video url', group='url') + domain, canonical = re.match(self._VALID_URL, url).groups() + requestor_id, brand = self._DOMAIN_MAP[domain] + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, + canonical, query={'filter[canonical]': '/' + canonical})['results'][0] + title = result['title'] + video_id = result['id'] + media_url = result['publicUrl'] theplatform_metadata = self._download_theplatform_metadata(self._search_regex( r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) info = self._parse_theplatform_metadata(theplatform_metadata) auth = None if theplatform_metadata.get('AETN$isBehindWall'): - requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] resource = self._get_mvpd_resource( requestor_id, theplatform_metadata['title'], theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), theplatform_metadata['ratings'][0]['rating']) auth = self._extract_mvpd_auth( url, video_id, requestor_id, resource) - info.update(self._search_json_ld(webpage, video_id, fatal=False)) info.update(self._extract_aen_smil(media_url, video_id, auth)) + info.update({ + 'title': title, + 'series': result.get('seriesName'), + 'season_number': int_or_none(result.get('tvSeasonNumber')), + 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), + }) return info +class AENetworksListBaseIE(AENetworksBaseIE): + def _call_api(self, resource, slug, brand, fields): + return self._download_json( + 'https://yoga.appsvcs.aetnd.com/graphql', + slug, query={'brand': brand}, data=urlencode_postdata({ + 'query': '''{ + %s(slug: "%s") { + %s + } +}''' % (resource, slug, fields), + }))['data'][resource] + + def _real_extract(self, url): + domain, slug = re.match(self._VALID_URL, url).groups() + _, brand = self._DOMAIN_MAP[domain] + playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS) + base_url = 'http://watch.%s' % domain + + entries = [] + for item in (playlist.get(self._ITEMS_KEY) or []): + doc = self._get_doc(item) + canonical = doc.get('canonical') + if not canonical: + continue + entries.append(self.url_result( + base_url + canonical, AENetworksIE.ie_key(), doc.get('id'))) + + description = None + if self._PLAYLIST_DESCRIPTION_KEY: + description = playlist.get(self._PLAYLIST_DESCRIPTION_KEY) + + return self.playlist_result( + entries, playlist.get('id'), + playlist.get(self._PLAYLIST_TITLE_KEY), description) + + +class AENetworksCollectionIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:collection' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'(?:[^/]+/)*(?:list|collections)/(?P<id>[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://watch.historyvault.com/list/america-the-story-of-us', + 'info_dict': { + 'id': '282', + 'title': 'America The Story of Us', + }, + 'playlist_mincount': 12, + }, { + 'url': 'https://watch.historyvault.com/shows/america-the-story-of-us-2/season-1/list/america-the-story-of-us', + 'only_matching': True + }, { + 'url': 'https://www.historyvault.com/collections/mysteryquest', + 'only_matching': True + }] + _RESOURCE = 'list' + _ITEMS_KEY = 'items' + _PLAYLIST_TITLE_KEY = 'display_title' + _PLAYLIST_DESCRIPTION_KEY = None + _FIELDS = '''id + display_title + items { + ... on ListVideoItem { + doc { + canonical + id + } + } + }''' + + def _get_doc(self, item): + return item.get('doc') or {} + + +class AENetworksShowIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:show' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'shows/(?P<id>[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.history.com/shows/ancient-aliens', + 'info_dict': { + 'id': 'SH012427480000', + 'title': 'Ancient Aliens', + 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', + }, + 'playlist_mincount': 168, + }] + _RESOURCE = 'series' + _ITEMS_KEY = 'episodes' + _PLAYLIST_TITLE_KEY = 'title' + _PLAYLIST_DESCRIPTION_KEY = 'description' + _FIELDS = '''description + id + title + episodes { + canonical + id + }''' + + def _get_doc(self, item): + return item + + class HistoryTopicIE(AENetworksBaseIE): IE_NAME = 'history:topic' IE_DESC = 'History.com Topic' @@ -204,6 +280,7 @@ class HistoryTopicIE(AENetworksBaseIE): 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', 'timestamp': 1375819729, 'upload_date': '20130806', + 'uploader': 'AENE-NEW', }, 'params': { # m3u8 download @@ -212,36 +289,8 @@ class HistoryTopicIE(AENetworksBaseIE): 'add_ie': ['ThePlatform'], }] - def theplatform_url_result(self, theplatform_url, video_id, query): - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': smuggle_url( - update_url_query(theplatform_url, query), - { - 'sig': { - 'key': self._THEPLATFORM_KEY, - 'secret': self._THEPLATFORM_SECRET, - }, - 'force_smil_url': True - }), - 'ie_key': 'ThePlatform', - } - def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'<phoenix-iframe[^>]+src="[^"]+\btpid=(\d+)', webpage, 'tpid') - result = self._download_json( - 'https://feeds.video.aetnd.com/api/v2/history/videos', - video_id, query={'filter[id]': video_id})['results'][0] - title = result['title'] - info = self._extract_aen_smil(result['publicUrl'], video_id) - info.update({ - 'title': title, - 'description': result.get('description'), - 'duration': int_or_none(result.get('duration')), - 'timestamp': int_or_none(result.get('added'), 1000), - }) - return info + return self.url_result( + 'http://www.history.com/videos/' + display_id, + AENetworksIE.ie_key()) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 46f3b604c..7a0706532 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -30,6 +30,8 @@ from .adobetv import ( from .adultswim import AdultSwimIE from .aenetworks import ( AENetworksIE, + AENetworksCollectionIE, + AENetworksShowIE, HistoryTopicIE, ) from .afreecatv import AfreecaTVIE From f717a3cc82662cc677ffede2ccf30a1709bbf007 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:51:01 +0100 Subject: [PATCH 104/384] [extractor/generic] Remove unused import --- haruhi_dl/extractor/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index e67e883b0..037fc4d7a 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -31,7 +31,6 @@ from ..utils import ( sanitized_Request, smuggle_url, unescapeHTML, - unified_strdate, unified_timestamp, unsmuggle_url, UnsupportedError, From 32e8c82a3b1f2135cb25ad796102d83a3f0ec69d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 14:51:20 +0100 Subject: [PATCH 105/384] =?UTF-8?q?[slideslive]=20Add=20support=20for=20yo?= =?UTF-8?q?da=20service=20videos=20and=20extract=20subtitle=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …s (closes #27323) --- haruhi_dl/extractor/slideslive.py | 55 ++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/slideslive.py b/haruhi_dl/extractor/slideslive.py index d9ea76831..cd70841a9 100644 --- a/haruhi_dl/extractor/slideslive.py +++ b/haruhi_dl/extractor/slideslive.py @@ -2,7 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + bool_or_none, + smuggle_url, + try_get, + url_or_none, +) class SlidesLiveIE(InfoExtractor): @@ -18,8 +23,21 @@ class SlidesLiveIE(InfoExtractor): 'description': 'Watch full version of this video at https://slideslive.com/38902413.', 'uploader': 'SlidesLive Videos - A', 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', + 'timestamp': 1597615266, 'upload_date': '20170925', } + }, { + # video_service_name = yoda + 'url': 'https://slideslive.com/38935785', + 'md5': '575cd7a6c0acc6e28422fe76dd4bcb1a', + 'info_dict': { + 'id': 'RMraDYN5ozA_', + 'ext': 'mp4', + 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', + }, + 'params': { + 'format': 'bestvideo', + }, }, { # video_service_name = youtube 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', @@ -39,18 +57,47 @@ class SlidesLiveIE(InfoExtractor): video_data = self._download_json( 'https://ben.slideslive.com/player/' + video_id, video_id) service_name = video_data['video_service_name'].lower() - assert service_name in ('url', 'vimeo', 'youtube') + assert service_name in ('url', 'yoda', 'vimeo', 'youtube') service_id = video_data['video_service_id'] + subtitles = {} + for sub in try_get(video_data, lambda x: x['subtitles'], list) or []: + if not isinstance(sub, dict): + continue + webvtt_url = url_or_none(sub.get('webvtt_url')) + if not webvtt_url: + continue + lang = sub.get('language') or 'en' + subtitles.setdefault(lang, []).append({ + 'url': webvtt_url, + }) info = { 'id': video_id, 'thumbnail': video_data.get('thumbnail'), - 'url': service_id, + 'is_live': bool_or_none(video_data.get('is_live')), + 'subtitles': subtitles, } - if service_name == 'url': + if service_name in ('url', 'yoda'): info['title'] = video_data['title'] + if service_name == 'url': + info['url'] = service_id + else: + formats = [] + _MANIFEST_PATTERN = 'https://01.cdn.yoda.slideslive.com/%s/master.%s' + formats.extend(self._extract_m3u8_formats( + _MANIFEST_PATTERN % (service_id, 'm3u8'), service_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_mpd_formats( + _MANIFEST_PATTERN % (service_id, 'mpd'), service_id, + mpd_id='dash', fatal=False)) + self._sort_formats(formats) + info.update({ + 'id': service_id, + 'formats': formats, + }) else: info.update({ '_type': 'url_transparent', + 'url': service_id, 'ie_key': service_name.capitalize(), 'title': video_data.get('title'), }) From 96e0370bb26470928eefd2053f7735c094cdf077 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:51:38 +0100 Subject: [PATCH 106/384] =?UTF-8?q?[americastestkitchen]=20Fix=20Extractio?= =?UTF-8?q?n=20and=20add=20support=20for=20Cook's=20Count=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ry and Cook's Illustrated closes #17234 closes #27322 --- haruhi_dl/extractor/americastestkitchen.py | 68 +++++++++------------- 1 file changed, 26 insertions(+), 42 deletions(-) diff --git a/haruhi_dl/extractor/americastestkitchen.py b/haruhi_dl/extractor/americastestkitchen.py index 9c9d77ae1..e20f00fc3 100644 --- a/haruhi_dl/extractor/americastestkitchen.py +++ b/haruhi_dl/extractor/americastestkitchen.py @@ -1,33 +1,33 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( clean_html, - int_or_none, - js_to_json, try_get, unified_strdate, ) class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?P<resource_type>episode|videos)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', 'info_dict': { 'id': '5b400b9ee338f922cb06450c', - 'title': 'Weeknight Japanese Suppers', + 'title': 'Japanese Suppers', 'ext': 'mp4', - 'description': 'md5:3d0c1a44bb3b27607ce82652db25b4a8', + 'description': 'md5:64e606bfee910627efc4b5f050de92b3', 'thumbnail': r're:^https?://', 'timestamp': 1523664000, 'upload_date': '20180414', - 'release_date': '20180414', + 'release_date': '20180410', 'series': "America's Test Kitchen", 'season_number': 18, - 'episode': 'Weeknight Japanese Suppers', + 'episode': 'Japanese Suppers', 'episode_number': 15, }, 'params': { @@ -36,47 +36,31 @@ class AmericasTestKitchenIE(InfoExtractor): }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + resource_type, video_id = re.match(self._VALID_URL, url).groups() + is_episode = resource_type == 'episode' + if is_episode: + resource_type = 'episodes' - webpage = self._download_webpage(url, video_id) - - video_data = self._parse_json( - self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*</script>', - webpage, 'initial context'), - video_id, js_to_json) - - ep_data = try_get( - video_data, - (lambda x: x['episodeDetail']['content']['data'], - lambda x: x['videoDetail']['content']['data']), dict) - ep_meta = ep_data.get('full_video', {}) - - zype_id = ep_data.get('zype_id') or ep_meta['zype_id'] - - title = ep_data.get('title') or ep_meta.get('title') - description = clean_html(ep_meta.get('episode_description') or ep_data.get( - 'description') or ep_meta.get('description')) - thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url']) - release_date = unified_strdate(ep_data.get('aired_at')) - - season_number = int_or_none(ep_meta.get('season_number')) - episode = ep_meta.get('title') - episode_number = int_or_none(ep_meta.get('episode_number')) + resource = self._download_json( + 'https://www.americastestkitchen.com/api/v6/%s/%s' % (resource_type, video_id), video_id) + video = resource['video'] if is_episode else resource + episode = resource if is_episode else resource.get('episode') or {} return { '_type': 'url_transparent', - 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id, + 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'], 'ie_key': 'Zype', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'release_date': release_date, - 'series': "America's Test Kitchen", - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, + 'description': clean_html(video.get('description')), + 'release_date': unified_strdate(video.get('publishDate')), + 'series': try_get(episode, lambda x: x['show']['title']), + 'episode': episode.get('title'), } From 7e83a9d619561302e2d4fac857ba17b28b9fec6b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:51:45 +0100 Subject: [PATCH 107/384] [tvplay:home] Fix extraction(closes #21153) --- haruhi_dl/extractor/tvplay.py | 90 ++++++++++++++--------------------- 1 file changed, 35 insertions(+), 55 deletions(-) diff --git a/haruhi_dl/extractor/tvplay.py b/haruhi_dl/extractor/tvplay.py index 3c2450dd0..0d858c025 100644 --- a/haruhi_dl/extractor/tvplay.py +++ b/haruhi_dl/extractor/tvplay.py @@ -12,11 +12,13 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + parse_duration, parse_iso8601, qualities, try_get, update_url_query, url_or_none, + urljoin, ) @@ -414,7 +416,7 @@ class ViafreeIE(InfoExtractor): class TVPlayHomeIE(InfoExtractor): - _VALID_URL = r'https?://tvplay\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/[^/]+/[^/?#&]+-(?P<id>\d+)' + _VALID_URL = r'https?://(?:tv3?)?play\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/(?:[^/]+/)*[^/?#&]+-(?P<id>\d+)' _TESTS = [{ 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/', 'info_dict': { @@ -433,80 +435,58 @@ class TVPlayHomeIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [TVPlayIE.ie_key()], }, { 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/', 'only_matching': True, }, { 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/', 'only_matching': True, + }, { + 'url': 'https://play.tv3.lt/aferistai-10047125', + 'only_matching': True, + }, { + 'url': 'https://tv3play.skaties.lv/vinas-melo-labak-10280317', + 'only_matching': True, + }, { + 'url': 'https://play.tv3.ee/cool-d-ga-mehhikosse-10044354', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + asset = self._download_json( + urljoin(url, '/sb/public/asset/' + video_id), video_id) - video_id = self._search_regex( - r'data-asset-id\s*=\s*["\'](\d{5,})\b', webpage, 'video id') - - if len(video_id) < 8: - return self.url_result( - 'mtg:%s' % video_id, ie=TVPlayIE.ie_key(), video_id=video_id) - - m3u8_url = self._search_regex( - r'data-file\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'm3u8 url', group='url') + m3u8_url = asset['movie']['contentUrl'] + video_id = asset['assetId'] + asset_title = asset['title'] + title = asset_title['title'] formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) - title = self._search_regex( - r'data-title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, - 'title', default=None, group='value') or self._html_search_meta( - 'title', webpage, default=None) or self._og_search_title( - webpage) + thumbnails = None + image_url = asset.get('imageUrl') + if image_url: + thumbnails = [{ + 'url': urljoin(url, image_url), + 'ext': 'jpg', + }] - description = self._html_search_meta( - 'description', webpage, - default=None) or self._og_search_description(webpage) - - thumbnail = self._search_regex( - r'data-image\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'thumbnail', default=None, group='url') or self._html_search_meta( - 'thumbnail', webpage, default=None) or self._og_search_thumbnail( - webpage) - - duration = int_or_none(self._search_regex( - r'data-duration\s*=\s*["\'](\d+)', webpage, 'duration', - fatal=False)) - - season = self._search_regex( - (r'data-series-title\s*=\s*(["\'])[^/]+/(?P<value>(?:(?!\1).)+)\1', - r'\bseason\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, - 'season', default=None, group='value') - season_number = int_or_none(self._search_regex( - r'(\d+)(?:[.\s]+sezona|\s+HOOAEG)', season or '', 'season number', - default=None)) - episode = self._search_regex( - (r'\bepisode\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', - r'data-subtitle\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, - 'episode', default=None, group='value') - episode_number = int_or_none(self._search_regex( - r'(?:S[eē]rija|Osa)\s+(\d+)', episode or '', 'episode number', - default=None)) + metadata = asset.get('metadata') or {} return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'season': season, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, + 'description': asset_title.get('summaryLong') or asset_title.get('summaryShort'), + 'thumbnails': thumbnails, + 'duration': parse_duration(asset_title.get('runTime')), + 'series': asset.get('tvSeriesTitle'), + 'season': asset.get('tvSeasonTitle'), + 'season_number': int_or_none(metadata.get('seasonNumber')), + 'episode': asset_title.get('titleBrief'), + 'episode_number': int_or_none(metadata.get('episodeNumber')), 'formats': formats, } From 8b9bc4eeeeaa7cf167d4a3ed8c09f4d6fc77c8ed Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:51:54 +0100 Subject: [PATCH 108/384] [generic] comment a test covered now by AmericasTestKitchenIE --- haruhi_dl/extractor/generic.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 037fc4d7a..7cfc7464e 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -2126,23 +2126,23 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - { - # Zype embed - 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', - 'info_dict': { - 'id': '5b400b834b32992a310622b9', - 'ext': 'mp4', - 'title': 'Smoky Barbecue Favorites', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', - 'upload_date': '20170909', - 'timestamp': 1504915200, - }, - 'add_ie': [ZypeIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, + # { + # # Zype embed + # 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', + # 'info_dict': { + # 'id': '5b400b834b32992a310622b9', + # 'ext': 'mp4', + # 'title': 'Smoky Barbecue Favorites', + # 'thumbnail': r're:^https?://.*\.jpe?g', + # 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', + # 'upload_date': '20170909', + # 'timestamp': 1504915200, + # }, + # 'add_ie': [ZypeIE.ie_key()], + # 'params': { + # 'skip_download': True, + # }, + # }, { # videojs embed 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904', From e754d9d1a52762170bb52bbe6b45a502902fb947 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:52:02 +0100 Subject: [PATCH 109/384] [telequebec] Fix Extraction and Add Support for video.telequebec.tv closes #25733 closes #26883 closes #27339 --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/telequebec.py | 160 ++++++++++++++++-------------- 2 files changed, 88 insertions(+), 73 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 7a0706532..bbbbadd8a 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1199,6 +1199,7 @@ from .telequebec import ( TeleQuebecSquatIE, TeleQuebecEmissionIE, TeleQuebecLiveIE, + TeleQuebecVideoIE, ) from .teletask import TeleTaskIE from .telewebion import TelewebionIE diff --git a/haruhi_dl/extractor/telequebec.py b/haruhi_dl/extractor/telequebec.py index b4c485b9b..800d87b70 100644 --- a/haruhi_dl/extractor/telequebec.py +++ b/haruhi_dl/extractor/telequebec.py @@ -12,25 +12,16 @@ from ..utils import ( class TeleQuebecBaseIE(InfoExtractor): + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + @staticmethod - def _result(url, ie_key): + def _brightcove_result(brightcove_id, player_id, account_id='6150020952001'): return { '_type': 'url_transparent', - 'url': smuggle_url(url, {'geo_countries': ['CA']}), - 'ie_key': ie_key, + 'url': smuggle_url(TeleQuebecBaseIE.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, brightcove_id), {'geo_countries': ['CA']}), + 'ie_key': 'BrightcoveNew', } - @staticmethod - def _limelight_result(media_id): - return TeleQuebecBaseIE._result( - 'limelight:media:' + media_id, 'LimelightMedia') - - @staticmethod - def _brightcove_result(brightcove_id): - return TeleQuebecBaseIE._result( - 'http://players.brightcove.net/6150020952001/default_default/index.html?videoId=%s' - % brightcove_id, 'BrightcoveNew') - class TeleQuebecIE(TeleQuebecBaseIE): _VALID_URL = r'''(?x) @@ -44,14 +35,18 @@ class TeleQuebecIE(TeleQuebecBaseIE): # available till 01.01.2023 'url': 'http://zonevideo.telequebec.tv/media/37578/un-petit-choc-et-puis-repart/un-chef-a-la-cabane', 'info_dict': { - 'id': '577116881b4b439084e6b1cf4ef8b1b3', + 'id': '6155972771001', 'ext': 'mp4', 'title': 'Un petit choc et puis repart!', - 'description': 'md5:067bc84bd6afecad85e69d1000730907', + 'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374', + 'timestamp': 1589262469, + 'uploader_id': '6150020952001', + 'upload_date': '20200512', }, 'params': { - 'skip_download': True, + 'format': 'bestvideo', }, + 'add_ie': ['BrightcoveNew'], }, { 'url': 'https://zonevideo.telequebec.tv/media/55267/le-soleil/passe-partout', 'info_dict': { @@ -65,7 +60,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): }, 'params': { 'format': 'bestvideo', - 'skip_download': True, }, 'add_ie': ['BrightcoveNew'], }, { @@ -79,25 +73,20 @@ class TeleQuebecIE(TeleQuebecBaseIE): def _real_extract(self, url): media_id = self._match_id(url) - - media_data = self._download_json( - 'https://mnmedias.api.telequebec.tv/api/v2/media/' + media_id, + media = self._download_json( + 'https://mnmedias.api.telequebec.tv/api/v3/media/' + media_id, media_id)['media'] - - source_id = media_data['streamInfo']['sourceId'] - source = (try_get( - media_data, lambda x: x['streamInfo']['source'], - compat_str) or 'limelight').lower() - if source == 'brightcove': - info = self._brightcove_result(source_id) - else: - info = self._limelight_result(source_id) + source_id = next(source_info['sourceId'] for source_info in media['streamInfos'] if source_info.get('source') == 'Brightcove') + info = self._brightcove_result(source_id, '22gPKdt7f') + product = media.get('product') or {} + season = product.get('season') or {} info.update({ - 'title': media_data.get('title'), - 'description': try_get( - media_data, lambda x: x['descriptions'][0]['text'], compat_str), - 'duration': int_or_none( - media_data.get('durationInMilliseconds'), 1000), + 'description': try_get(media, lambda x: x['descriptions'][-1]['text'], compat_str), + 'series': try_get(season, lambda x: x['serie']['titre']), + 'season': season.get('name'), + 'season_number': int_or_none(season.get('seasonNo')), + 'episode': product.get('titre'), + 'episode_number': int_or_none(product.get('episodeNo')), }) return info @@ -148,7 +137,7 @@ class TeleQuebecSquatIE(InfoExtractor): } -class TeleQuebecEmissionIE(TeleQuebecBaseIE): +class TeleQuebecEmissionIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: @@ -160,15 +149,16 @@ class TeleQuebecEmissionIE(TeleQuebecBaseIE): _TESTS = [{ 'url': 'http://lindicemcsween.telequebec.tv/emissions/100430013/des-soins-esthetiques-a-377-d-interets-annuels-ca-vous-tente', 'info_dict': { - 'id': '66648a6aef914fe3badda25e81a4d50a', + 'id': '6154476028001', 'ext': 'mp4', - 'title': "Des soins esthétiques à 377 % d'intérêts annuels, ça vous tente?", - 'description': 'md5:369e0d55d0083f1fc9b71ffb640ea014', - 'upload_date': '20171024', - 'timestamp': 1508862118, + 'title': 'Des soins esthétiques à 377 % d’intérêts annuels, ça vous tente?', + 'description': 'md5:cb4d378e073fae6cce1f87c00f84ae9f', + 'upload_date': '20200505', + 'timestamp': 1588713424, + 'uploader_id': '6150020952001', }, 'params': { - 'skip_download': True, + 'format': 'bestvideo', }, }, { 'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression', @@ -187,26 +177,26 @@ class TeleQuebecEmissionIE(TeleQuebecBaseIE): webpage = self._download_webpage(url, display_id) media_id = self._search_regex( - r'mediaUID\s*:\s*["\'][Ll]imelight_(?P<id>[a-z0-9]{32})', webpage, - 'limelight id') + r'mediaId\s*:\s*(?P<id>\d+)', webpage, 'media id') - info = self._limelight_result(media_id) - info.update({ - 'title': self._og_search_title(webpage, default=None), - 'description': self._og_search_description(webpage, default=None), - }) - return info + return self.url_result( + 'http://zonevideo.telequebec.tv/media/' + media_id, + TeleQuebecIE.ie_key()) -class TeleQuebecLiveIE(InfoExtractor): +class TeleQuebecLiveIE(TeleQuebecBaseIE): _VALID_URL = r'https?://zonevideo\.telequebec\.tv/(?P<id>endirect)' _TEST = { 'url': 'http://zonevideo.telequebec.tv/endirect/', 'info_dict': { - 'id': 'endirect', + 'id': '6159095684001', 'ext': 'mp4', - 'title': 're:^Télé-Québec - En direct [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^Télé-Québec [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'description': 'Canal principal de Télé-Québec', + 'uploader_id': '6150020952001', + 'timestamp': 1590439901, + 'upload_date': '20200525', }, 'params': { 'skip_download': True, @@ -214,25 +204,49 @@ class TeleQuebecLiveIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + return self._brightcove_result('6159095684001', 'skCsmi2Uw') - m3u8_url = None - webpage = self._download_webpage( - 'https://player.telequebec.tv/Tq_VideoPlayer.js', video_id, - fatal=False) - if webpage: - m3u8_url = self._search_regex( - r'm3U8Url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'm3u8 url', default=None, group='url') - if not m3u8_url: - m3u8_url = 'https://teleqmmd.mmdlive.lldns.net/teleqmmd/f386e3b206814e1f8c8c1c71c0f8e748/manifest.m3u8' - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', m3u8_id='hls') - self._sort_formats(formats) - return { - 'id': video_id, - 'title': self._live_title('Télé-Québec - En direct'), - 'is_live': True, - 'formats': formats, - } +class TeleQuebecVideoIE(TeleQuebecBaseIE): + _VALID_URL = r'https?://video\.telequebec\.tv/player(?:-live)?/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://video.telequebec.tv/player/31110/stream', + 'info_dict': { + 'id': '6202570652001', + 'ext': 'mp4', + 'title': 'Le coût du véhicule le plus vendu au Canada / Tous les frais liés à la procréation assistée', + 'description': 'md5:685a7e4c450ba777c60adb6e71e41526', + 'upload_date': '20201019', + 'timestamp': 1603115930, + 'uploader_id': '6101674910001', + }, + 'params': { + 'format': 'bestvideo', + }, + }, { + 'url': 'https://video.telequebec.tv/player-live/28527', + 'only_matching': True, + }] + + def _call_api(self, path, video_id): + return self._download_json( + 'http://beacon.playback.api.brightcove.com/telequebec/api/assets/' + path, + video_id, query={'device_layout': 'web', 'device_type': 'web'})['data'] + + def _real_extract(self, url): + asset_id = self._match_id(url) + asset = self._call_api(asset_id, asset_id)['asset'] + stream = self._call_api( + asset_id + '/streams/' + asset['streams'][0]['id'], asset_id)['stream'] + stream_url = stream['url'] + account_id = try_get( + stream, lambda x: x['video_provider_details']['account_id']) or '6101674910001' + info = self._brightcove_result(stream_url, 'default', account_id) + info.update({ + 'description': asset.get('long_description') or asset.get('short_description'), + 'series': asset.get('series_original_name'), + 'season_number': int_or_none(asset.get('season_number')), + 'episode': asset.get('original_name'), + 'episode_number': int_or_none(asset.get('episode_number')), + }) + return info From 228d41686d835e4c660131e936b2df862af0cf49 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:52:17 +0100 Subject: [PATCH 110/384] [amcnetworks] Fix free content extraction(closes #20354) --- haruhi_dl/extractor/amcnetworks.py | 50 +++++++++++++++--------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/haruhi_dl/extractor/amcnetworks.py b/haruhi_dl/extractor/amcnetworks.py index 6fb3d6c53..12b6de0bf 100644 --- a/haruhi_dl/extractor/amcnetworks.py +++ b/haruhi_dl/extractor/amcnetworks.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .theplatform import ThePlatformIE from ..utils import ( int_or_none, @@ -11,25 +13,22 @@ from ..utils import ( class AMCNetworksIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<site>amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P<id>(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', - 'md5': '', + 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', 'info_dict': { - 'id': 's3MX01Nl4vPH', + 'id': '4Lq1dzOnZGt0', 'ext': 'mp4', - 'title': 'Maron - Season 4 - Step 1', - 'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.', - 'age_limit': 17, - 'upload_date': '20160505', - 'timestamp': 1462468831, + 'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner", + 'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.", + 'upload_date': '20201120', + 'timestamp': 1605904350, 'uploader': 'AMCN', }, 'params': { # m3u8 download 'skip_download': True, }, - 'skip': 'Requires TV provider accounts', }, { 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', 'only_matching': True, @@ -55,32 +54,33 @@ class AMCNetworksIE(ThePlatformIE): 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', 'only_matching': True, }] + _REQUESTOR_ID_MAP = { + 'amc': 'AMC', + 'bbcamerica': 'BBCA', + 'ifc': 'IFC', + 'sundancetv': 'SUNDANCE', + 'wetv': 'WETV', + } def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + site, display_id = re.match(self._VALID_URL, url).groups() + requestor_id = self._REQUESTOR_ID_MAP[site] + properties = self._download_json( + 'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/%s/url/%s' % (requestor_id.lower(), display_id), + display_id)['data']['properties'] query = { 'mbr': 'true', 'manifest': 'm3u', } - media_url = self._search_regex( - r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', - webpage, 'media url') - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'link\.theplatform\.com/s/([^?]+)', - media_url, 'theplatform_path'), display_id) + tp_path = 'M_UwQC/media/' + properties['videoPid'] + media_url = 'https://link.theplatform.com/s/' + tp_path + theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id) info = self._parse_theplatform_metadata(theplatform_metadata) video_id = theplatform_metadata['pid'] title = theplatform_metadata['title'] rating = try_get( theplatform_metadata, lambda x: x['ratings'][0]['rating']) - auth_required = self._search_regex( - r'window\.authRequired\s*=\s*(true|false);', - webpage, 'auth required') - if auth_required == 'true': - requestor_id = self._search_regex( - r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', - webpage, 'requestor id') + if properties.get('videoCategory') == 'TVE-Auth': resource = self._get_mvpd_resource( requestor_id, title, video_id, rating) query['auth'] = self._extract_mvpd_auth( From eefe89651dccc952d48413da555a658da325ba16 Mon Sep 17 00:00:00 2001 From: EntranceJew <EntranceJew@gmail.com> Date: Fri, 26 Feb 2021 14:53:07 +0100 Subject: [PATCH 111/384] [tubitv] Extract release year (#27317) --- haruhi_dl/extractor/tubitv.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/haruhi_dl/extractor/tubitv.py b/haruhi_dl/extractor/tubitv.py index a51fa6515..ebfb05c63 100644 --- a/haruhi_dl/extractor/tubitv.py +++ b/haruhi_dl/extractor/tubitv.py @@ -33,6 +33,19 @@ class TubiTvIE(InfoExtractor): }, { 'url': 'http://tubitv.com/movies/383676/tracker', 'only_matching': True, + }, { + 'url': 'https://tubitv.com/movies/560057/penitentiary?start=true', + 'info_dict': { + 'id': '560057', + 'ext': 'mp4', + 'title': 'Penitentiary', + 'description': 'md5:8d2fc793a93cc1575ff426fdcb8dd3f9', + 'uploader_id': 'd8fed30d4f24fcb22ec294421b9defc2', + 'release_year': 1979, + }, + 'params': { + 'skip_download': True, + }, }] def _login(self): @@ -93,4 +106,5 @@ class TubiTvIE(InfoExtractor): 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'uploader_id': video_data.get('publisher_id'), + 'release_year': int_or_none(video_data.get('year')), } From 325ff4c628d0fb57550682554641d461d287d94e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:53:18 +0100 Subject: [PATCH 112/384] [beampro] Remove Extractor closes #17290 closes #22871 closes #23020 closes #23061 closes #26099 --- haruhi_dl/extractor/beampro.py | 194 ------------------------------ haruhi_dl/extractor/extractors.py | 4 - 2 files changed, 198 deletions(-) delete mode 100644 haruhi_dl/extractor/beampro.py diff --git a/haruhi_dl/extractor/beampro.py b/haruhi_dl/extractor/beampro.py deleted file mode 100644 index 86abdae00..000000000 --- a/haruhi_dl/extractor/beampro.py +++ /dev/null @@ -1,194 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - clean_html, - compat_str, - float_or_none, - int_or_none, - parse_iso8601, - try_get, - urljoin, -) - - -class BeamProBaseIE(InfoExtractor): - _API_BASE = 'https://mixer.com/api/v1' - _RATINGS = {'family': 0, 'teen': 13, '18+': 18} - - def _extract_channel_info(self, chan): - user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id']) - return { - 'uploader': chan.get('token') or try_get( - chan, lambda x: x['user']['username'], compat_str), - 'uploader_id': compat_str(user_id) if user_id else None, - 'age_limit': self._RATINGS.get(chan.get('audience')), - } - - -class BeamProLiveIE(BeamProBaseIE): - IE_NAME = 'Mixer:live' - _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'http://mixer.com/niterhayven', - 'info_dict': { - 'id': '261562', - 'ext': 'mp4', - 'title': 'Introducing The Witcher 3 // The Grind Starts Now!', - 'description': 'md5:0b161ac080f15fe05d18a07adb44a74d', - 'thumbnail': r're:https://.*\.jpg$', - 'timestamp': 1483477281, - 'upload_date': '20170103', - 'uploader': 'niterhayven', - 'uploader_id': '373396', - 'age_limit': 18, - 'is_live': True, - 'view_count': int, - }, - 'skip': 'niterhayven is offline', - 'params': { - 'skip_download': True, - }, - } - - _MANIFEST_URL_TEMPLATE = '%s/channels/%%s/manifest.%%s' % BeamProBaseIE._API_BASE - - @classmethod - def suitable(cls, url): - return False if BeamProVodIE.suitable(url) else super(BeamProLiveIE, cls).suitable(url) - - def _real_extract(self, url): - channel_name = self._match_id(url) - - chan = self._download_json( - '%s/channels/%s' % (self._API_BASE, channel_name), channel_name) - - if chan.get('online') is False: - raise ExtractorError( - '{0} is offline'.format(channel_name), expected=True) - - channel_id = chan['id'] - - def manifest_url(kind): - return self._MANIFEST_URL_TEMPLATE % (channel_id, kind) - - formats = self._extract_m3u8_formats( - manifest_url('m3u8'), channel_name, ext='mp4', m3u8_id='hls', - fatal=False) - formats.extend(self._extract_smil_formats( - manifest_url('smil'), channel_name, fatal=False)) - self._sort_formats(formats) - - info = { - 'id': compat_str(chan.get('id') or channel_name), - 'title': self._live_title(chan.get('name') or channel_name), - 'description': clean_html(chan.get('description')), - 'thumbnail': try_get( - chan, lambda x: x['thumbnail']['url'], compat_str), - 'timestamp': parse_iso8601(chan.get('updatedAt')), - 'is_live': True, - 'view_count': int_or_none(chan.get('viewersTotal')), - 'formats': formats, - } - info.update(self._extract_channel_info(chan)) - - return info - - -class BeamProVodIE(BeamProBaseIE): - IE_NAME = 'Mixer:vod' - _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P<id>[^?#&]+)' - _TESTS = [{ - 'url': 'https://mixer.com/willow8714?vod=2259830', - 'md5': 'b2431e6e8347dc92ebafb565d368b76b', - 'info_dict': { - 'id': '2259830', - 'ext': 'mp4', - 'title': 'willow8714\'s Channel', - 'duration': 6828.15, - 'thumbnail': r're:https://.*source\.png$', - 'timestamp': 1494046474, - 'upload_date': '20170506', - 'uploader': 'willow8714', - 'uploader_id': '6085379', - 'age_limit': 13, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://mixer.com/streamer?vod=IxFno1rqC0S_XJ1a2yGgNw', - 'only_matching': True, - }, { - 'url': 'https://mixer.com/streamer?vod=Rh3LY0VAqkGpEQUe2pN-ig', - 'only_matching': True, - }] - - @staticmethod - def _extract_format(vod, vod_type): - if not vod.get('baseUrl'): - return [] - - if vod_type == 'hls': - filename, protocol = 'manifest.m3u8', 'm3u8_native' - elif vod_type == 'raw': - filename, protocol = 'source.mp4', 'https' - else: - assert False - - data = vod.get('data') if isinstance(vod.get('data'), dict) else {} - - format_id = [vod_type] - if isinstance(data.get('Height'), compat_str): - format_id.append('%sp' % data['Height']) - - return [{ - 'url': urljoin(vod['baseUrl'], filename), - 'format_id': '-'.join(format_id), - 'ext': 'mp4', - 'protocol': protocol, - 'width': int_or_none(data.get('Width')), - 'height': int_or_none(data.get('Height')), - 'fps': int_or_none(data.get('Fps')), - 'tbr': int_or_none(data.get('Bitrate'), 1000), - }] - - def _real_extract(self, url): - vod_id = self._match_id(url) - - vod_info = self._download_json( - '%s/recordings/%s' % (self._API_BASE, vod_id), vod_id) - - state = vod_info.get('state') - if state != 'AVAILABLE': - raise ExtractorError( - 'VOD %s is not available (state: %s)' % (vod_id, state), - expected=True) - - formats = [] - thumbnail_url = None - - for vod in vod_info['vods']: - vod_type = vod.get('format') - if vod_type in ('hls', 'raw'): - formats.extend(self._extract_format(vod, vod_type)) - elif vod_type == 'thumbnail': - thumbnail_url = urljoin(vod.get('baseUrl'), 'source.png') - - self._sort_formats(formats) - - info = { - 'id': vod_id, - 'title': vod_info.get('name') or vod_id, - 'duration': float_or_none(vod_info.get('duration')), - 'thumbnail': thumbnail_url, - 'timestamp': parse_iso8601(vod_info.get('createdAt')), - 'view_count': int_or_none(vod_info.get('viewsTotal')), - 'formats': formats, - } - info.update(self._extract_channel_info(vod_info.get('channel') or {})) - - return info diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index bbbbadd8a..cb9c69e8b 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -101,10 +101,6 @@ from .bbc import ( BBCCoUkPlaylistIE, BBCIE, ) -from .beampro import ( - BeamProLiveIE, - BeamProVodIE, -) from .beeg import BeegIE from .behindkink import BehindKinkIE from .bellmedia import BellMediaIE From ec7e1e27c27bd6d71d9743e0c51509ed07702a19 Mon Sep 17 00:00:00 2001 From: Andrey Smirnoff <37037851+mashed-potatoes@users.noreply.github.com> Date: Fri, 26 Feb 2021 14:53:29 +0100 Subject: [PATCH 113/384] [smotri] Remove extractor (#27358) --- haruhi_dl/extractor/extractors.py | 6 - haruhi_dl/extractor/generic.py | 6 - haruhi_dl/extractor/smotri.py | 416 ------------------------------ haruhi_dl/options.py | 2 +- 4 files changed, 1 insertion(+), 429 deletions(-) delete mode 100644 haruhi_dl/extractor/smotri.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index cb9c69e8b..297a5e02b 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1085,12 +1085,6 @@ from .sky import ( from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE from .slutload import SlutloadIE -from .smotri import ( - SmotriIE, - SmotriCommunityIE, - SmotriUserIE, - SmotriBroadcastIE, -) from .snotr import SnotrIE from .sohu import SohuIE from .sonyliv import SonyLIVIE diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 7cfc7464e..a321bcd6d 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -52,7 +52,6 @@ from .ooyala import OoyalaIE from .rutv import RUTVIE from .tvc import TVCIE from .sportbox import SportBoxIE -from .smotri import SmotriIE from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE @@ -2804,11 +2803,6 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) - # Look for embedded smotri.com player - smotri_url = SmotriIE._extract_url(webpage) - if smotri_url: - return self.url_result(smotri_url, 'Smotri') - # Look for embedded Myvi.ru player myvi_url = MyviIE._extract_url(webpage) if myvi_url: diff --git a/haruhi_dl/extractor/smotri.py b/haruhi_dl/extractor/smotri.py deleted file mode 100644 index 45995f30f..000000000 --- a/haruhi_dl/extractor/smotri.py +++ /dev/null @@ -1,416 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import json -import hashlib -import uuid - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - sanitized_Request, - unified_strdate, - urlencode_postdata, - xpath_text, -) - - -class SmotriIE(InfoExtractor): - IE_DESC = 'Smotri.com' - IE_NAME = 'smotri' - _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' - _NETRC_MACHINE = 'smotri' - - _TESTS = [ - # real video id 2610366 - { - 'url': 'http://smotri.com/video/view/?id=v261036632ab', - 'md5': '02c0dfab2102984e9c5bb585cc7cc321', - 'info_dict': { - 'id': 'v261036632ab', - 'ext': 'mp4', - 'title': 'катастрофа с камер видеонаблюдения', - 'uploader': 'rbc2008', - 'uploader_id': 'rbc08', - 'upload_date': '20131118', - 'thumbnail': 'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg', - }, - }, - # real video id 57591 - { - 'url': 'http://smotri.com/video/view/?id=v57591cb20', - 'md5': '830266dfc21f077eac5afd1883091bcd', - 'info_dict': { - 'id': 'v57591cb20', - 'ext': 'flv', - 'title': 'test', - 'uploader': 'Support Photofile@photofile', - 'uploader_id': 'support-photofile', - 'upload_date': '20070704', - 'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', - }, - }, - # video-password, not approved by moderator - { - 'url': 'http://smotri.com/video/view/?id=v1390466a13c', - 'md5': 'f6331cef33cad65a0815ee482a54440b', - 'info_dict': { - 'id': 'v1390466a13c', - 'ext': 'mp4', - 'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', - 'uploader': 'timoxa40', - 'uploader_id': 'timoxa40', - 'upload_date': '20100404', - 'thumbnail': 'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg', - }, - 'params': { - 'videopassword': 'qwerty', - }, - 'skip': 'Video is not approved by moderator', - }, - # video-password - { - 'url': 'http://smotri.com/video/view/?id=v6984858774#', - 'md5': 'f11e01d13ac676370fc3b95b9bda11b0', - 'info_dict': { - 'id': 'v6984858774', - 'ext': 'mp4', - 'title': 'Дача Солженицина ПАРОЛЬ 223322', - 'uploader': 'psavari1', - 'uploader_id': 'psavari1', - 'upload_date': '20081103', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - 'videopassword': '223322', - }, - }, - # age limit + video-password, not approved by moderator - { - 'url': 'http://smotri.com/video/view/?id=v15408898bcf', - 'md5': '91e909c9f0521adf5ee86fbe073aad70', - 'info_dict': { - 'id': 'v15408898bcf', - 'ext': 'flv', - 'title': 'этот ролик не покажут по ТВ', - 'uploader': 'zzxxx', - 'uploader_id': 'ueggb', - 'upload_date': '20101001', - 'thumbnail': 'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', - 'age_limit': 18, - }, - 'params': { - 'videopassword': '333' - }, - 'skip': 'Video is not approved by moderator', - }, - # age limit + video-password - { - 'url': 'http://smotri.com/video/view/?id=v7780025814', - 'md5': 'b4599b068422559374a59300c5337d72', - 'info_dict': { - 'id': 'v7780025814', - 'ext': 'mp4', - 'title': 'Sexy Beach (пароль 123)', - 'uploader': 'вАся', - 'uploader_id': 'asya_prosto', - 'upload_date': '20081218', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - }, - 'params': { - 'videopassword': '123' - }, - }, - # swf player - { - 'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500', - 'md5': '31099eeb4bc906712c5f40092045108d', - 'info_dict': { - 'id': 'v9188090500', - 'ext': 'mp4', - 'title': 'Shakira - Don\'t Bother', - 'uploader': 'HannahL', - 'uploader_id': 'lisaha95', - 'upload_date': '20090331', - 'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg', - }, - }, - ] - - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r'<embed[^>]src=(["\'])(?P<url>http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)', - webpage) - if mobj is not None: - return mobj.group('url') - - mobj = re.search( - r'''(?x)<div\s+class="video_file">http://smotri\.com/video/download/file/[^<]+</div>\s* - <div\s+class="video_image">[^<]+</div>\s* - <div\s+class="video_id">(?P<id>[^<]+)</div>''', webpage) - if mobj is not None: - return 'http://smotri.com/video/view/?id=%s' % mobj.group('id') - - def _search_meta(self, name, html, display_name=None): - if display_name is None: - display_name = name - return self._html_search_meta(name, html, display_name) - - def _real_extract(self, url): - video_id = self._match_id(url) - - video_form = { - 'ticket': video_id, - 'video_url': '1', - 'frame_url': '1', - 'devid': 'LoadupFlashPlayer', - 'getvideoinfo': '1', - } - - video_password = self._downloader.params.get('videopassword') - if video_password: - video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() - - video = self._download_json( - 'http://smotri.com/video/view/url/bot/', - video_id, 'Downloading video JSON', - data=urlencode_postdata(video_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - video_url = video.get('_vidURL') or video.get('_vidURL_mp4') - - if not video_url: - if video.get('_moderate_no'): - raise ExtractorError( - 'Video %s has not been approved by moderator' % video_id, expected=True) - - if video.get('error'): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - if video.get('_pass_protected') == 1: - msg = ('Invalid video password' if video_password - else 'This video is protected by a password, use the --video-password option') - raise ExtractorError(msg, expected=True) - - title = video['title'] - thumbnail = video.get('_imgURL') - upload_date = unified_strdate(video.get('added')) - uploader = video.get('userNick') - uploader_id = video.get('userLogin') - duration = int_or_none(video.get('duration')) - - # Video JSON does not provide enough meta data - # We will extract some from the video web page instead - webpage_url = 'http://smotri.com/video/view/?id=%s' % video_id - webpage = self._download_webpage(webpage_url, video_id, 'Downloading video page') - - # Warning if video is unavailable - warning = self._html_search_regex( - r'<div[^>]+class="videoUnModer"[^>]*>(.+?)</div>', webpage, - 'warning message', default=None) - if warning is not None: - self._downloader.report_warning( - 'Video %s may not be available; smotri said: %s ' % - (video_id, warning)) - - # Adult content - if 'EroConfirmText">' in webpage: - self.report_age_confirmation() - confirm_string = self._html_search_regex( - r'<a[^>]+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id, - webpage, 'confirm string') - confirm_url = webpage_url + '&confirm=%s' % confirm_string - webpage = self._download_webpage( - confirm_url, video_id, - 'Downloading video page (age confirmed)') - adult_content = True - else: - adult_content = False - - view_count = self._html_search_regex( - r'(?s)Общее количество просмотров.*?<span class="Number">(\d+)</span>', - webpage, 'view count', fatal=False) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - 'uploader_id': uploader_id, - 'duration': duration, - 'view_count': int_or_none(view_count), - 'age_limit': 18 if adult_content else 0, - } - - -class SmotriCommunityIE(InfoExtractor): - IE_DESC = 'Smotri.com community videos' - IE_NAME = 'smotri:community' - _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P<id>[0-9A-Za-z_\'-]+)' - _TEST = { - 'url': 'http://smotri.com/community/video/kommuna', - 'info_dict': { - 'id': 'kommuna', - }, - 'playlist_mincount': 4, - } - - def _real_extract(self, url): - community_id = self._match_id(url) - - rss = self._download_xml( - 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id, - community_id, 'Downloading community RSS') - - entries = [ - self.url_result(video_url.text, SmotriIE.ie_key()) - for video_url in rss.findall('./channel/item/link')] - - return self.playlist_result(entries, community_id) - - -class SmotriUserIE(InfoExtractor): - IE_DESC = 'Smotri.com user videos' - IE_NAME = 'smotri:user' - _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P<id>[0-9A-Za-z_\'-]+)' - _TESTS = [{ - 'url': 'http://smotri.com/user/inspector', - 'info_dict': { - 'id': 'inspector', - 'title': 'Inspector', - }, - 'playlist_mincount': 9, - }] - - def _real_extract(self, url): - user_id = self._match_id(url) - - rss = self._download_xml( - 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id, - user_id, 'Downloading user RSS') - - entries = [self.url_result(video_url.text, 'Smotri') - for video_url in rss.findall('./channel/item/link')] - - description_text = xpath_text(rss, './channel/description') or '' - user_nickname = self._search_regex( - '^Видео режиссера (.+)$', description_text, - 'user nickname', fatal=False) - - return self.playlist_result(entries, user_id, user_nickname) - - -class SmotriBroadcastIE(InfoExtractor): - IE_DESC = 'Smotri.com broadcasts' - IE_NAME = 'smotri:broadcast' - _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*' - _NETRC_MACHINE = 'smotri' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - broadcast_id = mobj.group('id') - - broadcast_url = 'http://' + mobj.group('url') - broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') - - if re.search('>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None: - raise ExtractorError( - 'Broadcast %s does not exist' % broadcast_id, expected=True) - - # Adult content - if re.search('EroConfirmText">', broadcast_page) is not None: - - (username, password) = self._get_login_info() - if username is None: - self.raise_login_required( - 'Erotic broadcasts allowed only for registered users') - - login_form = { - 'login-hint53': '1', - 'confirm_erotic': '1', - 'login': username, - 'password': password, - } - - request = sanitized_Request( - broadcast_url + '/?no_redirect=1', urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - broadcast_page = self._download_webpage( - request, broadcast_id, 'Logging in and confirming age') - - if '>Неверный логин или пароль<' in broadcast_page: - raise ExtractorError( - 'Unable to log in: bad username or password', expected=True) - - adult_content = True - else: - adult_content = False - - ticket = self._html_search_regex( - (r'data-user-file=(["\'])(?P<ticket>(?!\1).+)\1', - r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'(?P<ticket>[^']+)'\)"), - broadcast_page, 'broadcast ticket', group='ticket') - - broadcast_url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket - - broadcast_password = self._downloader.params.get('videopassword') - if broadcast_password: - broadcast_url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() - - broadcast_json_page = self._download_webpage( - broadcast_url, broadcast_id, 'Downloading broadcast JSON') - - try: - broadcast_json = json.loads(broadcast_json_page) - - protected_broadcast = broadcast_json['_pass_protected'] == 1 - if protected_broadcast and not broadcast_password: - raise ExtractorError( - 'This broadcast is protected by a password, use the --video-password option', - expected=True) - - broadcast_offline = broadcast_json['is_play'] == 0 - if broadcast_offline: - raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True) - - rtmp_url = broadcast_json['_server'] - mobj = re.search(r'^rtmp://[^/]+/(?P<app>.+)/?$', rtmp_url) - if not mobj: - raise ExtractorError('Unexpected broadcast rtmp URL') - - broadcast_playpath = broadcast_json['_streamName'] - broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) - broadcast_thumbnail = broadcast_json.get('_imgURL') - broadcast_title = self._live_title(broadcast_json['title']) - broadcast_description = broadcast_json.get('description') - broadcaster_nick = broadcast_json.get('nick') - broadcaster_login = broadcast_json.get('login') - rtmp_conn = 'S:%s' % uuid.uuid4().hex - except KeyError: - if protected_broadcast: - raise ExtractorError('Bad broadcast password', expected=True) - raise ExtractorError('Unexpected broadcast JSON') - - return { - 'id': broadcast_id, - 'url': rtmp_url, - 'title': broadcast_title, - 'thumbnail': broadcast_thumbnail, - 'description': broadcast_description, - 'uploader': broadcaster_nick, - 'uploader_id': broadcaster_login, - 'age_limit': 18 if adult_content else 0, - 'ext': 'flv', - 'play_path': broadcast_playpath, - 'player_url': 'http://pics.smotri.com/broadcast_play.swf', - 'app': broadcast_app, - 'rtmp_live': True, - 'rtmp_conn': rtmp_conn, - 'is_live': True, - } diff --git a/haruhi_dl/options.py b/haruhi_dl/options.py index acbef1584..76f97f452 100644 --- a/haruhi_dl/options.py +++ b/haruhi_dl/options.py @@ -369,7 +369,7 @@ def parseOpts(overrideArguments=None): authentication.add_option( '--video-password', dest='videopassword', metavar='PASSWORD', - help='Video password (vimeo, smotri, youku)') + help='Video password (vimeo, youku)') adobe_pass = optparse.OptionGroup(parser, 'Adobe Pass Options') adobe_pass.add_option( From e62320f70a3199917eeb580d43fddd1d594599ba Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:53:36 +0100 Subject: [PATCH 114/384] [facebook] remove hardcoded chrome user-agent closes #18974 closes #25411 closes #26958 closes #27329 --- haruhi_dl/extractor/facebook.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 72781bd80..2143d41a7 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -54,8 +54,6 @@ class FacebookIE(InfoExtractor): _NETRC_MACHINE = 'facebook' IE_NAME = 'facebook' - _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' - _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' @@ -306,9 +304,7 @@ class FacebookIE(InfoExtractor): self._login() def _extract_from_url(self, url, video_id, fatal_if_no_video=True): - req = sanitized_Request(url) - req.add_header('User-Agent', self._CHROME_USER_AGENT) - webpage = self._download_webpage(req, video_id) + webpage = self._download_webpage(url, video_id) video_data = None From feac903afbd615fa3390a8c7b86e5979f750de48 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:53:49 +0100 Subject: [PATCH 115/384] [facebook] try to reduce unessessary tahoe requests --- haruhi_dl/extractor/facebook.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 2143d41a7..6045058c1 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -328,11 +328,10 @@ class FacebookIE(InfoExtractor): js_data, lambda x: x['jsmods']['instances'], list) or []) if not video_data: - server_js_data = self._parse_json( - self._search_regex( - r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', - webpage, 'js data', default='{}'), - video_id, transform_source=js_to_json, fatal=False) + server_js_data = self._parse_json(self._search_regex([ + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', + r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"permalink_video_pagelet".*?})\);' + ], webpage, 'js data', default='{}'), video_id, js_to_json, False) video_data = extract_from_jsmods_instances(server_js_data) if not video_data: From fa06aa76adc1d20bd39d20b3e378df241daae893 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:53:53 +0100 Subject: [PATCH 116/384] [facebook] Add support for Relay based pages(closes #26823) --- haruhi_dl/extractor/facebook.py | 71 ++++++++++++++++++++++++++++++--- 1 file changed, 66 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 6045058c1..0d3a86b6c 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -16,11 +16,13 @@ from ..utils import ( clean_html, error_to_compat_str, ExtractorError, + float_or_none, get_element_by_id, int_or_none, js_to_json, limit_length, parse_count, + qualities, sanitized_Request, try_get, urlencode_postdata, @@ -327,6 +329,14 @@ class FacebookIE(InfoExtractor): return extract_video_data(try_get( js_data, lambda x: x['jsmods']['instances'], list) or []) + formats = [] + + def extract_dash_manifest(video): + dash_manifest = video.get('dash_manifest') + if dash_manifest: + formats.extend(self._parse_mpd_formats( + compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + if not video_data: server_js_data = self._parse_json(self._search_regex([ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', @@ -334,6 +344,61 @@ class FacebookIE(InfoExtractor): ], webpage, 'js data', default='{}'), video_id, js_to_json, False) video_data = extract_from_jsmods_instances(server_js_data) + if not video_data: + graphql_data = self._parse_json(self._search_regex( + r'handleWithCustomApplyEach\([^,]+,\s*({.*?"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+".*?})\);', + webpage, 'graphql data', default='{}'), video_id, fatal=False) or {} + for require in (graphql_data.get('require') or []): + if require[0] == 'RelayPrefetchedStreamCache': + def parse_graphql_video(video): + q = qualities(['sd', 'hd']) + for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: + playable_url = video.get('playable_url' + suffix) + if not playable_url: + continue + formats.append({ + 'format_id': format_id, + 'quality': q(format_id), + 'url': playable_url, + }) + extract_dash_manifest(video) + self._sort_formats(formats) + v_id = video.get('videoId') or video.get('id') or video_id + info = { + 'id': v_id, + 'formats': formats, + 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), + 'uploader_id': try_get(video, lambda x: x['owner']['id']), + 'timestamp': int_or_none(video.get('publish_time')), + 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), + } + description = try_get(video, lambda x: x['savable_description']['text']) + title = video.get('name') + if title: + info.update({ + 'title': title, + 'description': description, + }) + else: + info['title'] = description or 'Facebook video #%s' % v_id + return webpage, info + + data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + + attachments = try_get(data, [ + lambda x: x['video']['story']['attachments'], + lambda x: x['video']['creation_story']['attachments'], + lambda x: x['node']['comet_sections']['content']['story']['attachments'] + ], list) or [] + for attachment in attachments: + media = attachment.get('media') or try_get(attachment, lambda x: x['style_type_renderer']['attachment']['media'], dict) or {} + if media.get('__typename') == 'Video': + return parse_graphql_video(media) + + video = data.get('video') or {} + if video: + return parse_graphql_video(video) + if not video_data: if not fatal_if_no_video: return webpage, False @@ -375,7 +440,6 @@ class FacebookIE(InfoExtractor): raise ExtractorError('Cannot parse data') subtitles = {} - formats = [] for f in video_data: format_id = f['stream_type'] if f and isinstance(f, dict): @@ -394,10 +458,7 @@ class FacebookIE(InfoExtractor): 'url': src, 'preference': preference, }) - dash_manifest = f[0].get('dash_manifest') - if dash_manifest: - formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + extract_dash_manifest(f[0]) subtitles_src = f[0].get('subtitles_src') if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src}) From 91f1af44a1c740047cdd7929b8f4b2d681994ee4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:53:58 +0100 Subject: [PATCH 117/384] [facebook] redirect Mobile URLs to Desktop URLs closes #24831 closes #25624 --- haruhi_dl/extractor/facebook.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 0d3a86b6c..bcb224c03 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -306,7 +306,8 @@ class FacebookIE(InfoExtractor): self._login() def _extract_from_url(self, url, video_id, fatal_if_no_video=True): - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) video_data = None From c9b7b7dd04cd854d61350bbd41837fdf6c2aa230 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:54:03 +0100 Subject: [PATCH 118/384] [itv] remove old extractio method and fix series metadata extraction closes #23177 closes #26897 --- haruhi_dl/extractor/itv.py | 309 +++++++++++-------------------------- 1 file changed, 91 insertions(+), 218 deletions(-) diff --git a/haruhi_dl/extractor/itv.py b/haruhi_dl/extractor/itv.py index ad2f4eca5..08bcc8b68 100644 --- a/haruhi_dl/extractor/itv.py +++ b/haruhi_dl/extractor/itv.py @@ -1,29 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals -import uuid -import xml.etree.ElementTree as etree import json import re from .common import InfoExtractor from .brightcove import BrightcoveNewIE -from ..compat import ( - compat_str, - compat_etree_register_namespace, -) from ..utils import ( determine_ext, - ExtractorError, extract_attributes, - int_or_none, + get_element_by_class, + JSON_LD_RE, merge_dicts, parse_duration, smuggle_url, + strip_or_none, url_or_none, - xpath_with_ns, - xpath_element, - xpath_text, ) @@ -31,14 +23,18 @@ class ITVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)' _GEO_COUNTRIES = ['GB'] _TESTS = [{ - 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', + 'url': 'https://www.itv.com/hub/liar/2a4547a0012', 'info_dict': { - 'id': '2a2936a0053', - 'ext': 'flv', - 'title': 'Home Movie', + 'id': '2a4547a0012', + 'ext': 'mp4', + 'title': 'Liar - Series 2 - Episode 6', + 'description': 'md5:d0f91536569dec79ea184f0a44cca089', + 'series': 'Liar', + 'season_number': 2, + 'episode_number': 6, }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, }, { @@ -61,220 +57,97 @@ class ITVIE(InfoExtractor): params = extract_attributes(self._search_regex( r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params')) - ns_map = { - 'soapenv': 'http://schemas.xmlsoap.org/soap/envelope/', - 'tem': 'http://tempuri.org/', - 'itv': 'http://schemas.datacontract.org/2004/07/Itv.BB.Mercury.Common.Types', - 'com': 'http://schemas.itv.com/2009/05/Common', - } - for ns, full_ns in ns_map.items(): - compat_etree_register_namespace(ns, full_ns) - - def _add_ns(name): - return xpath_with_ns(name, ns_map) - - def _add_sub_element(element, name): - return etree.SubElement(element, _add_ns(name)) - - production_id = ( - params.get('data-video-autoplay-id') - or '%s#001' % ( - params.get('data-video-episode-id') - or video_id.replace('a', '/'))) - - req_env = etree.Element(_add_ns('soapenv:Envelope')) - _add_sub_element(req_env, 'soapenv:Header') - body = _add_sub_element(req_env, 'soapenv:Body') - get_playlist = _add_sub_element(body, ('tem:GetPlaylist')) - request = _add_sub_element(get_playlist, 'tem:request') - _add_sub_element(request, 'itv:ProductionId').text = production_id - _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper() - vodcrid = _add_sub_element(request, 'itv:Vodcrid') - _add_sub_element(vodcrid, 'com:Id') - _add_sub_element(request, 'itv:Partition') - user_info = _add_sub_element(get_playlist, 'tem:userInfo') - _add_sub_element(user_info, 'itv:Broadcaster').text = 'Itv' - _add_sub_element(user_info, 'itv:DM') - _add_sub_element(user_info, 'itv:RevenueScienceValue') - _add_sub_element(user_info, 'itv:SessionId') - _add_sub_element(user_info, 'itv:SsoToken') - _add_sub_element(user_info, 'itv:UserToken') - site_info = _add_sub_element(get_playlist, 'tem:siteInfo') - _add_sub_element(site_info, 'itv:AdvertisingRestriction').text = 'None' - _add_sub_element(site_info, 'itv:AdvertisingSite').text = 'ITV' - _add_sub_element(site_info, 'itv:AdvertisingType').text = 'Any' - _add_sub_element(site_info, 'itv:Area').text = 'ITVPLAYER.VIDEO' - _add_sub_element(site_info, 'itv:Category') - _add_sub_element(site_info, 'itv:Platform').text = 'DotCom' - _add_sub_element(site_info, 'itv:Site').text = 'ItvCom' - device_info = _add_sub_element(get_playlist, 'tem:deviceInfo') - _add_sub_element(device_info, 'itv:ScreenSize').text = 'Big' - player_info = _add_sub_element(get_playlist, 'tem:playerInfo') - _add_sub_element(player_info, 'itv:Version').text = '2' - + ios_playlist_url = params.get('data-video-playlist') or params['data-video-id'] + hmac = params['data-video-hmac'] headers = self.geo_verification_headers() headers.update({ - 'Content-Type': 'text/xml; charset=utf-8', - 'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist', + 'Accept': 'application/vnd.itv.vod.playlist.v2+json', + 'Content-Type': 'application/json', + 'hmac': hmac.upper(), }) + ios_playlist = self._download_json( + ios_playlist_url, video_id, data=json.dumps({ + 'user': { + 'itvUserId': '', + 'entitlements': [], + 'token': '' + }, + 'device': { + 'manufacturer': 'Safari', + 'model': '5', + 'os': { + 'name': 'Windows NT', + 'version': '6.1', + 'type': 'desktop' + } + }, + 'client': { + 'version': '4.1', + 'id': 'browser' + }, + 'variantAvailability': { + 'featureset': { + 'min': ['hls', 'aes', 'outband-webvtt'], + 'max': ['hls', 'aes', 'outband-webvtt'] + }, + 'platformTag': 'dotcom' + } + }).encode(), headers=headers) + video_data = ios_playlist['Playlist']['Video'] + ios_base_url = video_data.get('Base') - info = self._search_json_ld(webpage, video_id, default={}) formats = [] - subtitles = {} - - def extract_subtitle(sub_url): - ext = determine_ext(sub_url, 'ttml') - subtitles.setdefault('en', []).append({ - 'url': sub_url, - 'ext': 'ttml' if ext == 'xml' else ext, - }) - - resp_env = self._download_xml( - params['data-playlist-url'], video_id, - headers=headers, data=etree.tostring(req_env), fatal=False) - if resp_env: - playlist = xpath_element(resp_env, './/Playlist') - if playlist is None: - fault_code = xpath_text(resp_env, './/faultcode') - fault_string = xpath_text(resp_env, './/faultstring') - if fault_code == 'InvalidGeoRegion': - self.raise_geo_restricted( - msg=fault_string, countries=self._GEO_COUNTRIES) - elif fault_code not in ( - 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, fault_string), expected=True) - info.update({ - 'title': self._og_search_title(webpage), - 'episode_title': params.get('data-video-episode'), - 'series': params.get('data-video-title'), - }) + for media_file in (video_data.get('MediaFiles') or []): + href = media_file.get('Href') + if not href: + continue + if ios_base_url: + href = ios_base_url + href + ext = determine_ext(href) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + href, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) else: - title = xpath_text(playlist, 'EpisodeTitle', default=None) - info.update({ - 'title': title, - 'episode_title': title, - 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), - 'series': xpath_text(playlist, 'ProgrammeTitle'), - 'duration': parse_duration(xpath_text(playlist, 'Duration')), + formats.append({ + 'url': href, }) - video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) - media_files = xpath_element(video_element, 'MediaFiles', fatal=True) - rtmp_url = media_files.attrib['base'] - - for media_file in media_files.findall('MediaFile'): - play_path = xpath_text(media_file, 'URL') - if not play_path: - continue - tbr = int_or_none(media_file.get('bitrate'), 1000) - f = { - 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'play_path': play_path, - # Providing this swfVfy allows to avoid truncated downloads - 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', - 'page_url': url, - 'tbr': tbr, - 'ext': 'flv', - } - app = self._search_regex( - 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) - if app: - f.update({ - 'url': rtmp_url.split('?', 1)[0], - 'app': app, - }) - else: - f['url'] = rtmp_url - formats.append(f) - - for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): - if caption_url.text: - extract_subtitle(caption_url.text) - - ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') - hmac = params.get('data-video-hmac') - if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url): - headers = self.geo_verification_headers() - headers.update({ - 'Accept': 'application/vnd.itv.vod.playlist.v2+json', - 'Content-Type': 'application/json', - 'hmac': hmac.upper(), - }) - ios_playlist = self._download_json( - ios_playlist_url, video_id, data=json.dumps({ - 'user': { - 'itvUserId': '', - 'entitlements': [], - 'token': '' - }, - 'device': { - 'manufacturer': 'Safari', - 'model': '5', - 'os': { - 'name': 'Windows NT', - 'version': '6.1', - 'type': 'desktop' - } - }, - 'client': { - 'version': '4.1', - 'id': 'browser' - }, - 'variantAvailability': { - 'featureset': { - 'min': ['hls', 'aes', 'outband-webvtt'], - 'max': ['hls', 'aes', 'outband-webvtt'] - }, - 'platformTag': 'dotcom' - } - }).encode(), headers=headers, fatal=False) - if ios_playlist: - video_data = ios_playlist.get('Playlist', {}).get('Video', {}) - ios_base_url = video_data.get('Base') - for media_file in video_data.get('MediaFiles', []): - href = media_file.get('Href') - if not href: - continue - if ios_base_url: - href = ios_base_url + href - ext = determine_ext(href) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - href, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': href, - }) - subs = video_data.get('Subtitles') - if isinstance(subs, list): - for sub in subs: - if not isinstance(sub, dict): - continue - href = url_or_none(sub.get('Href')) - if href: - extract_subtitle(href) - if not info.get('duration'): - info['duration'] = parse_duration(video_data.get('Duration')) - self._sort_formats(formats) - info.update({ + subtitles = {} + subs = video_data.get('Subtitles') or [] + for sub in subs: + if not isinstance(sub, dict): + continue + href = url_or_none(sub.get('Href')) + if not href: + continue + subtitles.setdefault('en', []).append({ + 'url': href, + 'ext': determine_ext(href, 'vtt'), + }) + + info = self._search_json_ld(webpage, video_id, default={}) + if not info: + json_ld = self._parse_json(self._search_regex( + JSON_LD_RE, webpage, 'JSON-LD', '{}', + group='json_ld'), video_id, fatal=False) + if json_ld and json_ld.get('@type') == 'BreadcrumbList': + for ile in (json_ld.get('itemListElement:') or []): + item = ile.get('item:') or {} + if item.get('@type') == 'TVEpisode': + item['@context'] = 'http://schema.org' + info = self._json_ld(item, video_id, fatal=False) or {} + break + + return merge_dicts({ 'id': video_id, + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), 'formats': formats, 'subtitles': subtitles, - }) - - webpage_info = self._search_json_ld(webpage, video_id, default={}) - if not webpage_info.get('title'): - webpage_info['title'] = self._html_search_regex( - r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<', - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', - default=None) or webpage_info['episode'] - - return merge_dicts(info, webpage_info) + 'duration': parse_duration(video_data.get('Duration')), + 'description': strip_or_none(get_element_by_class('episode-info__synopsis', webpage)), + }, info) class ITVBTCCIE(InfoExtractor): From 1765b2f8706a13502ee9788ea3b07c780462be72 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:54:08 +0100 Subject: [PATCH 119/384] =?UTF-8?q?[facebook]=20add=20support=20for=20grou?= =?UTF-8?q?p=20posts=20with=20multiple=20videos(closes=20#1=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …9131) --- haruhi_dl/extractor/facebook.py | 57 ++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index bcb224c03..2c3e4b251 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -72,6 +72,7 @@ class FacebookIE(InfoExtractor): }, 'skip': 'Requires logging in', }, { + # data.video 'url': 'https://www.facebook.com/video.php?v=274175099429670', 'info_dict': { 'id': '274175099429670', @@ -133,6 +134,7 @@ class FacebookIE(InfoExtractor): }, }, { # have 1080P, but only up to 720p in swf params + # data.video.story.attachments[].media 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', 'md5': '9571fae53d4165bbbadb17a94651dcdc', 'info_dict': { @@ -147,6 +149,7 @@ class FacebookIE(InfoExtractor): }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', 'info_dict': { 'id': '1417995061575415', @@ -174,6 +177,7 @@ class FacebookIE(InfoExtractor): 'skip_download': True, }, }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', 'info_dict': { 'id': '1396382447100162', @@ -193,18 +197,23 @@ class FacebookIE(InfoExtractor): 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', 'only_matching': True, }, { + # data.mediaset.currMedia.edges 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', 'only_matching': True, }, { + # data.video.story.attachments[].media 'url': 'facebook:544765982287235', 'only_matching': True, }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 'only_matching': True, }, { + # data.video.creation_story.attachments[].media 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', 'only_matching': True, }, { + # data.video 'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670', 'only_matching': True, }, { @@ -212,6 +221,7 @@ class FacebookIE(InfoExtractor): 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', 'only_matching': True, }, { + # data.video 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', 'info_dict': { 'id': '359649331226507', @@ -222,6 +232,13 @@ class FacebookIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media + 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', + 'info_dict': { + 'id': '106560053808006', + }, + 'playlist_count': 2, }] @staticmethod @@ -330,9 +347,7 @@ class FacebookIE(InfoExtractor): return extract_video_data(try_get( js_data, lambda x: x['jsmods']['instances'], list) or []) - formats = [] - - def extract_dash_manifest(video): + def extract_dash_manifest(video, formats): dash_manifest = video.get('dash_manifest') if dash_manifest: formats.extend(self._parse_mpd_formats( @@ -351,7 +366,10 @@ class FacebookIE(InfoExtractor): webpage, 'graphql data', default='{}'), video_id, fatal=False) or {} for require in (graphql_data.get('require') or []): if require[0] == 'RelayPrefetchedStreamCache': + entries = [] + def parse_graphql_video(video): + formats = [] q = qualities(['sd', 'hd']) for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: playable_url = video.get('playable_url' + suffix) @@ -362,7 +380,7 @@ class FacebookIE(InfoExtractor): 'quality': q(format_id), 'url': playable_url, }) - extract_dash_manifest(video) + extract_dash_manifest(video, formats) self._sort_formats(formats) v_id = video.get('videoId') or video.get('id') or video_id info = { @@ -382,7 +400,12 @@ class FacebookIE(InfoExtractor): }) else: info['title'] = description or 'Facebook video #%s' % v_id - return webpage, info + entries.append(info) + + def parse_attachment(attachment, key='media'): + media = attachment.get(key) or {} + if media.get('__typename') == 'Video': + return parse_graphql_video(media) data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} @@ -392,13 +415,22 @@ class FacebookIE(InfoExtractor): lambda x: x['node']['comet_sections']['content']['story']['attachments'] ], list) or [] for attachment in attachments: - media = attachment.get('media') or try_get(attachment, lambda x: x['style_type_renderer']['attachment']['media'], dict) or {} - if media.get('__typename') == 'Video': - return parse_graphql_video(media) + attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) or attachment + nodes = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] + for node in nodes: + parse_attachment(node) + parse_attachment(attachment) - video = data.get('video') or {} - if video: - return parse_graphql_video(video) + edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] + for edge in edges: + parse_attachment(edge, key='node') + + if not entries: + video = data.get('video') or {} + if video: + parse_graphql_video(video) + + return webpage, self.playlist_result(entries, video_id) if not video_data: if not fatal_if_no_video: @@ -440,6 +472,7 @@ class FacebookIE(InfoExtractor): if not video_data: raise ExtractorError('Cannot parse data') + formats = [] subtitles = {} for f in video_data: format_id = f['stream_type'] @@ -459,7 +492,7 @@ class FacebookIE(InfoExtractor): 'url': src, 'preference': preference, }) - extract_dash_manifest(f[0]) + extract_dash_manifest(f[0], formats) subtitles_src = f[0].get('subtitles_src') if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src}) From 493a5245dc16750e094fb228c40d1fe0614edfbe Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:54:13 +0100 Subject: [PATCH 120/384] [facebook] add support for watch videos(closes #22795) --- haruhi_dl/extractor/facebook.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 2c3e4b251..417d7a370 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -41,7 +41,8 @@ class FacebookIE(InfoExtractor): photo\.php| video\.php| video/embed| - story\.php + story\.php| + watch/? )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| @@ -239,6 +240,20 @@ class FacebookIE(InfoExtractor): 'id': '106560053808006', }, 'playlist_count': 2, + }, { + # data.video_home_www_feed.video_home_sections.edges[].node.feed_section_renderer.section.section_components.edges[].node.feed_unit.attachments + 'url': 'https://www.facebook.com/watch/?v=125475412191640', + 'md5': 'a38bed45dd1b2881ea230f3561c914b7', + 'info_dict': { + 'id': '373249263226147', + 'ext': 'mp4', + 'title': 'شوف بعينيك ماذا يحدث...ماناش نخوف فيكم رانا ننقل لكم مايحدث...', + 'description': 'شوف بعينيك ماذا يحدث خويا العزيز...ماناش نخوف فيكم رانا ننقل لكم مايحدث...\nتذكروا جيدا ماكنا نقوله لكم منذ سنوات وماكنا نحذركم .', + 'timestamp': 1550353963, + 'upload_date': '20190216', + 'uploader_id': '176917942440142', + }, + 'skip': 'Requires logging in', }] @staticmethod @@ -425,6 +440,14 @@ class FacebookIE(InfoExtractor): for edge in edges: parse_attachment(edge, key='node') + video_home_sections = try_get(data, lambda x: x['video_home_www_feed']['video_home_sections']['edges'], list) or [] + for video_home_section in video_home_sections: + section_components = try_get(video_home_section, lambda x: x['node']['feed_section_renderer']['section']['section_components']['edges'], list) or [] + for section_component in section_components: + attachments = try_get(section_component, lambda x: x['node']['feed_unit']['attachments'], list) or [] + for attachment in attachments: + parse_attachment(attachment) + if not entries: video = data.get('video') or {} if video: From e51e641c6c80a9d3347a02c52401a21220c4768a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:54:19 +0100 Subject: [PATCH 121/384] Revert "[facebook] add support for watch videos(closes #22795)" This reverts commit dc65041c224497f46b2984df02c234ce54bdedfd. --- haruhi_dl/extractor/facebook.py | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 417d7a370..2c3e4b251 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -41,8 +41,7 @@ class FacebookIE(InfoExtractor): photo\.php| video\.php| video/embed| - story\.php| - watch/? + story\.php )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| @@ -240,20 +239,6 @@ class FacebookIE(InfoExtractor): 'id': '106560053808006', }, 'playlist_count': 2, - }, { - # data.video_home_www_feed.video_home_sections.edges[].node.feed_section_renderer.section.section_components.edges[].node.feed_unit.attachments - 'url': 'https://www.facebook.com/watch/?v=125475412191640', - 'md5': 'a38bed45dd1b2881ea230f3561c914b7', - 'info_dict': { - 'id': '373249263226147', - 'ext': 'mp4', - 'title': 'شوف بعينيك ماذا يحدث...ماناش نخوف فيكم رانا ننقل لكم مايحدث...', - 'description': 'شوف بعينيك ماذا يحدث خويا العزيز...ماناش نخوف فيكم رانا ننقل لكم مايحدث...\nتذكروا جيدا ماكنا نقوله لكم منذ سنوات وماكنا نحذركم .', - 'timestamp': 1550353963, - 'upload_date': '20190216', - 'uploader_id': '176917942440142', - }, - 'skip': 'Requires logging in', }] @staticmethod @@ -440,14 +425,6 @@ class FacebookIE(InfoExtractor): for edge in edges: parse_attachment(edge, key='node') - video_home_sections = try_get(data, lambda x: x['video_home_www_feed']['video_home_sections']['edges'], list) or [] - for video_home_section in video_home_sections: - section_components = try_get(video_home_section, lambda x: x['node']['feed_section_renderer']['section']['section_components']['edges'], list) or [] - for section_component in section_components: - attachments = try_get(section_component, lambda x: x['node']['feed_unit']['attachments'], list) or [] - for attachment in attachments: - parse_attachment(attachment) - if not entries: video = data.get('video') or {} if video: From 9f4416afd729b1b552a1247f09e710e42a4c5a39 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 14:54:28 +0100 Subject: [PATCH 122/384] [facebook] proper support for watch videos(closes #22795)(#27062) --- haruhi_dl/extractor/facebook.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 2c3e4b251..c16453776 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -41,7 +41,8 @@ class FacebookIE(InfoExtractor): photo\.php| video\.php| video/embed| - story\.php + story\.php| + watch/? )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| @@ -239,6 +240,10 @@ class FacebookIE(InfoExtractor): 'id': '106560053808006', }, 'playlist_count': 2, + }, { + # data.video.story.attachments[].media + 'url': 'https://www.facebook.com/watch/?v=647537299265662', + 'only_matching': True, }] @staticmethod From 96b2d8bb346554b2f75fef9115dfb6db7c69237c Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:08:54 +0100 Subject: [PATCH 123/384] [PATCH] [facebook] add support for Relay post pages(closes #26935) --- haruhi_dl/extractor/facebook.py | 125 +++++++++++++++++++------------- 1 file changed, 73 insertions(+), 52 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index c16453776..82f90d2ac 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -26,6 +26,7 @@ from ..utils import ( sanitized_Request, try_get, urlencode_postdata, + urljoin, ) @@ -244,7 +245,28 @@ class FacebookIE(InfoExtractor): # data.video.story.attachments[].media 'url': 'https://www.facebook.com/watch/?v=647537299265662', 'only_matching': True, + }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media + 'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271', + 'info_dict': { + 'id': '10157667649866271', + }, + 'playlist_count': 3, + }, { + # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media + 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', + 'info_dict': { + 'id': '117576630041613', + 'ext': 'mp4', + # TODO: title can be extracted from video page + 'title': 'Facebook video #117576630041613', + 'uploader_id': '189393014416438', + 'upload_date': '20201123', + 'timestamp': 1606162592, + }, + 'skip': 'Requires logging in', }] + _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' @staticmethod def _extract_urls(webpage, **kwargs): @@ -327,18 +349,20 @@ class FacebookIE(InfoExtractor): def _real_initialize(self): self._login() - def _extract_from_url(self, url, video_id, fatal_if_no_video=True): + def _extract_from_url(self, url, video_id): webpage = self._download_webpage( url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) video_data = None def extract_video_data(instances): + video_data = [] for item in instances: if item[1][0] == 'VideoConfig': video_item = item[2][0] if video_item.get('video_id'): - return video_item['videoData'] + video_data.append(video_item['videoData']) + return video_data server_js_data = self._parse_json(self._search_regex( r'handleServerJS\(({.+})(?:\);|,")', webpage, @@ -358,10 +382,18 @@ class FacebookIE(InfoExtractor): formats.extend(self._parse_mpd_formats( compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + def process_formats(formats): + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. + for f in formats: + f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' + + self._sort_formats(formats) + if not video_data: server_js_data = self._parse_json(self._search_regex([ - r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', - r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"permalink_video_pagelet".*?})\);' + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, + r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"%s".*?})\);' % self._SUPPORTED_PAGLETS_REGEX ], webpage, 'js data', default='{}'), video_id, js_to_json, False) video_data = extract_from_jsmods_instances(server_js_data) @@ -386,7 +418,7 @@ class FacebookIE(InfoExtractor): 'url': playable_url, }) extract_dash_manifest(video, formats) - self._sort_formats(formats) + process_formats(formats) v_id = video.get('videoId') or video.get('id') or video_id info = { 'id': v_id, @@ -414,32 +446,37 @@ class FacebookIE(InfoExtractor): data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} - attachments = try_get(data, [ - lambda x: x['video']['story']['attachments'], - lambda x: x['video']['creation_story']['attachments'], - lambda x: x['node']['comet_sections']['content']['story']['attachments'] - ], list) or [] - for attachment in attachments: - attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) or attachment - nodes = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] - for node in nodes: - parse_attachment(node) - parse_attachment(attachment) + nodes = data.get('nodes') or [] + node = data.get('node') or {} + if not nodes and node: + nodes.append(node) + for node in nodes: + attachments = try_get(node, lambda x: x['comet_sections']['content']['story']['attachments'], list) or [] + for attachment in attachments: + attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) + ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] + for n in ns: + parse_attachment(n) + parse_attachment(attachment) edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] for edge in edges: parse_attachment(edge, key='node') - if not entries: - video = data.get('video') or {} - if video: + video = data.get('video') or {} + if video: + attachments = try_get(video, [ + lambda x: x['story']['attachments'], + lambda x: x['creation_story']['attachments'] + ], list) or [] + for attachment in attachments: + parse_attachment(attachment) + if not entries: parse_graphql_video(video) - return webpage, self.playlist_result(entries, video_id) + return self.playlist_result(entries, video_id) if not video_data: - if not fatal_if_no_video: - return webpage, False m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) if m_msg is not None: raise ExtractorError( @@ -477,6 +514,17 @@ class FacebookIE(InfoExtractor): if not video_data: raise ExtractorError('Cannot parse data') + if len(video_data) > 1: + entries = [] + for v in video_data: + video_url = v[0].get('video_url') + if not video_url: + continue + entries.append(self.url_result(urljoin( + url, video_url), self.ie_key(), v[0].get('video_id'))) + return self.playlist_result(entries, video_id) + video_data = video_data[0] + formats = [] subtitles = {} for f in video_data: @@ -504,12 +552,7 @@ class FacebookIE(InfoExtractor): if not formats: raise ExtractorError('Cannot find video formats') - # Downloads with browser's User-Agent are rate limited. Working around - # with non-browser User-Agent. - for f in formats: - f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - - self._sort_formats(formats) + process_formats(formats) video_title = self._html_search_regex( r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, @@ -549,35 +592,13 @@ class FacebookIE(InfoExtractor): 'subtitles': subtitles, } - return webpage, info_dict + return info_dict def _real_extract(self, url): video_id = self._match_id(url) real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url - webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False) - - if info_dict: - return info_dict - - if '/posts/' in url: - video_id_json = self._search_regex( - r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])', webpage, 'video ids', group='ids', - default='') - if video_id_json: - entries = [ - self.url_result('facebook:%s' % vid, FacebookIE.ie_key()) - for vid in self._parse_json(video_id_json, video_id)] - return self.playlist_result(entries, video_id) - - # Single Video? - video_id = self._search_regex(r'video_id:\s*"([0-9]+)"', webpage, 'single video id') - return self.url_result('facebook:%s' % video_id, FacebookIE.ie_key()) - else: - _, info_dict = self._extract_from_url( - self._VIDEO_PAGE_TEMPLATE % video_id, - video_id, fatal_if_no_video=True) - return info_dict + return self._extract_from_url(real_url, video_id) class FacebookPluginsVideoIE(InfoExtractor): From 7818a5cbb613f3a279f7c82355f79b60a624dfd2 Mon Sep 17 00:00:00 2001 From: compujo <2576634+compujo@users.noreply.github.com> Date: Fri, 26 Feb 2021 15:11:12 +0100 Subject: [PATCH 124/384] =?UTF-8?q?[YoutubeDL]=20Improve=20thumbnails'=20f?= =?UTF-8?q?ilenames=20deducing=20(closes=20#26010)=20(#=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …27244) --- haruhi_dl/HaruhiDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/HaruhiDL.py b/haruhi_dl/HaruhiDL.py index ffc583e82..813d32d76 100755 --- a/haruhi_dl/HaruhiDL.py +++ b/haruhi_dl/HaruhiDL.py @@ -2414,7 +2414,7 @@ class HaruhiDL(object): thumb_ext = determine_ext(t['url'], 'jpg') suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' - t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext + t['filename'] = thumb_filename = replace_extension(filename + suffix, thumb_ext, info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): self.to_screen('[%s] %s: Thumbnail %sis already present' % From cba73be180a795a550eaf8034c3fc68c41bbf233 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:11:21 +0100 Subject: [PATCH 125/384] [facebook] fix embed page extraction --- haruhi_dl/extractor/facebook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 82f90d2ac..c2e2155f5 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -358,7 +358,7 @@ class FacebookIE(InfoExtractor): def extract_video_data(instances): video_data = [] for item in instances: - if item[1][0] == 'VideoConfig': + if try_get(item, lambda x: x[1][0]) == 'VideoConfig': video_item = item[2][0] if video_item.get('video_id'): video_data.append(video_item['videoData']) From fe9f5a795d06dc2dd422f5ff002fa13f021fd8b9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:11:26 +0100 Subject: [PATCH 126/384] [facebook] Add another regex for handleServerJS --- haruhi_dl/extractor/facebook.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index c2e2155f5..9113678c4 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -365,8 +365,8 @@ class FacebookIE(InfoExtractor): return video_data server_js_data = self._parse_json(self._search_regex( - r'handleServerJS\(({.+})(?:\);|,")', webpage, - 'server js data', default='{}'), video_id, fatal=False) + [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'], + webpage, 'server js data', default='{}'), video_id, fatal=False) if server_js_data: video_data = extract_video_data(server_js_data.get('instances', [])) From ba0f2c14da7ec1722858342059fe05b26e426047 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:11:31 +0100 Subject: [PATCH 127/384] [wdr:page] Add support for kinder.wdr.de (closes #27350) --- haruhi_dl/extractor/wdr.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/wdr.py b/haruhi_dl/extractor/wdr.py index cf6f7c7ed..ba97d983b 100644 --- a/haruhi_dl/extractor/wdr.py +++ b/haruhi_dl/extractor/wdr.py @@ -105,7 +105,7 @@ class WDRIE(InfoExtractor): class WDRPageIE(InfoExtractor): _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html' - _VALID_URL = r'https?://(?:www\d?\.)?(?:wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL + _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL _TESTS = [ { @@ -212,7 +212,11 @@ class WDRPageIE(InfoExtractor): { 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', 'only_matching': True, - } + }, + { + 'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html', + 'only_matching': True, + }, ] def _real_extract(self, url): From 7cebd30677f6e1322be8af37ac448740b077a0f4 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:11:35 +0100 Subject: [PATCH 128/384] =?UTF-8?q?[facebook]=20add=20support=20for=20vide?= =?UTF-8?q?os=20attached=20to=20Relay=20based=20story=20pages=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …(#10795) --- haruhi_dl/extractor/facebook.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 9113678c4..370365ab8 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -265,6 +265,17 @@ class FacebookIE(InfoExtractor): 'timestamp': 1606162592, }, 'skip': 'Requires logging in', + }, { + # node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media + 'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/', + 'info_dict': { + 'id': '211567722618337', + 'ext': 'mp4', + 'title': 'Facebook video #211567722618337', + 'uploader_id': '127875227654254', + 'upload_date': '20161122', + 'timestamp': 1479793574, + }, }] _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' @@ -451,7 +462,11 @@ class FacebookIE(InfoExtractor): if not nodes and node: nodes.append(node) for node in nodes: - attachments = try_get(node, lambda x: x['comet_sections']['content']['story']['attachments'], list) or [] + story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} + attachments = try_get(story, [ + lambda x: x['attached_story']['attachments'], + lambda x: x['attachments'] + ], list) or [] for attachment in attachments: attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] From a8573bb5b2586b49db30803f6d5236d263bc6545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:11:39 +0100 Subject: [PATCH 129/384] =?UTF-8?q?[wdr]=20Extent=20subtitles=20extraction?= =?UTF-8?q?=20and=20improve=20overall=20extraction=20(clo=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ses #22672, closes #22723) --- haruhi_dl/extractor/wdr.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/wdr.py b/haruhi_dl/extractor/wdr.py index ba97d983b..2903d189e 100644 --- a/haruhi_dl/extractor/wdr.py +++ b/haruhi_dl/extractor/wdr.py @@ -17,6 +17,7 @@ from ..utils import ( unified_strdate, update_url_query, urlhandle_detect_ext, + url_or_none, ) @@ -42,16 +43,20 @@ class WDRIE(InfoExtractor): is_live = metadata.get('mediaType') == 'live' tracker_data = metadata['trackerData'] + title = tracker_data['trackerClipTitle'] + media_resource = metadata['mediaResource'] formats = [] # check if the metadata contains a direct URL to a file - for kind, media_resource in media_resource.items(): + for kind, media in media_resource.items(): + if not isinstance(media, dict): + continue if kind not in ('dflt', 'alt'): continue - for tag_name, medium_url in media_resource.items(): + for tag_name, medium_url in media.items(): if tag_name not in ('videoURL', 'audioURL'): continue @@ -88,8 +93,16 @@ class WDRIE(InfoExtractor): 'url': caption_url, 'ext': 'ttml', }] - - title = tracker_data['trackerClipTitle'] + captions_hash = media_resource.get('captionsHash') + if isinstance(captions_hash, dict): + for ext, format_url in captions_hash.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + subtitles.setdefault('de', []).append({ + 'url': format_url, + 'ext': determine_ext(format_url, None) or ext, + }) return { 'id': tracker_data.get('trackerClipId', video_id), From 19d8f8301369d8b3641299e611220dc79e5d9fb3 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:11:44 +0100 Subject: [PATCH 130/384] [facebook] Add support archived live video URLs(closes #15859) --- haruhi_dl/extractor/facebook.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 370365ab8..d5afd0051 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -43,7 +43,7 @@ class FacebookIE(InfoExtractor): video\.php| video/embed| story\.php| - watch/? + watch(?:/live)?/? )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| @@ -276,6 +276,10 @@ class FacebookIE(InfoExtractor): 'upload_date': '20161122', 'timestamp': 1479793574, }, + }, { + # data.video.creation_story.attachments[].media + 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', + 'only_matching': True, }] _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' From 1646a89d7105524cfbec594ae22a161148d86953 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:11:51 +0100 Subject: [PATCH 131/384] [ruutu] Extend _VALID_URL (closes #24839) --- haruhi_dl/extractor/ruutu.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/ruutu.py b/haruhi_dl/extractor/ruutu.py index f984040aa..4dbd144bc 100644 --- a/haruhi_dl/extractor/ruutu.py +++ b/haruhi_dl/extractor/ruutu.py @@ -13,7 +13,7 @@ from ..utils import ( class RuutuIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/(?P<id>\d+)' _TESTS = [ { 'url': 'http://www.ruutu.fi/video/2058907', @@ -71,8 +71,15 @@ class RuutuIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 0, }, - 'expected_warnings': ['HTTP Error 502: Bad Gateway'], - } + 'expected_warnings': [ + 'HTTP Error 502: Bad Gateway', + 'Failed to download m3u8 information', + ], + }, + { + 'url': 'http://www.supla.fi/audio/2231370', + 'only_matching': True, + }, ] def _real_extract(self, url): From 39031fb5ac0c06f1f2e1b9fc668987763dbf1f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:11:57 +0100 Subject: [PATCH 132/384] [ruutu] Add support for static.nelonenmedia.fi (closes #25412) --- haruhi_dl/extractor/ruutu.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/ruutu.py b/haruhi_dl/extractor/ruutu.py index 4dbd144bc..af42e3b12 100644 --- a/haruhi_dl/extractor/ruutu.py +++ b/haruhi_dl/extractor/ruutu.py @@ -13,7 +13,14 @@ from ..utils import ( class RuutuIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/| + static\.nelonenmedia\.fi/player/misc/embed_player\.html\?.*?\bnid= + ) + (?P<id>\d+) + ''' _TESTS = [ { 'url': 'http://www.ruutu.fi/video/2058907', @@ -80,6 +87,10 @@ class RuutuIE(InfoExtractor): 'url': 'http://www.supla.fi/audio/2231370', 'only_matching': True, }, + { + 'url': 'https://static.nelonenmedia.fi/player/misc/embed_player.html?nid=3618790', + 'only_matching': True, + }, ] def _real_extract(self, url): From e9de74c42f6e342f1d7127e9b9fec95fd9468c24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:12:03 +0100 Subject: [PATCH 133/384] [ruutu] Authenticate format URLs (closes #21031, closes #26782) --- haruhi_dl/extractor/ruutu.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/ruutu.py b/haruhi_dl/extractor/ruutu.py index af42e3b12..561669bb2 100644 --- a/haruhi_dl/extractor/ruutu.py +++ b/haruhi_dl/extractor/ruutu.py @@ -7,6 +7,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + url_or_none, xpath_attr, xpath_text, ) @@ -92,12 +93,13 @@ class RuutuIE(InfoExtractor): 'only_matching': True, }, ] + _API_BASE = 'https://gatling.nelonenmedia.fi' def _real_extract(self, url): video_id = self._match_id(url) video_xml = self._download_xml( - 'https://gatling.nelonenmedia.fi/media-xml-cache', video_id, + '%s/media-xml-cache' % self._API_BASE, video_id, query={'id': video_id}) formats = [] @@ -114,9 +116,18 @@ class RuutuIE(InfoExtractor): continue processed_urls.append(video_url) ext = determine_ext(video_url) + auth_video_url = url_or_none(self._download_webpage( + '%s/auth/access/v2' % self._API_BASE, video_id, + note='Downloading authenticated %s stream URL' % ext, + fatal=False, query={'stream': video_url})) + if auth_video_url: + processed_urls.append(auth_video_url) + video_url = auth_video_url if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + video_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds', fatal=False)) From c1f59f3fb6cb89b816ff905bcbd52832b800e097 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:12:07 +0100 Subject: [PATCH 134/384] [ruutu] Extract more metadata and detect non-free videos (closes #21154) --- haruhi_dl/extractor/ruutu.py | 53 +++++++++++++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/ruutu.py b/haruhi_dl/extractor/ruutu.py index 561669bb2..c50cd3ecd 100644 --- a/haruhi_dl/extractor/ruutu.py +++ b/haruhi_dl/extractor/ruutu.py @@ -6,7 +6,9 @@ from ..compat import compat_urllib_parse_urlparse from ..utils import ( determine_ext, ExtractorError, + find_xpath_attr, int_or_none, + unified_strdate, url_or_none, xpath_attr, xpath_text, @@ -92,6 +94,32 @@ class RuutuIE(InfoExtractor): 'url': 'https://static.nelonenmedia.fi/player/misc/embed_player.html?nid=3618790', 'only_matching': True, }, + { + # episode + 'url': 'https://www.ruutu.fi/video/3401964', + 'info_dict': { + 'id': '3401964', + 'ext': 'mp4', + 'title': 'Temptation Island Suomi - Kausi 5 - Jakso 17', + 'description': 'md5:87cf01d5e1e88adf0c8a2937d2bd42ba', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2582, + 'age_limit': 12, + 'upload_date': '20190508', + 'series': 'Temptation Island Suomi', + 'season_number': 5, + 'episode_number': 17, + 'categories': ['Reality ja tositapahtumat', 'Kotimaiset suosikit', 'Romantiikka ja parisuhde'], + }, + 'params': { + 'skip_download': True, + }, + }, + { + # premium + 'url': 'https://www.ruutu.fi/video/3618715', + 'only_matching': True, + }, ] _API_BASE = 'https://gatling.nelonenmedia.fi' @@ -165,18 +193,35 @@ class RuutuIE(InfoExtractor): extract_formats(video_xml.find('./Clip')) - drm = xpath_text(video_xml, './Clip/DRM', default=None) - if not formats and drm: - raise ExtractorError('This video is DRM protected.', expected=True) + def pv(name): + node = find_xpath_attr( + video_xml, './Clip/PassthroughVariables/variable', 'name', name) + if node is not None: + return node.get('value') + + if not formats: + drm = xpath_text(video_xml, './Clip/DRM', default=None) + if drm: + raise ExtractorError('This video is DRM protected.', expected=True) + ns_st_cds = pv('ns_st_cds') + if ns_st_cds != 'free': + raise ExtractorError('This video is %s.' % ns_st_cds, expected=True) self._sort_formats(formats) + themes = pv('themes') + return { 'id': video_id, 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'), - 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')), + 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')) or int_or_none(pv('runtime')), 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), + 'upload_date': unified_strdate(pv('date_start')), + 'series': pv('series_name'), + 'season_number': int_or_none(pv('season_number')), + 'episode_number': int_or_none(pv('episode_number')), + 'categories': themes.split(',') if themes else [], 'formats': formats, } From d1a7ceb19afb372beae64e334e97f85b6322f663 Mon Sep 17 00:00:00 2001 From: toniz4 <cassioavila000@gmail.com> Date: Fri, 26 Feb 2021 15:12:33 +0100 Subject: [PATCH 135/384] [youtube] Add some invidious instances (#27373) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Cássio <heyitscassio@cock.li> --- haruhi_dl/extractor/youtube.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/haruhi_dl/extractor/youtube.py b/haruhi_dl/extractor/youtube.py index 2430fa180..dd58b2407 100644 --- a/haruhi_dl/extractor/youtube.py +++ b/haruhi_dl/extractor/youtube.py @@ -293,10 +293,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?invidious\.kabi\.tk/| (?:www\.)?invidious\.13ad\.de/| (?:www\.)?invidious\.mastodon\.host/| + (?:www\.)?invidious\.zapashcanon\.fr/| + (?:www\.)?invidious\.kavin\.rocks/| + (?:www\.)?invidious\.tube/| + (?:www\.)?invidiou\.site/| + (?:www\.)?invidious\.site/| + (?:www\.)?invidious\.xyz/| (?:www\.)?invidious\.nixnet\.xyz/| (?:www\.)?invidious\.drycat\.fr/| (?:www\.)?tube\.poal\.co/| + (?:www\.)?tube\.connect\.cafe/| (?:www\.)?vid\.wxzm\.sx/| + (?:www\.)?vid\.mint\.lgbt/| (?:www\.)?yewtu\.be/| (?:www\.)?yt\.elukerio\.org/| (?:www\.)?yt\.lelux\.fi/| From a1e744970320b513cb66b1fa38f589418978d099 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:12:42 +0100 Subject: [PATCH 136/384] [hotstart] fix and improve extraction - fix format extraction (closes #26690) - extract thumbnail URL (closes #16079, closes #20412) - support country specific playlist URLs (closes #23496) - select the last id in video URL (closes #26412) --- haruhi_dl/extractor/hotstar.py | 98 ++++++++++++++++++++++++---------- 1 file changed, 70 insertions(+), 28 deletions(-) diff --git a/haruhi_dl/extractor/hotstar.py b/haruhi_dl/extractor/hotstar.py index f97eefa3d..1620822b6 100644 --- a/haruhi_dl/extractor/hotstar.py +++ b/haruhi_dl/extractor/hotstar.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import hashlib import hmac +import json import re import time import uuid @@ -25,43 +26,50 @@ from ..utils import ( class HotStarBaseIE(InfoExtractor): _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee' - def _call_api_impl(self, path, video_id, query): + def _call_api_impl(self, path, video_id, headers, query, data=None): st = int(time.time()) exp = st + 6000 auth = 'st=%d~exp=%d~acl=/*' % (st, exp) auth += '~hmac=' + hmac.new(self._AKAMAI_ENCRYPTION_KEY, auth.encode(), hashlib.sha256).hexdigest() - response = self._download_json( - 'https://api.hotstar.com/' + path, video_id, headers={ - 'hotstarauth': auth, - 'x-country-code': 'IN', - 'x-platform-code': 'JIO', - }, query=query) + h = {'hotstarauth': auth} + h.update(headers) + return self._download_json( + 'https://api.hotstar.com/' + path, + video_id, headers=h, query=query, data=data) + + def _call_api(self, path, video_id, query_name='contentId'): + response = self._call_api_impl(path, video_id, { + 'x-country-code': 'IN', + 'x-platform-code': 'JIO', + }, { + query_name: video_id, + 'tas': 10000, + }) if response['statusCode'] != 'OK': raise ExtractorError( response['body']['message'], expected=True) return response['body']['results'] - def _call_api(self, path, video_id, query_name='contentId'): - return self._call_api_impl(path, video_id, { - query_name: video_id, - 'tas': 10000, - }) - - def _call_api_v2(self, path, video_id): - return self._call_api_impl( - '%s/in/contents/%s' % (path, video_id), video_id, { - 'desiredConfig': 'encryption:plain;ladder:phone,tv;package:hls,dash', - 'client': 'mweb', - 'clientVersion': '6.18.0', - 'deviceId': compat_str(uuid.uuid4()), - 'osName': 'Windows', - 'osVersion': '10', - }) + def _call_api_v2(self, path, video_id, headers, query=None, data=None): + h = {'X-Request-Id': compat_str(uuid.uuid4())} + h.update(headers) + try: + return self._call_api_impl( + path, video_id, h, query, data) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + if e.cause.code == 402: + self.raise_login_required() + message = self._parse_json(e.cause.read().decode(), video_id)['message'] + if message in ('Content not available in region', 'Country is not supported'): + raise self.raise_geo_restricted(message) + raise ExtractorError(message) + raise e class HotStarIE(HotStarBaseIE): IE_NAME = 'hotstar' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P<id>\d{10})' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+[/-])?(?P<id>\d{10})' _TESTS = [{ # contentData 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', @@ -92,8 +100,13 @@ class HotStarIE(HotStarBaseIE): # only available via api v2 'url': 'https://www.hotstar.com/tv/ek-bhram-sarvagun-sampanna/s-2116/janhvi-targets-suman/1000234847', 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/tv/start-music/1260005217/cooks-vs-comalis/1100039717', + 'only_matching': True, }] _GEO_BYPASS = False + _DEVICE_ID = None + _USER_TOKEN = None def _real_extract(self, url): video_id = self._match_id(url) @@ -121,7 +134,30 @@ class HotStarIE(HotStarBaseIE): headers = {'Referer': url} formats = [] geo_restricted = False - playback_sets = self._call_api_v2('h/v2/play', video_id)['playBackSets'] + + if not self._USER_TOKEN: + self._DEVICE_ID = compat_str(uuid.uuid4()) + self._USER_TOKEN = self._call_api_v2('um/v3/users', video_id, { + 'X-HS-Platform': 'PCTV', + 'Content-Type': 'application/json', + }, data=json.dumps({ + 'device_ids': [{ + 'id': self._DEVICE_ID, + 'type': 'device_id', + }], + }).encode())['user_identity'] + + playback_sets = self._call_api_v2( + 'play/v2/playback/content/' + video_id, video_id, { + 'X-HS-Platform': 'web', + 'X-HS-AppVersion': '6.99.1', + 'X-HS-UserToken': self._USER_TOKEN, + }, query={ + 'device-id': self._DEVICE_ID, + 'desired-config': 'encryption:plain', + 'os-name': 'Windows', + 'os-version': '10', + })['data']['playBackSets'] for playback_set in playback_sets: if not isinstance(playback_set, dict): continue @@ -163,19 +199,22 @@ class HotStarIE(HotStarBaseIE): for f in formats: f.setdefault('http_headers', {}).update(headers) + image = try_get(video_data, lambda x: x['image']['h'], compat_str) + return { 'id': video_id, 'title': title, + 'thumbnail': 'https://img1.hotstarext.com/image/upload/' + image if image else None, 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')), 'formats': formats, 'channel': video_data.get('channelName'), - 'channel_id': video_data.get('channelId'), + 'channel_id': str_or_none(video_data.get('channelId')), 'series': video_data.get('showName'), 'season': video_data.get('seasonName'), 'season_number': int_or_none(video_data.get('seasonNo')), - 'season_id': video_data.get('seasonId'), + 'season_id': str_or_none(video_data.get('seasonId')), 'episode': title, 'episode_number': int_or_none(video_data.get('episodeNo')), } @@ -183,7 +222,7 @@ class HotStarIE(HotStarBaseIE): class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com/tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:[a-z]{2}/)?tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', 'info_dict': { @@ -193,6 +232,9 @@ class HotStarPlaylistIE(HotStarBaseIE): }, { 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/us/tv/masterchef-india/s-830/list/episodes/t-1_2_830', + 'only_matching': True, }] def _real_extract(self, url): From 0311375dc5bc4c8294b612e61a69000bb1c0dd6b Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:13:05 +0100 Subject: [PATCH 137/384] [itv] clean description from HTML tags (closes #27399) --- haruhi_dl/extractor/itv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/itv.py b/haruhi_dl/extractor/itv.py index 08bcc8b68..e86c40b42 100644 --- a/haruhi_dl/extractor/itv.py +++ b/haruhi_dl/extractor/itv.py @@ -7,6 +7,7 @@ import re from .common import InfoExtractor from .brightcove import BrightcoveNewIE from ..utils import ( + clean_html, determine_ext, extract_attributes, get_element_by_class, @@ -14,7 +15,6 @@ from ..utils import ( merge_dicts, parse_duration, smuggle_url, - strip_or_none, url_or_none, ) @@ -146,7 +146,7 @@ class ITVIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'duration': parse_duration(video_data.get('Duration')), - 'description': strip_or_none(get_element_by_class('episode-info__synopsis', webpage)), + 'description': clean_html(get_element_by_class('episode-info__synopsis', webpage)), }, info) From 28bbfdff53d2cb5db81d3de1c09518ecdc3e4c7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:13:12 +0100 Subject: [PATCH 138/384] =?UTF-8?q?[linuxacademy]=20Fix=20authentication?= =?UTF-8?q?=20and=20extraction=20(closes=20#21129,=20clos=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …es #26223, closes #27402) --- haruhi_dl/extractor/linuxacademy.py | 130 +++++++++++++++++++++------- 1 file changed, 100 insertions(+), 30 deletions(-) diff --git a/haruhi_dl/extractor/linuxacademy.py b/haruhi_dl/extractor/linuxacademy.py index 23ca965d9..7ec4a6557 100644 --- a/haruhi_dl/extractor/linuxacademy.py +++ b/haruhi_dl/extractor/linuxacademy.py @@ -8,11 +8,15 @@ from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_HTTPError, + compat_str, ) from ..utils import ( + clean_html, ExtractorError, - orderedSet, - unescapeHTML, + js_to_json, + parse_duration, + try_get, + unified_timestamp, urlencode_postdata, urljoin, ) @@ -28,11 +32,15 @@ class LinuxAcademyIE(InfoExtractor): ) ''' _TESTS = [{ - 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154', + 'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675', 'info_dict': { - 'id': '1498-2', + 'id': '7971-2', 'ext': 'mp4', - 'title': "Introduction to the Practitioner's Brief", + 'title': 'What Is Data Science', + 'description': 'md5:c574a3c20607144fb36cb65bdde76c99', + 'timestamp': 1607387907, + 'upload_date': '20201208', + 'duration': 304, }, 'params': { 'skip_download': True, @@ -46,7 +54,8 @@ class LinuxAcademyIE(InfoExtractor): 'info_dict': { 'id': '154', 'title': 'AWS Certified Cloud Practitioner', - 'description': 'md5:039db7e60e4aac9cf43630e0a75fa834', + 'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c', + 'duration': 28835, }, 'playlist_count': 41, 'skip': 'Requires Linux Academy account credentials', @@ -74,6 +83,7 @@ class LinuxAcademyIE(InfoExtractor): self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ 'client_id': self._CLIENT_ID, 'response_type': 'token id_token', + 'response_mode': 'web_message', 'redirect_uri': self._ORIGIN_URL, 'scope': 'openid email user_impersonation profile', 'audience': self._ORIGIN_URL, @@ -129,7 +139,13 @@ class LinuxAcademyIE(InfoExtractor): access_token = self._search_regex( r'access_token=([^=&]+)', urlh.geturl(), - 'access token') + 'access token', default=None) + if not access_token: + access_token = self._parse_json( + self._search_regex( + r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page, + 'authorization response'), None, + transform_source=js_to_json)['response']['access_token'] self._download_webpage( 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' @@ -144,30 +160,84 @@ class LinuxAcademyIE(InfoExtractor): # course path if course_id: - entries = [ - self.url_result( - urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key()) - for lesson_url in orderedSet(re.findall( - r'<a[^>]+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)', - webpage))] - title = unescapeHTML(self._html_search_regex( - (r'class=["\']course-title["\'][^>]*>(?P<value>[^<]+)', - r'var\s+title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), - webpage, 'title', default=None, group='value')) - description = unescapeHTML(self._html_search_regex( - r'var\s+description\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', - webpage, 'description', default=None, group='value')) - return self.playlist_result(entries, course_id, title, description) + module = self._parse_json( + self._search_regex( + r'window\.module\s*=\s*({.+?})\s*;', webpage, 'module'), + item_id) + entries = [] + chapter_number = None + chapter = None + chapter_id = None + for item in module['items']: + if not isinstance(item, dict): + continue + + def type_field(key): + return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower() + type_fields = (type_field('name'), type_field('slug')) + # Move to next module section + if 'section' in type_fields: + chapter = item.get('course_name') + chapter_id = item.get('course_module') + chapter_number = 1 if not chapter_number else chapter_number + 1 + continue + # Skip non-lessons + if 'lesson' not in type_fields: + continue + lesson_url = urljoin(url, item.get('url')) + if not lesson_url: + continue + title = item.get('title') or item.get('lesson_name') + description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text')) + entries.append({ + '_type': 'url_transparent', + 'url': lesson_url, + 'ie_key': LinuxAcademyIE.ie_key(), + 'title': title, + 'description': description, + 'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')), + 'duration': parse_duration(item.get('duration')), + 'chapter': chapter, + 'chapter_id': chapter_id, + 'chapter_number': chapter_number, + }) + return { + '_type': 'playlist', + 'entries': entries, + 'id': course_id, + 'title': module.get('title'), + 'description': module.get('md_desc') or clean_html(module.get('desc')), + 'duration': parse_duration(module.get('duration')), + } # single video path - info = self._extract_jwplayer_data( - webpage, item_id, require_title=False, m3u8_id='hls',) - title = self._search_regex( - (r'>Lecture\s*:\s*(?P<value>[^<]+)', - r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, - 'title', group='value') - info.update({ + m3u8_url = self._parse_json( + self._search_regex( + r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'), + item_id)[0]['file'] + formats = self._extract_m3u8_formats( + m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + info = { 'id': item_id, - 'title': title, - }) + 'formats': formats, + } + lesson = self._parse_json( + self._search_regex( + (r'window\.lesson\s*=\s*({.+?})\s*;', + r'player\.lesson\s*=\s*({.+?})\s*;'), + webpage, 'lesson', default='{}'), item_id, fatal=False) + if lesson: + info.update({ + 'title': lesson.get('lesson_name'), + 'description': lesson.get('md_desc') or clean_html(lesson.get('desc')), + 'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')), + 'duration': parse_duration(lesson.get('duration')), + }) + if not info.get('title'): + info['title'] = self._search_regex( + (r'>Lecture\s*:\s*(?P<value>[^<]+)', + r'lessonName\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, + 'title', group='value') return info From 51d290f5d7d0e339fa72edbb198dd0113dbd1fe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:13:21 +0100 Subject: [PATCH 139/384] [extractor/common] Document duration meta field for playlists --- haruhi_dl/extractor/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index 2db95d592..3353987a3 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -336,8 +336,8 @@ class InfoExtractor(object): object, each element of which is a valid dictionary by this specification. Additionally, playlists can have "id", "title", "description", "uploader", - "uploader_id", "uploader_url" attributes with the same semantics as videos - (see above). + "uploader_id", "uploader_url", "duration" attributes with the same semantics + as videos (see above). _type "multi_video" indicates that there are multiple videos that From 0f3f3e90466380e73cbe5c85ea2ef95f821843d9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:13:28 +0100 Subject: [PATCH 140/384] =?UTF-8?q?[twitcasting]=20fix=20format=20extracti?= =?UTF-8?q?on=20and=20improve=20info=20extraction(close=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …s #24868) --- haruhi_dl/extractor/twitcasting.py | 72 +++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 21 deletions(-) diff --git a/haruhi_dl/extractor/twitcasting.py b/haruhi_dl/extractor/twitcasting.py index 2dbe89f5b..6596eef9f 100644 --- a/haruhi_dl/extractor/twitcasting.py +++ b/haruhi_dl/extractor/twitcasting.py @@ -1,11 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import urlencode_postdata - import re +from .common import InfoExtractor +from ..utils import ( + clean_html, + float_or_none, + get_element_by_class, + get_element_by_id, + parse_duration, + str_to_int, + unified_timestamp, + urlencode_postdata, +) + class TwitCastingIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)' @@ -17,8 +26,12 @@ class TwitCastingIE(InfoExtractor): 'ext': 'mp4', 'title': 'Live #2357609', 'uploader_id': 'ivetesangalo', - 'description': "Moi! I'm live on TwitCasting from my iPhone.", + 'description': 'Twitter Oficial da cantora brasileira Ivete Sangalo.', 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20110822', + 'timestamp': 1314010824, + 'duration': 32, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -30,8 +43,12 @@ class TwitCastingIE(InfoExtractor): 'ext': 'mp4', 'title': 'Live playing something #3689740', 'uploader_id': 'mttbernardini', - 'description': "I'm live on TwitCasting from my iPad. password: abc (Santa Marinella/Lazio, Italia)", + 'description': 'Salve, io sono Matto (ma con la e). Questa è la mia presentazione, in quanto sono letteralmente matto (nel senso di strano), con qualcosa in più.', 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20120212', + 'timestamp': 1329028024, + 'duration': 681, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -40,9 +57,7 @@ class TwitCastingIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - uploader_id = mobj.group('uploader_id') + uploader_id, video_id = re.match(self._VALID_URL, url).groups() video_password = self._downloader.params.get('videopassword') request_data = None @@ -52,30 +67,45 @@ class TwitCastingIE(InfoExtractor): }) webpage = self._download_webpage(url, video_id, data=request_data) - title = self._html_search_regex( - r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)</', - webpage, 'title', default=None) or self._html_search_meta( - 'twitter:title', webpage, fatal=True) + title = clean_html(get_element_by_id( + 'movietitle', webpage)) or self._html_search_meta( + ['og:title', 'twitter:title'], webpage, fatal=True) + video_js_data = {} m3u8_url = self._search_regex( - (r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - r'(["\'])(?P<url>http.+?\.m3u8.*?)\1'), - webpage, 'm3u8 url', group='url') + r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'm3u8 url', group='url', default=None) + if not m3u8_url: + video_js_data = self._parse_json(self._search_regex( + r"data-movie-playlist='(\[[^']+\])'", + webpage, 'movie playlist'), video_id)[0] + m3u8_url = video_js_data['source']['url'] + # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + m3u8_url, video_id, 'mp4', m3u8_id='hls') - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'twitter:description', webpage) + thumbnail = video_js_data.get('thumbnailUrl') or self._og_search_thumbnail(webpage) + description = clean_html(get_element_by_id( + 'authorcomment', webpage)) or self._html_search_meta( + ['description', 'og:description', 'twitter:description'], webpage) + duration = float_or_none(video_js_data.get( + 'duration'), 1000) or parse_duration(clean_html( + get_element_by_class('tw-player-duration-time', webpage))) + view_count = str_to_int(self._search_regex( + r'Total\s*:\s*([\d,]+)\s*Views', webpage, 'views', None)) + timestamp = unified_timestamp(self._search_regex( + r'data-toggle="true"[^>]+datetime="([^"]+)"', + webpage, 'datetime', None)) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, + 'timestamp': timestamp, 'uploader_id': uploader_id, + 'duration': duration, + 'view_count': view_count, 'formats': formats, } From a7f325972c2a080e2b7457812f8dbad7b329be8a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:13:34 +0100 Subject: [PATCH 141/384] [downloader/hls] delegate manifests with media initialization to ffmpeg --- haruhi_dl/downloader/hls.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/haruhi_dl/downloader/hls.py b/haruhi_dl/downloader/hls.py index 0cd16db87..56c84e113 100644 --- a/haruhi_dl/downloader/hls.py +++ b/haruhi_dl/downloader/hls.py @@ -42,11 +42,13 @@ class HlsFD(FragmentFD): # no segments will definitely be appended to the end of the playlist. # r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of # # event media playlists [4] + r'#EXT-X-MAP:', # media initialization [5] # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 + # 5. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.5 ) check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] is_aes128_enc = '#EXT-X-KEY:METHOD=AES-128' in manifest From f42fea540291cb3e8ca551ef1d1d2ff2f22a109d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:13:40 +0100 Subject: [PATCH 142/384] [slideslive] use m3u8 entry protocol for m3u8 formats(closes #27400) --- haruhi_dl/extractor/slideslive.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/slideslive.py b/haruhi_dl/extractor/slideslive.py index cd70841a9..9409a0100 100644 --- a/haruhi_dl/extractor/slideslive.py +++ b/haruhi_dl/extractor/slideslive.py @@ -83,9 +83,10 @@ class SlidesLiveIE(InfoExtractor): else: formats = [] _MANIFEST_PATTERN = 'https://01.cdn.yoda.slideslive.com/%s/master.%s' + # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol formats.extend(self._extract_m3u8_formats( - _MANIFEST_PATTERN % (service_id, 'm3u8'), service_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + _MANIFEST_PATTERN % (service_id, 'm3u8'), + service_id, 'mp4', m3u8_id='hls', fatal=False)) formats.extend(self._extract_mpd_formats( _MANIFEST_PATTERN % (service_id, 'mpd'), service_id, mpd_id='dash', fatal=False)) From ff7c31e4f2c250b8e072879d7c5e9d6e7bbe3183 Mon Sep 17 00:00:00 2001 From: spvkgn <spvkgn@users.noreply.github.com> Date: Fri, 26 Feb 2021 15:13:48 +0100 Subject: [PATCH 143/384] [eporner] Fix hash extraction and extend _VALID_URL (#27396) Co-authored-by: Sergey M <dstftw@gmail.com> --- haruhi_dl/extractor/eporner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/eporner.py b/haruhi_dl/extractor/eporner.py index fe42821c7..709925471 100644 --- a/haruhi_dl/extractor/eporner.py +++ b/haruhi_dl/extractor/eporner.py @@ -16,7 +16,7 @@ from ..utils import ( class EpornerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?eporner\.com/(?:hd-porn|embed)/(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' + _VALID_URL = r'https?://(?:www\.)?eporner\.com/(?:(?:hd-porn|embed)/|video-)(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' _TESTS = [{ 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', 'md5': '39d486f046212d8e1b911c52ab4691f8', @@ -45,6 +45,9 @@ class EpornerIE(InfoExtractor): }, { 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', 'only_matching': True, + }, { + 'url': 'https://www.eporner.com/video-FJsA19J3Y3H/one-of-the-greats/', + 'only_matching': True, }] def _real_extract(self, url): @@ -57,7 +60,7 @@ class EpornerIE(InfoExtractor): video_id = self._match_id(urlh.geturl()) hash = self._search_regex( - r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash') + r'hash\s*[:=]\s*["\']([\da-f]{32})', webpage, 'hash') title = self._og_search_title(webpage, default=None) or self._html_search_regex( r'<title>(.+?) - EPORNER', webpage, 'title') From 2f63edb44aafa5cf0cc83432a7f7aea50045115a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:13:58 +0100 Subject: [PATCH 144/384] [eporner] Fix embed test URL --- haruhi_dl/extractor/eporner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/eporner.py b/haruhi_dl/extractor/eporner.py index 709925471..920fb417e 100644 --- a/haruhi_dl/extractor/eporner.py +++ b/haruhi_dl/extractor/eporner.py @@ -43,7 +43,7 @@ class EpornerIE(InfoExtractor): 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', 'only_matching': True, }, { - 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', + 'url': 'http://www.eporner.com/embed/3YRUtzMcWn0', 'only_matching': True, }, { 'url': 'https://www.eporner.com/video-FJsA19J3Y3H/one-of-the-greats/', From e4b993e9dbc8b7161a1df8593105d8695b2c0fce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:14:10 +0100 Subject: [PATCH 145/384] =?UTF-8?q?[extractor/common]=20Improve=20JSON-LD?= =?UTF-8?q?=20interaction=20statistic=20extraction=20(=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …refs #23306) --- haruhi_dl/extractor/common.py | 12 +++++++-- test/test_InfoExtractor.py | 50 +++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index 3353987a3..f845688f5 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -1239,8 +1239,16 @@ class InfoExtractor(object): 'ViewAction': 'view', } + def extract_interaction_type(e): + interaction_type = e.get('interactionType') + if isinstance(interaction_type, dict): + interaction_type = interaction_type.get('@type') + return str_or_none(interaction_type) + def extract_interaction_statistic(e): interaction_statistic = e.get('interactionStatistic') + if isinstance(interaction_statistic, dict): + interaction_statistic = [interaction_statistic] if not isinstance(interaction_statistic, list): return for is_e in interaction_statistic: @@ -1248,8 +1256,8 @@ class InfoExtractor(object): continue if is_e.get('@type') != 'InteractionCounter': continue - interaction_type = is_e.get('interactionType') - if not isinstance(interaction_type, compat_str): + interaction_type = extract_interaction_type(is_e) + if not interaction_type: continue # For interaction count some sites provide string instead of # an integer (as per spec) with non digit characters (e.g. ",") diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index a7a15b8ae..54ed446d5 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -98,6 +98,56 @@ class TestInfoExtractor(unittest.TestCase): self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) + def test_search_json_ld_realworld(self): + # https://github.com/hdl-org/haruhi-dl/issues/23306 + expect_dict( + self, + self.ie._search_json_ld(r'''<script type="application/ld+json"> +{ +"@context": "http://schema.org/", +"@type": "VideoObject", +"name": "1 On 1 With Kleio", +"url": "https://www.eporner.com/hd-porn/xN49A1cT3eB/1-On-1-With-Kleio/", +"duration": "PT0H12M23S", +"thumbnailUrl": ["https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg", "https://imggen.eporner.com/780814/1920/1080/9.jpg"], +"contentUrl": "https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4", +"embedUrl": "https://www.eporner.com/embed/xN49A1cT3eB/1-On-1-With-Kleio/", +"image": "https://static-eu-cdn.eporner.com/thumbs/static4/7/78/780/780814/9_360.jpg", +"width": "1920", +"height": "1080", +"encodingFormat": "mp4", +"bitrate": "6617kbps", +"isFamilyFriendly": "False", +"description": "Kleio Valentien", +"uploadDate": "2015-12-05T21:24:35+01:00", +"interactionStatistic": { +"@type": "InteractionCounter", +"interactionType": { "@type": "http://schema.org/WatchAction" }, +"userInteractionCount": 1120958 +}, "aggregateRating": { +"@type": "AggregateRating", +"ratingValue": "88", +"ratingCount": "630", +"bestRating": "100", +"worstRating": "0" +}, "actor": [{ +"@type": "Person", +"name": "Kleio Valentien", +"url": "https://www.eporner.com/pornstar/kleio-valentien/" +}]} +</script>''', None), + { + 'title': '1 On 1 With Kleio', + 'description': 'Kleio Valentien', + 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4', + 'timestamp': 1449347075, + 'duration': 743.0, + 'view_count': 1120958, + 'width': 1920, + 'height': 1080, + }) + + def test_download_json(self): uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'}) From c4dfcc3d9c02cdff8b778fdc16fd652c33279163 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:14:15 +0100 Subject: [PATCH 146/384] [eporner] Fix view count extraction and make optional (closes #23306) --- haruhi_dl/extractor/eporner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/eporner.py b/haruhi_dl/extractor/eporner.py index 920fb417e..bfecd3a41 100644 --- a/haruhi_dl/extractor/eporner.py +++ b/haruhi_dl/extractor/eporner.py @@ -118,8 +118,8 @@ class EpornerIE(InfoExtractor): duration = parse_duration(self._html_search_meta( 'duration', webpage, default=None)) view_count = str_to_int(self._search_regex( - r'id="cinemaviews">\s*([0-9,]+)\s*<small>views', - webpage, 'view count', fatal=False)) + r'id=["\']cinemaviews1["\'][^>]*>\s*([0-9,]+)', + webpage, 'view count', default=None)) return merge_dicts(json_ld, { 'id': video_id, From 1339530c44d92db7d2f9652f148af0be7f4ccba3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:15:27 +0100 Subject: [PATCH 147/384] [mdr] Improve extraction (closes #24346, closes #26873) --- haruhi_dl/extractor/mdr.py | 75 +++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/haruhi_dl/extractor/mdr.py b/haruhi_dl/extractor/mdr.py index 322e5b45a..38afdc789 100644 --- a/haruhi_dl/extractor/mdr.py +++ b/haruhi_dl/extractor/mdr.py @@ -2,12 +2,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( determine_ext, int_or_none, parse_duration, parse_iso8601, + url_or_none, xpath_text, ) @@ -66,6 +70,22 @@ class MDRIE(InfoExtractor): 'duration': 3239, 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, + }, { + # empty bitrateVideo and bitrateAudio + 'url': 'https://www.kika.de/filme/sendung128372_zc-572e3f45_zs-1d9fb70e.html', + 'info_dict': { + 'id': '128372', + 'ext': 'mp4', + 'title': 'Der kleine Wichtel kehrt zurück', + 'description': 'md5:f77fafdff90f7aa1e9dca14f662c052a', + 'duration': 4876, + 'timestamp': 1607823300, + 'upload_date': '20201213', + 'uploader': 'ZDF', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', 'only_matching': True, @@ -91,10 +111,13 @@ class MDRIE(InfoExtractor): title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True) + type_ = xpath_text(doc, './type', default=None) + formats = [] processed_urls = [] for asset in doc.findall('./assets/asset'): for source in ( + 'download', 'progressiveDownload', 'dynamicHttpStreamingRedirector', 'adaptiveHttpStreamingRedirector'): @@ -102,63 +125,49 @@ class MDRIE(InfoExtractor): if url_el is None: continue - video_url = url_el.text - if video_url in processed_urls: + video_url = url_or_none(url_el.text) + if not video_url or video_url in processed_urls: continue processed_urls.append(video_url) - vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) - abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) - - ext = determine_ext(url_el.text) + ext = determine_ext(video_url) if ext == 'm3u8': - url_formats = self._extract_m3u8_formats( + formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', entry_protocol='m3u8_native', - preference=0, m3u8_id='HLS', fatal=False) + preference=0, m3u8_id='HLS', fatal=False)) elif ext == 'f4m': - url_formats = self._extract_f4m_formats( + formats.extend(self._extract_f4m_formats( video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, - preference=0, f4m_id='HDS', fatal=False) + preference=0, f4m_id='HDS', fatal=False)) else: media_type = xpath_text(asset, './mediaType', 'media type', default='MP4') vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) + format_id = [media_type] + if vbr or abr: + format_id.append(compat_str(vbr or abr)) + f = { 'url': video_url, - 'format_id': '%s-%d' % (media_type, vbr or abr), + 'format_id': '-'.join(format_id), 'filesize': filesize, 'abr': abr, - 'preference': 1, + 'vbr': vbr, } if vbr: - width = int_or_none(xpath_text(asset, './frameWidth', 'width')) - height = int_or_none(xpath_text(asset, './frameHeight', 'height')) f.update({ - 'vbr': vbr, - 'width': width, - 'height': height, + 'width': int_or_none(xpath_text(asset, './frameWidth', 'width')), + 'height': int_or_none(xpath_text(asset, './frameHeight', 'height')), }) - url_formats = [f] + if type_ == 'audio': + f['vcodec'] = 'none' - if not url_formats: - continue - - if not vbr: - for f in url_formats: - abr = f.get('tbr') or abr - if 'tbr' in f: - del f['tbr'] - f.update({ - 'abr': abr, - 'vcodec': 'none', - }) - - formats.extend(url_formats) + formats.append(f) self._sort_formats(formats) From 2c85578a1fe929e0a71d7d66a1e07962d1988868 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:15:34 +0100 Subject: [PATCH 148/384] [mdr] Bypass geo restriction --- haruhi_dl/extractor/mdr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/haruhi_dl/extractor/mdr.py b/haruhi_dl/extractor/mdr.py index 38afdc789..dc6aa9819 100644 --- a/haruhi_dl/extractor/mdr.py +++ b/haruhi_dl/extractor/mdr.py @@ -20,6 +20,8 @@ class MDRIE(InfoExtractor): IE_DESC = 'MDR.DE and KiKA' _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P<id>\d+)(?:_.+?)?\.html' + _GEO_COUNTRIES = ['DE'] + _TESTS = [{ # MDR regularly deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', From 541e22037b74a0c9a18f847a24e9fb5a46912f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 15:15:40 +0100 Subject: [PATCH 149/384] [test_InfoExtractor] PEP 8 --- test/test_InfoExtractor.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 54ed446d5..d7f42a02d 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -147,7 +147,6 @@ class TestInfoExtractor(unittest.TestCase): 'height': 1080, }) - def test_download_json(self): uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'}) From 27765ca68fc7ffd577b012ecb2ee6471105e7070 Mon Sep 17 00:00:00 2001 From: Matthew Rayermann <matthew.rayermann@gmail.com> Date: Fri, 26 Feb 2021 15:15:47 +0100 Subject: [PATCH 150/384] [nhk] Add support for NHK video programs (#27230) --- haruhi_dl/extractor/extractors.py | 5 +- haruhi_dl/extractor/nhk.py | 162 +++++++++++++++++++++--------- 2 files changed, 118 insertions(+), 49 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 297a5e02b..1b5f9b65e 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -756,7 +756,10 @@ from .nexx import ( NexxEmbedIE, ) from .nfl import NFLIE -from .nhk import NhkVodIE +from .nhk import ( + NhkVodIE, + NhkVodProgramIE, +) from .nhl import NHLIE from .nick import ( NickIE, diff --git a/haruhi_dl/extractor/nhk.py b/haruhi_dl/extractor/nhk.py index 6a61a47d2..907db4de9 100644 --- a/haruhi_dl/extractor/nhk.py +++ b/haruhi_dl/extractor/nhk.py @@ -3,14 +3,96 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ExtractorError -class NhkVodIE(InfoExtractor): +class NhkBaseIE(InfoExtractor): + _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json' + + def _get_clean_field(self, episode, key): + return episode.get(key + '_clean') or episode.get(key) + + def _list_episodes(self, m_id, lang, is_video, is_episode): + return self._download_json( + self._API_URL_TEMPLATE % ( + 'v' if is_video else 'r', + 'clip' if m_id[:4] == '9999' else 'esd', + 'episode' if is_episode else 'program', + m_id, lang, '/all' if is_video else ''), + m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] + + def _parse_episode_json(self, episode, lang, is_video): + title = episode.get('sub_title_clean') or episode['sub_title'] + + episode_id = None + if is_video: + pgm_id = episode.get('pgm_id') + pgm_no = episode.get('pgm_no') + + if not (pgm_id and pgm_no): + missing_field = 'pgm_id' if not pgm_id else 'pgm_no' + raise ExtractorError('Cannot download episode. Field %s is missing from episode JSON.' % missing_field) + + episode_id = pgm_id + pgm_no + else: + pgm_gr_id = episode.get('pgm_gr_id') + first_onair_date = episode.get('first_onair_date') + first_onair_no = episode.get('first_onair_no') + + if not (pgm_gr_id and first_onair_date and first_onair_no): + missing_field = 'pgm_gr_id' if not pgm_gr_id else 'first_onair_date' if not first_onair_date else 'first_onair_no' + raise ExtractorError('Cannot download episode. Field %s is missing from episode JSON.' % missing_field) + + episode_id = pgm_gr_id + '-' + first_onair_date + '-' + first_onair_no + + series = self._get_clean_field(episode, 'title') + + thumbnails = [] + for s, w, h in [('', 640, 360), ('_l', 1280, 720)]: + img_path = episode.get('image' + s) + if not img_path: + continue + thumbnails.append({ + 'id': '%dp' % h, + 'height': h, + 'width': w, + 'url': 'https://www3.nhk.or.jp' + img_path, + }) + + info = { + 'id': episode_id + '-' + lang, + 'title': '%s - %s' % (series, title) if series and title else title, + 'description': self._get_clean_field(episode, 'description'), + 'thumbnails': thumbnails, + 'series': series, + 'episode': title, + } + + if is_video: + info.update({ + '_type': 'url_transparent', + 'ie_key': 'Piksel', + 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + episode['vod_id'], + }) + else: + audio = episode['audio'] + audio_path = audio['audio'] + info['formats'] = self._extract_m3u8_formats( + 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + episode_id, 'm4a', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in info['formats']: + f['language'] = lang + + return info + + +class NhkVodIE(NhkBaseIE): _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(?P<type>video|audio)/(?P<id>\d{7}|[^/]+?-\d{8}-\d+)' # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - # video clip + # clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', 'md5': '256a1be14f48d960a7e61e2532d95ec3', 'info_dict': { @@ -47,60 +129,44 @@ class NhkVodIE(InfoExtractor): 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', 'only_matching': True, }] - _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/episode/%s/%s/all%s.json' def _real_extract(self, url): lang, m_type, episode_id = re.match(self._VALID_URL, url).groups() + if episode_id.isdigit(): episode_id = episode_id[:4] + '-' + episode_id[4:] - is_video = m_type == 'video' - episode = self._download_json( - self._API_URL_TEMPLATE % ( - 'v' if is_video else 'r', - 'clip' if episode_id[:4] == '9999' else 'esd', - episode_id, lang, '/all' if is_video else ''), - episode_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'][0] - title = episode.get('sub_title_clean') or episode['sub_title'] + episode = self._list_episodes(episode_id, lang, m_type == 'video', True)[0] - def get_clean_field(key): - return episode.get(key + '_clean') or episode.get(key) + return self._parse_episode_json(episode, lang, m_type == 'video') - series = get_clean_field('title') - thumbnails = [] - for s, w, h in [('', 640, 360), ('_l', 1280, 720)]: - img_path = episode.get('image' + s) - if not img_path: - continue - thumbnails.append({ - 'id': '%dp' % h, - 'height': h, - 'width': w, - 'url': 'https://www3.nhk.or.jp' + img_path, - }) +class NhkVodProgramIE(NhkBaseIE): + _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(program/video)/(?P<id>\w+)' + # Content available only for a limited period of time. Visit + # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. + _TESTS = [{ + # video program + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', + 'info_dict': { + 'id': 'japanrailway', + 'title': 'Japan Railway Journal', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/', + 'only_matching': True, + }] - info = { - 'id': episode_id + '-' + lang, - 'title': '%s - %s' % (series, title) if series and title else title, - 'description': get_clean_field('description'), - 'thumbnails': thumbnails, - 'series': series, - 'episode': title, - } - if is_video: - info.update({ - '_type': 'url_transparent', - 'ie_key': 'Piksel', - 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + episode['vod_id'], - }) + def _real_extract(self, url): + lang, m_type, program_id = re.match(self._VALID_URL, url).groups() + + episodes = self._list_episodes(program_id, lang, True, False) + + if episodes: + return self.playlist_result( + [self._parse_episode_json(episode, lang, True) + for episode in episodes], + self._get_clean_field(episodes[0], 'pgm_gr_id'), self._get_clean_field(episodes[0], 'title')) else: - audio = episode['audio'] - audio_path = audio['audio'] - info['formats'] = self._extract_m3u8_formats( - 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, - episode_id, 'm4a', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False) - for f in info['formats']: - f['language'] = lang - return info + raise ExtractorError('No episodes returned for program with ID: %s' % program_id, expected=True) From 1859fa8ac4cd64819ec3e0739d138ed129c784b7 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:15:53 +0100 Subject: [PATCH 151/384] [nhk:program] Add support for audio programs and program clips --- haruhi_dl/extractor/nhk.py | 134 +++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 64 deletions(-) diff --git a/haruhi_dl/extractor/nhk.py b/haruhi_dl/extractor/nhk.py index 907db4de9..c5b406573 100644 --- a/haruhi_dl/extractor/nhk.py +++ b/haruhi_dl/extractor/nhk.py @@ -3,49 +3,39 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import urljoin class NhkBaseIE(InfoExtractor): _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json' + _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand' + _TYPE_REGEX = r'/(?P<type>video|audio)/' - def _get_clean_field(self, episode, key): - return episode.get(key + '_clean') or episode.get(key) - - def _list_episodes(self, m_id, lang, is_video, is_episode): + def _call_api(self, m_id, lang, is_video, is_episode, is_clip): return self._download_json( self._API_URL_TEMPLATE % ( 'v' if is_video else 'r', - 'clip' if m_id[:4] == '9999' else 'esd', + 'clip' if is_clip else 'esd', 'episode' if is_episode else 'program', m_id, lang, '/all' if is_video else ''), - m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] + m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] or [] - def _parse_episode_json(self, episode, lang, is_video): + def _extract_episode_info(self, url, episode=None): + fetch_episode = episode is None + lang, m_type, episode_id = re.match(NhkVodIE._VALID_URL, url).groups() + if episode_id.isdigit(): + episode_id = episode_id[:4] + '-' + episode_id[4:] + + is_video = m_type == 'video' + if fetch_episode: + episode = self._call_api( + episode_id, lang, is_video, True, episode_id[:4] == '9999')[0] title = episode.get('sub_title_clean') or episode['sub_title'] - episode_id = None - if is_video: - pgm_id = episode.get('pgm_id') - pgm_no = episode.get('pgm_no') + def get_clean_field(key): + return episode.get(key + '_clean') or episode.get(key) - if not (pgm_id and pgm_no): - missing_field = 'pgm_id' if not pgm_id else 'pgm_no' - raise ExtractorError('Cannot download episode. Field %s is missing from episode JSON.' % missing_field) - - episode_id = pgm_id + pgm_no - else: - pgm_gr_id = episode.get('pgm_gr_id') - first_onair_date = episode.get('first_onair_date') - first_onair_no = episode.get('first_onair_no') - - if not (pgm_gr_id and first_onair_date and first_onair_no): - missing_field = 'pgm_gr_id' if not pgm_gr_id else 'first_onair_date' if not first_onair_date else 'first_onair_no' - raise ExtractorError('Cannot download episode. Field %s is missing from episode JSON.' % missing_field) - - episode_id = pgm_gr_id + '-' + first_onair_date + '-' + first_onair_no - - series = self._get_clean_field(episode, 'title') + series = get_clean_field('title') thumbnails = [] for s, w, h in [('', 640, 360), ('_l', 1280, 720)]: @@ -62,37 +52,43 @@ class NhkBaseIE(InfoExtractor): info = { 'id': episode_id + '-' + lang, 'title': '%s - %s' % (series, title) if series and title else title, - 'description': self._get_clean_field(episode, 'description'), + 'description': get_clean_field('description'), 'thumbnails': thumbnails, 'series': series, 'episode': title, } - if is_video: + vod_id = episode['vod_id'] info.update({ '_type': 'url_transparent', 'ie_key': 'Piksel', - 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + episode['vod_id'], + 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id, + 'id': vod_id, }) else: - audio = episode['audio'] - audio_path = audio['audio'] - info['formats'] = self._extract_m3u8_formats( - 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, - episode_id, 'm4a', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False) - for f in info['formats']: - f['language'] = lang - + if fetch_episode: + audio_path = episode['audio']['audio'] + info['formats'] = self._extract_m3u8_formats( + 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + episode_id, 'm4a', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in info['formats']: + f['language'] = lang + else: + info.update({ + '_type': 'url_transparent', + 'ie_key': NhkVodIE.ie_key(), + 'url': url, + }) return info class NhkVodIE(NhkBaseIE): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(?P<type>video|audio)/(?P<id>\d{7}|[^/]+?-\d{8}-\d+)' + _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ - # clip + # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', 'md5': '256a1be14f48d960a7e61e2532d95ec3', 'info_dict': { @@ -131,42 +127,52 @@ class NhkVodIE(NhkBaseIE): }] def _real_extract(self, url): - lang, m_type, episode_id = re.match(self._VALID_URL, url).groups() - - if episode_id.isdigit(): - episode_id = episode_id[:4] + '-' + episode_id[4:] - - episode = self._list_episodes(episode_id, lang, m_type == 'video', True)[0] - - return self._parse_episode_json(episode, lang, m_type == 'video') + return self._extract_episode_info(url) class NhkVodProgramIE(NhkBaseIE): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(program/video)/(?P<id>\w+)' - # Content available only for a limited period of time. Visit - # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. + _VALID_URL = r'%s/program%s(?P<id>[0-9a-z]+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) _TESTS = [{ - # video program + # video program episodes 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', }, 'playlist_mincount': 1, + }, { + # video program clips + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip', + 'info_dict': { + 'id': 'japanrailway', + 'title': 'Japan Railway Journal', + }, + 'playlist_mincount': 5, }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/', 'only_matching': True, + }, { + # audio program + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/', + 'only_matching': True, }] def _real_extract(self, url): - lang, m_type, program_id = re.match(self._VALID_URL, url).groups() + lang, m_type, program_id, episode_type = re.match(self._VALID_URL, url).groups() - episodes = self._list_episodes(program_id, lang, True, False) + episodes = self._call_api( + program_id, lang, m_type == 'video', False, episode_type == 'clip') - if episodes: - return self.playlist_result( - [self._parse_episode_json(episode, lang, True) - for episode in episodes], - self._get_clean_field(episodes[0], 'pgm_gr_id'), self._get_clean_field(episodes[0], 'title')) - else: - raise ExtractorError('No episodes returned for program with ID: %s' % program_id, expected=True) + entries = [] + for episode in episodes: + episode_path = episode.get('url') + if not episode_path: + continue + entries.append(self._extract_episode_info( + urljoin(url, episode_path), episode)) + + program_title = None + if entries: + program_title = entries[0].get('series') + + return self.playlist_result(entries, program_id, program_title) From 1315296aedfd99c406e5f3bdfda93e16b7c827a9 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:16:07 +0100 Subject: [PATCH 152/384] [videomore] add support more.tv (closes #27088) --- haruhi_dl/extractor/videomore.py | 251 ++++++++++++++++--------------- 1 file changed, 133 insertions(+), 118 deletions(-) diff --git a/haruhi_dl/extractor/videomore.py b/haruhi_dl/extractor/videomore.py index e3eda3327..e0c10aa5b 100644 --- a/haruhi_dl/extractor/videomore.py +++ b/haruhi_dl/extractor/videomore.py @@ -4,30 +4,50 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - orderedSet, - parse_duration, - str_or_none, - unified_strdate, - url_or_none, - xpath_element, - xpath_text, +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, ) +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class VideomoreBaseIE(InfoExtractor): + _API_BASE_URL = 'https://more.tv/api/v3/web/' + _VALID_URL_BASE = r'https?://(?:videomore\.ru|more\.tv)/' + + def _download_page_data(self, display_id): + return self._download_json( + self._API_BASE_URL + 'PageData', display_id, query={ + 'url': '/' + display_id, + })['attributes']['response']['data'] + + def _track_url_result(self, track): + track_vod = track['trackVod'] + video_url = track_vod.get('playerLink') or track_vod['link'] + return self.url_result( + video_url, VideomoreIE.ie_key(), track_vod.get('hubId')) class VideomoreIE(InfoExtractor): IE_NAME = 'videomore' _VALID_URL = r'''(?x) videomore:(?P<sid>\d+)$| - https?://(?:player\.)?videomore\.ru/ + https?:// (?: + videomore\.ru/ (?: embed| [^/]+/[^/]+ )/| - [^/]*\?.*?\btrack_id= + (?: + (?:player\.)?videomore\.ru| + siren\.more\.tv/player + )/[^/]*\?.*?\btrack_id=| + odysseus\.more.tv/player/(?P<partner_id>\d+)/ ) (?P<id>\d+) (?:[/?#&]|\.(?:xml|json)|$) @@ -47,18 +67,19 @@ class VideomoreIE(InfoExtractor): 'comment_count': int, 'age_limit': 16, }, + 'skip': 'The video is not available for viewing.', }, { 'url': 'http://videomore.ru/embed/259974', 'info_dict': { 'id': '259974', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Молодежка 2 сезон 40 серия', 'series': 'Молодежка', + 'season': '2 сезон', 'episode': '40 серия', 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 2809, + 'duration': 2789, 'view_count': int, - 'comment_count': int, 'age_limit': 16, }, 'params': { @@ -79,6 +100,7 @@ class VideomoreIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'The video is not available for viewing.', }, { 'url': 'http://videomore.ru/elki_3?track_id=364623', 'only_matching': True, @@ -100,7 +122,14 @@ class VideomoreIE(InfoExtractor): }, { 'url': 'https://player.videomore.ru/?partner_id=97&track_id=736234&autoplay=0&userToken=', 'only_matching': True, + }, { + 'url': 'https://odysseus.more.tv/player/1788/352317', + 'only_matching': True, + }, { + 'url': 'https://siren.more.tv/player/config?track_id=352317&partner_id=1788&user_token=', + 'only_matching': True, }] + _GEO_BYPASS = False @staticmethod def _extract_url(webpage): @@ -118,46 +147,73 @@ class VideomoreIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('sid') or mobj.group('id') + partner_id = mobj.group('partner_id') or compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('partner_id', [None])[0] or '97' - video = self._download_xml( - 'http://videomore.ru/video/tracks/%s.xml' % video_id, - video_id, 'Downloading video XML') + item = self._download_json( + 'https://siren.more.tv/player/config', video_id, query={ + 'partner_id': partner_id, + 'track_id': video_id, + })['data']['playlist']['items'][0] - item = xpath_element(video, './/playlist/item', fatal=True) + title = item.get('title') + series = item.get('project_name') + season = item.get('season_name') + episode = item.get('episode_name') + if not title: + title = [] + for v in (series, season, episode): + if v: + title.append(v) + title = ' '.join(title) - title = xpath_text( - item, ('./title', './episode_name'), 'title', fatal=True) + streams = item.get('streams') or [] + for protocol in ('DASH', 'HLS'): + stream_url = item.get(protocol.lower() + '_url') + if stream_url: + streams.append({'protocol': protocol, 'url': stream_url}) - video_url = xpath_text(item, './video_url', 'video url', fatal=True) - formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds') + formats = [] + for stream in streams: + stream_url = stream.get('url') + if not stream_url: + continue + protocol = stream.get('protocol') + if protocol == 'DASH': + formats.extend(self._extract_mpd_formats( + stream_url, video_id, mpd_id='dash', fatal=False)) + elif protocol == 'HLS': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif protocol == 'MSS': + formats.extend(self._extract_ism_formats( + stream_url, video_id, ism_id='mss', fatal=False)) + + if not formats: + error = item.get('error') + if error: + if error in ('Данное видео недоступно для просмотра на территории этой страны', 'Данное видео доступно для просмотра только на территории России'): + self.raise_geo_restricted(countries=['RU']) + raise ExtractorError(error, expected=True) self._sort_formats(formats) - thumbnail = xpath_text(item, './thumbnail_url') - duration = int_or_none(xpath_text(item, './duration')) - view_count = int_or_none(xpath_text(item, './views')) - comment_count = int_or_none(xpath_text(item, './count_comments')) - age_limit = int_or_none(xpath_text(item, './min_age')) - - series = xpath_text(item, './project_name') - episode = xpath_text(item, './episode_name') - return { 'id': video_id, 'title': title, 'series': series, + 'season': season, 'episode': episode, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'age_limit': age_limit, + 'thumbnail': item.get('thumbnail_url'), + 'duration': int_or_none(item.get('duration')), + 'view_count': int_or_none(item.get('views')), + 'age_limit': int_or_none(item.get('min_age')), 'formats': formats, } -class VideomoreVideoIE(InfoExtractor): +class VideomoreVideoIE(VideomoreBaseIE): IE_NAME = 'videomore:video' - _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)(?:/*|[?#&].*?)$' + _VALID_URL = VideomoreBaseIE._VALID_URL_BASE + r'(?P<id>(?:(?:[^/]+/){2})?[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ # single video with og:video:iframe 'url': 'http://videomore.ru/elki_3', @@ -174,10 +230,25 @@ class VideomoreVideoIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Requires logging in', }, { # season single series with og:video:iframe 'url': 'http://videomore.ru/poslednii_ment/1_sezon/14_seriya', - 'only_matching': True, + 'info_dict': { + 'id': '352317', + 'ext': 'mp4', + 'title': 'Последний мент 1 сезон 14 серия', + 'series': 'Последний мент', + 'season': '1 сезон', + 'episode': '14 серия', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2464, + 'age_limit': 16, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://videomore.ru/sejchas_v_seti/serii_221-240/226_vypusk', 'only_matching': True, @@ -197,9 +268,13 @@ class VideomoreVideoIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'redirects to https://more.tv/' }, { 'url': 'https://videomore.ru/molodezhka/6_sezon/29_seriya?utm_so', 'only_matching': True, + }, { + 'url': 'https://more.tv/poslednii_ment/1_sezon/14_seriya', + 'only_matching': True, }] @classmethod @@ -208,38 +283,25 @@ class VideomoreVideoIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_url = self._og_search_property( - 'video:iframe', webpage, 'video url', default=None) - - if not video_url: - video_id = self._search_regex( - (r'config\s*:\s*["\']https?://videomore\.ru/video/tracks/(\d+)\.xml', - r'track-id=["\'](\d+)', - r'xcnt_product_id\s*=\s*(\d+)'), webpage, 'video id') - video_url = 'videomore:%s' % video_id - else: - video_id = None - - return self.url_result( - video_url, ie=VideomoreIE.ie_key(), video_id=video_id) + return self._track_url_result(self._download_page_data(display_id)) -class VideomoreSeasonIE(InfoExtractor): +class VideomoreSeasonIE(VideomoreBaseIE): IE_NAME = 'videomore:season' - _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$' + _VALID_URL = VideomoreBaseIE._VALID_URL_BASE + r'(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ - 'url': 'http://videomore.ru/molodezhka/sezon_promo', + 'url': 'http://videomore.ru/molodezhka/film_o_filme', 'info_dict': { - 'id': 'molodezhka/sezon_promo', - 'title': 'Молодежка Промо', + 'id': 'molodezhka/film_o_filme', + 'title': 'Фильм о фильме', }, - 'playlist_mincount': 12, + 'playlist_mincount': 3, }, { 'url': 'http://videomore.ru/molodezhka/sezon_promo?utm_so', 'only_matching': True, + }, { + 'url': 'https://more.tv/molodezhka/film_o_filme', + 'only_matching': True, }] @classmethod @@ -249,59 +311,12 @@ class VideomoreSeasonIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - title = self._og_search_title(webpage) - - data = self._parse_json( - self._html_search_regex( - r'\bclass=["\']seasons-tracks["\'][^>]+\bdata-custom-data=(["\'])(?P<value>{.+?})\1', - webpage, 'data', default='{}', group='value'), - display_id, fatal=False) - + season = self._download_page_data(display_id) + season_id = compat_str(season['id']) + tracks = self._download_json( + self._API_BASE_URL + 'seasons/%s/tracks' % season_id, + season_id)['data'] entries = [] - - if data: - episodes = data.get('episodes') - if isinstance(episodes, list): - for ep in episodes: - if not isinstance(ep, dict): - continue - ep_id = int_or_none(ep.get('id')) - ep_url = url_or_none(ep.get('url')) - if ep_id: - e = { - 'url': 'videomore:%s' % ep_id, - 'id': compat_str(ep_id), - } - elif ep_url: - e = {'url': ep_url} - else: - continue - e.update({ - '_type': 'url', - 'ie_key': VideomoreIE.ie_key(), - 'title': str_or_none(ep.get('title')), - 'thumbnail': url_or_none(ep.get('image')), - 'duration': parse_duration(ep.get('duration')), - 'episode_number': int_or_none(ep.get('number')), - 'upload_date': unified_strdate(ep.get('date')), - }) - entries.append(e) - - if not entries: - entries = [ - self.url_result( - 'videomore:%s' % video_id, ie=VideomoreIE.ie_key(), - video_id=video_id) - for video_id in orderedSet(re.findall( - r':(?:id|key)=["\'](\d+)["\']', webpage))] - - if not entries: - entries = [ - self.url_result(item) for item in re.findall( - r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"' - % display_id, webpage)] - - return self.playlist_result(entries, display_id, title) + for track in tracks: + entries.append(self._track_url_result(track)) + return self.playlist_result(entries, display_id, season.get('title')) From dfb69009b9d66a893ba8f56c0afef0e101aa0ea2 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:16:13 +0100 Subject: [PATCH 153/384] [tv5unis] Add new extractor(closes #22399)(closes #24890) --- haruhi_dl/extractor/extractors.py | 4 + haruhi_dl/extractor/tv5unis.py | 121 ++++++++++++++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 haruhi_dl/extractor/tv5unis.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 1b5f9b65e..825a28907 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1273,6 +1273,10 @@ from .tv2dk import ( from .tv2hu import TV2HuIE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE +from .tv5unis import ( + TV5UnisVideoIE, + TV5UnisIE, +) from .tva import ( TVAIE, QubIE, diff --git a/haruhi_dl/extractor/tv5unis.py b/haruhi_dl/extractor/tv5unis.py new file mode 100644 index 000000000..eabdc2271 --- /dev/null +++ b/haruhi_dl/extractor/tv5unis.py @@ -0,0 +1,121 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_age_limit, + smuggle_url, + try_get, +) + + +class TV5UnisBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['CA'] + + def _real_extract(self, url): + groups = re.match(self._VALID_URL, url).groups() + product = self._download_json( + 'https://api.tv5unis.ca/graphql', groups[0], query={ + 'query': '''{ + %s(%s) { + collection { + title + } + episodeNumber + rating { + name + } + seasonNumber + tags + title + videoElement { + ... on Video { + mediaId + } + } + } +}''' % (self._GQL_QUERY_NAME, self._gql_args(groups)), + })['data'][self._GQL_QUERY_NAME] + media_id = product['videoElement']['mediaId'] + + return { + '_type': 'url_transparent', + 'id': media_id, + 'title': product.get('title'), + 'url': smuggle_url('limelight:media:' + media_id, {'geo_countries': self._GEO_COUNTRIES}), + 'age_limit': parse_age_limit(try_get(product, lambda x: x['rating']['name'])), + 'tags': product.get('tags'), + 'series': try_get(product, lambda x: x['collection']['title']), + 'season_number': int_or_none(product.get('seasonNumber')), + 'episode_number': int_or_none(product.get('episodeNumber')), + 'ie_key': 'LimelightMedia', + } + + +class TV5UnisVideoIE(TV5UnisBaseIE): + IE_NAME = 'tv5unis:video' + _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/[^/]+/(?P<id>\d+)' + _TEST = { + 'url': 'https://www.tv5unis.ca/videos/bande-annonces/71843', + 'md5': '3d794164928bda97fb87a17e89923d9b', + 'info_dict': { + 'id': 'a883684aecb2486cad9bdc7bbe17f861', + 'ext': 'mp4', + 'title': 'Watatatow', + 'duration': 10.01, + } + } + _GQL_QUERY_NAME = 'productById' + + @staticmethod + def _gql_args(groups): + return 'id: %s' % groups + + +class TV5UnisIE(TV5UnisBaseIE): + IE_NAME = 'tv5unis' + _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/(?P<id>[^/]+)(?:/saisons/(?P<season_number>\d+)/episodes/(?P<episode_number>\d+))?/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://www.tv5unis.ca/videos/watatatow/saisons/6/episodes/1', + 'md5': 'a479907d2e531a73e1f8dc48d6388d02', + 'info_dict': { + 'id': 'e5ee23a586c44612a56aad61accf16ef', + 'ext': 'mp4', + 'title': 'Je ne peux pas lui résister', + 'description': "Atys, le nouveau concierge de l'école, a réussi à ébranler la confiance de Mado en affirmant qu\'une médaille, ce n'est que du métal. Comme Mado essaie de lui prouver que ses valeurs sont solides, il veut la mettre à l'épreuve...", + 'subtitles': { + 'fr': 'count:1', + }, + 'duration': 1370, + 'age_limit': 8, + 'tags': 'count:3', + 'series': 'Watatatow', + 'season_number': 6, + 'episode_number': 1, + }, + }, { + 'url': 'https://www.tv5unis.ca/videos/le-voyage-de-fanny', + 'md5': '9ca80ebb575c681d10cae1adff3d4774', + 'info_dict': { + 'id': '726188eefe094d8faefb13381d42bc06', + 'ext': 'mp4', + 'title': 'Le voyage de Fanny', + 'description': "Fanny, 12 ans, cachée dans un foyer loin de ses parents, s'occupe de ses deux soeurs. Devant fuir, Fanny prend la tête d'un groupe de huit enfants et s'engage dans un dangereux périple à travers la France occupée pour rejoindre la frontière suisse.", + 'subtitles': { + 'fr': 'count:1', + }, + 'duration': 5587.034, + 'tags': 'count:4', + }, + }] + _GQL_QUERY_NAME = 'productByRootProductSlug' + + @staticmethod + def _gql_args(groups): + args = 'rootProductSlug: "%s"' % groups[0] + if groups[1]: + args += ', seasonNumber: %s, episodeNumber: %s' % groups[1:] + return args From 441fbc40561efb679a1354177d276fbb7c9a6c95 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:16:18 +0100 Subject: [PATCH 154/384] [sky] relax SkySports URL regex (closes #27435) --- haruhi_dl/extractor/sky.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/sky.py b/haruhi_dl/extractor/sky.py index ea30d6e62..681691004 100644 --- a/haruhi_dl/extractor/sky.py +++ b/haruhi_dl/extractor/sky.py @@ -41,8 +41,8 @@ class SkyBaseIE(InfoExtractor): class SkySportsIE(SkyBaseIE): - _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/([^/]+/)*(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', 'md5': '77d59166cddc8d3cb7b13e35eaf0f5ec', 'info_dict': { @@ -52,7 +52,13 @@ class SkySportsIE(SkyBaseIE): 'description': 'md5:e88bda94ae15f7720c5cb467e777bb6d', }, 'add_ie': ['Ooyala'], - } + }, { + 'url': 'https://www.skysports.com/watch/video/sports/f1/12160544/abu-dhabi-gp-the-notebook', + 'only_matching': True, + }, { + 'url': 'https://www.skysports.com/watch/video/tv-shows/12118508/rainford-brent-how-ace-programme-helps', + 'only_matching': True, + }] class SkyNewsIE(SkyBaseIE): From 3463c192f6a7594fc9f7092fa6def7c90311512d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:16:29 +0100 Subject: [PATCH 155/384] =?UTF-8?q?[anvato]=20update=20ANVACK=20table=20an?= =?UTF-8?q?d=20add=20experimental=20token=20generator=20for=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … NFL --- haruhi_dl/extractor/anvato.py | 97 ++++++++++++++++--- .../anvato_token_generator/__init__.py | 7 ++ .../anvato_token_generator/common.py | 6 ++ .../extractor/anvato_token_generator/nfl.py | 30 ++++++ 4 files changed, 129 insertions(+), 11 deletions(-) create mode 100644 haruhi_dl/extractor/anvato_token_generator/__init__.py create mode 100644 haruhi_dl/extractor/anvato_token_generator/common.py create mode 100644 haruhi_dl/extractor/anvato_token_generator/nfl.py diff --git a/haruhi_dl/extractor/anvato.py b/haruhi_dl/extractor/anvato.py index 84e841035..a6410311c 100644 --- a/haruhi_dl/extractor/anvato.py +++ b/haruhi_dl/extractor/anvato.py @@ -9,6 +9,7 @@ import re import time from .common import InfoExtractor +from .anvato_token_generator import NFLTokenGenerator from ..aes import aes_encrypt from ..compat import compat_str from ..utils import ( @@ -116,7 +117,76 @@ class AnvatoIE(InfoExtractor): 'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn', 'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W', 'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ', - 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ' + 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', + 'X8POa4zPPaKVZHqmWjuEzfP31b1QM9VN': 'Dn5vOY9ooDw7VSl9qztjZI5o0g08mA0z', + 'M2v78QkBMpNJlSPp9diX5F2PBmBy6Bog': 'ka6K32kyo7nDZfNkjQCGWf1lpApXMd1B', + 'bvJ0dQpav07l0hG5JgfVLF2dv1vARwpP': 'BzoQW24GrJZoJfmNodiJKSPeB9B8NOxj', + 'lxQMLg2XZKuEZaWgsqubBxV9INZ6bryY': 'Vm2Mx6noKds9jB71h6urazwlTG3m9x8l', + '04EnjvXeoSmkbJ9ckPs7oY0mcxv7PlyN': 'aXERQP9LMfQVlEDsgGs6eEA1SWznAQ8P', + 'mQbO2ge6BFRWVPYCYpU06YvNt80XLvAX': 'E2BV1NGmasN5v7eujECVPJgwflnLPm2A', + 'g43oeBzJrCml7o6fa5fRL1ErCdeD8z4K': 'RX34mZ6zVH4Nr6whbxIGLv9WSbxEKo8V', + 'VQrDJoP7mtdBzkxhXbSPwGB1coeElk4x': 'j2VejQx0VFKQepAF7dI0mJLKtOVJE18z', + 'WxA5NzLRjCrmq0NUgaU5pdMDuZO7RJ4w': 'lyY5ADLKaIOLEgAsGQCveEMAcqnx3rY9', + 'M4lpMXB71ie0PjMCjdFzVXq0SeRVqz49': 'n2zVkOqaLIv3GbLfBjcwW51LcveWOZ2e', + 'dyDZGEqN8u8nkJZcJns0oxYmtP7KbGAn': 'VXOEqQW9BtEVLajfZQSLEqxgS5B7qn2D', + 'E7QNjrVY5u5mGvgu67IoDgV1CjEND8QR': 'rz8AaDmdKIkLmPNhB5ILPJnjS5PnlL8d', + 'a4zrqjoKlfzg0dwHEWtP31VqcLBpjm4g': 'LY9J16gwETdGWa3hjBu5o0RzuoQDjqXQ', + 'dQP5BZroMsMVLO1hbmT5r2Enu86GjxA6': '7XR3oOdbPF6x3PRFLDCq9RkgsRjAo48V', + 'M4lKNBO1NFe0PjMCj1tzVXq0SeRVqzA9': 'n2zoRqGLRUv3GbLfBmTwW51LcveWOZYe', + 'nAZ7MZdpGCGg1pqFEbsoJOz2C60mv143': 'dYJgdqA9aT4yojETqGi7yNgoFADxqmXP', + '3y1MERYgOuE9NzbFgwhV6Wv2F0YKvbyz': '081xpZDQgC4VadLTavhWQxrku56DAgXV', + 'bmQvmEXr5HWklBMCZOcpE2Z3HBYwqGyl': 'zxXPbVNyMiMAZldhr9FkOmA0fl4aKr2v', + 'wA7oDNYldfr6050Hwxi52lPZiVlB86Ap': 'ZYK16aA7ni0d3l3c34uwpxD7CbReMm8Q', + 'g43MbKMWmFml7o7sJoSRkXxZiXRvJ3QK': 'RX3oBJonvs4Nr6rUWBCGn3matRGqJPXV', + 'mA9VdlqpLS0raGaSDvtoqNrBTzb8XY4q': '0XN4OjBD3fnW7r7IbmtJB4AyfOmlrE2r', + 'mAajOwgkGt17oGoFmEuklMP9H0GnW54d': 'lXbBLPGyzikNGeGujAuAJGjZiwLRxyXR', + 'vy8vjJ9kbUwrRqRu59Cj5dWZfzYErlAb': 'K8l7gpwaGcBpnAnCLNCmPZRdin3eaQX0', + 'xQMWBpR8oHEZaWaSMGUb0avOHjLVYn4Y': 'm2MrN4vEaf9jB7BFy5Srb40jTrN67AYl', + 'xyKEmVO3miRr6D6UVkt7oB8jtD6aJEAv': 'g2ddDebqDfqdgKgswyUKwGjbTWwzq923', + '7Qk0wa2D9FjKapacoJF27aLvUDKkLGA0': 'b2kgBEkephJaMkMTL7s1PLe4Ua6WyP2P', + '3QLg6nqmNTJ5VvVTo7f508LPidz1xwyY': 'g2L1GgpraipmAOAUqmIbBnPxHOmw4MYa', + '3y1B7zZjXTE9NZNSzZSVNPZaTNLjo6Qz': '081b5G6wzH4VagaURmcWbN5mT4JGEe2V', + 'lAqnwvkw6SG6D8DSqmUg6DRLUp0w3G4x': 'O2pbP0xPDFNJjpjIEvcdryOJtpkVM4X5', + 'awA7xd1N0Hr6050Hw2c52lPZiVlB864p': 'GZYKpn4aoT0d3l3c3PiwpxD7CbReMmXQ', + 'jQVqPLl9YHL1WGWtR1HDgWBGT63qRNyV': '6X03ne6vrU4oWyWUN7tQVoajikxJR3Ye', + 'GQRMR8mL7uZK797t7xH3eNzPIP5dOny1': 'm2vqPWGd4U31zWzSyasDRAoMT1PKRp8o', + 'zydq9RdmRhXLkNkfNoTJlMzaF0lWekQB': '3X7LnvE7vH5nkEkSqLiey793Un7dLB8e', + 'VQrDzwkB2IdBzjzu9MHPbEYkSB50gR4x': 'j2VebLzoKUKQeEesmVh0gM1eIp9jKz8z', + 'mAa2wMamBs17oGoFmktklMP9H0GnW54d': 'lXbgP74xZTkNGeGujVUAJGjZiwLRxy8R', + '7yjB6ZLG6sW8R6RF2xcan1KGfJ5dNoyd': 'wXQkPorvPHZ45N5t4Jf6qwg5Tp4xvw29', + 'a4zPpNeWGuzg0m0iX3tPeanGSkRKWXQg': 'LY9oa3QAyHdGW9Wu3Ri5JGeEik7l1N8Q', + 'k2rneA2M38k25cXDwwSknTJlxPxQLZ6M': '61lyA2aEVDzklfdwmmh31saPxQx2VRjp', + 'bK9Zk4OvPnvxduLgxvi8VUeojnjA02eV': 'o5jANYjbeMb4nfBaQvcLAt1jzLzYx6ze', + '5VD6EydM3R9orHmNMGInGCJwbxbQvGRw': 'w3zjmX7g4vnxzCxElvUEOiewkokXprkZ', + '70X35QbVYVYNPUmP9YfbzI06YqYQk2R1': 'vG4Aj2BMjMjoztB7zeFOnCVPJpJ8lMOa', + '26qYwQVG9p1Bks2GgBckjfDJOXOAMgG1': 'r4ev9X0mv5zqJc0yk5IBDcQOwZw8mnwQ', + 'rvVKpA56MBXWlSxMw3cobT5pdkd4Dm7q': '1J7ZkY53pZ645c93owcLZuveE7E8B3rL', + 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo': 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo', + 'jdKqRGF16dKsBviMDae7IGDl7oTjEbVV': 'Q09l7vhlNxPFErIOK6BVCe7KnwUW5DVV', + '3QLkogW1OUJ5VvPsrDH56DY2u7lgZWyY': 'g2LRE1V9espmAOPhE4ubj4ZdUA57yDXa', + 'wyJvWbXGBSdbkEzhv0CW8meou82aqRy8': 'M2wolPvyBIpQGkbT4juedD4ruzQGdK2y', + '7QkdZrzEkFjKap6IYDU2PB0oCNZORmA0': 'b2kN1l96qhJaMkPs9dt1lpjBfwqZoA8P', + 'pvA05113MHG1w3JTYxc6DVlRCjErVz4O': 'gQXeAbblBUnDJ7vujbHvbRd1cxlz3AXO', + 'mA9blJDZwT0raG1cvkuoeVjLC7ZWd54q': '0XN9jRPwMHnW7rvumgfJZOD9CJgVkWYr', + '5QwRN5qKJTvGKlDTmnf7xwNZcjRmvEy9': 'R2GP6LWBJU1QlnytwGt0B9pytWwAdDYy', + 'eyn5rPPbkfw2KYxH32fG1q58CbLJzM40': 'p2gyqooZnS56JWeiDgfmOy1VugOQEBXn', + '3BABn3b5RfPJGDwilbHe7l82uBoR05Am': '7OYZG7KMVhbPdKJS3xcWEN3AuDlLNmXj', + 'xA5zNGXD3HrmqMlF6OS5pdMDuZO7RJ4w': 'yY5DAm6r1IOLE3BCVMFveEMAcqnx3r29', + 'g43PgW3JZfml7o6fDEURL1ErCdeD8zyK': 'RX3aQn1zrS4Nr6whDgCGLv9WSbxEKo2V', + 'lAqp8WbGgiG6D8LTKJcg3O72CDdre1Qx': 'O2pnm6473HNJjpKuVosd3vVeh975yrX5', + 'wyJbYEDxKSdbkJ6S6RhW8meou82aqRy8': 'M2wPm7EgRSpQGlAh70CedD4ruzQGdKYy', + 'M4lgW28nLCe0PVdtaXszVXq0SeRVqzA9': 'n2zmJvg4jHv3G0ETNgiwW51LcveWOZ8e', + '5Qw3OVvp9FvGKlDTmOC7xwNZcjRmvEQ9': 'R2GzDdml9F1Qlnytw9s0B9pytWwAdD8y', + 'vy8a98X7zCwrRqbHrLUjYzwDiK2b70Qb': 'K8lVwzyjZiBpnAaSGeUmnAgxuGOBxmY0', + 'g4eGjJLLoiqRD3Pf9oT5O03LuNbLRDQp': '6XqD59zzpfN4EwQuaGt67qNpSyRBlnYy', + 'g43OPp9boIml7o6fDOIRL1ErCdeD8z4K': 'RX33alNB4s4Nr6whDPUGLv9WSbxEKoXV', + 'xA2ng9OkBcGKzDbTkKsJlx7dUK8R3dA5': 'z2aPnJvzBfObkwGC3vFaPxeBhxoMqZ8K', + 'xyKEgBajZuRr6DEC0Kt7XpD1cnNW9gAv': 'g2ddlEBvRsqdgKaI4jUK9PrgfMexGZ23', + 'BAogww51jIMa2JnH1BcYpXM5F658RNAL': 'rYWDmm0KptlkGv4FGJFMdZmjs9RDE6XR', + 'BAokpg62VtMa2JnH1mHYpXM5F658RNAL': 'rYWryDnlNslkGv4FG4HMdZmjs9RDE62R', + 'a4z1Px5e2hzg0m0iMMCPeanGSkRKWXAg': 'LY9eorNQGUdGW9WuKKf5JGeEik7l1NYQ', + 'kAx69R58kF9nY5YcdecJdl2pFXP53WyX': 'gXyRxELpbfPvLeLSaRil0mp6UEzbZJ8L', + 'BAoY13nwViMa2J2uo2cY6BlETgmdwryL': 'rYWwKzJmNFlkGvGtNoUM9bzwIJVzB1YR', } _MCP_TO_ACCESS_KEY_TABLE = { @@ -134,6 +204,10 @@ class AnvatoIE(InfoExtractor): 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' } + _TOKEN_GENERATORS = { + 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, + } + _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' @@ -189,19 +263,20 @@ class AnvatoIE(InfoExtractor): video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') anvrid = md5_text(time.time() * 1000 * random.random())[:30] - payload = { - 'api': { - 'anvrid': anvrid, - 'anvstk': md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, - self._ANVACK_TABLE.get(access_key, self._API_KEY))), - 'anvts': server_time, - }, + api = { + 'anvrid': anvrid, + 'anvts': server_time, } + if access_key in self._TOKEN_GENERATORS: + api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id) + else: + api['anvstk'] = md5_text('%s|%s|%d|%s' % ( + access_key, anvrid, server_time, + self._ANVACK_TABLE.get(access_key, self._API_KEY))) return self._download_json( video_data_url, video_id, transform_source=strip_jsonp, - data=json.dumps(payload).encode('utf-8')) + data=json.dumps({'api': api}).encode('utf-8')) def _get_anvato_videos(self, access_key, video_id): video_data = self._get_video_json(access_key, video_id) @@ -259,7 +334,7 @@ class AnvatoIE(InfoExtractor): 'description': video_data.get('def_description'), 'tags': video_data.get('def_tags', '').split(','), 'categories': video_data.get('categories'), - 'thumbnail': video_data.get('thumbnail'), + 'thumbnail': video_data.get('src_image_url') or video_data.get('thumbnail'), 'timestamp': int_or_none(video_data.get( 'ts_published') or video_data.get('ts_added')), 'uploader': video_data.get('mcp_id'), diff --git a/haruhi_dl/extractor/anvato_token_generator/__init__.py b/haruhi_dl/extractor/anvato_token_generator/__init__.py new file mode 100644 index 000000000..6e223db9f --- /dev/null +++ b/haruhi_dl/extractor/anvato_token_generator/__init__.py @@ -0,0 +1,7 @@ +from __future__ import unicode_literals + +from .nfl import NFLTokenGenerator + +__all__ = [ + 'NFLTokenGenerator', +] diff --git a/haruhi_dl/extractor/anvato_token_generator/common.py b/haruhi_dl/extractor/anvato_token_generator/common.py new file mode 100644 index 000000000..b959a903b --- /dev/null +++ b/haruhi_dl/extractor/anvato_token_generator/common.py @@ -0,0 +1,6 @@ +from __future__ import unicode_literals + + +class TokenGenerator: + def generate(self, anvack, mcp_id): + raise NotImplementedError('This method must be implemented by subclasses') diff --git a/haruhi_dl/extractor/anvato_token_generator/nfl.py b/haruhi_dl/extractor/anvato_token_generator/nfl.py new file mode 100644 index 000000000..97a2b245f --- /dev/null +++ b/haruhi_dl/extractor/anvato_token_generator/nfl.py @@ -0,0 +1,30 @@ +from __future__ import unicode_literals + +import json + +from .common import TokenGenerator + + +class NFLTokenGenerator(TokenGenerator): + _AUTHORIZATION = None + + def generate(ie, anvack, mcp_id): + if not NFLTokenGenerator._AUTHORIZATION: + reroute = ie._download_json( + 'https://api.nfl.com/v1/reroute', mcp_id, + data=b'grant_type=client_credentials', + headers={'X-Domain-Id': 100}) + NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token']) + return ie._download_json( + 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ + 'query': '''{ + viewer { + mediaToken(anvack: "%s", id: %s) { + token + } + } +}''' % (anvack, mcp_id), + }).encode(), headers={ + 'Authorization': NFLTokenGenerator._AUTHORIZATION, + 'Content-Type': 'application/json', + })['data']['viewer']['mediaToken']['token'] From 7346665442a3f485e8db21aafdbcc21266885e7a Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:16:34 +0100 Subject: [PATCH 156/384] [nfl] fix extraction(closes #22245) --- haruhi_dl/extractor/extractors.py | 5 +- haruhi_dl/extractor/nfl.py | 258 +++++++++++------------------- 2 files changed, 97 insertions(+), 166 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 825a28907..6a78ccb97 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -755,7 +755,10 @@ from .nexx import ( NexxIE, NexxEmbedIE, ) -from .nfl import NFLIE +from .nfl import ( + NFLIE, + NFLArticleIE, +) from .nhk import ( NhkVodIE, NhkVodProgramIE, diff --git a/haruhi_dl/extractor/nfl.py b/haruhi_dl/extractor/nfl.py index 460deb162..e234fad38 100644 --- a/haruhi_dl/extractor/nfl.py +++ b/haruhi_dl/extractor/nfl.py @@ -4,19 +4,15 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, -) from ..utils import ( - ExtractorError, - int_or_none, - remove_end, + clean_html, + determine_ext, + get_element_by_class, ) -class NFLIE(InfoExtractor): - IE_NAME = 'nfl.com' - _VALID_URL = r'''(?x) +class NFLBaseIE(InfoExtractor): + _VALID_URL_BASE = r'''(?x) https?:// (?P<host> (?:www\.)? @@ -34,15 +30,15 @@ class NFLIE(InfoExtractor): houstontexans| colts| jaguars| - titansonline| + (?:titansonline|tennesseetitans)| denverbroncos| - kcchiefs| + (?:kc)?chiefs| raiders| chargers| dallascowboys| giants| philadelphiaeagles| - redskins| + (?:redskins|washingtonfootball)| chicagobears| detroitlions| packers| @@ -52,180 +48,112 @@ class NFLIE(InfoExtractor): neworleanssaints| buccaneers| azcardinals| - stlouisrams| + (?:stlouis|the)rams| 49ers| seahawks )\.com| .+?\.clubs\.nfl\.com ) )/ - (?:.+?/)* - (?P<id>[^/#?&]+) ''' + _VIDEO_CONFIG_REGEX = r'<script[^>]+id="[^"]*video-config-[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}[^"]*"[^>]*>\s*({.+})' + + def _parse_video_config(self, video_config, display_id): + video_config = self._parse_json(video_config, display_id) + item = video_config['playlist'][0] + mcp_id = item.get('mcpID') + if mcp_id: + info = self.url_result( + 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:' + mcp_id, + 'Anvato', mcp_id) + else: + media_id = item.get('id') or item['entityId'] + title = item['title'] + item_url = item['url'] + info = {'id': media_id} + ext = determine_ext(item_url) + if ext == 'm3u8': + info['formats'] = self._extract_m3u8_formats(item_url, media_id, 'mp4') + self._sort_formats(info['formats']) + else: + info['url'] = item_url + if item.get('audio') is True: + info['vcodec'] = 'none' + is_live = video_config.get('live') is True + thumbnails = None + image_url = item.get(item.get('imageSrc')) or item.get(item.get('posterImage')) + if image_url: + thumbnails = [{ + 'url': image_url, + 'ext': determine_ext(image_url, 'jpg'), + }] + info.update({ + 'title': self._live_title(title) if is_live else title, + 'is_live': is_live, + 'description': clean_html(item.get('description')), + 'thumbnails': thumbnails, + }) + return info + + +class NFLIE(NFLBaseIE): + IE_NAME = 'nfl.com' + _VALID_URL = NFLBaseIE._VALID_URL_BASE + r'(?:videos?|listen|audio)/(?P<id>[^/#?&]+)' _TESTS = [{ - 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', - 'md5': '394ef771ddcd1354f665b471d78ec4c6', + 'url': 'https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14', 'info_dict': { - 'id': '0ap3000000398478', + 'id': '899441', 'ext': 'mp4', - 'title': 'Week 3: Redskins vs. Eagles highlights', - 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', - 'upload_date': '20140921', - 'timestamp': 1411337580, + 'title': "Baker Mayfield's game-changing plays from 3-TD game Week 14", + 'description': 'md5:85e05a3cc163f8c344340f220521136d', + 'upload_date': '20201215', + 'timestamp': 1608009755, 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'NFL', } }, { - 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266', - 'md5': 'cf85bdb4bc49f6e9d3816d130c78279c', + 'url': 'https://www.chiefs.com/listen/patrick-mahomes-travis-kelce-react-to-win-over-dolphins-the-breakdown', + 'md5': '6886b32c24b463038c760ceb55a34566', 'info_dict': { - 'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266', - 'ext': 'mp4', - 'title': 'LIVE: Post Game vs. Browns', - 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8', - 'upload_date': '20131229', - 'timestamp': 1388354455, - 'thumbnail': r're:^https?://.*\.jpg$', + 'id': 'd87e8790-3e14-11eb-8ceb-ff05c2867f99', + 'ext': 'mp3', + 'title': 'Patrick Mahomes, Travis Kelce React to Win Over Dolphins | The Breakdown', + 'description': 'md5:12ada8ee70e6762658c30e223e095075', } }, { - 'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish', - 'info_dict': { - 'id': '0ap3000000467607', - 'ext': 'mp4', - 'title': 'Frustrations flare on the field', - 'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.', - 'timestamp': 1422850320, - 'upload_date': '20150202', - }, - }, { - 'url': 'http://www.patriots.com/video/2015/09/18/10-days-gillette', - 'md5': '4c319e2f625ffd0b481b4382c6fc124c', - 'info_dict': { - 'id': 'n-238346', - 'ext': 'mp4', - 'title': '10 Days at Gillette', - 'description': 'md5:8cd9cd48fac16de596eadc0b24add951', - 'timestamp': 1442618809, - 'upload_date': '20150918', - }, - }, { - # lowercase data-contentid - 'url': 'http://www.steelers.com/news/article-1/Tomlin-on-Ben-getting-Vick-ready/56399c96-4160-48cf-a7ad-1d17d4a3aef7', - 'info_dict': { - 'id': '12693586-6ea9-4743-9c1c-02c59e4a5ef2', - 'ext': 'mp4', - 'title': 'Tomlin looks ahead to Ravens on a short week', - 'description': 'md5:32f3f7b139f43913181d5cbb24ecad75', - 'timestamp': 1443459651, - 'upload_date': '20150928', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood', + 'url': 'https://www.buffalobills.com/video/buffalo-bills-military-recognition-week-14', 'only_matching': True, }, { - 'url': 'http://www.buffalobills.com/video/videos/Rex_Ryan_Show_World_Wide_Rex/b1dcfab2-3190-4bb1-bfc0-d6e603d6601a', + 'url': 'https://www.raiders.com/audio/instant-reactions-raiders-week-14-loss-to-indianapolis-colts-espn-jason-fitz', 'only_matching': True, }] - @staticmethod - def prepend_host(host, url): - if not url.startswith('http'): - if not url.startswith('/'): - url = '/%s' % url - url = 'http://{0:}{1:}'.format(host, url) - return url + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + return self._parse_video_config(self._search_regex( + self._VIDEO_CONFIG_REGEX, webpage, 'video config'), display_id) - @staticmethod - def format_from_stream(stream, protocol, host, path_prefix='', - preference=0, note=None): - url = '{protocol:}://{host:}/{prefix:}{path:}'.format( - protocol=protocol, - host=host, - prefix=path_prefix, - path=stream.get('path'), - ) - return { - 'url': url, - 'vbr': int_or_none(stream.get('rate', 0), 1000), - 'preference': preference, - 'format_note': note, - } + +class NFLArticleIE(NFLBaseIE): + IE_NAME = 'nfl.com:article' + _VALID_URL = NFLBaseIE._VALID_URL_BASE + r'news/(?P<id>[^/#?&]+)' + _TEST = { + 'url': 'https://www.buffalobills.com/news/the-only-thing-we-ve-earned-is-the-noise-bills-coaches-discuss-handling-rising-e', + 'info_dict': { + 'id': 'the-only-thing-we-ve-earned-is-the-noise-bills-coaches-discuss-handling-rising-e', + 'title': "'The only thing we've earned is the noise' | Bills coaches discuss handling rising expectations", + }, + 'playlist_count': 4, + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, host = mobj.group('id'), mobj.group('host') - - webpage = self._download_webpage(url, video_id) - - config_url = NFLIE.prepend_host(host, self._search_regex( - r'(?:(?:config|configURL)\s*:\s*|<nflcs:avplayer[^>]+data-config\s*=\s*)(["\'])(?P<config>.+?)\1', - webpage, 'config URL', default='static/content/static/config/video/config.json', - group='config')) - # For articles, the id in the url is not the video id - video_id = self._search_regex( - r'(?:<nflcs:avplayer[^>]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P<id>(?:(?!\1).)+)\1', - webpage, 'video id', default=video_id, group='id') - config = self._download_json(config_url, video_id, 'Downloading player config') - url_template = NFLIE.prepend_host( - host, '{contentURLTemplate:}'.format(**config)) - video_data = self._download_json( - url_template.format(id=video_id), video_id) - - formats = [] - cdn_data = video_data.get('cdnData', {}) - streams = cdn_data.get('bitrateInfo', []) - if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM': - parts = compat_urllib_parse_urlparse(cdn_data.get('uri')) - protocol, host = parts.scheme, parts.netloc - for stream in streams: - formats.append( - NFLIE.format_from_stream(stream, protocol, host)) - else: - cdns = config.get('cdns') - if not cdns: - raise ExtractorError('Failed to get CDN data', expected=True) - - for name, cdn in cdns.items(): - # LimeLight streams don't seem to work - if cdn.get('name') == 'LIMELIGHT': - continue - - protocol = cdn.get('protocol') - host = remove_end(cdn.get('host', ''), '/') - if not (protocol and host): - continue - - prefix = cdn.get('pathprefix', '') - if prefix and not prefix.endswith('/'): - prefix = '%s/' % prefix - - preference = 0 - if protocol == 'rtmp': - preference = -2 - elif 'prog' in name.lower(): - preference = 1 - - for stream in streams: - formats.append( - NFLIE.format_from_stream(stream, protocol, host, - prefix, preference, name)) - - self._sort_formats(formats) - - thumbnail = None - for q in ('xl', 'l', 'm', 's', 'xs'): - thumbnail = video_data.get('imagePaths', {}).get(q) - if thumbnail: - break - - return { - 'id': video_id, - 'title': video_data.get('headline'), - 'formats': formats, - 'description': video_data.get('caption'), - 'duration': video_data.get('duration'), - 'thumbnail': thumbnail, - 'timestamp': int_or_none(video_data.get('posted'), 1000), - } + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + entries = [] + for video_config in re.findall(self._VIDEO_CONFIG_REGEX, webpage): + entries.append(self._parse_video_config(video_config, display_id)) + title = clean_html(get_element_by_class( + 'nfl-c-article__title', webpage)) or self._html_search_meta( + ['og:title', 'twitter:title'], webpage) + return self.playlist_result(entries, display_id, title) From 794a3becfbbc08a4c6268c38ac884b040a2cccc0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 15:17:04 +0100 Subject: [PATCH 157/384] [asiancrush] fix extraction and add support for retrocrush.tv closes #25577 closes #25829 --- haruhi_dl/extractor/asiancrush.py | 211 +++++++++++++++++++----------- 1 file changed, 133 insertions(+), 78 deletions(-) diff --git a/haruhi_dl/extractor/asiancrush.py b/haruhi_dl/extractor/asiancrush.py index 0348e680c..66ce7c686 100644 --- a/haruhi_dl/extractor/asiancrush.py +++ b/haruhi_dl/extractor/asiancrush.py @@ -1,27 +1,91 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import re from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import extract_attributes +from ..utils import ( + extract_attributes, + int_or_none, + OnDemandPagedList, + parse_age_limit, + strip_or_none, + try_get, +) -class AsianCrushIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?(?P<host>(?:(?:asiancrush|yuyutv|midnightpulp)\.com|cocoro\.tv))' - _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' % _VALID_URL_BASE +class AsianCrushBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?(?P<host>(?:(?:asiancrush|yuyutv|midnightpulp)\.com|(?:cocoro|retrocrush)\.tv))' + _KALTURA_KEYS = [ + 'video_url', 'progressive_url', 'download_url', 'thumbnail_url', + 'widescreen_thumbnail_url', 'screencap_widescreen', + ] + _API_SUFFIX = {'retrocrush.tv': '-ott'} + + def _call_api(self, host, endpoint, video_id, query, resource): + return self._download_json( + 'https://api%s.%s/%s' % (self._API_SUFFIX.get(host, ''), host, endpoint), video_id, + 'Downloading %s JSON metadata' % resource, query=query, + headers=self.geo_verification_headers())['objects'] + + def _download_object_data(self, host, object_id, resource): + return self._call_api( + host, 'search', object_id, {'id': object_id}, resource)[0] + + def _get_object_description(self, obj): + return strip_or_none(obj.get('long_description') or obj.get('short_description')) + + def _parse_video_data(self, video): + title = video['name'] + + entry_id, partner_id = [None] * 2 + for k in self._KALTURA_KEYS: + k_url = video.get(k) + if k_url: + mobj = re.search(r'/p/(\d+)/.+?/entryId/([^/]+)/', k_url) + if mobj: + partner_id, entry_id = mobj.groups() + break + + meta_categories = try_get(video, lambda x: x['meta']['categories'], list) or [] + categories = list(filter(None, [c.get('name') for c in meta_categories])) + + show_info = video.get('show_info') or {} + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': KalturaIE.ie_key(), + 'id': entry_id, + 'title': title, + 'description': self._get_object_description(video), + 'age_limit': parse_age_limit(video.get('mpaa_rating') or video.get('tv_rating')), + 'categories': categories, + 'series': show_info.get('show_name'), + 'season_number': int_or_none(show_info.get('season_num')), + 'season_id': show_info.get('season_id'), + 'episode_number': int_or_none(show_info.get('episode_num')), + } + + +class AsianCrushIE(AsianCrushBaseIE): + _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P<id>\d+)v\b' % AsianCrushBaseIE._VALID_URL_BASE _TESTS = [{ - 'url': 'https://www.asiancrush.com/video/012869v/women-who-flirt/', + 'url': 'https://www.asiancrush.com/video/004289v/women-who-flirt', 'md5': 'c3b740e48d0ba002a42c0b72857beae6', 'info_dict': { 'id': '1_y4tmjm5r', 'ext': 'mp4', 'title': 'Women Who Flirt', - 'description': 'md5:7e986615808bcfb11756eb503a751487', + 'description': 'md5:b65c7e0ae03a85585476a62a186f924c', 'timestamp': 1496936429, 'upload_date': '20170608', 'uploader_id': 'craig@crifkin.com', + 'age_limit': 13, + 'categories': 'count:5', + 'duration': 5812, }, }, { 'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/', @@ -41,67 +105,35 @@ class AsianCrushIE(InfoExtractor): }, { 'url': 'https://www.cocoro.tv/video/the-wonderful-wizard-of-oz/008878v-the-wonderful-wizard-of-oz-ep01/', 'only_matching': True, + }, { + 'url': 'https://www.retrocrush.tv/video/true-tears/012328v-i...gave-away-my-tears', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - video_id = mobj.group('id') + host, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, video_id) - - entry_id, partner_id, title = [None] * 3 - - vars = self._parse_json( - self._search_regex( + if host == 'cocoro.tv': + webpage = self._download_webpage(url, video_id) + embed_vars = self._parse_json(self._search_regex( r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars', - default='{}'), video_id, fatal=False) - if vars: - entry_id = vars.get('entry_id') - partner_id = vars.get('partner_id') - title = vars.get('vid_label') + default='{}'), video_id, fatal=False) or {} + video_id = embed_vars.get('entry_id') or video_id - if not entry_id: - entry_id = self._search_regex( - r'\bentry_id["\']\s*:\s*["\'](\d+)', webpage, 'entry id') - - player = self._download_webpage( - 'https://api.%s/embeddedVideoPlayer' % host, video_id, - query={'id': entry_id}) - - kaltura_id = self._search_regex( - r'entry_id["\']\s*:\s*(["\'])(?P<id>(?:(?!\1).)+)\1', player, - 'kaltura id', group='id') - - if not partner_id: - partner_id = self._search_regex( - r'/p(?:artner_id)?/(\d+)', player, 'partner id', - default='513551') - - description = self._html_search_regex( - r'(?s)<div[^>]+\bclass=["\']description["\'][^>]*>(.+?)</div>', - webpage, 'description', fatal=False) - - return { - '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), - 'ie_key': KalturaIE.ie_key(), - 'id': video_id, - 'title': title, - 'description': description, - } + video = self._download_object_data(host, video_id, 'video') + return self._parse_video_data(video) -class AsianCrushPlaylistIE(InfoExtractor): - _VALID_URL = r'%s/series/0+(?P<id>\d+)s\b' % AsianCrushIE._VALID_URL_BASE +class AsianCrushPlaylistIE(AsianCrushBaseIE): + _VALID_URL = r'%s/series/0+(?P<id>\d+)s\b' % AsianCrushBaseIE._VALID_URL_BASE _TESTS = [{ - 'url': 'https://www.asiancrush.com/series/012481s/scholar-walks-night/', + 'url': 'https://www.asiancrush.com/series/006447s/fruity-samurai', 'info_dict': { - 'id': '12481', - 'title': 'Scholar Who Walks the Night', - 'description': 'md5:7addd7c5132a09fd4741152d96cce886', + 'id': '6447', + 'title': 'Fruity Samurai', + 'description': 'md5:7535174487e4a202d3872a7fc8f2f154', }, - 'playlist_count': 20, + 'playlist_count': 13, }, { 'url': 'https://www.yuyutv.com/series/013920s/peep-show/', 'only_matching': True, @@ -111,35 +143,58 @@ class AsianCrushPlaylistIE(InfoExtractor): }, { 'url': 'https://www.cocoro.tv/series/008549s/the-wonderful-wizard-of-oz/', 'only_matching': True, + }, { + 'url': 'https://www.retrocrush.tv/series/012355s/true-tears', + 'only_matching': True, }] + _PAGE_SIZE = 1000000000 + + def _fetch_page(self, domain, parent_id, page): + videos = self._call_api( + domain, 'getreferencedobjects', parent_id, { + 'max': self._PAGE_SIZE, + 'object_type': 'video', + 'parent_id': parent_id, + 'start': page * self._PAGE_SIZE, + }, 'page %d' % (page + 1)) + for video in videos: + yield self._parse_video_data(video) def _real_extract(self, url): - playlist_id = self._match_id(url) + host, playlist_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, playlist_id) + if host == 'cocoro.tv': + webpage = self._download_webpage(url, playlist_id) - entries = [] + entries = [] - for mobj in re.finditer( - r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, - webpage): - attrs = extract_attributes(mobj.group(0)) - if attrs.get('class') == 'clearfix': - entries.append(self.url_result( - mobj.group('url'), ie=AsianCrushIE.ie_key())) + for mobj in re.finditer( + r'<a[^>]+href=(["\'])(?P<url>%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, + webpage): + attrs = extract_attributes(mobj.group(0)) + if attrs.get('class') == 'clearfix': + entries.append(self.url_result( + mobj.group('url'), ie=AsianCrushIE.ie_key())) - title = self._html_search_regex( - r'(?s)<h1\b[^>]\bid=["\']movieTitle[^>]+>(.+?)</h1>', webpage, - 'title', default=None) or self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', - default=None) or self._search_regex( - r'<title>([^<]+)', webpage, 'title', fatal=False) - if title: - title = re.sub(r'\s*\|\s*.+?$', '', title) + title = self._html_search_regex( + r'(?s)]\bid=["\']movieTitle[^>]+>(.+?)', webpage, + 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or self._search_regex( + r'([^<]+)', webpage, 'title', fatal=False) + if title: + title = re.sub(r'\s*\|\s*.+?$', '', title) - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'twitter:description', webpage, 'description', fatal=False) + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage, 'description', fatal=False) + else: + show = self._download_object_data(host, playlist_id, 'show') + title = show.get('name') + description = self._get_object_description(show) + entries = OnDemandPagedList( + functools.partial(self._fetch_page, host, playlist_id), + self._PAGE_SIZE) return self.playlist_result(entries, playlist_id, title, description) From 597505ed41af92c8c35b954d1cfc37f65dffdaf2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:19:00 +0100 Subject: [PATCH 158/384] [zaq1] Remove extractor --- haruhi_dl/extractor/extractors.py | 1 - haruhi_dl/extractor/zaq1.py | 101 ------------------------------ 2 files changed, 102 deletions(-) delete mode 100644 haruhi_dl/extractor/zaq1.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 6a78ccb97..39fa151e6 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1586,7 +1586,6 @@ from .youtube import ( YoutubeTruncatedURLIE, ) from .zapiks import ZapiksIE -from .zaq1 import Zaq1IE from .zattoo import ( BBVTVIE, EinsUndEinsTVIE, diff --git a/haruhi_dl/extractor/zaq1.py b/haruhi_dl/extractor/zaq1.py deleted file mode 100644 index 889aff5d8..000000000 --- a/haruhi_dl/extractor/zaq1.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_timestamp, -) - - -class Zaq1IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'http://zaq1.pl/video/xev0e', - 'md5': '24a5eb3f052e604ae597c4d0d19b351e', - 'info_dict': { - 'id': 'xev0e', - 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa', - 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147', - 'ext': 'mp4', - 'duration': 511, - 'timestamp': 1490896361, - 'uploader': 'Anonim', - 'upload_date': '20170330', - 'view_count': int, - } - }, { - # malformed JSON-LD - 'url': 'http://zaq1.pl/video/x81vn', - 'info_dict': { - 'id': 'x81vn', - 'title': 'SEKRETNE ŻYCIE WALTERA MITTY', - 'ext': 'mp4', - 'duration': 6234, - 'timestamp': 1493494860, - 'uploader': 'Anonim', - 'upload_date': '20170429', - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Failed to parse JSON'], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_url = self._search_regex( - r'data-video-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'video url', group='url') - - info = self._search_json_ld(webpage, video_id, fatal=False) - - def extract_data(field, name, fatal=False): - return self._search_regex( - r'data-%s=(["\'])(?P(?:(?!\1).)+)\1' % field, - webpage, field, fatal=fatal, group='field') - - if not info.get('title'): - info['title'] = extract_data('file-name', 'title', fatal=True) - - if not info.get('duration'): - info['duration'] = int_or_none(extract_data('duration', 'duration')) - - if not info.get('thumbnail'): - info['thumbnail'] = extract_data('photo-url', 'thumbnail') - - if not info.get('timestamp'): - info['timestamp'] = unified_timestamp(self._html_search_meta( - 'uploadDate', webpage, 'timestamp')) - - if not info.get('interactionCount'): - info['view_count'] = int_or_none(self._html_search_meta( - 'interactionCount', webpage, 'view count')) - - uploader = self._html_search_regex( - r'Wideo dodał:\s*]*>([^<]+)', webpage, 'uploader', - fatal=False) - - width = int_or_none(self._html_search_meta( - 'width', webpage, fatal=False)) - height = int_or_none(self._html_search_meta( - 'height', webpage, fatal=False)) - - info.update({ - 'id': video_id, - 'formats': [{ - 'url': video_url, - 'width': width, - 'height': height, - 'http_headers': { - 'Referer': url, - }, - }], - 'uploader': uploader, - }) - - return info From 76c441edf0ddc26d165b679f4ceb52a96237c7d5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:20:36 +0100 Subject: [PATCH 159/384] [anvato] Disable NFLTokenGenerator(closes #27449) --- haruhi_dl/extractor/anvato.py | 4 ++-- haruhi_dl/extractor/nfl.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/anvato.py b/haruhi_dl/extractor/anvato.py index a6410311c..98c5e6d38 100644 --- a/haruhi_dl/extractor/anvato.py +++ b/haruhi_dl/extractor/anvato.py @@ -9,7 +9,7 @@ import re import time from .common import InfoExtractor -from .anvato_token_generator import NFLTokenGenerator +# from .anvato_token_generator import NFLTokenGenerator from ..aes import aes_encrypt from ..compat import compat_str from ..utils import ( @@ -205,7 +205,7 @@ class AnvatoIE(InfoExtractor): } _TOKEN_GENERATORS = { - 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, + # 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, } _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' diff --git a/haruhi_dl/extractor/nfl.py b/haruhi_dl/extractor/nfl.py index e234fad38..871923e4c 100644 --- a/haruhi_dl/extractor/nfl.py +++ b/haruhi_dl/extractor/nfl.py @@ -57,6 +57,7 @@ class NFLBaseIE(InfoExtractor): )/ ''' _VIDEO_CONFIG_REGEX = r']+id="[^"]*video-config-[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}[^"]*"[^>]*>\s*({.+})' + _WORKING = False def _parse_video_config(self, video_config, display_id): video_config = self._parse_json(video_config, display_id) From 4f7380c8f5f52840d6210c62bff276c94f463bf3 Mon Sep 17 00:00:00 2001 From: Trevor Nelson <25140503+trevnels@users.noreply.github.com> Date: Fri, 26 Feb 2021 15:20:42 +0100 Subject: [PATCH 160/384] [redditr] Extract duration (#27426) --- haruhi_dl/extractor/reddit.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/haruhi_dl/extractor/reddit.py b/haruhi_dl/extractor/reddit.py index 663f622b3..3b2abb262 100644 --- a/haruhi_dl/extractor/reddit.py +++ b/haruhi_dl/extractor/reddit.py @@ -7,6 +7,7 @@ from ..utils import ( ExtractorError, int_or_none, float_or_none, + try_get, url_or_none, ) @@ -59,6 +60,7 @@ class RedditRIE(InfoExtractor): 'timestamp': 1501941939, 'upload_date': '20170805', 'uploader': 'Antw87', + 'duration': 12, 'like_count': int, 'dislike_count': int, 'comment_count': int, @@ -123,6 +125,10 @@ class RedditRIE(InfoExtractor): 'thumbnail': url_or_none(data.get('thumbnail')), 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), + 'duration': int_or_none(try_get( + data, + (lambda x: x['media']['reddit_video']['duration'], + lambda x: x['secure_media']['reddit_video']['duration']))), 'like_count': int_or_none(data.get('ups')), 'dislike_count': int_or_none(data.get('downs')), 'comment_count': int_or_none(data.get('num_comments')), From c359b1903460fa751a39ccf6fd079a50b84c8af4 Mon Sep 17 00:00:00 2001 From: renalid Date: Fri, 26 Feb 2021 15:20:47 +0100 Subject: [PATCH 161/384] [generic] Fix RSS itunes thumbnail extraction (#27405) --- haruhi_dl/extractor/generic.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index a321bcd6d..181ccd491 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -35,6 +35,7 @@ from ..utils import ( unsmuggle_url, UnsupportedError, url_or_none, + xpath_attr, xpath_text, xpath_with_ns, ) @@ -223,6 +224,30 @@ class GenericIE(InfoExtractor): }, }], }, + # RSS feed with item with description and thumbnails + { + 'url': 'https://anchor.fm/s/dd00e14/podcast/rss', + 'info_dict': { + 'id': 'https://anchor.fm/s/dd00e14/podcast/rss', + 'title': 're:.*100% Hydrogen.*', + 'description': 're:.*In this episode.*', + }, + 'playlist': [{ + 'info_dict': { + 'ext': 'm4a', + 'id': 'c1c879525ce2cb640b344507e682c36d', + 'title': 're:Hydrogen!', + 'description': 're:.*In this episode we are going.*', + 'timestamp': int, + 'upload_date': '20190908', + 'duration': int, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }], + 'params': { + 'skip_download': True, + }, + }, # RSS feed with enclosures and unsupported link URLs { 'url': 'http://www.hellointernet.fm/podcast?format=rss', @@ -2271,7 +2296,7 @@ class GenericIE(InfoExtractor): 'timestamp': unified_timestamp( xpath_text(it, 'pubDate', default=None)), 'duration': int_or_none(duration) or parse_duration(duration), - 'thumbnail': url_or_none(itunes('image')), + 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), 'episode': itunes('title'), 'episode_number': int_or_none(itunes('episode')), 'season_number': int_or_none(itunes('season')), From 457ef9b4b50b1bc538a2c98f27c7274d768f45a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:20:52 +0100 Subject: [PATCH 162/384] [generic] Improve RSS age limit extraction --- haruhi_dl/extractor/generic.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 181ccd491..240de66da 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -238,10 +238,13 @@ class GenericIE(InfoExtractor): 'id': 'c1c879525ce2cb640b344507e682c36d', 'title': 're:Hydrogen!', 'description': 're:.*In this episode we are going.*', - 'timestamp': int, + 'timestamp': 1567977776, 'upload_date': '20190908', - 'duration': int, + 'duration': 459, 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 1, + 'season_number': 1, + 'age_limit': 0, }, }], 'params': { @@ -2280,10 +2283,10 @@ class GenericIE(InfoExtractor): default=None) duration = itunes('duration') - explicit = itunes('explicit') - if explicit == 'true': + explicit = (itunes('explicit') or '').lower() + if explicit in ('true', 'yes'): age_limit = 18 - elif explicit == 'false': + elif explicit in ('false', 'no'): age_limit = 0 else: age_limit = None From 9d2fabe5d43845c5849da50ad11e0d33fefdf6c4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:21:13 +0100 Subject: [PATCH 163/384] [common] remove unwanted query params from unsigned akamai manifest URLs --- haruhi_dl/extractor/common.py | 9 ++++++++- haruhi_dl/extractor/nrk.py | 3 +-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index f845688f5..6b8ebbdef 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -2621,6 +2621,13 @@ class InfoExtractor(object): return entries def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): + signed = 'hdnea=' in manifest_url + if not signed: + # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html + manifest_url = re.sub( + r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?', + '', manifest_url).strip('?') + formats = [] hdcore_sign = 'hdcore=3.7.0' @@ -2646,7 +2653,7 @@ class InfoExtractor(object): formats.extend(m3u8_formats) http_host = hosts.get('http') - if http_host and m3u8_formats and 'hdnea=' not in m3u8_url: + if http_host and m3u8_formats and not signed: REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+' qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') qualities_length = len(qualities) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index fdf2d7407..b545f291b 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -33,8 +33,7 @@ class NRKBaseIE(InfoExtractor): def _extract_nrk_formats(self, asset_url, video_id): if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): - return self._extract_akamai_formats( - re.sub(r'(?:b=\d+-\d+|__a__=off)&?', '', asset_url), video_id) + return self._extract_akamai_formats(asset_url, video_id) asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url) formats = self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) From cb5a16067be0caa65b685014ed5ecb03a902de7f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:21:20 +0100 Subject: [PATCH 164/384] [turner] improve info extraction --- haruhi_dl/extractor/cnn.py | 5 +++- haruhi_dl/extractor/turner.py | 44 ++++++++++++++++++++++++++++------- 2 files changed, 39 insertions(+), 10 deletions(-) diff --git a/haruhi_dl/extractor/cnn.py b/haruhi_dl/extractor/cnn.py index 774b71055..2d950fa05 100644 --- a/haruhi_dl/extractor/cnn.py +++ b/haruhi_dl/extractor/cnn.py @@ -96,7 +96,10 @@ class CNNIE(TurnerBaseIE): config['data_src'] % path, page_title, { 'default': { 'media_src': config['media_src'], - } + }, + 'f4m': { + 'host': 'cnn-vh.akamaihd.net', + }, }) diff --git a/haruhi_dl/extractor/turner.py b/haruhi_dl/extractor/turner.py index 4a6cbfbb8..820e3cbe1 100644 --- a/haruhi_dl/extractor/turner.py +++ b/haruhi_dl/extractor/turner.py @@ -6,6 +6,7 @@ import re from .adobepass import AdobePassIE from ..compat import compat_str from ..utils import ( + fix_xml_ampersands, xpath_text, int_or_none, determine_ext, @@ -49,8 +50,13 @@ class TurnerBaseIE(AdobePassIE): self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token return video_url + '?hdnea=' + token - def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}): - video_data = self._download_xml(data_src, video_id) + def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}, fatal=False): + video_data = self._download_xml( + data_src, video_id, + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=fatal) + if not video_data: + return {} video_id = video_data.attrib['id'] title = xpath_text(video_data, 'headline', fatal=True) content_id = xpath_text(video_data, 'contentId') or video_id @@ -63,12 +69,14 @@ class TurnerBaseIE(AdobePassIE): urls = [] formats = [] + thumbnails = [] + subtitles = {} rex = re.compile( r'(?P[0-9]+)x(?P[0-9]+)(?:_(?P[0-9]+))?') # Possible formats locations: files/file, files/groupFiles/files # and maybe others for video_file in video_data.findall('.//file'): - video_url = video_file.text.strip() + video_url = url_or_none(video_file.text.strip()) if not video_url: continue ext = determine_ext(video_url) @@ -108,9 +116,28 @@ class TurnerBaseIE(AdobePassIE): continue urls.append(video_url) format_id = video_file.get('bitrate') - if ext == 'smil': + if ext in ('scc', 'srt', 'vtt'): + subtitles.setdefault('en', []).append({ + 'ext': ext, + 'url': video_url, + }) + elif ext == 'png': + thumbnails.append({ + 'id': format_id, + 'url': video_url, + }) + elif ext == 'smil': formats.extend(self._extract_smil_formats( video_url, video_id, fatal=False)) + elif re.match(r'https?://[^/]+\.akamaihd\.net/[iz]/', video_url): + formats.extend(self._extract_akamai_formats( + video_url, video_id, { + 'hds': path_data.get('f4m', {}).get('host'), + # nba.cdn.turner.com, ht.cdn.turner.com, ht2.cdn.turner.com + # ht3.cdn.turner.com, i.cdn.turner.com, s.cdn.turner.com + # ssl.cdn.turner.com + 'http': 'pmd.cdn.turner.com', + })) elif ext == 'm3u8': m3u8_formats = self._extract_m3u8_formats( video_url, video_id, 'mp4', @@ -129,7 +156,7 @@ class TurnerBaseIE(AdobePassIE): 'url': video_url, 'ext': ext, } - mobj = rex.search(format_id + video_url) + mobj = rex.search(video_url) if mobj: f.update({ 'width': int(mobj.group('width')), @@ -152,7 +179,6 @@ class TurnerBaseIE(AdobePassIE): formats.append(f) self._sort_formats(formats) - subtitles = {} for source in video_data.findall('closedCaptions/source'): for track in source.findall('track'): track_url = url_or_none(track.get('url')) @@ -168,12 +194,12 @@ class TurnerBaseIE(AdobePassIE): }.get(source.get('format')) }) - thumbnails = [{ - 'id': image.get('cut'), + thumbnails.extend({ + 'id': image.get('cut') or image.get('name'), 'url': image.text, 'width': int_or_none(image.get('width')), 'height': int_or_none(image.get('height')), - } for image in video_data.findall('images/image')] + } for image in video_data.findall('images/image')) is_live = xpath_text(video_data, 'isLive') == 'true' From 027f07edd3d84b66286769db847c8e8c15dd665f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:21:32 +0100 Subject: [PATCH 165/384] [nba] rewrite extractor --- haruhi_dl/extractor/extractors.py | 9 +- haruhi_dl/extractor/nba.py | 488 +++++++++++++++++++++++------- 2 files changed, 389 insertions(+), 108 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 39fa151e6..f639ade6c 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -710,7 +710,14 @@ from .nationalgeographic import ( NationalGeographicTVIE, ) from .naver import NaverIE -from .nba import NBAIE +from .nba import ( + NBAWatchEmbedIE, + NBAWatchIE, + NBAWatchCollectionIE, + NBAEmbedIE, + NBAIE, + NBAChannelIE, +) from .nbc import ( CSNNEIE, NBCIE, diff --git a/haruhi_dl/extractor/nba.py b/haruhi_dl/extractor/nba.py index be295a7a3..fbc7adaf4 100644 --- a/haruhi_dl/extractor/nba.py +++ b/haruhi_dl/extractor/nba.py @@ -5,33 +5,137 @@ import re from .turner import TurnerBaseIE from ..compat import ( - compat_urllib_parse_urlencode, - compat_urlparse, + compat_parse_qs, + compat_str, + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, ) from ..utils import ( + int_or_none, + merge_dicts, OnDemandPagedList, - remove_start, + parse_duration, + parse_iso8601, + try_get, + update_url_query, + urljoin, ) -class NBAIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P(?:[^/]+/)+(?P[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' +class NBACVPBaseIE(TurnerBaseIE): + def _extract_nba_cvp_info(self, path, video_id, fatal=False): + return self._extract_cvp_info( + 'http://secure.nba.com/%s' % path, video_id, { + 'default': { + 'media_src': 'http://nba.cdn.turner.com/nba/big', + }, + 'm3u8': { + 'media_src': 'http://nbavod-f.akamaihd.net', + }, + }, fatal=fatal) + + +class NBAWatchBaseIE(NBACVPBaseIE): + _VALID_URL_BASE = r'https?://(?:(?:www\.)?nba\.com(?:/watch)?|watch\.nba\.com)/' + + def _extract_video(self, filter_key, filter_value): + video = self._download_json( + 'https://neulionscnbav2-a.akamaihd.net/solr/nbad_program/usersearch', + filter_value, query={ + 'fl': 'description,image,name,pid,releaseDate,runtime,tags,seoName', + 'q': filter_key + ':' + filter_value, + 'wt': 'json', + })['response']['docs'][0] + + video_id = str(video['pid']) + title = video['name'] + + formats = [] + m3u8_url = (self._download_json( + 'https://watch.nba.com/service/publishpoint', video_id, query={ + 'type': 'video', + 'format': 'json', + 'id': video_id, + }, headers={ + 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1', + }, fatal=False) or {}).get('path') + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats( + re.sub(r'_(?:pc|iphone)\.', '.', m3u8_url), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + for f in m3u8_formats: + http_f = f.copy() + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': http_f['url'].replace('.m3u8', ''), + }) + formats.append(http_f) + + info = { + 'id': video_id, + 'title': title, + 'thumbnail': urljoin('https://nbadsdmt.akamaized.net/media/nba/nba/thumbs/', video.get('image')), + 'description': video.get('description'), + 'duration': int_or_none(video.get('runtime')), + 'timestamp': parse_iso8601(video.get('releaseDate')), + 'tags': video.get('tags'), + } + + seo_name = video.get('seoName') + if seo_name and re.search(r'\d{4}/\d{2}/\d{2}/', seo_name): + base_path = '' + if seo_name.startswith('teams/'): + base_path += seo_name.split('/')[1] + '/' + base_path += 'video/' + cvp_info = self._extract_nba_cvp_info( + base_path + seo_name + '.xml', video_id, False) + if cvp_info: + formats.extend(cvp_info['formats']) + info = merge_dicts(info, cvp_info) + + self._sort_formats(formats) + info['formats'] = formats + return info + + +class NBAWatchEmbedIE(NBAWatchBaseIE): + IENAME = 'nba:watch:embed' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P\d+)' + _TESTS = [{ + 'url': 'http://watch.nba.com/embed?id=659395', + 'md5': 'b7e3f9946595f4ca0a13903ce5edd120', + 'info_dict': { + 'id': '659395', + 'ext': 'mp4', + 'title': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017', + 'description': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017', + 'timestamp': 1492228800, + 'upload_date': '20170415', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video('pid', video_id) + + +class NBAWatchIE(NBAWatchBaseIE): + IE_NAME = 'nba:watch' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'(?:nba/)?video/(?P.+?(?=/index\.html)|(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - 'md5': '9e7729d3010a9c71506fd1248f74e4f4', + 'md5': '9d902940d2a127af3f7f9d2f3dc79c96', 'info_dict': { - 'id': '0021200253-okc-bkn-recap', + 'id': '70946', 'ext': 'mp4', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, - 'timestamp': 1354638466, + 'timestamp': 1354597200, 'upload_date': '20121204', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, @@ -39,116 +143,286 @@ class NBAIE(TurnerBaseIE): 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { - 'id': 'channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'id': '330865', 'ext': 'mp4', 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, - 'timestamp': 1432134543, - 'upload_date': '20150520', - }, - 'expected_warnings': ['Unable to download f4m manifest'], - }, { - 'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake', - 'info_dict': { - 'id': 'teams/clippers/2016/02/17/1455672027478-Doc_Feb16_720.mov-297324', - 'ext': 'mp4', - 'title': 'Practice: Doc Rivers - 2/16/16', - 'description': 'Head Coach Doc Rivers addresses the media following practice.', - 'upload_date': '20160216', - 'timestamp': 1455672000, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], - }, { - 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', - 'info_dict': { - 'id': 'timberwolves', - 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', - }, - 'playlist_count': 30, - 'params': { - # Download the whole playlist takes too long time - 'playlist_items': '1-30', + 'timestamp': 1432094400, + 'upload_date': '20150521', }, }, { - 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', - 'info_dict': { - 'id': 'teams/timberwolves/2014/12/12/Wigginsmp4-3462601', - 'ext': 'mp4', - 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', - 'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.', - 'upload_date': '20141212', - 'timestamp': 1418418600, - }, - 'params': { - 'noplaylist': True, - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], + 'url': 'http://watch.nba.com/nba/video/channels/nba_tv/2015/06/11/YT_go_big_go_home_Game4_061115', + 'only_matching': True, + }, { + # only CVP mp4 format available + 'url': 'https://watch.nba.com/video/teams/cavaliers/2012/10/15/sloan121015mov-2249106', + 'only_matching': True, + }, { + 'url': 'https://watch.nba.com/video/top-100-dunks-from-the-2019-20-season?plsrc=nba&collection=2019-20-season-highlights', + 'only_matching': True, }] - _PAGE_SIZE = 30 + def _real_extract(self, url): + display_id = self._match_id(url) + collection_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('collection', [None])[0] + if collection_id: + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % display_id) + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % collection_id) + return self.url_result( + 'https://www.nba.com/watch/list/collection/' + collection_id, + NBAWatchCollectionIE.ie_key(), collection_id) + return self._extract_video('seoName', display_id) - def _fetch_page(self, team, video_id, page): - search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse_urlencode({ - 'type': 'teamvideo', - 'start': page * self._PAGE_SIZE + 1, - 'npp': (page + 1) * self._PAGE_SIZE + 1, - 'sort': 'recent', - 'output': 'json', - 'site': team, - }) - results = self._download_json( - search_url, video_id, note='Download page %d of playlist data' % page)['results'][0] - for item in results: - yield self.url_result(compat_urlparse.urljoin('http://www.nba.com/', item['url'])) - def _extract_playlist(self, orig_path, video_id, webpage): - team = orig_path.split('/')[0] +class NBAWatchCollectionIE(NBAWatchBaseIE): + IE_NAME = 'nba:watch:collection' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'list/collection/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://watch.nba.com/list/collection/season-preview-2020', + 'info_dict': { + 'id': 'season-preview-2020', + }, + 'playlist_mincount': 43, + }] + _PAGE_SIZE = 100 - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video because of --no-playlist') - video_path = self._search_regex( - r'nbaVideoCore\.firstVideo\s*=\s*\'([^\']+)\';', webpage, 'video path') - video_url = 'http://www.nba.com/%s/video/%s' % (team, video_path) - return self.url_result(video_url) - - self.to_screen('Downloading playlist - add --no-playlist to just download video') - playlist_title = self._og_search_title(webpage, fatal=False) - entries = OnDemandPagedList( - functools.partial(self._fetch_page, team, video_id), - self._PAGE_SIZE) - - return self.playlist_result(entries, team, playlist_title) + def _fetch_page(self, collection_id, page): + page += 1 + videos = self._download_json( + 'https://content-api-prod.nba.com/public/1/endeavor/video-list/collection/' + collection_id, + collection_id, 'Downloading page %d JSON metadata' % page, query={ + 'count': self._PAGE_SIZE, + 'page': page, + })['results']['videos'] + for video in videos: + program = video.get('program') or {} + seo_name = program.get('seoName') or program.get('slug') + if not seo_name: + continue + yield { + '_type': 'url', + 'id': program.get('id'), + 'title': program.get('title') or video.get('title'), + 'url': 'https://www.nba.com/watch/video/' + seo_name, + 'thumbnail': video.get('image'), + 'description': program.get('description') or video.get('description'), + 'duration': parse_duration(program.get('runtimeHours')), + 'timestamp': parse_iso8601(video.get('releaseDate')), + } def _real_extract(self, url): - path, video_id = re.match(self._VALID_URL, url).groups() - orig_path = path - if path.startswith('nba/'): - path = path[3:] + collection_id = self._match_id(url) + entries = OnDemandPagedList( + functools.partial(self._fetch_page, collection_id), + self._PAGE_SIZE) + return self.playlist_result(entries, collection_id) - if 'video/' not in path: - webpage = self._download_webpage(url, video_id) - path = remove_start(self._search_regex(r'data-videoid="([^"]+)"', webpage, 'video id'), '/') - if path == '{{id}}': - return self._extract_playlist(orig_path, video_id, webpage) +class NBABaseIE(NBACVPBaseIE): + _VALID_URL_BASE = r'''(?x) + https?://(?:www\.)?nba\.com/ + (?P + blazers| + bucks| + bulls| + cavaliers| + celtics| + clippers| + grizzlies| + hawks| + heat| + hornets| + jazz| + kings| + knicks| + lakers| + magic| + mavericks| + nets| + nuggets| + pacers| + pelicans| + pistons| + raptors| + rockets| + sixers| + spurs| + suns| + thunder| + timberwolves| + warriors| + wizards + ) + (?:/play\#)?/''' + _CHANNEL_PATH_REGEX = r'video/channel|series' - # See prepareContentId() of pkgCvp.js - if path.startswith('video/teams'): - path = 'video/channels/proxy/' + path[6:] + def _embed_url_result(self, team, content_id): + return self.url_result(update_url_query( + 'https://secure.nba.com/assets/amp/include/video/iframe.html', { + 'contentId': content_id, + 'team': team, + }), NBAEmbedIE.ie_key()) - return self._extract_cvp_info( - 'http://www.nba.com/%s.xml' % path, video_id, { - 'default': { - 'media_src': 'http://nba.cdn.turner.com/nba/big', - }, - 'm3u8': { - 'media_src': 'http://nbavod-f.akamaihd.net', - }, + def _call_api(self, team, content_id, query, resource): + return self._download_json( + 'https://api.nba.net/2/%s/video,imported_video,wsc/' % team, + content_id, 'Download %s JSON metadata' % resource, + query=query, headers={ + 'accessToken': 'internal|bb88df6b4c2244e78822812cecf1ee1b', + })['response']['result'] + + def _extract_video(self, video, team, extract_all=True): + video_id = compat_str(video['nid']) + team = video['brand'] + + info = { + 'id': video_id, + 'title': video.get('title') or video.get('headline') or video['shortHeadline'], + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('published')), + } + + subtitles = {} + captions = try_get(video, lambda x: x['videoCaptions']['sidecars'], dict) or {} + for caption_url in captions.values(): + subtitles.setdefault('en', []).append({'url': caption_url}) + + formats = [] + mp4_url = video.get('mp4') + if mp4_url: + formats.append({ + 'url': mp4_url, }) + + if extract_all: + source_url = video.get('videoSource') + if source_url and not source_url.startswith('s3://') and self._is_valid_url(source_url, video_id, 'source'): + formats.append({ + 'format_id': 'source', + 'url': source_url, + 'preference': 1, + }) + + m3u8_url = video.get('m3u8') + if m3u8_url: + if '.akamaihd.net/i/' in m3u8_url: + formats.extend(self._extract_akamai_formats( + m3u8_url, video_id, {'http': 'pmd.cdn.turner.com'})) + else: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + content_xml = video.get('contentXml') + if team and content_xml: + cvp_info = self._extract_nba_cvp_info( + team + content_xml, video_id, fatal=False) + if cvp_info: + formats.extend(cvp_info['formats']) + subtitles = self._merge_subtitles(subtitles, cvp_info['subtitles']) + info = merge_dicts(info, cvp_info) + + self._sort_formats(formats) + else: + info.update(self._embed_url_result(team, video['videoId'])) + + info.update({ + 'formats': formats, + 'subtitles': subtitles, + }) + + return info + + def _real_extract(self, url): + team, display_id = re.match(self._VALID_URL, url).groups() + if '/play#/' in url: + display_id = compat_urllib_parse_unquote(display_id) + else: + webpage = self._download_webpage(url, display_id) + display_id = self._search_regex( + self._CONTENT_ID_REGEX + r'\s*:\s*"([^"]+)"', webpage, 'video id') + return self._extract_url_results(team, display_id) + + +class NBAEmbedIE(NBABaseIE): + IENAME = 'nba:embed' + _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P[^?#&]+)' + _TESTS = [{ + 'url': 'https://secure.nba.com/assets/amp/include/video/topIframe.html?contentId=teams/bulls/2020/12/04/3478774/1607105587854-20201204_SCHEDULE_RELEASE_FINAL_DRUPAL-3478774&team=bulls&adFree=false&profile=71&videoPlayerName=TAMPCVP&baseUrl=&videoAdsection=nba.com_mobile_web_teamsites_chicagobulls&Env=', + 'only_matching': True, + }, { + 'url': 'https://secure.nba.com/assets/amp/include/video/iframe.html?contentId=2016/10/29/0021600027boschaplay7&adFree=false&profile=71&team=&videoPlayerName=LAMPCVP', + 'only_matching': True, + }] + + def _real_extract(self, url): + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + content_id = qs['contentId'][0] + team = qs.get('team', [None])[0] + if not team: + return self.url_result( + 'https://watch.nba.com/video/' + content_id, NBAWatchIE.ie_key()) + video = self._call_api(team, content_id, {'videoid': content_id}, 'video')[0] + return self._extract_video(video, team) + + +class NBAIE(NBABaseIE): + IENAME = 'nba' + _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?!%s)video/(?P(?:[^/]+/)*[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX + _TESTS = [{ + 'url': 'https://www.nba.com/bulls/video/teams/bulls/2020/12/04/3478774/1607105587854-20201204schedulereleasefinaldrupal-3478774', + 'info_dict': { + 'id': '45039', + 'ext': 'mp4', + 'title': 'AND WE BACK.', + 'description': 'Part 1 of our 2020-21 schedule is here! Watch our games on NBC Sports Chicago.', + 'duration': 94, + 'timestamp': 1607112000, + 'upload_date': '20201218', + }, + }, { + 'url': 'https://www.nba.com/bucks/play#/video/teams%2Fbucks%2F2020%2F12%2F17%2F64860%2F1608252863446-Op_Dream_16x9-64860', + 'only_matching': True, + }, { + 'url': 'https://www.nba.com/bucks/play#/video/wsc%2Fteams%2F2787C911AA1ACD154B5377F7577CCC7134B2A4B0', + 'only_matching': True, + }] + _CONTENT_ID_REGEX = r'videoID' + + def _extract_url_results(self, team, content_id): + return self._embed_url_result(team, content_id) + + +class NBAChannelIE(NBABaseIE): + IENAME = 'nba:channel' + _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?:%s)/(?P[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX + _TESTS = [{ + 'url': 'https://www.nba.com/blazers/video/channel/summer_league', + 'info_dict': { + 'title': 'Summer League', + }, + 'playlist_mincount': 138, + }, { + 'url': 'https://www.nba.com/bucks/play#/series/On%20This%20Date', + 'only_matching': True, + }] + _CONTENT_ID_REGEX = r'videoSubCategory' + _PAGE_SIZE = 100 + + def _fetch_page(self, team, channel, page): + results = self._call_api(team, channel, { + 'channels': channel, + 'count': self._PAGE_SIZE, + 'offset': page * self._PAGE_SIZE, + }, 'page %d' % (page + 1)) + for video in results: + yield self._extract_video(video, team, False) + + def _extract_url_results(self, team, content_id): + entries = OnDemandPagedList( + functools.partial(self._fetch_page, team, content_id), + self._PAGE_SIZE) + return self.playlist_result(entries, playlist_title=content_id) From ef03683547c9331d62333260d9137207f2bb5ae6 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:22:09 +0100 Subject: [PATCH 166/384] [kanalplay] Remove Extractor --- haruhi_dl/extractor/extractors.py | 1 - haruhi_dl/extractor/kanalplay.py | 97 ------------------------------- 2 files changed, 98 deletions(-) delete mode 100644 haruhi_dl/extractor/kanalplay.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index f639ade6c..b7a2d5eba 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -532,7 +532,6 @@ from .joj import JojIE from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE -from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE diff --git a/haruhi_dl/extractor/kanalplay.py b/haruhi_dl/extractor/kanalplay.py deleted file mode 100644 index 6c3498c67..000000000 --- a/haruhi_dl/extractor/kanalplay.py +++ /dev/null @@ -1,97 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - srt_subtitles_timecode, -) - - -class KanalPlayIE(InfoExtractor): - IE_DESC = 'Kanal 5/9/11 Play' - _VALID_URL = r'https?://(?:www\.)?kanal(?P5|9|11)play\.se/(?:#!/)?(?:play/)?program/\d+/video/(?P\d+)' - _TESTS = [{ - 'url': 'http://www.kanal5play.se/#!/play/program/3060212363/video/3270012277', - 'info_dict': { - 'id': '3270012277', - 'ext': 'flv', - 'title': 'Saknar både dusch och avlopp', - 'description': 'md5:6023a95832a06059832ae93bc3c7efb7', - 'duration': 2636.36, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.kanal9play.se/#!/play/program/335032/video/246042', - 'only_matching': True, - }, { - 'url': 'http://www.kanal11play.se/#!/play/program/232835958/video/367135199', - 'only_matching': True, - }] - - def _fix_subtitles(self, subs): - return '\r\n\r\n'.join( - '%s\r\n%s --> %s\r\n%s' - % ( - num, - srt_subtitles_timecode(item['startMillis'] / 1000.0), - srt_subtitles_timecode(item['endMillis'] / 1000.0), - item['text'], - ) for num, item in enumerate(subs, 1)) - - def _get_subtitles(self, channel_id, video_id): - subs = self._download_json( - 'http://www.kanal%splay.se/api/subtitles/%s' % (channel_id, video_id), - video_id, 'Downloading subtitles JSON', fatal=False) - return {'sv': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {} - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - channel_id = mobj.group('channel_id') - - video = self._download_json( - 'http://www.kanal%splay.se/api/getVideo?format=FLASH&videoId=%s' % (channel_id, video_id), - video_id) - - reasons_for_no_streams = video.get('reasonsForNoStreams') - if reasons_for_no_streams: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, '\n'.join(reasons_for_no_streams)), - expected=True) - - title = video['title'] - description = video.get('description') - duration = float_or_none(video.get('length'), 1000) - thumbnail = video.get('posterUrl') - - stream_base_url = video['streamBaseUrl'] - - formats = [{ - 'url': stream_base_url, - 'play_path': stream['source'], - 'ext': 'flv', - 'tbr': float_or_none(stream.get('bitrate'), 1000), - 'rtmp_real_time': True, - } for stream in video['streams']] - self._sort_formats(formats) - - subtitles = {} - if video.get('hasSubtitle'): - subtitles = self.extract_subtitles(channel_id, video_id) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } From 90988f47724514a8029b7e4bb1426159bb8bfe7d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:22:49 +0100 Subject: [PATCH 167/384] [everyonesmixtape] Remove Extractor --- haruhi_dl/extractor/everyonesmixtape.py | 77 ------------------------- haruhi_dl/extractor/extractors.py | 1 - 2 files changed, 78 deletions(-) delete mode 100644 haruhi_dl/extractor/everyonesmixtape.py diff --git a/haruhi_dl/extractor/everyonesmixtape.py b/haruhi_dl/extractor/everyonesmixtape.py deleted file mode 100644 index 84a9b750e..000000000 --- a/haruhi_dl/extractor/everyonesmixtape.py +++ /dev/null @@ -1,77 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - sanitized_Request, -) - - -class EveryonesMixtapeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?everyonesmixtape\.com/#/mix/(?P[0-9a-zA-Z]+)(?:/(?P[0-9]))?$' - - _TESTS = [{ - 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi/5', - 'info_dict': { - 'id': '5bfseWNmlds', - 'ext': 'mp4', - 'title': "Passion Pit - \"Sleepyhead\" (Official Music Video)", - 'uploader': 'FKR.TV', - 'uploader_id': 'frenchkissrecords', - 'description': "Music video for \"Sleepyhead\" from Passion Pit's debut EP Chunk Of Change.\nBuy on iTunes: https://itunes.apple.com/us/album/chunk-of-change-ep/id300087641\n\nDirected by The Wilderness.\n\nhttp://www.passionpitmusic.com\nhttp://www.frenchkissrecords.com", - 'upload_date': '20081015' - }, - 'params': { - 'skip_download': True, # This is simply YouTube - } - }, { - 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi', - 'info_dict': { - 'id': 'm7m0jJAbMQi', - 'title': 'Driving', - }, - 'playlist_count': 24 - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - - pllist_url = 'http://everyonesmixtape.com/mixtape.php?a=getMixes&u=-1&linked=%s&explore=' % playlist_id - pllist_req = sanitized_Request(pllist_url) - pllist_req.add_header('X-Requested-With', 'XMLHttpRequest') - - playlist_list = self._download_json( - pllist_req, playlist_id, note='Downloading playlist metadata') - try: - playlist_no = next(playlist['id'] - for playlist in playlist_list - if playlist['code'] == playlist_id) - except StopIteration: - raise ExtractorError('Playlist id not found') - - pl_url = 'http://everyonesmixtape.com/mixtape.php?a=getMix&id=%s&userId=null&code=' % playlist_no - pl_req = sanitized_Request(pl_url) - pl_req.add_header('X-Requested-With', 'XMLHttpRequest') - playlist = self._download_json( - pl_req, playlist_id, note='Downloading playlist info') - - entries = [{ - '_type': 'url', - 'url': t['url'], - 'title': t['title'], - } for t in playlist['tracks']] - - if mobj.group('songnr'): - songnr = int(mobj.group('songnr')) - 1 - return entries[songnr] - - playlist_title = playlist['mixData']['name'] - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': playlist_title, - 'entries': entries, - } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index b7a2d5eba..4d3b7bb50 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -344,7 +344,6 @@ from .eurozet import ( EurozetPlayerPodcastIE, EurozetPlayerMusicStreamIE, ) -from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE from .expressen import ExpressenIE from .extremetube import ExtremeTubeIE From b3acd855b8bf48ef1c70995c1c342cc4813fb244 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:24:05 +0100 Subject: [PATCH 168/384] [niconico] fix playlist extraction(closes #27428) --- haruhi_dl/extractor/niconico.py | 97 ++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 26 deletions(-) diff --git a/haruhi_dl/extractor/niconico.py b/haruhi_dl/extractor/niconico.py index eb07ca776..a85fc3d5c 100644 --- a/haruhi_dl/extractor/niconico.py +++ b/haruhi_dl/extractor/niconico.py @@ -1,20 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -import json import datetime +import functools +import json +import math from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urlparse, + compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, dict_get, ExtractorError, - int_or_none, float_or_none, + InAdvancePagedList, + int_or_none, parse_duration, parse_iso8601, remove_start, @@ -181,7 +184,7 @@ class NiconicoIE(InfoExtractor): if urlh is False: login_ok = False else: - parts = compat_urlparse.urlparse(urlh.geturl()) + parts = compat_urllib_parse_urlparse(urlh.geturl()) if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login': login_ok = False if not login_ok: @@ -292,7 +295,7 @@ class NiconicoIE(InfoExtractor): 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', video_id, 'Downloading flv info') - flv_info = compat_urlparse.parse_qs(flv_info_webpage) + flv_info = compat_parse_qs(flv_info_webpage) if 'url' not in flv_info: if 'deleted' in flv_info: raise ExtractorError('The video has been deleted.', @@ -437,34 +440,76 @@ class NiconicoIE(InfoExtractor): class NiconicoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/mylist/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/)?mylist/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.nicovideo.jp/mylist/27411728', 'info_dict': { 'id': '27411728', 'title': 'AKB48のオールナイトニッポン', + 'description': 'md5:d89694c5ded4b6c693dea2db6e41aa08', + 'uploader': 'のっく', + 'uploader_id': '805442', }, 'playlist_mincount': 225, - } + }, { + 'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728', + 'only_matching': True, + }] + _PAGE_SIZE = 100 + + def _call_api(self, list_id, resource, query): + return self._download_json( + 'https://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, + 'Downloading %s JSON metatdata' % resource, query=query, + headers={'X-Frontend-Id': 6})['data']['mylist'] + + def _parse_owner(self, item): + owner = item.get('owner') or {} + if owner: + return { + 'uploader': owner.get('name'), + 'uploader_id': owner.get('id'), + } + return {} + + def _fetch_page(self, list_id, page): + page += 1 + items = self._call_api(list_id, 'page %d' % page, { + 'page': page, + 'pageSize': self._PAGE_SIZE, + })['items'] + for item in items: + video = item.get('video') or {} + video_id = video.get('id') + if not video_id: + continue + count = video.get('count') or {} + get_count = lambda x: int_or_none(count.get(x)) + info = { + '_type': 'url', + 'id': video_id, + 'title': video.get('title'), + 'url': 'https://www.nicovideo.jp/watch/' + video_id, + 'description': video.get('shortDescription'), + 'duration': int_or_none(video.get('duration')), + 'view_count': get_count('view'), + 'comment_count': get_count('comment'), + 'ie_key': NiconicoIE.ie_key(), + } + info.update(self._parse_owner(video)) + yield info def _real_extract(self, url): list_id = self._match_id(url) - webpage = self._download_webpage(url, list_id) - - entries_json = self._search_regex(r'Mylist\.preload\(\d+, (\[.*\])\);', - webpage, 'entries') - entries = json.loads(entries_json) - entries = [{ - '_type': 'url', - 'ie_key': NiconicoIE.ie_key(), - 'url': ('http://www.nicovideo.jp/watch/%s' % - entry['item_data']['video_id']), - } for entry in entries] - - return { - '_type': 'playlist', - 'title': self._search_regex(r'\s+name: "(.*?)"', webpage, 'title'), - 'id': list_id, - 'entries': entries, - } + mylist = self._call_api(list_id, 'list', { + 'pageSize': 1, + }) + entries = InAdvancePagedList( + functools.partial(self._fetch_page, list_id), + math.ceil(mylist['totalItemCount'] / self._PAGE_SIZE), + self._PAGE_SIZE) + result = self.playlist_result( + entries, list_id, mylist.get('name'), mylist.get('description')) + result.update(self._parse_owner(mylist)) + return result From 3c6c586e4b0373c7877bb81a1a97d7f736ef175b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:24:11 +0100 Subject: [PATCH 169/384] [tastytrade] Remove Extractor(closes #25716) covered by GenericIE via BrighcoveNewIE --- haruhi_dl/extractor/extractors.py | 1 - haruhi_dl/extractor/tastytrade.py | 43 ------------------------------- 2 files changed, 44 deletions(-) delete mode 100644 haruhi_dl/extractor/tastytrade.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 4d3b7bb50..b1aed7e05 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1173,7 +1173,6 @@ from .tagesschau import ( TagesschauIE, ) from .tass import TassIE -from .tastytrade import TastyTradeIE from .tbs import TBSIE from .tdslifeway import TDSLifewayIE from .teachable import ( diff --git a/haruhi_dl/extractor/tastytrade.py b/haruhi_dl/extractor/tastytrade.py deleted file mode 100644 index 7fe96bd5f..000000000 --- a/haruhi_dl/extractor/tastytrade.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .ooyala import OoyalaIE - - -class TastyTradeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/shows/[^/]+/episodes/(?P[^/?#&]+)' - - _TESTS = [{ - 'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017', - 'info_dict': { - 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', - 'ext': 'mp4', - 'title': 'A History of Teaming', - 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', - 'duration': 422.255, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, { - 'url': 'https://www.tastytrade.com/tt/shows/daily-dose/episodes/daily-dose-06-30-2017', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - ooyala_code = self._search_regex( - r'data-media-id=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'ooyala code', group='code') - - info = self._search_json_ld(webpage, display_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'ie_key': OoyalaIE.ie_key(), - 'url': 'ooyala:%s' % ooyala_code, - 'display_id': display_id, - }) - return info From ed14efaed272789c6f5dde6fb633af4fcb1c11f8 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:24:16 +0100 Subject: [PATCH 170/384] [anvato] remove NFLTokenGenerator until a better solution is introduced that: - works with lazy_extractors - allows for 3rd party token generators --- haruhi_dl/extractor/anvato.py | 14 ++------- .../anvato_token_generator/__init__.py | 7 ----- .../anvato_token_generator/common.py | 6 ---- .../extractor/anvato_token_generator/nfl.py | 30 ------------------- 4 files changed, 3 insertions(+), 54 deletions(-) delete mode 100644 haruhi_dl/extractor/anvato_token_generator/__init__.py delete mode 100644 haruhi_dl/extractor/anvato_token_generator/common.py delete mode 100644 haruhi_dl/extractor/anvato_token_generator/nfl.py diff --git a/haruhi_dl/extractor/anvato.py b/haruhi_dl/extractor/anvato.py index 98c5e6d38..b7398563b 100644 --- a/haruhi_dl/extractor/anvato.py +++ b/haruhi_dl/extractor/anvato.py @@ -9,7 +9,6 @@ import re import time from .common import InfoExtractor -# from .anvato_token_generator import NFLTokenGenerator from ..aes import aes_encrypt from ..compat import compat_str from ..utils import ( @@ -204,10 +203,6 @@ class AnvatoIE(InfoExtractor): 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' } - _TOKEN_GENERATORS = { - # 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, - } - _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' _ANVP_RE = r']+\bdata-anvp\s*=\s*(["\'])(?P(?:(?!\1).)+)\1' @@ -267,12 +262,9 @@ class AnvatoIE(InfoExtractor): 'anvrid': anvrid, 'anvts': server_time, } - if access_key in self._TOKEN_GENERATORS: - api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id) - else: - api['anvstk'] = md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, - self._ANVACK_TABLE.get(access_key, self._API_KEY))) + api['anvstk'] = md5_text('%s|%s|%d|%s' % ( + access_key, anvrid, server_time, + self._ANVACK_TABLE.get(access_key, self._API_KEY))) return self._download_json( video_data_url, video_id, transform_source=strip_jsonp, diff --git a/haruhi_dl/extractor/anvato_token_generator/__init__.py b/haruhi_dl/extractor/anvato_token_generator/__init__.py deleted file mode 100644 index 6e223db9f..000000000 --- a/haruhi_dl/extractor/anvato_token_generator/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from __future__ import unicode_literals - -from .nfl import NFLTokenGenerator - -__all__ = [ - 'NFLTokenGenerator', -] diff --git a/haruhi_dl/extractor/anvato_token_generator/common.py b/haruhi_dl/extractor/anvato_token_generator/common.py deleted file mode 100644 index b959a903b..000000000 --- a/haruhi_dl/extractor/anvato_token_generator/common.py +++ /dev/null @@ -1,6 +0,0 @@ -from __future__ import unicode_literals - - -class TokenGenerator: - def generate(self, anvack, mcp_id): - raise NotImplementedError('This method must be implemented by subclasses') diff --git a/haruhi_dl/extractor/anvato_token_generator/nfl.py b/haruhi_dl/extractor/anvato_token_generator/nfl.py deleted file mode 100644 index 97a2b245f..000000000 --- a/haruhi_dl/extractor/anvato_token_generator/nfl.py +++ /dev/null @@ -1,30 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import TokenGenerator - - -class NFLTokenGenerator(TokenGenerator): - _AUTHORIZATION = None - - def generate(ie, anvack, mcp_id): - if not NFLTokenGenerator._AUTHORIZATION: - reroute = ie._download_json( - 'https://api.nfl.com/v1/reroute', mcp_id, - data=b'grant_type=client_credentials', - headers={'X-Domain-Id': 100}) - NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token']) - return ie._download_json( - 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ - 'query': '''{ - viewer { - mediaToken(anvack: "%s", id: %s) { - token - } - } -}''' % (anvack, mcp_id), - }).encode(), headers={ - 'Authorization': NFLTokenGenerator._AUTHORIZATION, - 'Content-Type': 'application/json', - })['data']['viewer']['mediaToken']['token'] From 4317f7c6fafa171ec1b518a348084206b0da2ade Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:26:14 +0100 Subject: [PATCH 171/384] [mewatch] Relax _VALID_URL (closes #27506) --- haruhi_dl/extractor/toggle.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/toggle.py b/haruhi_dl/extractor/toggle.py index 91b8023b8..3b9b54759 100644 --- a/haruhi_dl/extractor/toggle.py +++ b/haruhi_dl/extractor/toggle.py @@ -200,7 +200,7 @@ class ToggleIE(InfoExtractor): class MeWatchIE(InfoExtractor): IE_NAME = 'mewatch' - _VALID_URL = r'https?://(?:www\.)?mewatch\.sg/watch/[0-9a-zA-Z-]+-(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?mewatch\.sg/watch/[^/?#&]+-(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371', 'info_dict': { @@ -214,6 +214,12 @@ class MeWatchIE(InfoExtractor): 'params': { 'skip_download': 'm3u8 download', }, + }, { + 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-搜密。打卡。小红点-S2-E1-176232', + 'only_matching': True, + }, { + 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-%E6%90%9C%E5%AF%86%E3%80%82%E6%89%93%E5%8D%A1%E3%80%82%E5%B0%8F%E7%BA%A2%E7%82%B9-S2-E1-176232', + 'only_matching': True, }] def _real_extract(self, url): From fc441623a8bcbb103e68b7389a90e94d9d97ae49 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:26:18 +0100 Subject: [PATCH 172/384] [brightcove] add another method to extract policyKey --- haruhi_dl/extractor/brightcove.py | 38 +++++++++++++++++-------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/haruhi_dl/extractor/brightcove.py b/haruhi_dl/extractor/brightcove.py index ee2867ecc..2845f4df2 100644 --- a/haruhi_dl/extractor/brightcove.py +++ b/haruhi_dl/extractor/brightcove.py @@ -28,6 +28,7 @@ from ..utils import ( parse_iso8601, smuggle_url, str_or_none, + try_get, unescapeHTML, unsmuggle_url, UnsupportedError, @@ -600,24 +601,27 @@ class BrightcoveNewIE(AdobePassIE): store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) def extract_policy_key(): - webpage = self._download_webpage( - 'http://players.brightcove.net/%s/%s_%s/index.min.js' - % (account_id, player_id, embed), video_id) - - policy_key = None - - catalog = self._search_regex( - r'catalog\(({.+?})\);', webpage, 'catalog', default=None) - if catalog: - catalog = self._parse_json( - js_to_json(catalog), video_id, fatal=False) - if catalog: - policy_key = catalog.get('policyKey') - + base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed) + config = self._download_json( + base_url + 'config.json', video_id, fatal=False) or {} + policy_key = try_get( + config, lambda x: x['video_cloud']['policy_key']) if not policy_key: - policy_key = self._search_regex( - r'policyKey\s*:\s*(["\'])(?P.+?)\1', - webpage, 'policy key', group='pk') + webpage = self._download_webpage( + base_url + 'index.min.js', video_id) + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P.+?)\1', + webpage, 'policy key', group='pk') store_pk(policy_key) return policy_key From 437ab525e92c1c44dc23f4bae8a73a5ae0fa3079 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:26:26 +0100 Subject: [PATCH 173/384] [cbslocal] fix video extraction --- haruhi_dl/extractor/cbslocal.py | 67 +++++++++++++++++++------------ haruhi_dl/extractor/extractors.py | 5 ++- 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/haruhi_dl/extractor/cbslocal.py b/haruhi_dl/extractor/cbslocal.py index 90852a9ef..3b7e1a8b9 100644 --- a/haruhi_dl/extractor/cbslocal.py +++ b/haruhi_dl/extractor/cbslocal.py @@ -11,7 +11,47 @@ from ..utils import ( class CBSLocalIE(AnvatoIE): - _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P[0-9a-z-]+)' + _VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/' + _VALID_URL = _VALID_URL_BASE + r'video/(?P\d+)' + + _TESTS = [{ + 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', + 'info_dict': { + 'id': '3580809', + 'ext': 'mp4', + 'title': 'A Very Blue Anniversary', + 'description': 'CBS2’s Cindy Hsu has more.', + 'thumbnail': 're:^https?://.*', + 'timestamp': int, + 'upload_date': r're:^\d{8}$', + 'uploader': 'CBS', + 'subtitles': { + 'en': 'mincount:5', + }, + 'categories': [ + 'Stations\\Spoken Word\\WCBSTV', + 'Syndication\\AOL', + 'Syndication\\MSN', + 'Syndication\\NDN', + 'Syndication\\Yahoo', + 'Content\\News', + 'Content\\News\\Local News', + ], + 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + mcp_id = self._match_id(url) + return self.url_result( + 'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id) + + +class CBSLocalArticleIE(AnvatoIE): + _VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P[0-9a-z-]+)' _TESTS = [{ # Anvato backend @@ -52,31 +92,6 @@ class CBSLocalIE(AnvatoIE): # m3u8 download 'skip_download': True, }, - }, { - 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', - 'info_dict': { - 'id': '3580809', - 'ext': 'mp4', - 'title': 'A Very Blue Anniversary', - 'description': 'CBS2’s Cindy Hsu has more.', - 'thumbnail': 're:^https?://.*', - 'timestamp': int, - 'upload_date': r're:^\d{8}$', - 'uploader': 'CBS', - 'subtitles': { - 'en': 'mincount:5', - }, - 'categories': [ - 'Stations\\Spoken Word\\WCBSTV', - 'Syndication\\AOL', - 'Syndication\\MSN', - 'Syndication\\NDN', - 'Syndication\\Yahoo', - 'Content\\News', - 'Content\\News\\Local News', - ], - 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], - }, }] def _real_extract(self, url): diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index b1aed7e05..da6767164 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -173,7 +173,10 @@ from .cbc import ( CBCOlympicsIE, ) from .cbs import CBSIE -from .cbslocal import CBSLocalIE +from .cbslocal import ( + CBSLocalIE, + CBSLocalArticleIE, +) from .cbsinteractive import CBSInteractiveIE from .cbsnews import ( CBSNewsEmbedIE, From 00e2c2ddea0df6d88aa6122ff19a3e0d8e14d381 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:33:41 +0100 Subject: [PATCH 174/384] [facebook] add support for watchparty pages(closes #27507) --- haruhi_dl/extractor/facebook.py | 209 ++++++++++++++++++++------------ 1 file changed, 133 insertions(+), 76 deletions(-) diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index d5afd0051..5dc931b86 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re import socket @@ -8,6 +9,7 @@ from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, compat_http_client, + compat_str, compat_urllib_error, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, @@ -47,7 +49,8 @@ class FacebookIE(InfoExtractor): )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| - groups/[^/]+/permalink/ + groups/[^/]+/permalink/| + watchparty/ )| facebook: ) @@ -280,8 +283,18 @@ class FacebookIE(InfoExtractor): # data.video.creation_story.attachments[].media 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/watchparty/211641140192478', + 'info_dict': { + 'id': '211641140192478', + }, + 'playlist_count': 1, + 'skip': 'Requires logging in', }] _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' + _api_config = { + 'graphURI': '/api/graphql/' + } @staticmethod def _extract_urls(webpage, **kwargs): @@ -405,6 +418,17 @@ class FacebookIE(InfoExtractor): self._sort_formats(formats) + def extract_relay_data(_filter): + return self._parse_json(self._search_regex( + r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, + webpage, 'replay data', default='{}'), video_id, fatal=False) or {} + + def extract_relay_prefetched_data(_filter): + replay_data = extract_relay_data(_filter) + for require in (replay_data.get('require') or []): + if require[0] == 'RelayPrefetchedStreamCache': + return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + if not video_data: server_js_data = self._parse_json(self._search_regex([ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, @@ -413,87 +437,83 @@ class FacebookIE(InfoExtractor): video_data = extract_from_jsmods_instances(server_js_data) if not video_data: - graphql_data = self._parse_json(self._search_regex( - r'handleWithCustomApplyEach\([^,]+,\s*({.*?"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+".*?})\);', - webpage, 'graphql data', default='{}'), video_id, fatal=False) or {} - for require in (graphql_data.get('require') or []): - if require[0] == 'RelayPrefetchedStreamCache': - entries = [] + data = extract_relay_prefetched_data( + r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') + if data: + entries = [] - def parse_graphql_video(video): - formats = [] - q = qualities(['sd', 'hd']) - for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: - playable_url = video.get('playable_url' + suffix) - if not playable_url: - continue - formats.append({ - 'format_id': format_id, - 'quality': q(format_id), - 'url': playable_url, - }) - extract_dash_manifest(video, formats) - process_formats(formats) - v_id = video.get('videoId') or video.get('id') or video_id - info = { - 'id': v_id, - 'formats': formats, - 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), - 'uploader_id': try_get(video, lambda x: x['owner']['id']), - 'timestamp': int_or_none(video.get('publish_time')), - 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), - } - description = try_get(video, lambda x: x['savable_description']['text']) - title = video.get('name') - if title: - info.update({ - 'title': title, - 'description': description, - }) - else: - info['title'] = description or 'Facebook video #%s' % v_id - entries.append(info) + def parse_graphql_video(video): + formats = [] + q = qualities(['sd', 'hd']) + for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: + playable_url = video.get('playable_url' + suffix) + if not playable_url: + continue + formats.append({ + 'format_id': format_id, + 'quality': q(format_id), + 'url': playable_url, + }) + extract_dash_manifest(video, formats) + process_formats(formats) + v_id = video.get('videoId') or video.get('id') or video_id + info = { + 'id': v_id, + 'formats': formats, + 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), + 'uploader_id': try_get(video, lambda x: x['owner']['id']), + 'timestamp': int_or_none(video.get('publish_time')), + 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), + } + description = try_get(video, lambda x: x['savable_description']['text']) + title = video.get('name') + if title: + info.update({ + 'title': title, + 'description': description, + }) + else: + info['title'] = description or 'Facebook video #%s' % v_id + entries.append(info) - def parse_attachment(attachment, key='media'): - media = attachment.get(key) or {} - if media.get('__typename') == 'Video': - return parse_graphql_video(media) + def parse_attachment(attachment, key='media'): + media = attachment.get(key) or {} + if media.get('__typename') == 'Video': + return parse_graphql_video(media) - data = try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + nodes = data.get('nodes') or [] + node = data.get('node') or {} + if not nodes and node: + nodes.append(node) + for node in nodes: + story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} + attachments = try_get(story, [ + lambda x: x['attached_story']['attachments'], + lambda x: x['attachments'] + ], list) or [] + for attachment in attachments: + attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) + ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] + for n in ns: + parse_attachment(n) + parse_attachment(attachment) - nodes = data.get('nodes') or [] - node = data.get('node') or {} - if not nodes and node: - nodes.append(node) - for node in nodes: - story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} - attachments = try_get(story, [ - lambda x: x['attached_story']['attachments'], - lambda x: x['attachments'] - ], list) or [] - for attachment in attachments: - attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) - ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] - for n in ns: - parse_attachment(n) - parse_attachment(attachment) + edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] + for edge in edges: + parse_attachment(edge, key='node') - edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] - for edge in edges: - parse_attachment(edge, key='node') + video = data.get('video') or {} + if video: + attachments = try_get(video, [ + lambda x: x['story']['attachments'], + lambda x: x['creation_story']['attachments'] + ], list) or [] + for attachment in attachments: + parse_attachment(attachment) + if not entries: + parse_graphql_video(video) - video = data.get('video') or {} - if video: - attachments = try_get(video, [ - lambda x: x['story']['attachments'], - lambda x: x['creation_story']['attachments'] - ], list) or [] - for attachment in attachments: - parse_attachment(attachment) - if not entries: - parse_graphql_video(video) - - return self.playlist_result(entries, video_id) + return self.playlist_result(entries, video_id) if not video_data: m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*">
(.*?)
', webpage) @@ -504,6 +524,43 @@ class FacebookIE(InfoExtractor): elif '>You must log in to continue' in webpage: self.raise_login_required() + if not video_data and '/watchparty/' in url: + post_data = { + 'doc_id': 3731964053542869, + 'variables': json.dumps({ + 'livingRoomID': video_id, + }), + } + + prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{') + if prefetched_data: + lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict) + if lsd: + post_data[lsd['name']] = lsd['value'] + + relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,') + for define in (relay_data.get('define') or []): + if define[0] == 'RelayAPIConfigDefaults': + self._api_config = define[2] + + living_room = self._download_json( + urljoin(url, self._api_config['graphURI']), video_id, + data=urlencode_postdata(post_data))['data']['living_room'] + + entries = [] + for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []): + video = try_get(edge, lambda x: x['node']['video']) or {} + v_id = video.get('id') + if not v_id: + continue + v_id = compat_str(v_id) + entries.append(self.url_result( + self._VIDEO_PAGE_TEMPLATE % v_id, + self.ie_key(), v_id, video.get('name'))) + + return self.playlist_result(entries, video_id) + + if not video_data: # Video info not in first request, do a secondary request using # tahoe player specific URL tahoe_data = self._download_webpage( From 6e80cb939be1b62653899d9cc34eccf38064cbf9 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:34:30 +0100 Subject: [PATCH 175/384] [streetvoice] fix extraction(closes #27455)(closes #27492) --- haruhi_dl/extractor/streetvoice.py | 95 +++++++++++++++++++++++------- 1 file changed, 73 insertions(+), 22 deletions(-) diff --git a/haruhi_dl/extractor/streetvoice.py b/haruhi_dl/extractor/streetvoice.py index 91612c7f2..f21681ae7 100644 --- a/haruhi_dl/extractor/streetvoice.py +++ b/haruhi_dl/extractor/streetvoice.py @@ -2,25 +2,40 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str -from ..utils import unified_strdate +from ..utils import ( + int_or_none, + parse_iso8601, + str_or_none, + strip_or_none, + try_get, + urljoin, +) class StreetVoiceIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P[0-9]+)' _TESTS = [{ - 'url': 'http://streetvoice.com/skippylu/songs/94440/', - 'md5': '15974627fc01a29e492c98593c2fd472', + 'url': 'https://streetvoice.com/skippylu/songs/123688/', + 'md5': '0eb535970629a5195685355f3ed60bfd', 'info_dict': { - 'id': '94440', + 'id': '123688', 'ext': 'mp3', - 'title': '輸', - 'description': 'Crispy脆樂團 - 輸', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 260, - 'upload_date': '20091018', + 'title': '流浪', + 'description': 'md5:8eb0bfcc9dcd8aa82bd6efca66e3fea6', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 270, + 'upload_date': '20100923', 'uploader': 'Crispy脆樂團', 'uploader_id': '627810', + 'uploader_url': 're:^https?://streetvoice.com/skippylu/', + 'timestamp': 1285261661, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + 'track': '流浪', + 'track_id': '123688', + 'album': '2010', } }, { 'url': 'http://tw.streetvoice.com/skippylu/songs/94440/', @@ -29,21 +44,57 @@ class StreetVoiceIE(InfoExtractor): def _real_extract(self, url): song_id = self._match_id(url) - - song = self._download_json( - 'https://streetvoice.com/api/v1/public/song/%s/' % song_id, song_id, data=b'') - + base_url = 'https://streetvoice.com/api/v4/song/%s/' % song_id + song = self._download_json(base_url, song_id, query={ + 'fields': 'album,comments_count,created_at,id,image,length,likes_count,name,nickname,plays_count,profile,share_count,synopsis,user,username', + }) title = song['name'] - author = song['user']['nickname'] + + formats = [] + for suffix, format_id in [('hls/file', 'hls'), ('file', 'http'), ('file/original', 'original')]: + f_url = (self._download_json( + base_url + suffix + '/', song_id, + 'Downloading %s format URL' % format_id, + data=b'', fatal=False) or {}).get('file') + if not f_url: + continue + f = { + 'ext': 'mp3', + 'format_id': format_id, + 'url': f_url, + 'vcodec': 'none', + } + if format_id == 'hls': + f['protocol'] = 'm3u8_native' + abr = self._search_regex(r'\.mp3\.(\d+)k', f_url, 'bitrate', default=None) + if abr: + abr = int(abr) + f.update({ + 'abr': abr, + 'tbr': abr, + }) + formats.append(f) + + user = song.get('user') or {} + username = user.get('username') + get_count = lambda x: int_or_none(song.get(x + '_count')) return { 'id': song_id, - 'url': song['file'], + 'formats': formats, 'title': title, - 'description': '%s - %s' % (author, title), - 'thumbnail': self._proto_relative_url(song.get('image'), 'http:'), - 'duration': song.get('length'), - 'upload_date': unified_strdate(song.get('created_at')), - 'uploader': author, - 'uploader_id': compat_str(song['user']['id']), + 'description': strip_or_none(song.get('synopsis')), + 'thumbnail': song.get('image'), + 'duration': int_or_none(song.get('length')), + 'timestamp': parse_iso8601(song.get('created_at')), + 'uploader': try_get(user, lambda x: x['profile']['nickname']), + 'uploader_id': str_or_none(user.get('id')), + 'uploader_url': urljoin(url, '/%s/' % username) if username else None, + 'view_count': get_count('plays'), + 'like_count': get_count('likes'), + 'comment_count': get_count('comments'), + 'repost_count': get_count('share'), + 'track': title, + 'track_id': song_id, + 'album': try_get(song, lambda x: x['album']['name']), } From 08d63a28df92441b03e9383b31e0d7a2817d86d2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:34:41 +0100 Subject: [PATCH 176/384] [sonyliv] fix extraction(closes #25667) --- haruhi_dl/extractor/sonyliv.py | 112 +++++++++++++++++++++++++++------ 1 file changed, 92 insertions(+), 20 deletions(-) diff --git a/haruhi_dl/extractor/sonyliv.py b/haruhi_dl/extractor/sonyliv.py index 58a8c0d4d..b460b343a 100644 --- a/haruhi_dl/extractor/sonyliv.py +++ b/haruhi_dl/extractor/sonyliv.py @@ -1,40 +1,112 @@ # coding: utf-8 from __future__ import unicode_literals +import time +import uuid + from .common import InfoExtractor -from ..utils import smuggle_url +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + int_or_none, +) class SonyLIVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/(?:s(?:how|port)s/[^/]+|movies|clip|trailer|music-videos)/[^/?#&]+-(?P\d+)' _TESTS = [{ - 'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight", + 'url': 'https://www.sonyliv.com/shows/bachelors-delight-1700000113/achaari-cheese-toast-1000022678?watch=true', 'info_dict': { - 'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", - 'id': 'ref:5024612095001', + 'title': 'Bachelors Delight - Achaari Cheese Toast', + 'id': '1000022678', 'ext': 'mp4', - 'upload_date': '20170923', - 'description': 'md5:7f28509a148d5be9d0782b4d5106410d', - 'uploader_id': '5182475815001', - 'timestamp': 1506200547, + 'upload_date': '20200411', + 'description': 'md5:3957fa31d9309bf336ceb3f37ad5b7cb', + 'timestamp': 1586632091, + 'duration': 185, + 'season_number': 1, + 'episode': 'Achaari Cheese Toast', + 'episode_number': 1, + 'release_year': 2016, }, 'params': { 'skip_download': True, }, - 'add_ie': ['BrightcoveNew'], }, { - 'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)', + 'url': 'https://www.sonyliv.com/movies/tahalka-1000050121?watch=true', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/clip/jigarbaaz-1000098925', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/trailer/sandwiched-forever-1000100286?watch=true', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/sports/india-tour-of-australia-2020-21-1700000286/cricket-hls-day-3-1st-test-aus-vs-ind-19-dec-2020-1000100959?watch=true', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/music-videos/yeh-un-dinon-ki-baat-hai-1000018779', 'only_matching': True, }] + _GEO_COUNTRIES = ['IN'] + _TOKEN = None - # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s' + def _call_api(self, version, path, video_id): + headers = {} + if self._TOKEN: + headers['security_token'] = self._TOKEN + try: + return self._download_json( + 'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path), + video_id, headers=headers)['resultObj'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + message = self._parse_json( + e.cause.read().decode(), video_id)['message'] + if message == 'Geoblocked Country': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise ExtractorError(message) + raise + + def _real_initialize(self): + self._TOKEN = self._call_api('1.4', 'ALL/GETTOKEN', None) def _real_extract(self, url): - brightcove_id = self._match_id(url) - return self.url_result( - smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, { - 'geo_countries': ['IN'], - 'referrer': url, - }), - 'BrightcoveNew', brightcove_id) + video_id = self._match_id(url) + content = self._call_api( + '1.5', 'IN/CONTENT/VIDEOURL/VOD/' + video_id, video_id) + if content.get('isEncrypted'): + raise ExtractorError('This video is DRM protected.', expected=True) + dash_url = content['videoURL'] + headers = { + 'x-playback-session-id': '%s-%d' % (uuid.uuid4().hex, time.time() * 1000) + } + formats = self._extract_mpd_formats( + dash_url, video_id, mpd_id='dash', headers=headers, fatal=False) + formats.extend(self._extract_m3u8_formats( + dash_url.replace('.mpd', '.m3u8').replace('/DASH/', '/HLS/'), + video_id, 'mp4', m3u8_id='hls', headers=headers, fatal=False)) + for f in formats: + f.setdefault('http_headers', {}).update(headers) + self._sort_formats(formats) + + metadata = self._call_api( + '1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata'] + title = metadata['title'] + episode = metadata.get('episodeTitle') + if episode: + title += ' - ' + episode + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': content.get('posterURL'), + 'description': metadata.get('longDescription') or metadata.get('shortDescription'), + 'timestamp': int_or_none(metadata.get('creationDate'), 1000), + 'duration': int_or_none(metadata.get('duration')), + 'season_number': int_or_none(metadata.get('season')), + 'episode': episode, + 'episode_number': int_or_none(metadata.get('episodeNumber')), + 'release_year': int_or_none(metadata.get('year')), + } From 0445f9de8df3c7f4c5b36e3e5901c804260cba4a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:34:50 +0100 Subject: [PATCH 177/384] [sonyliv] fix title for movies --- haruhi_dl/extractor/sonyliv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/sonyliv.py b/haruhi_dl/extractor/sonyliv.py index b460b343a..fedfceb62 100644 --- a/haruhi_dl/extractor/sonyliv.py +++ b/haruhi_dl/extractor/sonyliv.py @@ -94,7 +94,7 @@ class SonyLIVIE(InfoExtractor): '1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata'] title = metadata['title'] episode = metadata.get('episodeTitle') - if episode: + if episode and title != episode: title += ' - ' + episode return { From f350e326ac3da25ca2a2cd5dfbc1d67ace47bc82 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:35:11 +0100 Subject: [PATCH 178/384] [9c9media] improve info extraction --- haruhi_dl/extractor/ninecninemedia.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/ninecninemedia.py b/haruhi_dl/extractor/ninecninemedia.py index 65754c5e7..a569c889e 100644 --- a/haruhi_dl/extractor/ninecninemedia.py +++ b/haruhi_dl/extractor/ninecninemedia.py @@ -5,10 +5,11 @@ import re from .common import InfoExtractor from ..utils import ( - parse_iso8601, - float_or_none, ExtractorError, + float_or_none, int_or_none, + parse_iso8601, + try_get, ) @@ -35,7 +36,7 @@ class NineCNineMediaIE(InfoExtractor): '$include': '[HasClosedCaptions]', }) - if content_package.get('Constraints', {}).get('Security', {}).get('Type'): + if try_get(content_package, lambda x: x['Constraints']['Security']['Type']): raise ExtractorError('This video is DRM protected.', expected=True) manifest_base_url = content_package_url + 'manifest.' @@ -52,7 +53,7 @@ class NineCNineMediaIE(InfoExtractor): self._sort_formats(formats) thumbnails = [] - for image in content.get('Images', []): + for image in (content.get('Images') or []): image_url = image.get('Url') if not image_url: continue @@ -70,7 +71,7 @@ class NineCNineMediaIE(InfoExtractor): continue container.append(e_name) - season = content.get('Season', {}) + season = content.get('Season') or {} info = { 'id': content_id, @@ -79,13 +80,14 @@ class NineCNineMediaIE(InfoExtractor): 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), 'episode_number': int_or_none(content.get('Episode')), 'season': season.get('Name'), - 'season_number': season.get('Number'), + 'season_number': int_or_none(season.get('Number')), 'season_id': season.get('Id'), - 'series': content.get('Media', {}).get('Name'), + 'series': try_get(content, lambda x: x['Media']['Name']), 'tags': tags, 'categories': categories, 'duration': float_or_none(content_package.get('Duration')), 'formats': formats, + 'thumbnails': thumbnails, } if content_package.get('HasClosedCaptions'): From 90a021a137d6999c53b6975bd233643ad3b9c53f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:35:17 +0100 Subject: [PATCH 179/384] [ctv] Add new extractor (closes #27525) --- haruhi_dl/extractor/ctv.py | 52 +++++++++++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 1 + 2 files changed, 53 insertions(+) create mode 100644 haruhi_dl/extractor/ctv.py diff --git a/haruhi_dl/extractor/ctv.py b/haruhi_dl/extractor/ctv.py new file mode 100644 index 000000000..756bcc2be --- /dev/null +++ b/haruhi_dl/extractor/ctv.py @@ -0,0 +1,52 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ctv\.ca/(?P(?:show|movie)s/[^/]+/[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.ctv.ca/shows/your-morning/wednesday-december-23-2020-s5e88', + 'info_dict': { + 'id': '2102249', + 'ext': 'flv', + 'title': 'Wednesday, December 23, 2020', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Your Morning delivers original perspectives and unique insights into the headlines of the day.', + 'timestamp': 1608732000, + 'upload_date': '20201223', + 'series': 'Your Morning', + 'season': '2020-2021', + 'season_number': 5, + 'episode_number': 88, + 'tags': ['Your Morning'], + 'categories': ['Talk Show'], + 'duration': 7467.126, + }, + }, { + 'url': 'https://www.ctv.ca/movies/adam-sandlers-eight-crazy-nights/adam-sandlers-eight-crazy-nights', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + content = self._download_json( + 'https://www.ctv.ca/space-graphql/graphql', display_id, query={ + 'query': '''{ + resolvedPath(path: "/%s") { + lastSegment { + content { + ... on AxisContent { + axisId + videoPlayerDestCode + } + } + } + } +}''' % display_id, + })['data']['resolvedPath']['lastSegment']['content'] + video_id = content['axisId'] + return self.url_result( + '9c9media:%s:%s' % (content['videoPlayerDestCode'], video_id), + 'NineCNineMedia', video_id) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index da6767164..9d196a135 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -254,6 +254,7 @@ from .crunchyroll import ( ) from .cspan import CSpanIE from .ctsnews import CtsNewsIE +from .ctv import CTVIE from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE from .curiositystream import ( From 5f00c83c35b6322a9d9058573d2d2813104bcfb1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:35:57 +0100 Subject: [PATCH 180/384] [theplatform] allow passing geo bypass countries from other extractors --- haruhi_dl/extractor/theplatform.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/haruhi_dl/extractor/theplatform.py b/haruhi_dl/extractor/theplatform.py index 5b14bbf82..cdba10e40 100644 --- a/haruhi_dl/extractor/theplatform.py +++ b/haruhi_dl/extractor/theplatform.py @@ -234,6 +234,9 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) mobj = re.match(self._VALID_URL, url) provider_id = mobj.group('provider_id') From 4d81f8326715f23bf5e7f7ec03756b0f2a62f99e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:36:04 +0100 Subject: [PATCH 181/384] [sprout] Add support for Universal Kids (closes #22518) --- haruhi_dl/extractor/sprout.py | 88 ++++++++++++++++++++--------------- 1 file changed, 50 insertions(+), 38 deletions(-) diff --git a/haruhi_dl/extractor/sprout.py b/haruhi_dl/extractor/sprout.py index 8467bf49d..b1f8e05a2 100644 --- a/haruhi_dl/extractor/sprout.py +++ b/haruhi_dl/extractor/sprout.py @@ -3,50 +3,62 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( - extract_attributes, - update_url_query, + int_or_none, smuggle_url, + update_url_query, ) class SproutIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?sproutonline\.com/watch/(?P[^/?#]+)' - _TEST = { - 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', - 'md5': '74bf14128578d1e040c3ebc82088f45f', + _VALID_URL = r'https?://(?:www\.)?(?:sproutonline|universalkids)\.com/(?:watch|(?:[^/]+/)*videos)/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.universalkids.com/shows/remy-and-boo/season/1/videos/robot-bike-race', 'info_dict': { - 'id': '9dexnwtmh8_X', + 'id': 'bm0foJFaTKqb', 'ext': 'mp4', - 'title': 'A Cowboy Adventure', - 'description': 'Ruff-Ruff, Tweet and Dave get to be cowboys for the day at Six Cow Corral.', - 'timestamp': 1437758640, - 'upload_date': '20150724', - 'uploader': 'NBCU-SPROUT-NEW', - } - } + 'title': 'Robot Bike Race', + 'description': 'md5:436b1d97117cc437f54c383f4debc66d', + 'timestamp': 1606148940, + 'upload_date': '20201123', + 'uploader': 'NBCU-MPAT', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', + 'only_matching': True, + }, { + 'url': 'https://www.universalkids.com/watch/robot-bike-race', + 'only_matching': True, + }] + _GEO_COUNTRIES = ['US'] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_component = self._search_regex( - r'(?s)(]+data-component="video"[^>]*?>)', - webpage, 'video component', default=None) - if video_component: - options = self._parse_json(extract_attributes( - video_component)['data-options'], video_id) - theplatform_url = options['video'] - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - if options.get('protected'): - query['auth'] = self._extract_mvpd_auth(url, options['pid'], 'sprout', 'sprout') - theplatform_url = smuggle_url(update_url_query( - theplatform_url, query), {'force_smil_url': True}) - else: - iframe = self._search_regex( - r'(]+id="sproutVideoIframe"[^>]*?>)', - webpage, 'iframe') - theplatform_url = extract_attributes(iframe)['src'] - - return self.url_result(theplatform_url, 'ThePlatform') + display_id = self._match_id(url) + mpx_metadata = self._download_json( + # http://nbcuunikidsprod.apps.nbcuni.com/networks/universalkids/content/videos/ + 'https://www.universalkids.com/_api/videos/' + display_id, + display_id)['mpxMetadata'] + media_pid = mpx_metadata['mediaPid'] + theplatform_url = 'https://link.theplatform.com/s/HNK2IC/' + media_pid + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + if mpx_metadata.get('entitlement') == 'auth': + query['auth'] = self._extract_mvpd_auth(url, media_pid, 'sprout', 'sprout') + theplatform_url = smuggle_url( + update_url_query(theplatform_url, query), { + 'force_smil_url': True, + 'geo_countries': self._GEO_COUNTRIES, + }) + return { + '_type': 'url_transparent', + 'id': 'id', + 'url': theplatform_url, + 'series': mpx_metadata.get('seriesName'), + 'season_number': int_or_none(mpx_metadata.get('seasonNumber')), + 'episode_number': int_or_none(mpx_metadata.get('episodeNumber')), + 'ie_key': 'ThePlatform', + } From 8567d4488fce04c3576b48885cf294d2397cb38f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:36:11 +0100 Subject: [PATCH 182/384] [sprout] correct typo --- haruhi_dl/extractor/sprout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/sprout.py b/haruhi_dl/extractor/sprout.py index b1f8e05a2..e243732f2 100644 --- a/haruhi_dl/extractor/sprout.py +++ b/haruhi_dl/extractor/sprout.py @@ -55,7 +55,7 @@ class SproutIE(AdobePassIE): }) return { '_type': 'url_transparent', - 'id': 'id', + 'id': media_pid, 'url': theplatform_url, 'series': mpx_metadata.get('seriesName'), 'season_number': int_or_none(mpx_metadata.get('seasonNumber')), From b88f43a813fd75c0d5a6824088d31387848e2415 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:36:17 +0100 Subject: [PATCH 183/384] [theweatherchannel] fix extraction (closes #25930)(closes #26051) --- haruhi_dl/extractor/theweatherchannel.py | 43 ++++++++++++++++++------ 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/haruhi_dl/extractor/theweatherchannel.py b/haruhi_dl/extractor/theweatherchannel.py index c34a49d03..b2a8c3797 100644 --- a/haruhi_dl/extractor/theweatherchannel.py +++ b/haruhi_dl/extractor/theweatherchannel.py @@ -1,18 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .theplatform import ThePlatformIE from ..utils import ( determine_ext, parse_duration, + parse_iso8601, ) class TheWeatherChannelIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?weather\.com/(?:[^/]+/)*video/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?weather\.com(?P(?:/(?P[a-z]{2}-[A-Z]{2}))?/(?:[^/]+/)*video/(?P[^/?#]+))' _TESTS = [{ 'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock', - 'md5': 'ab924ac9574e79689c24c6b95e957def', + 'md5': 'c4cbe74c9c17c5676b704b950b73dd92', 'info_dict': { 'id': 'cc82397e-cc3f-4d11-9390-a785add090e8', 'ext': 'mp4', @@ -20,18 +24,33 @@ class TheWeatherChannelIE(ThePlatformIE): 'description': 'md5:55606ce1378d4c72e6545e160c9d9695', 'uploader': 'TWC - Digital (No Distro)', 'uploader_id': '6ccd5455-16bb-46f2-9c57-ff858bb9f62c', + 'upload_date': '20160720', + 'timestamp': 1469018835, } + }, { + 'url': 'https://weather.com/en-CA/international/videos/video/unidentified-object-falls-from-sky-in-india', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - drupal_settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), display_id) - video_id = drupal_settings['twc']['contexts']['node']['uuid'] - video_data = self._download_json( - 'https://dsx.weather.com/cms/v4/asset-collection/en_US/' + video_id, video_id) + asset_name, locale, display_id = re.match(self._VALID_URL, url).groups() + if not locale: + locale = 'en-US' + video_data = list(self._download_json( + 'https://weather.com/api/v1/p/redux-dal', display_id, data=json.dumps([{ + 'name': 'getCMSAssetsUrlConfig', + 'params': { + 'language': locale.replace('-', '_'), + 'query': { + 'assetName': { + '$in': asset_name, + }, + }, + } + }]).encode(), headers={ + 'Content-Type': 'application/json', + })['dal']['getCMSAssetsUrlConfig'].values())[0]['data'][0] + video_id = video_data['id'] seo_meta = video_data.get('seometa', {}) title = video_data.get('title') or seo_meta['title'] @@ -66,6 +85,8 @@ class TheWeatherChannelIE(ThePlatformIE): }) self._sort_formats(formats) + cc_url = video_data.get('cc_url') + return { 'id': video_id, 'display_id': display_id, @@ -74,6 +95,8 @@ class TheWeatherChannelIE(ThePlatformIE): 'duration': parse_duration(video_data.get('duration')), 'uploader': video_data.get('providername'), 'uploader_id': video_data.get('providerid'), + 'timestamp': parse_iso8601(video_data.get('publishdate')), + 'subtitles': {locale[:2]: [{'url': cc_url}]} if cc_url else None, 'thumbnails': thumbnails, 'formats': formats, } From 5b75c620bd148af4f4d4abe52b3c41d9e75e6d45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:36:23 +0100 Subject: [PATCH 184/384] [bongacams] Add extractor (closes #27440) --- haruhi_dl/extractor/bongacams.py | 60 +++++++++++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 1 + 2 files changed, 61 insertions(+) create mode 100644 haruhi_dl/extractor/bongacams.py diff --git a/haruhi_dl/extractor/bongacams.py b/haruhi_dl/extractor/bongacams.py new file mode 100644 index 000000000..180542fbc --- /dev/null +++ b/haruhi_dl/extractor/bongacams.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + try_get, + urlencode_postdata, +) + + +class BongaCamsIE(InfoExtractor): + _VALID_URL = r'https?://(?P(?:[^/]+\.)?bongacams\d*\.com)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://de.bongacams.com/azumi-8', + 'only_matching': True, + }, { + 'url': 'https://cn.bongacams.com/azumi-8', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + channel_id = mobj.group('id') + + amf = self._download_json( + 'https://%s/tools/amf.php' % host, channel_id, + data=urlencode_postdata(( + ('method', 'getRoomData'), + ('args[]', channel_id), + ('args[]', 'false'), + )), headers={'X-Requested-With': 'XMLHttpRequest'}) + + server_url = amf['localData']['videoServerUrl'] + + uploader_id = try_get( + amf, lambda x: x['performerData']['username'], compat_str) or channel_id + uploader = try_get( + amf, lambda x: x['performerData']['displayName'], compat_str) + like_count = int_or_none(try_get( + amf, lambda x: x['performerData']['loversCount'])) + + formats = self._extract_m3u8_formats( + '%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id), + channel_id, 'mp4', m3u8_id='hls', live=True) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': self._live_title(uploader or uploader_id), + 'uploader': uploader, + 'uploader_id': uploader_id, + 'like_count': like_count, + 'age_limit': 18, + 'is_live': True, + 'formats': formats, + } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 9d196a135..69c7d9f62 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -129,6 +129,7 @@ from .bleacherreport import ( from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE +from .bongacams import BongaCamsIE from .bostonglobe import BostonGlobeIE from .box import BoxIE from .bpb import BpbIE From 226faa552189bdb7267367fcfa692d63c589c381 Mon Sep 17 00:00:00 2001 From: JChris246 Date: Fri, 26 Feb 2021 15:36:31 +0100 Subject: [PATCH 185/384] [pornhub] Fix lq formats extraction (closes #27386) --- haruhi_dl/extractor/pornhub.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/pornhub.py b/haruhi_dl/extractor/pornhub.py index 20af84955..c9be511ea 100644 --- a/haruhi_dl/extractor/pornhub.py +++ b/haruhi_dl/extractor/pornhub.py @@ -291,15 +291,25 @@ class PornHubIE(PornHubBaseIE): video_urls.append((v_url, None)) video_urls_set.add(v_url) + def parse_quality_items(js_str): + if (url_or_none(js_str)): + return js_str + media_definitions = self._parse_json(js_str, video_id, fatal=False) + if isinstance(media_definitions, list): + for definition in media_definitions: + if not isinstance(definition, dict): + continue + add_video_url(definition.get('url')) + if not video_urls: - FORMAT_PREFIXES = ('media', 'quality') + FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') js_vars = extract_js_vars( webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), default=None) if js_vars: for key, format_url in js_vars.items(): if any(key.startswith(p) for p in FORMAT_PREFIXES): - add_video_url(format_url) + add_video_url(parse_quality_items(format_url)) if not video_urls and re.search( r'<[^>]+\bid=["\']lockedPlayer', webpage): raise ExtractorError( From dc69c587bfcd6594c7a51127addd1623e4fb867a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:36:38 +0100 Subject: [PATCH 186/384] [pornhub] Fix review issues (closes #27393) --- haruhi_dl/extractor/pornhub.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/haruhi_dl/extractor/pornhub.py b/haruhi_dl/extractor/pornhub.py index c9be511ea..a66152e98 100644 --- a/haruhi_dl/extractor/pornhub.py +++ b/haruhi_dl/extractor/pornhub.py @@ -291,15 +291,13 @@ class PornHubIE(PornHubBaseIE): video_urls.append((v_url, None)) video_urls_set.add(v_url) - def parse_quality_items(js_str): - if (url_or_none(js_str)): - return js_str - media_definitions = self._parse_json(js_str, video_id, fatal=False) - if isinstance(media_definitions, list): - for definition in media_definitions: - if not isinstance(definition, dict): - continue - add_video_url(definition.get('url')) + def parse_quality_items(quality_items): + q_items = self._parse_json(quality_items, video_id, fatal=False) + if not isinstance(q_items, list): + return + for item in q_items: + if isinstance(item, dict): + add_video_url(item.get('url')) if not video_urls: FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') @@ -308,8 +306,10 @@ class PornHubIE(PornHubBaseIE): default=None) if js_vars: for key, format_url in js_vars.items(): - if any(key.startswith(p) for p in FORMAT_PREFIXES): - add_video_url(parse_quality_items(format_url)) + if key.startswith(FORMAT_PREFIXES[-1]): + parse_quality_items(format_url) + elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]): + add_video_url(format_url) if not video_urls and re.search( r'<[^>]+\bid=["\']lockedPlayer', webpage): raise ExtractorError( From 84b7f91b289cac49d13d08c118de5bc564ac9bbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:38:12 +0100 Subject: [PATCH 187/384] [spangbang] Add support for playlist videos --- haruhi_dl/extractor/spankbang.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/spankbang.py b/haruhi_dl/extractor/spankbang.py index 61ca902ce..e3ec8602d 100644 --- a/haruhi_dl/extractor/spankbang.py +++ b/haruhi_dl/extractor/spankbang.py @@ -17,7 +17,14 @@ from ..utils import ( class SpankBangIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/(?:video|play|embed)\b' + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?spankbang\.com/ + (?: + (?P[\da-z]+)/(?:video|play|embed)\b| + [\da-z]+-(?P[\da-z]+)/playlist/[^/?#&]+ + ) + ''' _TESTS = [{ 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', 'md5': '1cc433e1d6aa14bc376535b8679302f7', @@ -57,10 +64,14 @@ class SpankBangIE(InfoExtractor): }, { 'url': 'https://spankbang.com/2y3td/embed/', 'only_matching': True, + }, { + 'url': 'https://spankbang.com/2v7ik-7ecbgu/playlist/latina+booty', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_2') webpage = self._download_webpage( url.replace('/%s/embed' % video_id, '/%s/video' % video_id), video_id, headers={'Cookie': 'country=US'}) From 50dfe7adb8a4d1f1f85e30568da944b2db179257 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:38:20 +0100 Subject: [PATCH 188/384] [spangbang:playlist] Fix extraction (closes #24087) --- haruhi_dl/extractor/spankbang.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/haruhi_dl/extractor/spankbang.py b/haruhi_dl/extractor/spankbang.py index e3ec8602d..8a7102d0c 100644 --- a/haruhi_dl/extractor/spankbang.py +++ b/haruhi_dl/extractor/spankbang.py @@ -13,6 +13,7 @@ from ..utils import ( str_to_int, url_or_none, urlencode_postdata, + urljoin, ) @@ -166,30 +167,33 @@ class SpankBangIE(InfoExtractor): class SpankBangPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/playlist/[^/]+' + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P[\da-z]+)/playlist/(?P[^/]+)' _TEST = { 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', 'info_dict': { 'id': 'ug0k', 'title': 'Big Ass Titties', }, - 'playlist_mincount': 50, + 'playlist_mincount': 40, } def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + display_id = mobj.group('display_id') webpage = self._download_webpage( url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) entries = [self.url_result( - 'https://spankbang.com/%s/video' % video_id, - ie=SpankBangIE.ie_key(), video_id=video_id) - for video_id in orderedSet(re.findall( - r']+\bhref=["\']/?([\da-z]+)/play/', webpage))] + urljoin(url, mobj.group('path')), + ie=SpankBangIE.ie_key(), video_id=mobj.group('id')) + for mobj in re.finditer( + r']+\bhref=(["\'])(?P/?[\da-z]+-(?P[\da-z]+)/playlist/%s(?:(?!\1).)*)\1' + % re.escape(display_id), webpage)] title = self._html_search_regex( - r'

([^<]+)\s+playlist

', webpage, 'playlist title', + r'

([^<]+)\s+playlist\s*<', webpage, 'playlist title', fatal=False) return self.playlist_result(entries, playlist_id, title) From 10a6f841a7aefeb182dd5e3a5ee491cb346517e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:38:28 +0100 Subject: [PATCH 189/384] [spankbang] Remove unused import --- haruhi_dl/extractor/spankbang.py | 1 - 1 file changed, 1 deletion(-) diff --git a/haruhi_dl/extractor/spankbang.py b/haruhi_dl/extractor/spankbang.py index 8a7102d0c..37cb8c839 100644 --- a/haruhi_dl/extractor/spankbang.py +++ b/haruhi_dl/extractor/spankbang.py @@ -7,7 +7,6 @@ from ..utils import ( determine_ext, ExtractorError, merge_dicts, - orderedSet, parse_duration, parse_resolution, str_to_int, From d7c028a33ef2fa24b6dc0868b9b15c1ea6a22f84 Mon Sep 17 00:00:00 2001 From: Andrew Udvare Date: Fri, 26 Feb 2021 15:38:34 +0100 Subject: [PATCH 190/384] [instagram] Fix extraction when authenticated (closes #27422) --- haruhi_dl/extractor/instagram.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/haruhi_dl/extractor/instagram.py b/haruhi_dl/extractor/instagram.py index b061850a1..0755896de 100644 --- a/haruhi_dl/extractor/instagram.py +++ b/haruhi_dl/extractor/instagram.py @@ -137,6 +137,16 @@ class InstagramIE(InfoExtractor): (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], lambda x: x['entry_data']['PostPage'][0]['media']), dict) + if not media: + additional_data = self._parse_json( + self._search_regex(r'window\.__additionalDataLoaded\(\'[^\']+\',\s*({.+?})\);', + webpage, 'additional data', default='{}'), + video_id, fatal=False) + if additional_data: + media = try_get( + additional_data, + lambda x: x['graphql']['shortcode_media'], + dict) if media: video_url = media.get('video_url') height = int_or_none(media.get('dimensions', {}).get('height')) From 73c5dc4104d6eb6914d008aeb62acfe58ce052aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:38:42 +0100 Subject: [PATCH 191/384] [instagram] Improve extraction (closes #22880) --- haruhi_dl/extractor/instagram.py | 128 ++++++++++++++++--------------- 1 file changed, 65 insertions(+), 63 deletions(-) diff --git a/haruhi_dl/extractor/instagram.py b/haruhi_dl/extractor/instagram.py index 0755896de..82f59c349 100644 --- a/haruhi_dl/extractor/instagram.py +++ b/haruhi_dl/extractor/instagram.py @@ -122,9 +122,9 @@ class InstagramIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - (video_url, description, thumbnail, timestamp, uploader, + (media, video_url, description, thumbnail, timestamp, uploader, uploader_id, like_count, comment_count, comments, height, - width) = [None] * 11 + width) = [None] * 12 shared_data = self._parse_json( self._search_regex( @@ -137,69 +137,71 @@ class InstagramIE(InfoExtractor): (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], lambda x: x['entry_data']['PostPage'][0]['media']), dict) - if not media: - additional_data = self._parse_json( - self._search_regex(r'window\.__additionalDataLoaded\(\'[^\']+\',\s*({.+?})\);', - webpage, 'additional data', default='{}'), - video_id, fatal=False) - if additional_data: - media = try_get( - additional_data, - lambda x: x['graphql']['shortcode_media'], - dict) - if media: - video_url = media.get('video_url') - height = int_or_none(media.get('dimensions', {}).get('height')) - width = int_or_none(media.get('dimensions', {}).get('width')) - description = try_get( - media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) or media.get('caption') - thumbnail = media.get('display_src') - timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) - uploader = media.get('owner', {}).get('full_name') - uploader_id = media.get('owner', {}).get('username') + # _sharedData.entry_data.PostPage is empty when authenticated (see + # https://github.com/hdl-org/haruhi-dl/pull/22880) + if not media: + additional_data = self._parse_json( + self._search_regex( + r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', + webpage, 'additional data', default='{}'), + video_id, fatal=False) + if additional_data: + media = try_get( + additional_data, lambda x: x['graphql']['shortcode_media'], + dict) + if media: + video_url = media.get('video_url') + height = int_or_none(media.get('dimensions', {}).get('height')) + width = int_or_none(media.get('dimensions', {}).get('width')) + description = try_get( + media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) or media.get('caption') + thumbnail = media.get('display_src') + timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) + uploader = media.get('owner', {}).get('full_name') + uploader_id = media.get('owner', {}).get('username') - def get_count(key, kind): - return int_or_none(try_get( - media, (lambda x: x['edge_media_%s' % key]['count'], - lambda x: x['%ss' % kind]['count']))) - like_count = get_count('preview_like', 'like') - comment_count = get_count('to_comment', 'comment') + def get_count(key, kind): + return int_or_none(try_get( + media, (lambda x: x['edge_media_%s' % key]['count'], + lambda x: x['%ss' % kind]['count']))) + like_count = get_count('preview_like', 'like') + comment_count = get_count('to_comment', 'comment') - comments = [{ - 'author': comment.get('user', {}).get('username'), - 'author_id': comment.get('user', {}).get('id'), - 'id': comment.get('id'), - 'text': comment.get('text'), - 'timestamp': int_or_none(comment.get('created_at')), - } for comment in media.get( - 'comments', {}).get('nodes', []) if comment.get('text')] - if not video_url: - edges = try_get( - media, lambda x: x['edge_sidecar_to_children']['edges'], - list) or [] - if edges: - entries = [] - for edge_num, edge in enumerate(edges, start=1): - node = try_get(edge, lambda x: x['node'], dict) - if not node: - continue - node_video_url = url_or_none(node.get('video_url')) - if not node_video_url: - continue - entries.append({ - 'id': node.get('shortcode') or node['id'], - 'title': 'Video %d' % edge_num, - 'url': node_video_url, - 'thumbnail': node.get('display_url'), - 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), - 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), - 'view_count': int_or_none(node.get('video_view_count')), - }) - return self.playlist_result( - entries, video_id, - 'Post by %s' % uploader_id if uploader_id else None, - description) + comments = [{ + 'author': comment.get('user', {}).get('username'), + 'author_id': comment.get('user', {}).get('id'), + 'id': comment.get('id'), + 'text': comment.get('text'), + 'timestamp': int_or_none(comment.get('created_at')), + } for comment in media.get( + 'comments', {}).get('nodes', []) if comment.get('text')] + if not video_url: + edges = try_get( + media, lambda x: x['edge_sidecar_to_children']['edges'], + list) or [] + if edges: + entries = [] + for edge_num, edge in enumerate(edges, start=1): + node = try_get(edge, lambda x: x['node'], dict) + if not node: + continue + node_video_url = url_or_none(node.get('video_url')) + if not node_video_url: + continue + entries.append({ + 'id': node.get('shortcode') or node['id'], + 'title': 'Video %d' % edge_num, + 'url': node_video_url, + 'thumbnail': node.get('display_url'), + 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), + 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), + 'view_count': int_or_none(node.get('video_view_count')), + }) + return self.playlist_result( + entries, video_id, + 'Post by %s' % uploader_id if uploader_id else None, + description) if not video_url: video_url = self._og_search_video_url(webpage, secure=False) From ff12ad0ee42a4004c40320dd5bede28966b26a97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:38:48 +0100 Subject: [PATCH 192/384] [instagram] Improve thumbnail extraction --- haruhi_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/instagram.py b/haruhi_dl/extractor/instagram.py index 82f59c349..0e70d9ea0 100644 --- a/haruhi_dl/extractor/instagram.py +++ b/haruhi_dl/extractor/instagram.py @@ -156,7 +156,7 @@ class InstagramIE(InfoExtractor): description = try_get( media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], compat_str) or media.get('caption') - thumbnail = media.get('display_src') + thumbnail = media.get('display_src') or media.get('display_url') timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') From c298be2ebd5d03cf1c54a785e1d59e939bd7aa73 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:38:53 +0100 Subject: [PATCH 193/384] [bbc] switch to media selector v6 closes #23232 closes #23933 closes #26303 closes #26432 closes #26821 closes #27538 --- haruhi_dl/extractor/bbc.py | 74 +++++++++++--------------------------- 1 file changed, 20 insertions(+), 54 deletions(-) diff --git a/haruhi_dl/extractor/bbc.py b/haruhi_dl/extractor/bbc.py index 7aa3a11b5..b73521043 100644 --- a/haruhi_dl/extractor/bbc.py +++ b/haruhi_dl/extractor/bbc.py @@ -49,22 +49,17 @@ class BBCCoUkIE(InfoExtractor): _LOGIN_URL = 'https://account.bbc.com/signin' _NETRC_MACHINE = 'bbc' - _MEDIASELECTOR_URLS = [ + _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s' + _MEDIA_SETS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails # with geolocation in some cases when it's even not geo restricted at all (e.g. # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', + 'iptv-all', + 'pc', ] - _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection' _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' - _NAMESPACES = ( - _MEDIASELECTION_NS, - _EMP_PLAYLIST_NS, - ) - _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', @@ -261,8 +256,6 @@ class BBCCoUkIE(InfoExtractor): 'only_matching': True, }] - _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' - def _login(self): username, password = self._get_login_info() if username is None: @@ -307,22 +300,14 @@ class BBCCoUkIE(InfoExtractor): def _extract_items(self, playlist): return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) - def _findall_ns(self, element, xpath): - elements = [] - for ns in self._NAMESPACES: - elements.extend(element.findall(xpath % ns)) - return elements - def _extract_medias(self, media_selection): - error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS) - if error is None: - media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS) - if error is not None: - raise BBCCoUkIE.MediaSelectionError(error.get('id')) - return self._findall_ns(media_selection, './{%s}media') + error = media_selection.get('result') + if error: + raise BBCCoUkIE.MediaSelectionError(error) + return media_selection.get('media') or [] def _extract_connections(self, media): - return self._findall_ns(media, './{%s}connection') + return media.get('connection') or [] def _get_subtitles(self, media, programme_id): subtitles = {} @@ -334,13 +319,13 @@ class BBCCoUkIE(InfoExtractor): cc_url, programme_id, 'Downloading captions', fatal=False) if not isinstance(captions, compat_etree_Element): continue - lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') - subtitles[lang] = [ + subtitles['en'] = [ { 'url': connection.get('href'), 'ext': 'ttml', }, ] + break return subtitles def _raise_extractor_error(self, media_selection_error): @@ -350,10 +335,10 @@ class BBCCoUkIE(InfoExtractor): def _download_media_selector(self, programme_id): last_exception = None - for mediaselector_url in self._MEDIASELECTOR_URLS: + for media_set in self._MEDIA_SETS: try: return self._download_media_selector_url( - mediaselector_url % programme_id, programme_id) + self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id) except BBCCoUkIE.MediaSelectionError as e: if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): last_exception = e @@ -362,8 +347,8 @@ class BBCCoUkIE(InfoExtractor): self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): - media_selection = self._download_xml( - url, programme_id, 'Downloading media selection XML', + media_selection = self._download_json( + url, programme_id, 'Downloading media selection JSON', expected_status=(403, 404)) return self._process_media_selector(media_selection, programme_id) @@ -377,7 +362,6 @@ class BBCCoUkIE(InfoExtractor): if kind in ('video', 'audio'): bitrate = int_or_none(media.get('bitrate')) encoding = media.get('encoding') - service = media.get('service') width = int_or_none(media.get('width')) height = int_or_none(media.get('height')) file_size = int_or_none(media.get('media_file_size')) @@ -392,8 +376,6 @@ class BBCCoUkIE(InfoExtractor): supplier = connection.get('supplier') transfer_format = connection.get('transferFormat') format_id = supplier or conn_kind or protocol - if service: - format_id = '%s_%s' % (service, format_id) # ASX playlist if supplier == 'asx': for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): @@ -408,20 +390,11 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) - if re.search(self._USP_RE, href): - usp_formats = self._extract_m3u8_formats( - re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href), - programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False) - for f in usp_formats: - if f.get('height') and f['height'] > 720: - continue - formats.append(f) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) else: - if not service and not supplier and bitrate: + if not supplier and bitrate: format_id += '-%d' % bitrate fmt = { 'format_id': format_id, @@ -554,7 +527,7 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') error = self._search_regex( - r']+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<', + r']+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<', webpage, 'error', default=None) if error: raise ExtractorError(error, expected=True) @@ -607,16 +580,9 @@ class BBCIE(BBCCoUkIE): IE_DESC = 'BBC' _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P[^/#?]+)' - _MEDIASELECTOR_URLS = [ - # Provides HQ HLS streams but fails with geolocation in some cases when it's - # even not geo restricted at all - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', - # Provides more formats, namely direct mp4 links, but fails on some videos with - # notukerror for non UK (?) users (e.g. - # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) - 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', - # Provides fewer formats, but works everywhere for everybody (hopefully) - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', + _MEDIA_SETS = [ + 'mobile-tablet-main', + 'pc', ] _TESTS = [{ From 477e444c3bd59e43ce5cf3de7d3aae7644b9bd5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:38:59 +0100 Subject: [PATCH 194/384] [instagram] Add support for reel URLs (closes #26234, closes #26250) --- haruhi_dl/extractor/instagram.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/instagram.py b/haruhi_dl/extractor/instagram.py index 0e70d9ea0..2d24d62c8 100644 --- a/haruhi_dl/extractor/instagram.py +++ b/haruhi_dl/extractor/instagram.py @@ -22,7 +22,7 @@ from ..utils import ( class InstagramIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv)/(?P[^/?#&]+))' + _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -95,6 +95,9 @@ class InstagramIE(InfoExtractor): }, { 'url': 'https://www.instagram.com/tv/aye83DjauH/', 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/reel/CDUMkliABpa/', + 'only_matching': True, }] @staticmethod From 97c34326598bbfa9d3abe02c705d8333101e57bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:39:04 +0100 Subject: [PATCH 195/384] [instagram] Fix comment count extraction --- haruhi_dl/extractor/instagram.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/instagram.py b/haruhi_dl/extractor/instagram.py index 2d24d62c8..5f917a603 100644 --- a/haruhi_dl/extractor/instagram.py +++ b/haruhi_dl/extractor/instagram.py @@ -164,12 +164,18 @@ class InstagramIE(InfoExtractor): uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') - def get_count(key, kind): - return int_or_none(try_get( - media, (lambda x: x['edge_media_%s' % key]['count'], - lambda x: x['%ss' % kind]['count']))) + def get_count(keys, kind): + if not isinstance(keys, (list, tuple)): + keys = [keys] + for key in keys: + count = int_or_none(try_get( + media, (lambda x: x['edge_media_%s' % key]['count'], + lambda x: x['%ss' % kind]['count']))) + if count is not None: + return count like_count = get_count('preview_like', 'like') - comment_count = get_count('to_comment', 'comment') + comment_count = get_count( + ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment') comments = [{ 'author': comment.get('user', {}).get('username'), From 1d9552c2367b9a22c03b8ab3eb98269a6ea0fbc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:39:10 +0100 Subject: [PATCH 196/384] [instagram] Fix test --- haruhi_dl/extractor/instagram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/instagram.py b/haruhi_dl/extractor/instagram.py index 5f917a603..27ea97f56 100644 --- a/haruhi_dl/extractor/instagram.py +++ b/haruhi_dl/extractor/instagram.py @@ -35,7 +35,7 @@ class InstagramIE(InfoExtractor): 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': 'naomipq', - 'uploader': 'Naomi Leonor Phan-Quang', + 'uploader': 'B E A U T Y F O R A S H E S', 'like_count': int, 'comment_count': int, 'comments': list, From 92bd8a446ef226acf58be120ad9dce95e71e96e1 Mon Sep 17 00:00:00 2001 From: Laura Liberda Date: Fri, 26 Feb 2021 15:45:57 +0100 Subject: [PATCH 197/384] VHX embeds https://github.com/ytdl-org/youtube-dl/issues/27546 --- haruhi_dl/extractor/generic.py | 17 ++++++++++++++++- haruhi_dl/extractor/vimeo.py | 7 +++++++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 240de66da..b67f066eb 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -67,7 +67,10 @@ from .tube8 import Tube8IE from .mofosex import MofosexEmbedIE from .spankwire import SpankwireIE from .youporn import YouPornIE -from .vimeo import VimeoIE +from .vimeo import ( + VimeoIE, + VHXEmbedIE, +) from .dailymotion import DailymotionIE from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE @@ -2247,6 +2250,17 @@ class GenericIE(InfoExtractor): # 'force_generic_extractor': True, # }, # } + { + # VHX Embed + 'url': 'https://demo.vhx.tv/category-c/videos/file-example-mp4-480-1-5mg-copy', + 'info_dict': { + 'id': '858208', + 'ext': 'mp4', + 'title': 'Untitled', + 'uploader_id': 'user80538407', + 'uploader': 'OTT Videos', + }, + }, ] def report_following_redirect(self, new_url): @@ -2661,6 +2675,7 @@ class GenericIE(InfoExtractor): SVTIE, XLinkIE, LibsynIE, + VHXEmbedIE, ): try: ie_key = embie.ie_key() diff --git a/haruhi_dl/extractor/vimeo.py b/haruhi_dl/extractor/vimeo.py index e8a4547cd..773296173 100644 --- a/haruhi_dl/extractor/vimeo.py +++ b/haruhi_dl/extractor/vimeo.py @@ -1125,6 +1125,12 @@ class VHXEmbedIE(VimeoBaseInfoExtractor): IE_NAME = 'vhx:embed' _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P\d+)' + @staticmethod + def _extract_urls(webpage, **kw): + mobjs = re.finditer( + r']+src="(https?://embed\.vhx\.tv/videos/\d+[^"]*)"', webpage) + return [unescapeHTML(mobj.group(1)) for mobj in mobjs] + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -1133,5 +1139,6 @@ class VHXEmbedIE(VimeoBaseInfoExtractor): 'ott data'), video_id, js_to_json)['config_url'] config = self._download_json(config_url, video_id) info = self._parse_config(config, video_id) + info['id'] = video_id self._vimeo_sort_formats(info['formats']) return info From 2d3b82a754df55ca35d12c20a2b631630588c421 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:46:29 +0100 Subject: [PATCH 198/384] [amcnetworks] improve auth only video detection(closes #27548) --- haruhi_dl/extractor/amcnetworks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/amcnetworks.py b/haruhi_dl/extractor/amcnetworks.py index 12b6de0bf..b8027bbca 100644 --- a/haruhi_dl/extractor/amcnetworks.py +++ b/haruhi_dl/extractor/amcnetworks.py @@ -80,7 +80,8 @@ class AMCNetworksIE(ThePlatformIE): title = theplatform_metadata['title'] rating = try_get( theplatform_metadata, lambda x: x['ratings'][0]['rating']) - if properties.get('videoCategory') == 'TVE-Auth': + video_category = properties.get('videoCategory') + if video_category and video_category.endswith('-Auth'): resource = self._get_mvpd_resource( requestor_id, title, video_id, rating) query['auth'] = self._extract_mvpd_auth( From c4445c3311a3bfa1c8ba61fa7679cadc23a77f5f Mon Sep 17 00:00:00 2001 From: Sergey M Date: Fri, 26 Feb 2021 15:46:39 +0100 Subject: [PATCH 199/384] [youtube] Update invidious.snopyta.org (#22667) Co-authored-by: sofutru <54445344+sofutru@users.noreply.github.com> --- haruhi_dl/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/youtube.py b/haruhi_dl/extractor/youtube.py index dd58b2407..f80c82f85 100644 --- a/haruhi_dl/extractor/youtube.py +++ b/haruhi_dl/extractor/youtube.py @@ -289,7 +289,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances (?:(?:www|dev)\.)?invidio\.us/| (?:(?:www|no)\.)?invidiou\.sh/| - (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| + (?:(?:www|fi)\.)?invidious\.snopyta\.org/| (?:www\.)?invidious\.kabi\.tk/| (?:www\.)?invidious\.13ad\.de/| (?:www\.)?invidious\.mastodon\.host/| From 7490ed64b4dda6740a5d14872f53b5ea905b67ef Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:46:46 +0100 Subject: [PATCH 200/384] [telecinco] fix extraction --- haruhi_dl/extractor/telecinco.py | 77 +++++++++----------------------- 1 file changed, 20 insertions(+), 57 deletions(-) diff --git a/haruhi_dl/extractor/telecinco.py b/haruhi_dl/extractor/telecinco.py index 9ba3da341..eecd6a5c9 100644 --- a/haruhi_dl/extractor/telecinco.py +++ b/haruhi_dl/extractor/telecinco.py @@ -5,14 +5,11 @@ import json import re from .common import InfoExtractor -from .ooyala import OoyalaIE from ..utils import ( clean_html, - determine_ext, int_or_none, str_or_none, try_get, - urljoin, ) @@ -28,7 +25,7 @@ class TelecincoIE(InfoExtractor): 'description': 'md5:716caf5601e25c3c5ab6605b1ae71529', }, 'playlist': [{ - 'md5': 'adb28c37238b675dad0f042292f209a7', + 'md5': '7ee56d665cfd241c0e6d80fd175068b0', 'info_dict': { 'id': 'JEA5ijCnF6p5W08A1rNKn7', 'ext': 'mp4', @@ -38,7 +35,7 @@ class TelecincoIE(InfoExtractor): }] }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', - 'md5': '9468140ebc300fbb8b9d65dc6e5c4b43', + 'md5': 'c86fe0d99e3bdb46b7950d38bf6ef12a', 'info_dict': { 'id': 'jn24Od1zGLG4XUZcnUnZB6', 'ext': 'mp4', @@ -48,7 +45,7 @@ class TelecincoIE(InfoExtractor): }, }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6', + 'md5': 'eddb50291df704ce23c74821b995bcac', 'info_dict': { 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', @@ -90,58 +87,24 @@ class TelecincoIE(InfoExtractor): def _parse_content(self, content, url): video_id = content['dataMediaId'] - if content.get('dataCmsId') == 'ooyala': - return self.url_result( - 'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id) - config_url = urljoin(url, content['dataConfig']) config = self._download_json( - config_url, video_id, 'Downloading config JSON') + content['dataConfig'], video_id, 'Downloading config JSON') title = config['info']['title'] - - def mmc_url(mmc_type): - return re.sub( - r'/(?:flash|html5)\.json', '/%s.json' % mmc_type, - config['services']['mmc']) - - duration = None - formats = [] - for mmc_type in ('flash', 'html5'): - mmc = self._download_json( - mmc_url(mmc_type), video_id, - 'Downloading %s mmc JSON' % mmc_type, fatal=False) - if not mmc: - continue - if not duration: - duration = int_or_none(mmc.get('duration')) - for location in mmc['locations']: - gat = self._proto_relative_url(location.get('gat'), 'http:') - gcp = location.get('gcp') - ogn = location.get('ogn') - if None in (gat, gcp, ogn): - continue - token_data = { - 'gcp': gcp, - 'ogn': ogn, - 'sta': 0, - } - media = self._download_json( - gat, video_id, data=json.dumps(token_data).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }, fatal=False) or {} - stream = media.get('stream') or media.get('file') - if not stream: - continue - ext = determine_ext(stream) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + services = config['services'] + caronte = self._download_json(services['caronte'], video_id) + stream = caronte['dls'][0]['stream'] + headers = self.geo_verification_headers() + headers.update({ + 'Content-Type': 'application/json;charset=UTF-8', + 'Origin': re.match(r'https?://[^/]+', url).group(0), + }) + cdn = self._download_json( + caronte['cerbero'], video_id, data=json.dumps({ + 'bbx': caronte['bbx'], + 'gbx': self._download_json(services['gbx'], video_id)['gbx'], + }).encode(), headers=headers)['tokens']['1']['cdn'] + formats = self._extract_m3u8_formats( + stream + '?' + cdn, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) return { @@ -149,7 +112,7 @@ class TelecincoIE(InfoExtractor): 'title': title, 'formats': formats, 'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'), - 'duration': duration, + 'duration': int_or_none(content.get('dataDuration')), } def _real_extract(self, url): From 217918987a1687071abfd5f3d4fc2b917fcd5a33 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:46:52 +0100 Subject: [PATCH 201/384] [mitele] fix free video extraction(#24624)(closes #25827)(closes #26757) --- haruhi_dl/extractor/mitele.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/haruhi_dl/extractor/mitele.py b/haruhi_dl/extractor/mitele.py index ad9da9612..b5937233b 100644 --- a/haruhi_dl/extractor/mitele.py +++ b/haruhi_dl/extractor/mitele.py @@ -1,15 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor +from .telecinco import TelecincoIE from ..utils import ( int_or_none, parse_iso8601, - smuggle_url, ) -class MiTeleIE(InfoExtractor): +class MiTeleIE(TelecincoIE): IE_DESC = 'mitele.es' _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P[^/]+)/player' @@ -31,7 +30,6 @@ class MiTeleIE(InfoExtractor): 'timestamp': 1471209401, 'upload_date': '20160814', }, - 'add_ie': ['Ooyala'], }, { # no explicit title 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player', @@ -54,7 +52,6 @@ class MiTeleIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': ['Ooyala'], }, { 'url': 'http://www.mitele.es/series-online/la-que-se-avecina/57aac5c1c915da951a8b45ed/player', 'only_matching': True, @@ -70,16 +67,11 @@ class MiTeleIE(InfoExtractor): r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=\s*({.+})', webpage, 'Pre Player'), display_id)['prePlayer'] title = pre_player['title'] - video = pre_player['video'] - video_id = video['dataMediaId'] + video_info = self._parse_content(pre_player['video'], url) content = pre_player.get('content') or {} info = content.get('info') or {} - return { - '_type': 'url_transparent', - # for some reason only HLS is supported - 'url': smuggle_url('ooyala:' + video_id, {'supportedformats': 'm3u8,dash'}), - 'id': video_id, + video_info.update({ 'title': title, 'description': info.get('synopsis'), 'series': content.get('title'), @@ -87,7 +79,7 @@ class MiTeleIE(InfoExtractor): 'episode': content.get('subtitle'), 'episode_number': int_or_none(info.get('episode_number')), 'duration': int_or_none(info.get('duration')), - 'thumbnail': video.get('dataPoster'), 'age_limit': int_or_none(info.get('rating')), 'timestamp': parse_iso8601(pre_player.get('publishedTime')), - } + }) + return video_info From 2a0a9bac02fa495bbcd064c75699742f0318fda4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:46:56 +0100 Subject: [PATCH 202/384] [teachable] Improve embed detection (closes #26923) --- haruhi_dl/extractor/teachable.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/teachable.py b/haruhi_dl/extractor/teachable.py index df305e38a..3a337afd8 100644 --- a/haruhi_dl/extractor/teachable.py +++ b/haruhi_dl/extractor/teachable.py @@ -140,7 +140,7 @@ class TeachableIE(TeachableBaseIE): @staticmethod def _is_teachable(webpage): return 'teachableTracker.linker:autoLink' in webpage and re.search( - r']+href=["\']https?://process\.fs\.teachablecdn\.com', + r']+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com', webpage) @staticmethod From 7b1f0173c12ba2d420f99f310cc05962cd62a644 Mon Sep 17 00:00:00 2001 From: JamKage Date: Fri, 26 Feb 2021 15:47:34 +0100 Subject: [PATCH 203/384] [go] Added support for FXNetworks (#26826) Co-authored-by: James Kirrage closes #13972 closes #22467 closes #23754 --- haruhi_dl/extractor/extractors.py | 1 - haruhi_dl/extractor/fxnetworks.py | 77 ------------------------------- haruhi_dl/extractor/go.py | 21 ++++++++- 3 files changed, 19 insertions(+), 80 deletions(-) delete mode 100644 haruhi_dl/extractor/fxnetworks.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 69c7d9f62..1a5cee636 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -424,7 +424,6 @@ from .funkwhale import ( FunkwhaleRadioSHIE, ) from .fusion import FusionIE -from .fxnetworks import FXNetworksIE from .gaia import GaiaIE from .gameinformer import GameInformerIE from .gamespot import GameSpotIE diff --git a/haruhi_dl/extractor/fxnetworks.py b/haruhi_dl/extractor/fxnetworks.py deleted file mode 100644 index 00e67426b..000000000 --- a/haruhi_dl/extractor/fxnetworks.py +++ /dev/null @@ -1,77 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .adobepass import AdobePassIE -from ..utils import ( - extract_attributes, - int_or_none, - parse_age_limit, - smuggle_url, - update_url_query, -) - - -class FXNetworksIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P\d+)' - _TESTS = [{ - 'url': 'http://www.fxnetworks.com/video/1032565827847', - 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703', - 'info_dict': { - 'id': 'dRzwHC_MMqIv', - 'ext': 'mp4', - 'title': 'First Look: Better Things - Season 2', - 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.', - 'age_limit': 14, - 'uploader': 'NEWA-FNG-FX', - 'upload_date': '20170825', - 'timestamp': 1503686274, - 'episode_number': 0, - 'season_number': 2, - 'series': 'Better Things', - }, - 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.simpsonsworld.com/video/716094019682', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - if 'The content you are trying to access is not available in your region.' in webpage: - self.raise_geo_restricted() - video_data = extract_attributes(self._search_regex( - r'()', webpage, 'video data')) - player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) - release_url = video_data['rel'] - title = video_data['data-title'] - rating = video_data.get('data-rating') - query = { - 'mbr': 'true', - } - if player_type == 'movies': - query.update({ - 'manifest': 'm3u', - }) - else: - query.update({ - 'switch': 'http', - }) - if video_data.get('data-req-auth') == '1': - resource = self._get_mvpd_resource( - video_data['data-channel'], title, - video_data.get('data-guid'), rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource) - - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), - 'series': video_data.get('data-show-title'), - 'episode_number': int_or_none(video_data.get('data-episode')), - 'season_number': int_or_none(video_data.get('data-season')), - 'thumbnail': video_data.get('data-large-thumb'), - 'age_limit': parse_age_limit(rating), - 'ie_key': 'ThePlatform', - } diff --git a/haruhi_dl/extractor/go.py b/haruhi_dl/extractor/go.py index 03cfba91f..0d731e90a 100644 --- a/haruhi_dl/extractor/go.py +++ b/haruhi_dl/extractor/go.py @@ -38,13 +38,17 @@ class GoIE(AdobePassIE): 'disneynow': { 'brand': '011', 'resource_id': 'Disney', - } + }, + 'fxnow.fxnetworks': { + 'brand': '025', + 'requestor_id': 'dtci', + }, } _VALID_URL = r'''(?x) https?:// (?: (?:(?P%s)\.)?go| - (?Pabc|freeform|disneynow) + (?Pabc|freeform|disneynow|fxnow\.fxnetworks) )\.com/ (?: (?:[^/]+/)*(?P[Vv][Dd][Kk][Aa]\w+)| @@ -99,6 +103,19 @@ class GoIE(AdobePassIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'https://fxnow.fxnetworks.com/shows/better-things/video/vdka12782841', + 'info_dict': { + 'id': 'VDKA12782841', + 'ext': 'mp4', + 'title': 'First Look: Better Things - Season 2', + 'description': 'md5:fa73584a95761c605d9d54904e35b407', + }, + 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, From d5bf4b0fea8b3e9047cc733ded9ce5da685cff31 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:07 +0100 Subject: [PATCH 204/384] [toggle] add support for live.mewatch.sg (closes #27555) --- haruhi_dl/extractor/toggle.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/toggle.py b/haruhi_dl/extractor/toggle.py index 3b9b54759..270c84daa 100644 --- a/haruhi_dl/extractor/toggle.py +++ b/haruhi_dl/extractor/toggle.py @@ -200,7 +200,7 @@ class ToggleIE(InfoExtractor): class MeWatchIE(InfoExtractor): IE_NAME = 'mewatch' - _VALID_URL = r'https?://(?:www\.)?mewatch\.sg/watch/[^/?#&]+-(?P[0-9]+)' + _VALID_URL = r'https?://(?:(?:www|live)\.)?mewatch\.sg/watch/[^/?#&]+-(?P[0-9]+)' _TESTS = [{ 'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371', 'info_dict': { @@ -220,6 +220,9 @@ class MeWatchIE(InfoExtractor): }, { 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-%E6%90%9C%E5%AF%86%E3%80%82%E6%89%93%E5%8D%A1%E3%80%82%E5%B0%8F%E7%BA%A2%E7%82%B9-S2-E1-176232', 'only_matching': True, + }, { + 'url': 'https://live.mewatch.sg/watch/Recipe-Of-Life-E41-189759', + 'only_matching': True, }] def _real_extract(self, url): From 68335e76a7423943c0b8e96ba1282112a2f79672 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:14 +0100 Subject: [PATCH 205/384] [zype] Add support for uplynk videos --- haruhi_dl/extractor/zype.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/zype.py b/haruhi_dl/extractor/zype.py index f336ebdb9..60dc6cb24 100644 --- a/haruhi_dl/extractor/zype.py +++ b/haruhi_dl/extractor/zype.py @@ -85,7 +85,13 @@ class ZypeIE(InfoExtractor): else: m3u8_url = self._search_regex( r'(["\'])(?P(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', - body, 'm3u8 url', group='url') + body, 'm3u8 url', group='url', default=None) + if not m3u8_url: + source = self._parse_json(self._search_regex( + r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, + 'source'), video_id, js_to_json) + if source.get('integration') == 'verizon-media': + m3u8_url = 'https://content.uplynk.com/%s.m3u8' % source['id'] formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') text_tracks = self._search_regex( From afa77db7313f4a185c9ca213fef16f08363491e3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:21 +0100 Subject: [PATCH 206/384] [piksel] import format extraction --- haruhi_dl/extractor/nhk.py | 2 +- haruhi_dl/extractor/piksel.py | 109 ++++++++++++++++++++++++---------- 2 files changed, 80 insertions(+), 31 deletions(-) diff --git a/haruhi_dl/extractor/nhk.py b/haruhi_dl/extractor/nhk.py index c5b406573..8a9331a79 100644 --- a/haruhi_dl/extractor/nhk.py +++ b/haruhi_dl/extractor/nhk.py @@ -90,7 +90,7 @@ class NhkVodIE(NhkBaseIE): _TESTS = [{ # video clip 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', - 'md5': '256a1be14f48d960a7e61e2532d95ec3', + 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', 'info_dict': { 'id': 'a95j5iza', 'ext': 'mp4', diff --git a/haruhi_dl/extractor/piksel.py b/haruhi_dl/extractor/piksel.py index 88b6859b0..ecf56ff8f 100644 --- a/haruhi_dl/extractor/piksel.py +++ b/haruhi_dl/extractor/piksel.py @@ -6,16 +6,33 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - ExtractorError, dict_get, + ExtractorError, int_or_none, - unescapeHTML, parse_iso8601, + try_get, + unescapeHTML, ) class PikselIE(InfoExtractor): - _VALID_URL = r'https?://player\.piksel\.com/v/(?:refid/[^/]+/prefid/)?(?P[a-z0-9_]+)' + _VALID_URL = r'''(?x)https?:// + (?: + (?: + player\. + (?: + olympusattelecom| + vibebyvista + )| + (?:api|player)\.multicastmedia| + (?:api-ovp|player)\.piksel + )\.com| + (?: + mz-edge\.stream\.co| + movie-s\.nhk\.or + )\.jp| + vidego\.baltimorecity\.gov + )/v/(?:refid/(?P[^/]+)/prefid/)?(?P[\w-]+)''' _TESTS = [ { 'url': 'http://player.piksel.com/v/ums2867l', @@ -56,46 +73,41 @@ class PikselIE(InfoExtractor): if mobj: return mobj.group('url') + def _call_api(self, app_token, resource, display_id, query, fatal=True): + response = (self._download_json( + 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), + display_id, query=query, fatal=fatal) or {}).get('response') + failure = try_get(response, lambda x: x['failure']['reason']) + if failure: + if fatal: + raise ExtractorError(failure, expected=True) + self.report_warning(failure) + return response + def _real_extract(self, url): - display_id = self._match_id(url) + ref_id, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'data-de-program-uuid=[\'"]([a-z0-9]+)', - webpage, 'program uuid', default=display_id) app_token = self._search_regex([ r'clientAPI\s*:\s*"([^"]+)"', r'data-de-api-key\s*=\s*"([^"]+)"' ], webpage, 'app token') - response = self._download_json( - 'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token, - video_id, query={ - 'v': video_id - })['response'] - failure = response.get('failure') - if failure: - raise ExtractorError(response['failure']['reason'], expected=True) - video_data = response['WsProgramResponse']['program']['asset'] + query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} + program = self._call_api( + app_token, 'program', display_id, query)['WsProgramResponse']['program'] + video_id = program['uuid'] + video_data = program['asset'] title = video_data['title'] + asset_type = dict_get(video_data, ['assetType', 'asset_type']) formats = [] - m3u8_url = dict_get(video_data, [ - 'm3u8iPadURL', - 'ipadM3u8Url', - 'm3u8AndroidURL', - 'm3u8iPhoneURL', - 'iphoneM3u8Url']) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - asset_type = dict_get(video_data, ['assetType', 'asset_type']) - for asset_file in video_data.get('assetFiles', []): + def process_asset_file(asset_file): + if not asset_file: + return # TODO: extract rtmp formats http_url = asset_file.get('http_url') if not http_url: - continue + return tbr = None vbr = int_or_none(asset_file.get('videoBitrate'), 1024) abr = int_or_none(asset_file.get('audioBitrate'), 1024) @@ -118,6 +130,43 @@ class PikselIE(InfoExtractor): 'filesize': int_or_none(asset_file.get('filesize')), 'tbr': tbr, }) + + def process_asset_files(asset_files): + for asset_file in (asset_files or []): + process_asset_file(asset_file) + + process_asset_files(video_data.get('assetFiles')) + process_asset_file(video_data.get('referenceFile')) + if not formats: + asset_id = video_data.get('assetid') or program.get('assetid') + if asset_id: + process_asset_files(try_get(self._call_api( + app_token, 'asset_file', display_id, { + 'assetid': asset_id, + }, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) + + m3u8_url = dict_get(video_data, [ + 'm3u8iPadURL', + 'ipadM3u8Url', + 'm3u8AndroidURL', + 'm3u8iPhoneURL', + 'iphoneM3u8Url']) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + smil_url = dict_get(video_data, ['httpSmil', 'hdSmil', 'rtmpSmil']) + if smil_url: + transform_source = None + if ref_id == 'nhkworld': + # TODO: figure out if this is something to be fixed in urljoin, + # _parse_smil_formats or keep it here + transform_source = lambda x: x.replace('src="/', 'src="').replace('/media"', '/media/"') + formats.extend(self._extract_smil_formats( + re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, + transform_source=transform_source, fatal=False)) + self._sort_formats(formats) subtitles = {} From f3474e105d447d5a30315bf81b17d01544b3e2b1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:26 +0100 Subject: [PATCH 207/384] [brightcove] remove sonyliv specific code --- haruhi_dl/extractor/brightcove.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/haruhi_dl/extractor/brightcove.py b/haruhi_dl/extractor/brightcove.py index 2845f4df2..675ede04c 100644 --- a/haruhi_dl/extractor/brightcove.py +++ b/haruhi_dl/extractor/brightcove.py @@ -534,14 +534,6 @@ class BrightcoveNewIE(AdobePassIE): 'format_id': build_format_id('rtmp'), }) formats.append(f) - if not formats: - # for sonyliv.com DRM protected videos - s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') - if s3_source_url: - formats.append({ - 'url': s3_source_url, - 'format_id': 'source', - }) errors = json_data.get('errors') if not formats and errors: From 838ac10bc721c4f5597dc4bbd0b9d5bd6c4aa5d0 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:32 +0100 Subject: [PATCH 208/384] [aparat] Fix extraction closes #22285 closes #22611 closes #23348 closes #24354 closes #24591 closes #24904 closes #25418 closes #26070 closes #26350 closes #26738 closes #27563 --- haruhi_dl/extractor/aparat.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/haruhi_dl/extractor/aparat.py b/haruhi_dl/extractor/aparat.py index 883dcee7a..a9527e785 100644 --- a/haruhi_dl/extractor/aparat.py +++ b/haruhi_dl/extractor/aparat.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + get_element_by_id, int_or_none, merge_dicts, mimetype2ext, @@ -39,23 +40,15 @@ class AparatIE(InfoExtractor): webpage = self._download_webpage(url, video_id, fatal=False) if not webpage: - # Note: There is an easier-to-parse configuration at - # http://www.aparat.com/video/video/config/videohash/%video_id - # but the URL in there does not work webpage = self._download_webpage( 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, video_id) - options = self._parse_json( - self._search_regex( - r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P(?:(?!\1).)+)\1\s*\)', - webpage, 'options', group='value'), - video_id) - - player = options['plugins']['sabaPlayerPlugin'] + options = self._parse_json(self._search_regex( + r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id) formats = [] - for sources in player['multiSRC']: + for sources in (options.get('multiSRC') or []): for item in sources: if not isinstance(item, dict): continue @@ -85,11 +78,12 @@ class AparatIE(InfoExtractor): info = self._search_json_ld(webpage, video_id, default={}) if not info.get('title'): - info['title'] = player['title'] + info['title'] = get_element_by_id('videoTitle', webpage) or \ + self._html_search_meta(['og:title', 'twitter:title', 'DC.Title', 'title'], webpage, fatal=True) return merge_dicts(info, { 'id': video_id, 'thumbnail': url_or_none(options.get('poster')), - 'duration': int_or_none(player.get('duration')), + 'duration': int_or_none(options.get('duration')), 'formats': formats, }) From 21f2e0a12eef8a67380458d59cff405daa84387f Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:42 +0100 Subject: [PATCH 209/384] =?UTF-8?q?[brightcove]=20raise=20ExtractorError?= =?UTF-8?q?=20for=20DRM=20protected=20videos(closes=20#23=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …467)(closes #27568) --- haruhi_dl/extractor/brightcove.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/brightcove.py b/haruhi_dl/extractor/brightcove.py index 675ede04c..0a2ec3879 100644 --- a/haruhi_dl/extractor/brightcove.py +++ b/haruhi_dl/extractor/brightcove.py @@ -471,13 +471,18 @@ class BrightcoveNewIE(AdobePassIE): def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() + num_drm_sources = 0 formats = [] - for source in json_data.get('sources', []): + sources = json_data.get('sources') or [] + for source in sources: container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if ext == 'ism' or container == 'WVM' or source.get('key_systems'): + if container == 'WVM' or source.get('key_systems'): + num_drm_sources += 1 + continue + elif ext == 'ism': continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -535,11 +540,14 @@ class BrightcoveNewIE(AdobePassIE): }) formats.append(f) - errors = json_data.get('errors') - if not formats and errors: - error = errors[0] - raise ExtractorError( - error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if not formats: + errors = json_data.get('errors') + if errors: + error = errors[0] + raise ExtractorError( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if sources and num_drm_sources == len(sources): + raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) From db69be3ccc39bb023f124fd4ba9d29d9c31569dd Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:47 +0100 Subject: [PATCH 210/384] [tenplay] fix format extraction(closes #26653) --- haruhi_dl/extractor/tenplay.py | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/haruhi_dl/extractor/tenplay.py b/haruhi_dl/extractor/tenplay.py index af325fea8..cd30d57f4 100644 --- a/haruhi_dl/extractor/tenplay.py +++ b/haruhi_dl/extractor/tenplay.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + HEADRequest, parse_age_limit, parse_iso8601, - smuggle_url, + # smuggle_url, ) @@ -24,14 +25,16 @@ class TenPlayIE(InfoExtractor): 'uploader_id': '2199827728001', }, 'params': { - 'format': 'bestvideo', + # 'format': 'bestvideo', 'skip_download': True, } }, { 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + _GEO_BYPASS = False + _FASTLY_URL_TEMPL = 'https://10-selector.global.ssl.fastly.net/s/kYEXFC/media/%s?mbr=true&manifest=m3u&format=redirect' def _real_extract(self, url): content_id = self._match_id(url) @@ -40,19 +43,28 @@ class TenPlayIE(InfoExtractor): video = data.get('video') or {} metadata = data.get('metaData') or {} brightcove_id = video.get('videoId') or metadata['showContentVideoId'] - brightcove_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['AU']}) + # brightcove_url = smuggle_url( + # self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + # {'geo_countries': ['AU']}) + m3u8_url = self._request_webpage(HEADRequest( + self._FASTLY_URL_TEMPL % brightcove_id), brightcove_id).geturl() + if '10play-not-in-oz' in m3u8_url: + self.raise_geo_restricted(countries=['AU']) + formats = self._extract_m3u8_formats(m3u8_url, brightcove_id, 'mp4') + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'url': brightcove_url, - 'id': content_id, - 'title': video.get('title') or metadata.get('pageContentName') or metadata.get('showContentName'), + # '_type': 'url_transparent', + # 'url': brightcove_url, + 'formats': formats, + 'id': brightcove_id, + 'title': video.get('title') or metadata.get('pageContentName') or metadata['showContentName'], 'description': video.get('description'), 'age_limit': parse_age_limit(video.get('showRatingClassification') or metadata.get('showProgramClassification')), 'series': metadata.get('showName'), 'season': metadata.get('showContentSeason'), 'timestamp': parse_iso8601(metadata.get('contentPublishDate') or metadata.get('pageContentPublishDate')), - 'ie_key': 'BrightcoveNew', + 'thumbnail': video.get('poster'), + 'uploader_id': '2199827728001', + # 'ie_key': 'BrightcoveNew', } From 1dbf12006f535d5176539d80d7103450a63d7669 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:54 +0100 Subject: [PATCH 211/384] [sevenplay] detect API errors --- haruhi_dl/extractor/sevenplus.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/haruhi_dl/extractor/sevenplus.py b/haruhi_dl/extractor/sevenplus.py index 84568ac69..240afc18f 100644 --- a/haruhi_dl/extractor/sevenplus.py +++ b/haruhi_dl/extractor/sevenplus.py @@ -4,8 +4,12 @@ from __future__ import unicode_literals import re from .brightcove import BrightcoveNewIE -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( + ExtractorError, try_get, update_url_query, ) @@ -41,16 +45,22 @@ class SevenPlusIE(BrightcoveNewIE): def _real_extract(self, url): path, episode_id = re.match(self._VALID_URL, url).groups() - media = self._download_json( - 'https://videoservice.swm.digital/playback', episode_id, query={ - 'appId': '7plus', - 'deviceType': 'web', - 'platformType': 'web', - 'accountId': 5303576322001, - 'referenceId': 'ref:' + episode_id, - 'deliveryId': 'csai', - 'videoType': 'vod', - })['media'] + try: + media = self._download_json( + 'https://videoservice.swm.digital/playback', episode_id, query={ + 'appId': '7plus', + 'deviceType': 'web', + 'platformType': 'web', + 'accountId': 5303576322001, + 'referenceId': 'ref:' + episode_id, + 'deliveryId': 'csai', + 'videoType': 'vod', + })['media'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), episode_id)[0]['error_code'], expected=True) + raise for source in media.get('sources', {}): src = source.get('src') From 50162a3580d87a7f90aae53fea8494711aaddd6b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:49:59 +0100 Subject: [PATCH 212/384] [uktvplay] match new video URLs(closes #17909) --- haruhi_dl/extractor/uktvplay.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/uktvplay.py b/haruhi_dl/extractor/uktvplay.py index 2137502a1..f28fd514d 100644 --- a/haruhi_dl/extractor/uktvplay.py +++ b/haruhi_dl/extractor/uktvplay.py @@ -5,10 +5,9 @@ from .common import InfoExtractor class UKTVPlayIE(InfoExtractor): - _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/.+?\?.*?\bvideo=(?P\d+)' - _TEST = { + _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P\d+)' + _TESTS = [{ 'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001', - 'md5': '', 'info_dict': { 'id': '2117008346001', 'ext': 'mp4', @@ -23,7 +22,11 @@ class UKTVPlayIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': ['Failed to download MPD manifest'] - } + }, { + 'url': 'https://uktvplay.uktv.co.uk/shows/africa/watch-online/5983349675001', + 'only_matching': True, + }] + # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/1242911124001/OrCyvJ2gyL_default/index.html?videoId=%s' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911124001/H1xnMOqP_default/index.html?videoId=%s' def _real_extract(self, url): From f7bef2772cfeec3954de444aef6651f918087967 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:50:35 +0100 Subject: [PATCH 213/384] [aenetworks] add support for biography.com (closes #3863) --- haruhi_dl/extractor/aenetworks.py | 97 ++++++++++++++++++++++--------- haruhi_dl/extractor/extractors.py | 2 + 2 files changed, 73 insertions(+), 26 deletions(-) diff --git a/haruhi_dl/extractor/aenetworks.py b/haruhi_dl/extractor/aenetworks.py index 3d0cf1208..237012978 100644 --- a/haruhi_dl/extractor/aenetworks.py +++ b/haruhi_dl/extractor/aenetworks.py @@ -6,6 +6,7 @@ import re from .theplatform import ThePlatformIE from ..utils import ( ExtractorError, + GeoRestrictedError, int_or_none, update_url_query, urlencode_postdata, @@ -28,6 +29,7 @@ class AENetworksBaseIE(ThePlatformIE): 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), 'fyi.tv': ('FYI', 'fyi'), 'historyvault.com': (None, 'historyvault'), + 'biography.com': (None, 'biography'), } def _extract_aen_smil(self, smil_url, video_id, auth=None): @@ -54,6 +56,8 @@ class AENetworksBaseIE(ThePlatformIE): tp_formats, tp_subtitles = self._extract_theplatform_smil( m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes'])) except ExtractorError as e: + if isinstance(e, GeoRestrictedError): + raise last_e = e continue formats.extend(tp_formats) @@ -67,6 +71,34 @@ class AENetworksBaseIE(ThePlatformIE): 'subtitles': subtitles, } + def _extract_aetn_info(self, domain, filter_key, filter_value, url): + requestor_id, brand = self._DOMAIN_MAP[domain] + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, + filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + title = result['title'] + video_id = result['id'] + media_url = result['publicUrl'] + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + auth = None + if theplatform_metadata.get('AETN$isBehindWall'): + resource = self._get_mvpd_resource( + requestor_id, theplatform_metadata['title'], + theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), + theplatform_metadata['ratings'][0]['rating']) + auth = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + info.update(self._extract_aen_smil(media_url, video_id, auth)) + info.update({ + 'title': title, + 'series': result.get('seriesName'), + 'season_number': int_or_none(result.get('tvSeasonNumber')), + 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), + }) + return info + class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' @@ -139,32 +171,7 @@ class AENetworksIE(AENetworksBaseIE): def _real_extract(self, url): domain, canonical = re.match(self._VALID_URL, url).groups() - requestor_id, brand = self._DOMAIN_MAP[domain] - result = self._download_json( - 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, - canonical, query={'filter[canonical]': '/' + canonical})['results'][0] - title = result['title'] - video_id = result['id'] - media_url = result['publicUrl'] - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) - info = self._parse_theplatform_metadata(theplatform_metadata) - auth = None - if theplatform_metadata.get('AETN$isBehindWall'): - resource = self._get_mvpd_resource( - requestor_id, theplatform_metadata['title'], - theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), - theplatform_metadata['ratings'][0]['rating']) - auth = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - info.update(self._extract_aen_smil(media_url, video_id, auth)) - info.update({ - 'title': title, - 'series': result.get('seriesName'), - 'season_number': int_or_none(result.get('tvSeasonNumber')), - 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), - }) - return info + return self._extract_aetn_info(domain, 'canonical', '/' + canonical, url) class AENetworksListBaseIE(AENetworksBaseIE): @@ -294,3 +301,41 @@ class HistoryTopicIE(AENetworksBaseIE): return self.url_result( 'http://www.history.com/videos/' + display_id, AENetworksIE.ie_key()) + + +class HistoryPlayerIE(AENetworksBaseIE): + IE_NAME = 'history:player' + _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_aetn_info(domain, 'id', video_id, url) + + +class BiographyIE(AENetworksBaseIE): + _VALID_URL = r'https?://(?:www\.)?biography\.com/video/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.biography.com/video/vincent-van-gogh-full-episode-2075049808', + 'info_dict': { + 'id': '30322987', + 'ext': 'mp4', + 'title': 'Vincent Van Gogh - Full Episode', + 'description': 'A full biography about the most influential 20th century painter, Vincent Van Gogh.', + 'timestamp': 1311970571, + 'upload_date': '20110729', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + player_url = self._search_regex( + r']+src="(%s)' % HistoryPlayerIE._VALID_URL, + webpage, 'player URL') + return self.url_result(player_url, HistoryPlayerIE.ie_key()) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 1a5cee636..1f8366076 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -33,6 +33,8 @@ from .aenetworks import ( AENetworksCollectionIE, AENetworksShowIE, HistoryTopicIE, + HistoryPlayerIE, + BiographyIE, ) from .afreecatv import AfreecaTVIE from .agora import ( From 3ca3074dc3a54e736fd82fcb717acdaac7a2c563 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:51:52 +0100 Subject: [PATCH 214/384] [aenetworks] fix HistoryPlayerIE tests --- haruhi_dl/extractor/aenetworks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haruhi_dl/extractor/aenetworks.py b/haruhi_dl/extractor/aenetworks.py index 237012978..8e4963131 100644 --- a/haruhi_dl/extractor/aenetworks.py +++ b/haruhi_dl/extractor/aenetworks.py @@ -306,6 +306,7 @@ class HistoryTopicIE(AENetworksBaseIE): class HistoryPlayerIE(AENetworksBaseIE): IE_NAME = 'history:player' _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' + _TESTS = [] def _real_extract(self, url): domain, video_id = re.match(self._VALID_URL, url).groups() From f1931b8ba87e01a690ca236bcb59298f265b8b6d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:51:59 +0100 Subject: [PATCH 215/384] [nbc] fix NBCSport VPlayer URL extraction(closes #16640) --- haruhi_dl/extractor/nbc.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/nbc.py b/haruhi_dl/extractor/nbc.py index ea5f5a315..9695a9616 100644 --- a/haruhi_dl/extractor/nbc.py +++ b/haruhi_dl/extractor/nbc.py @@ -158,7 +158,8 @@ class NBCIE(AdobePassIE): class NBCSportsVPlayerIE(InfoExtractor): - _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' + _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' + _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' _TESTS = [{ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', @@ -174,12 +175,15 @@ class NBCSportsVPlayerIE(InfoExtractor): }, { 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', 'only_matching': True, + }, { + 'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true', + 'only_matching': True, }] @staticmethod def _extract_url(webpage): iframe_m = re.search( - r']+src="(?Phttps?://vplayer\.nbcsports\.com/[^"]+)"', webpage) + r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) if iframe_m: return iframe_m.group('url') @@ -192,21 +196,29 @@ class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsIE(InfoExtractor): - # Does not include https because its certificate is invalid - _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?:[^/]+/)+(?P[0-9a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?!vplayer/)(?:[^/]+/)+(?P[0-9a-z-]+)' - _TEST = { + _TESTS = [{ + # iframe src 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', 'info_dict': { 'id': 'PHJSaFWbrTY9', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', 'uploader': 'NBCU-SPORTS', 'upload_date': '20150330', 'timestamp': 1427726529, } - } + }, { + # data-mpx-src + 'url': 'https://www.nbcsports.com/philadelphia/philadelphia-phillies/bruce-bochy-hector-neris-hes-idiot', + 'only_matching': True, + }, { + # data-src + 'url': 'https://www.nbcsports.com/boston/video/report-card-pats-secondary-no-match-josh-allen', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From c7d0af171ff8591f022d589e7fa8222d0a0b4a2a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:52:06 +0100 Subject: [PATCH 216/384] [nbc] Remove CSNNE extractor --- haruhi_dl/extractor/extractors.py | 1 - haruhi_dl/extractor/nbc.py | 27 --------------------------- 2 files changed, 28 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 1f8366076..c8e99253e 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -723,7 +723,6 @@ from .nba import ( NBAChannelIE, ) from .nbc import ( - CSNNEIE, NBCIE, NBCNewsIE, NBCOlympicsIE, diff --git a/haruhi_dl/extractor/nbc.py b/haruhi_dl/extractor/nbc.py index 9695a9616..0d77648c2 100644 --- a/haruhi_dl/extractor/nbc.py +++ b/haruhi_dl/extractor/nbc.py @@ -286,33 +286,6 @@ class NBCSportsStreamIE(AdobePassIE): } -class CSNNEIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P[0-9a-z-]+)' - - _TEST = { - 'url': 'http://www.csnne.com/video/snc-evening-update-wright-named-red-sox-no-5-starter', - 'info_dict': { - 'id': 'yvBLLUgQ8WU0', - 'ext': 'mp4', - 'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.', - 'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3', - 'timestamp': 1459369979, - 'upload_date': '20160330', - 'uploader': 'NBCU-SPORTS', - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': self._html_search_meta('twitter:player:stream', webpage), - 'display_id': display_id, - } - - class NBCNewsIE(ThePlatformIE): _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P[^/?]+)' From 10af8572d432e3dde30b3f02310c46755ee7d213 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:53:16 +0100 Subject: [PATCH 217/384] [YoutubeDL] Allow format filtering using audio language(#16209) --- haruhi_dl/HaruhiDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/HaruhiDL.py b/haruhi_dl/HaruhiDL.py index 813d32d76..7320403d2 100755 --- a/haruhi_dl/HaruhiDL.py +++ b/haruhi_dl/HaruhiDL.py @@ -1087,7 +1087,7 @@ class HaruhiDL(object): '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) - \s*(?Pext|acodec|vcodec|container|protocol|format_id) + \s*(?Pext|acodec|vcodec|container|protocol|format_id|language) \s*(?P!\s*)?(?P%s)(?P\s*\?)? \s*(?P[a-zA-Z0-9._-]+) \s*$ From 2c4b3dd8649151e1409d5af6adb05b26eed6679a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:53:27 +0100 Subject: [PATCH 218/384] [utils] accept only supported protocols in url_or_none --- haruhi_dl/utils.py | 2 +- test/test_utils.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/utils.py b/haruhi_dl/utils.py index 2bba1b04c..62b59bcdb 100644 --- a/haruhi_dl/utils.py +++ b/haruhi_dl/utils.py @@ -3642,7 +3642,7 @@ def url_or_none(url): if not url or not isinstance(url, compat_str): return None url = url.strip() - return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None + return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None def parse_duration(s): diff --git a/test/test_utils.py b/test/test_utils.py index fcb86d92a..dc3dde0c4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -554,6 +554,11 @@ class TestUtil(unittest.TestCase): self.assertEqual(url_or_none('http$://foo.de'), None) self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') self.assertEqual(url_or_none('//foo.de'), '//foo.de') + self.assertEqual(url_or_none('s3://foo.de'), None) + self.assertEqual(url_or_none('rtmpte://foo.de'), 'rtmpte://foo.de') + self.assertEqual(url_or_none('mms://foo.de'), 'mms://foo.de') + self.assertEqual(url_or_none('rtspu://foo.de'), 'rtspu://foo.de') + self.assertEqual(url_or_none('ftps://foo.de'), 'ftps://foo.de') def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) From a2f4d6ec07bed6bc52ede04d51dbfc03725ab21e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:53:33 +0100 Subject: [PATCH 219/384] [yandexvideo] fix extraction(closes #25000) --- haruhi_dl/extractor/yandexvideo.py | 116 +++++++++++++++++++---------- 1 file changed, 76 insertions(+), 40 deletions(-) diff --git a/haruhi_dl/extractor/yandexvideo.py b/haruhi_dl/extractor/yandexvideo.py index 46529be05..36d01cc8e 100644 --- a/haruhi_dl/extractor/yandexvideo.py +++ b/haruhi_dl/extractor/yandexvideo.py @@ -13,26 +13,30 @@ class YandexVideoIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - yandex\.ru(?:/portal/(?:video|efir))?/?\?.*?stream_id=| + yandex\.ru(?:/(?:portal/(?:video|efir)|efir))?/?\?.*?stream_id=| frontend\.vh\.yandex\.ru/player/ ) - (?P[\da-f]+) + (?P(?:[\da-f]{32}|[\w-]{12})) ''' _TESTS = [{ - 'url': 'https://yandex.ru/portal/video?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', - 'md5': '33955d7ae052f15853dc41f35f17581c', + 'url': 'https://yandex.ru/portal/video?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374', + 'md5': 'e02a05bfaf0d9615ef07ae3a10f4faf4', 'info_dict': { - 'id': '4dbb262b4fe5cf15a215de4f34eee34d', + 'id': '4dbb36ec4e0526d58f9f2dc8f0ecf374', 'ext': 'mp4', - 'title': 'В Нью-Йорке баржи и теплоход оторвались от причала и расплылись по Гудзону', - 'description': '', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 0, - 'duration': 30, + 'title': 'Русский Вудсток - главный рок-фест в истории СССР / вДудь', + 'description': 'md5:7d6b8d4bc4a3b9a56499916c1ea5b5fa', + 'thumbnail': r're:^https?://', + 'timestamp': 1549972939, + 'duration': 5575, 'age_limit': 18, + 'upload_date': '20190212', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, }, }, { - 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374&from=morda', + 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb262b4fe5cf15a215de4f34eee34d&from=morda', 'only_matching': True, }, { 'url': 'https://yandex.ru/?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', @@ -52,53 +56,85 @@ class YandexVideoIE(InfoExtractor): # DASH with DRM 'url': 'https://yandex.ru/portal/video?from=morda&stream_id=485a92d94518d73a9d0ff778e13505f8', 'only_matching': True, + }, { + 'url': 'https://yandex.ru/efir?stream_active=watching&stream_id=v7a2dZ-v5mSI&from_block=efir_newtab', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) content = self._download_json( - 'https://frontend.vh.yandex.ru/v22/player/%s.json' % video_id, - video_id, query={ - 'stream_options': 'hires', - 'disable_trackings': 1, - })['content'] + # 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, + # video_id, query={ + # 'stream_options': 'hires', + # 'disable_trackings': 1, + # })['content'] + 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ + player(content_id: "%s") { + computed_title + content_url + description + dislikes + duration + likes + program_title + release_date + release_date_ut + release_year + restriction_age + season + start_time + streams + thumbnail + title + views_count + } +}''' % video_id.encode())['player']['content']['content'] - content_url = url_or_none(content.get('content_url')) or url_or_none( - content['streams'][0]['url']) - title = content.get('title') or content.get('computed_title') + title = content.get('title') or content['computed_title'] - ext = determine_ext(content_url) - - if ext == 'm3u8': - formats = self._extract_m3u8_formats( - content_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - elif ext == 'mpd': - formats = self._extract_mpd_formats( - content_url, video_id, mpd_id='dash') - else: - formats = [{'url': content_url}] + formats = [] + streams = content.get('streams') or [] + streams.append({'url': content.get('content_url')}) + for stream in streams: + content_url = url_or_none(stream.get('url')) + if not content_url: + continue + ext = determine_ext(content_url) + if ext == 'ismc': + continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + content_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + content_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({'url': content_url}) self._sort_formats(formats) - description = content.get('description') - thumbnail = content.get('thumbnail') timestamp = (int_or_none(content.get('release_date')) or int_or_none(content.get('release_date_ut')) or int_or_none(content.get('start_time'))) - duration = int_or_none(content.get('duration')) - series = content.get('program_title') - age_limit = int_or_none(content.get('restriction_age')) + season = content.get('season') or {} return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'description': content.get('description'), + 'thumbnail': content.get('thumbnail'), 'timestamp': timestamp, - 'duration': duration, - 'series': series, - 'age_limit': age_limit, + 'duration': int_or_none(content.get('duration')), + 'series': content.get('program_title'), + 'age_limit': int_or_none(content.get('restriction_age')), + 'view_count': int_or_none(content.get('views_count')), + 'like_count': int_or_none(content.get('likes')), + 'dislike_count': int_or_none(content.get('dislikes')), + 'season_number': int_or_none(season.get('season_number')), + 'season_id': season.get('id'), + 'release_year': int_or_none(content.get('release_year')), 'formats': formats, } From 95b5454a31a56604423c08a59290ba4daad1f0fe Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:53:41 +0100 Subject: [PATCH 220/384] [yandexvideo] use old api call as fallback --- haruhi_dl/extractor/yandexvideo.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/yandexvideo.py b/haruhi_dl/extractor/yandexvideo.py index 36d01cc8e..ab8c84c93 100644 --- a/haruhi_dl/extractor/yandexvideo.py +++ b/haruhi_dl/extractor/yandexvideo.py @@ -5,6 +5,7 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + try_get, url_or_none, ) @@ -64,12 +65,7 @@ class YandexVideoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - content = self._download_json( - # 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, - # video_id, query={ - # 'stream_options': 'hires', - # 'disable_trackings': 1, - # })['content'] + player = try_get((self._download_json( 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ player(content_id: "%s") { computed_title @@ -90,7 +86,15 @@ class YandexVideoIE(InfoExtractor): title views_count } -}''' % video_id.encode())['player']['content']['content'] +}''' % video_id.encode(), fatal=False)), lambda x: x['player']['content']) + if not player or player.get('error'): + player = self._download_json( + 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, + video_id, query={ + 'stream_options': 'hires', + 'disable_trackings': 1, + }) + content = player['content'] title = content.get('title') or content['computed_title'] From 355b6d9ab6ec671678fc69acd58df191796c47bc Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:53:46 +0100 Subject: [PATCH 221/384] [yandexdisk] fix extraction(closes #17861)(closes #27131) --- haruhi_dl/extractor/yandexdisk.py | 138 ++++++++++++++++++------------ 1 file changed, 81 insertions(+), 57 deletions(-) diff --git a/haruhi_dl/extractor/yandexdisk.py b/haruhi_dl/extractor/yandexdisk.py index e8f6ae10f..21f37c192 100644 --- a/haruhi_dl/extractor/yandexdisk.py +++ b/haruhi_dl/extractor/yandexdisk.py @@ -1,19 +1,40 @@ # coding: utf-8 from __future__ import unicode_literals +import json + from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( determine_ext, + ExtractorError, float_or_none, int_or_none, - try_get, - urlencode_postdata, + mimetype2ext, + parse_iso8601, + urljoin, ) class YandexDiskIE(InfoExtractor): - _VALID_URL = r'https?://yadi\.sk/[di]/(?P[^/?#&]+)' + _VALID_URL = r'''(?x)https?:// + (?: + (?:www\.)?yadi\.sk| + disk\.yandex\. + (?: + az| + by| + co(?:m(?:\.(?:am|ge|tr))?|\.il)| + ee| + fr| + k[gz]| + l[tv]| + md| + t[jm]| + u[az]| + ru + ) + )/(?:[di]/|public.*?\bhash=)(?P[^/?#&]+)''' _TESTS = [{ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', @@ -25,94 +46,97 @@ class YandexDiskIE(InfoExtractor): 'duration': 168.6, 'uploader': 'y.botova', 'uploader_id': '300043621', + 'timestamp': 1421396809, + 'upload_date': '20150116', 'view_count': int, }, }, { 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', 'only_matching': True, + }, { + 'url': 'https://yadi.sk/public?hash=5DZ296JK9GWCLp02f6jrObjnctjRxMs8L6%2B%2FuhNqk38%3D', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - status = self._download_webpage( - 'https://disk.yandex.com/auth/status', video_id, query={ - 'urlOrigin': url, - 'source': 'public', - 'md5': 'false', - }) + try: + resource = self._download_json( + 'https://cloud-api.yandex.net/v1/disk/public/resources', + video_id, query={'public_key': url}) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + error_description = self._parse_json( + e.cause.read().decode(), video_id)['description'] + raise ExtractorError(error_description, expected=True) + raise - sk = self._search_regex( - r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P(?:(?!\2).)+)\2', - status, 'sk', group='value') + title = resource['name'] + public_url = resource.get('public_url') + if public_url: + video_id = self._match_id(public_url) - webpage = self._download_webpage(url, video_id) + self._set_cookie('yadi.sk', 'yandexuid', '0') - models = self._parse_json( - self._search_regex( - r']+id=["\']models-client[^>]+>\s*(\[.+?\])\s* Date: Fri, 26 Feb 2021 15:53:51 +0100 Subject: [PATCH 222/384] [yandexdisk] extract info from webpage the public API does not return metadata when download limit is reached --- haruhi_dl/extractor/yandexdisk.py | 89 ++++++++++++++++--------------- 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/haruhi_dl/extractor/yandexdisk.py b/haruhi_dl/extractor/yandexdisk.py index 21f37c192..6fcd8ee7e 100644 --- a/haruhi_dl/extractor/yandexdisk.py +++ b/haruhi_dl/extractor/yandexdisk.py @@ -2,24 +2,23 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor -from ..compat import compat_HTTPError from ..utils import ( determine_ext, - ExtractorError, float_or_none, int_or_none, mimetype2ext, - parse_iso8601, + try_get, urljoin, ) class YandexDiskIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// - (?: - (?:www\.)?yadi\.sk| + (?P + yadi\.sk| disk\.yandex\. (?: az| @@ -38,7 +37,7 @@ class YandexDiskIE(InfoExtractor): _TESTS = [{ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', - 'md5': '33955d7ae052f15853dc41f35f17581c', + 'md5': 'a4a8d52958c8fddcf9845935070402ae', 'info_dict': { 'id': 'VdOeDou8eZs6Y', 'ext': 'mp4', @@ -46,10 +45,9 @@ class YandexDiskIE(InfoExtractor): 'duration': 168.6, 'uploader': 'y.botova', 'uploader_id': '300043621', - 'timestamp': 1421396809, - 'upload_date': '20150116', 'view_count': int, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', 'only_matching': True, @@ -59,51 +57,58 @@ class YandexDiskIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + domain, video_id = re.match(self._VALID_URL, url).groups() - try: - resource = self._download_json( - 'https://cloud-api.yandex.net/v1/disk/public/resources', - video_id, query={'public_key': url}) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error_description = self._parse_json( - e.cause.read().decode(), video_id)['description'] - raise ExtractorError(error_description, expected=True) - raise + webpage = self._download_webpage(url, video_id) + store = self._parse_json(self._search_regex( + r']+id="store-prefetch"[^>]*>\s*({.+?})\s*', + webpage, 'store'), video_id) + resource = store['resources'][store['rootResourceId']] title = resource['name'] - public_url = resource.get('public_url') + meta = resource.get('meta') or {} + + public_url = meta.get('short_url') if public_url: video_id = self._match_id(public_url) - self._set_cookie('yadi.sk', 'yandexuid', '0') + source_url = (self._download_json( + 'https://cloud-api.yandex.net/v1/disk/public/resources/download', + video_id, query={'public_key': url}, fatal=False) or {}).get('href') + video_streams = resource.get('videoStreams') or {} + video_hash = resource.get('hash') or url + environment = store.get('environment') or {} + sk = environment.get('sk') + yandexuid = environment.get('yandexuid') + if sk and yandexuid and not (source_url and video_streams): + self._set_cookie(domain, 'yandexuid', yandexuid) - def call_api(action): - return (self._download_json( - urljoin(url, '/public/api/') + action, video_id, data=json.dumps({ - 'hash': url, - # obtain sk if needed from call_api('check-auth') while - # the yandexuid cookie is set and sending an empty JSON object - 'sk': 'ya6b52f8c6b12abe91a66d22d3a31084b' - }).encode(), headers={ - 'Content-Type': 'text/plain', - }, fatal=False) or {}).get('data') or {} + def call_api(action): + return (self._download_json( + urljoin(url, '/public/api/') + action, video_id, data=json.dumps({ + 'hash': video_hash, + 'sk': sk, + }).encode(), headers={ + 'Content-Type': 'text/plain', + }, fatal=False) or {}).get('data') or {} + if not source_url: + # TODO: figure out how to detect if download limit has + # been reached and then avoid unnecessary source format + # extraction requests + source_url = call_api('download-url').get('url') + if not video_streams: + video_streams = call_api('get-video-streams') formats = [] - source_url = resource.get('file') - if not source_url: - source_url = call_api('download-url').get('url') if source_url: formats.append({ 'url': source_url, 'format_id': 'source', - 'ext': determine_ext(title, mimetype2ext(resource.get('mime_type')) or 'mp4'), + 'ext': determine_ext(title, meta.get('ext') or mimetype2ext(meta.get('mime_type')) or 'mp4'), 'quality': 1, - 'filesize': int_or_none(resource.get('size')) + 'filesize': int_or_none(meta.get('size')) }) - video_streams = call_api('get-video-streams') for video in (video_streams.get('videos') or []): format_url = video.get('url') if not format_url: @@ -128,15 +133,15 @@ class YandexDiskIE(InfoExtractor): }) self._sort_formats(formats) - owner = resource.get('owner') or {} + uid = resource.get('uid') + display_name = try_get(store, lambda x: x['users'][uid]['displayName']) return { 'id': video_id, 'title': title, 'duration': float_or_none(video_streams.get('duration'), 1000), - 'uploader': owner.get('display_name'), - 'uploader_id': owner.get('uid'), - 'view_count': int_or_none(resource.get('views_count')), - 'timestamp': parse_iso8601(resource.get('created')), + 'uploader': display_name, + 'uploader_id': uid, + 'view_count': int_or_none(meta.get('views_counter')), 'formats': formats, } From 0165049a526dc14f4963d751d8b5ae814de4b0d5 Mon Sep 17 00:00:00 2001 From: nixxo Date: Fri, 26 Feb 2021 15:53:57 +0100 Subject: [PATCH 223/384] [vvvvid] add playlists support (#27574) closes #18130 --- haruhi_dl/extractor/extractors.py | 5 ++- haruhi_dl/extractor/vvvvid.py | 65 ++++++++++++++++++++++++++++--- 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index c8e99253e..2b81187ca 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1483,7 +1483,10 @@ from .vshare import VShareIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE -from .vvvvid import VVVVIDIE +from .vvvvid import ( + VVVVIDIE, + VVVVIDShowIE, +) from .vyborymos import VyboryMosIE from .vzaar import VzaarIE from .wakanim import WakanimIE diff --git a/haruhi_dl/extractor/vvvvid.py b/haruhi_dl/extractor/vvvvid.py index 6906cd2ab..5b8ea3665 100644 --- a/haruhi_dl/extractor/vvvvid.py +++ b/haruhi_dl/extractor/vvvvid.py @@ -12,7 +12,8 @@ from ..utils import ( class VVVVIDIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' + _VALID_URL_BASE = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/' + _VALID_URL = r'%s(?P\d+)/[^/]+/(?P\d+)/(?P[0-9]+)' % _VALID_URL_BASE _TESTS = [{ # video_type == 'video/vvvvid' 'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong', @@ -45,20 +46,26 @@ class VVVVIDIE(InfoExtractor): 'https://www.vvvvid.it/user/login', None, headers=self.geo_verification_headers())['data']['conn_id'] - def _real_extract(self, url): - show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + def _download_info(self, show_id, path, video_id, fatal=True): response = self._download_json( - 'https://www.vvvvid.it/vvvvid/ondemand/%s/season/%s' % (show_id, season_id), + 'https://www.vvvvid.it/vvvvid/ondemand/%s%s' % (show_id, path), video_id, headers=self.geo_verification_headers(), query={ 'conn_id': self._conn_id, - }) + }, fatal=fatal) if response['result'] == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, response['message']), expected=True) + return response['data'] + + def _real_extract(self, url): + show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + + response = self._download_info( + show_id, '/season/%s' % season_id, video_id) vid = int(video_id) video_data = list(filter( - lambda episode: episode.get('video_id') == vid, response['data']))[0] + lambda episode: episode.get('video_id') == vid, response))[0] formats = [] # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js @@ -156,3 +163,49 @@ class VVVVIDIE(InfoExtractor): 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), } + + +class VVVVIDShowIE(VVVVIDIE): + _VALID_URL = r'(?P%s(?P\d+)/(?P[^/]+))/?(?:$|[\?&].*$)?$' % VVVVIDIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.vvvvid.it/show/156/psyco-pass', + 'info_dict': { + 'id': '156', + 'title': 'Psycho-Pass', + 'description': 'md5:94d572c0bd85894b193b8aebc9a3a806', + }, + 'playlist_count': 46, + }] + + def _real_extract(self, url): + base_url, show_id, show_title = re.match(self._VALID_URL, url).groups() + + response = self._download_info( + show_id, '/seasons/', show_title) + + show_infos = self._download_info( + show_id, '/info/', show_title, fatal=False) + + entries = [] + for season in response: + episodes = season.get('episodes') or [] + for episode in episodes: + season_id = str_or_none(episode.get('season_id')) + video_id = str_or_none(episode.get('video_id')) + if not (season_id and video_id): + continue + + video_url = '/'.join([base_url, season_id, video_id]) + + entries.append({ + '_type': 'url_transparent', + 'ie_key': VVVVIDIE.ie_key(), + 'url': video_url, + 'title': episode.get('title'), + 'thumbnail': episode.get('thumbnail'), + 'description': episode.get('description'), + 'season_number': int_or_none(episode.get('season_number')), + 'episode_number': int_or_none(episode.get('number')), + }) + return self.playlist_result( + entries, show_id, show_infos.get('title'), show_infos.get('description')) From 9a6885f335f798cb467b3da2cafac5a134df1173 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:54:01 +0100 Subject: [PATCH 224/384] [vvvvid] imporove info extraction --- haruhi_dl/extractor/vvvvid.py | 78 +++++++++++++++++++++++------------ 1 file changed, 52 insertions(+), 26 deletions(-) diff --git a/haruhi_dl/extractor/vvvvid.py b/haruhi_dl/extractor/vvvvid.py index 5b8ea3665..014a67e53 100644 --- a/haruhi_dl/extractor/vvvvid.py +++ b/haruhi_dl/extractor/vvvvid.py @@ -22,6 +22,16 @@ class VVVVIDIE(InfoExtractor): 'id': '489048', 'ext': 'mp4', 'title': 'Ping Pong', + 'duration': 239, + 'series': '"Perché dovrei guardarlo?" di Dario Moccia', + 'season_id': '437', + 'season_number': 1, + 'episode': 'Ping Pong', + 'episode_number': 1, + 'episode_id': '3334', + 'view_count': int, + 'like_count': int, + 'repost_count': int, }, 'params': { 'skip_download': True, @@ -38,6 +48,9 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', + 'only_matching': True }] _conn_id = None @@ -48,24 +61,34 @@ class VVVVIDIE(InfoExtractor): def _download_info(self, show_id, path, video_id, fatal=True): response = self._download_json( - 'https://www.vvvvid.it/vvvvid/ondemand/%s%s' % (show_id, path), + 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), video_id, headers=self.geo_verification_headers(), query={ 'conn_id': self._conn_id, }, fatal=fatal) - if response['result'] == 'error': + if not (response or fatal): + return + if response.get('result') == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, response['message']), expected=True) return response['data'] + def _extract_common_video_info(self, video_data): + return { + 'thumbnail': video_data.get('thumbnail'), + 'episode_number': int_or_none(video_data.get('number')), + 'episode_id': str_or_none(video_data.get('id')), + } + def _real_extract(self, url): show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() response = self._download_info( - show_id, '/season/%s' % season_id, video_id) + show_id, 'season/%s' % season_id, video_id) vid = int(video_id) video_data = list(filter( lambda episode: episode.get('video_id') == vid, response))[0] + title = video_data['title'] formats = [] # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js @@ -148,25 +171,25 @@ class VVVVIDIE(InfoExtractor): 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) self._sort_formats(formats) - return { + info = self._extract_common_video_info(video_data) + info.update({ 'id': video_id, - 'title': video_data['title'], + 'title': title, 'formats': formats, - 'thumbnail': video_data.get('thumbnail'), 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, 'season_number': video_data.get('season_number'), - 'episode_id': str_or_none(video_data.get('id')), - 'episode_number': int_or_none(video_data.get('number')), - 'episode_title': video_data['title'], + 'episode': title, 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), - } + 'repost_count': int_or_none(video_data.get('video_shares')), + }) + return info class VVVVIDShowIE(VVVVIDIE): - _VALID_URL = r'(?P%s(?P\d+)/(?P[^/]+))/?(?:$|[\?&].*$)?$' % VVVVIDIE._VALID_URL_BASE + _VALID_URL = r'(?P%s(?P\d+)(?:/(?P[^/?&#]+))?)/?(?:[?#&]|$)' % VVVVIDIE._VALID_URL_BASE _TESTS = [{ 'url': 'https://www.vvvvid.it/show/156/psyco-pass', 'info_dict': { @@ -175,37 +198,40 @@ class VVVVIDShowIE(VVVVIDIE): 'description': 'md5:94d572c0bd85894b193b8aebc9a3a806', }, 'playlist_count': 46, + }, { + 'url': 'https://www.vvvvid.it/show/156', + 'only_matching': True, }] def _real_extract(self, url): base_url, show_id, show_title = re.match(self._VALID_URL, url).groups() - response = self._download_info( - show_id, '/seasons/', show_title) + seasons = self._download_info( + show_id, 'seasons/', show_title) - show_infos = self._download_info( - show_id, '/info/', show_title, fatal=False) + show_info = self._download_info( + show_id, 'info/', show_title, fatal=False) entries = [] - for season in response: + for season in (seasons or []): + season_number = int_or_none(season.get('number')) episodes = season.get('episodes') or [] for episode in episodes: season_id = str_or_none(episode.get('season_id')) video_id = str_or_none(episode.get('video_id')) if not (season_id and video_id): continue - - video_url = '/'.join([base_url, season_id, video_id]) - - entries.append({ - '_type': 'url_transparent', + info = self._extract_common_video_info(episode) + info.update({ + '_type': 'url', 'ie_key': VVVVIDIE.ie_key(), - 'url': video_url, + 'url': '/'.join([base_url, season_id, video_id]), 'title': episode.get('title'), - 'thumbnail': episode.get('thumbnail'), 'description': episode.get('description'), - 'season_number': int_or_none(episode.get('season_number')), - 'episode_number': int_or_none(episode.get('number')), + 'season_number': season_number, + 'season_id': season_id, }) + entries.append(info) + return self.playlist_result( - entries, show_id, show_infos.get('title'), show_infos.get('description')) + entries, show_id, show_info.get('title'), show_info.get('description')) From e4f3383802a321ff5f1e44e259e7878ae4fc9fa1 Mon Sep 17 00:00:00 2001 From: ozburo Date: Fri, 26 Feb 2021 15:54:08 +0100 Subject: [PATCH 225/384] [redditr] Extract all thumbnails --- haruhi_dl/extractor/reddit.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/reddit.py b/haruhi_dl/extractor/reddit.py index 3b2abb262..2d1a1fd99 100644 --- a/haruhi_dl/extractor/reddit.py +++ b/haruhi_dl/extractor/reddit.py @@ -9,6 +9,7 @@ from ..utils import ( float_or_none, try_get, url_or_none, + unescapeHTML, ) @@ -118,11 +119,23 @@ class RedditRIE(InfoExtractor): else: age_limit = None + thumbnails = [] + images = try_get( + data, lambda x: x['preview']['images'][0]['resolutions']) or [] + for image in images: + url = url_or_none(unescapeHTML(image['url'])) + if url is not None: + thumbnails.append({ + 'url': url, + 'width': int_or_none(image['width']), + 'height': int_or_none(image['height']), + }) + return { '_type': 'url_transparent', 'url': video_url, 'title': data.get('title'), - 'thumbnail': url_or_none(data.get('thumbnail')), + 'thumbnails': thumbnails, 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), 'duration': int_or_none(try_get( From 52fd0e8bb8e3c6b07b00f0e44d9961f59c15616a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 15:54:14 +0100 Subject: [PATCH 226/384] [redditr] Fix review issues and extract source thumbnail (closes #27503) --- haruhi_dl/extractor/reddit.py | 36 +++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/haruhi_dl/extractor/reddit.py b/haruhi_dl/extractor/reddit.py index 2d1a1fd99..222fa0172 100644 --- a/haruhi_dl/extractor/reddit.py +++ b/haruhi_dl/extractor/reddit.py @@ -8,8 +8,8 @@ from ..utils import ( int_or_none, float_or_none, try_get, - url_or_none, unescapeHTML, + url_or_none, ) @@ -57,7 +57,8 @@ class RedditRIE(InfoExtractor): 'id': 'zv89llsvexdz', 'ext': 'mp4', 'title': 'That small heart attack.', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:4', 'timestamp': 1501941939, 'upload_date': '20170805', 'uploader': 'Antw87', @@ -120,16 +121,27 @@ class RedditRIE(InfoExtractor): age_limit = None thumbnails = [] - images = try_get( - data, lambda x: x['preview']['images'][0]['resolutions']) or [] - for image in images: - url = url_or_none(unescapeHTML(image['url'])) - if url is not None: - thumbnails.append({ - 'url': url, - 'width': int_or_none(image['width']), - 'height': int_or_none(image['height']), - }) + + def add_thumbnail(src): + if not isinstance(src, dict): + return + thumbnail_url = url_or_none(src.get('url')) + if not thumbnail_url: + return + thumbnails.append({ + 'url': unescapeHTML(thumbnail_url), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + }) + + for image in try_get(data, lambda x: x['preview']['images']) or []: + if not isinstance(image, dict): + continue + add_thumbnail(image.get('source')) + resolutions = image.get('resolutions') + if isinstance(resolutions, list): + for resolution in resolutions: + add_thumbnail(resolution) return { '_type': 'url_transparent', From 0ade73d5629c2ea85fa9c2fc44b3ec72aae27d80 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:54:32 +0100 Subject: [PATCH 227/384] [yandexvideo] fix extraction for Python 3.4 --- haruhi_dl/extractor/yandexvideo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/yandexvideo.py b/haruhi_dl/extractor/yandexvideo.py index ab8c84c93..6a166ec9b 100644 --- a/haruhi_dl/extractor/yandexvideo.py +++ b/haruhi_dl/extractor/yandexvideo.py @@ -66,7 +66,7 @@ class YandexVideoIE(InfoExtractor): video_id = self._match_id(url) player = try_get((self._download_json( - 'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{ + 'https://frontend.vh.yandex.ru/graphql', video_id, data=('''{ player(content_id: "%s") { computed_title content_url @@ -86,7 +86,7 @@ class YandexVideoIE(InfoExtractor): title views_count } -}''' % video_id.encode(), fatal=False)), lambda x: x['player']['content']) +}''' % video_id).encode(), fatal=False)), lambda x: x['player']['content']) if not player or player.get('error'): player = self._download_json( 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, From 2aafa2f712d4d1eb447bb44356ed9a6e6d63ac4c Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:54:41 +0100 Subject: [PATCH 228/384] =?UTF-8?q?[vvvvid]=20skip=20unplayable=20episodes?= =?UTF-8?q?=20and=20extract=20akamai=20formats(closes=20#=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …27599) --- haruhi_dl/extractor/vvvvid.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/vvvvid.py b/haruhi_dl/extractor/vvvvid.py index 014a67e53..145805492 100644 --- a/haruhi_dl/extractor/vvvvid.py +++ b/haruhi_dl/extractor/vvvvid.py @@ -152,7 +152,6 @@ class VVVVIDIE(InfoExtractor): embed_code = ds(embed_code) video_type = video_data.get('video_type') if video_type in ('video/rcs', 'video/kenc'): - embed_code = re.sub(r'https?://([^/]+)/z/', r'https://\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') if video_type == 'video/kenc': kenc = self._download_json( 'https://www.vvvvid.it/kenc', video_id, query={ @@ -163,9 +162,7 @@ class VVVVIDIE(InfoExtractor): kenc_message = kenc.get('message') if kenc_message: embed_code += '?' + ds(kenc_message) - formats.extend(self._extract_m3u8_formats( - embed_code, video_id, 'mp4', - m3u8_id='hls', fatal=False)) + formats.extend(self._extract_akamai_formats(embed_code, video_id)) else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) @@ -217,6 +214,8 @@ class VVVVIDShowIE(VVVVIDIE): season_number = int_or_none(season.get('number')) episodes = season.get('episodes') or [] for episode in episodes: + if episode.get('playable') is False: + continue season_id = str_or_none(episode.get('season_id')) video_id = str_or_none(episode.get('video_id')) if not (season_id and video_id): From fc156473d92c90f0dad50a6cc779ac47718abf58 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 15:54:49 +0100 Subject: [PATCH 229/384] =?UTF-8?q?[sky]=20add=20support=20for=20Sports=20?= =?UTF-8?q?News=20articles=20and=20Brighcove=20videos(close=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …s #13054) --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/sky.py | 99 ++++++++++++++++++++++++------- 2 files changed, 78 insertions(+), 22 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 2b81187ca..0d92c6d0c 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1097,6 +1097,7 @@ from .skynewsarabia import ( from .sky import ( SkyNewsIE, SkySportsIE, + SkySportsNewsIE, ) from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE diff --git a/haruhi_dl/extractor/sky.py b/haruhi_dl/extractor/sky.py index 681691004..ff2c977a0 100644 --- a/haruhi_dl/extractor/sky.py +++ b/haruhi_dl/extractor/sky.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( extract_attributes, @@ -11,36 +13,59 @@ from ..utils import ( class SkyBaseIE(InfoExtractor): - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_data = extract_attributes(self._search_regex( - r'(]+>)', - webpage, 'video data')) + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + _SDC_EL_REGEX = r'(?s)(]+data-(?:component-name|fn)="sdc-(?:articl|sit)e-video"[^>]*>)' - video_url = 'ooyala:%s' % video_data['data-video-id'] - if video_data.get('data-token-required') == 'true': - token_fetch_options = self._parse_json(video_data.get( - 'data-token-fetch-options', '{}'), video_id, fatal=False) or {} - token_fetch_url = token_fetch_options.get('url') - if token_fetch_url: - embed_token = self._download_webpage(urljoin( - url, token_fetch_url), video_id, fatal=False) - if embed_token: - video_url = smuggle_url( - video_url, {'embed_token': embed_token.strip('"')}) + def _process_ooyala_element(self, webpage, sdc_el, url): + sdc = extract_attributes(sdc_el) + provider = sdc.get('data-provider') + if provider == 'ooyala': + video_id = sdc['data-sdc-video-id'] + video_url = 'ooyala:%s' % video_id + ie_key = 'Ooyala' + ooyala_el = self._search_regex( + r'(]+class="[^"]*\bsdc-article-video__media-ooyala\b[^"]*"[^>]+data-video-id="%s"[^>]*>)' % video_id, + webpage, 'video data', fatal=False) + if ooyala_el: + ooyala_attrs = extract_attributes(ooyala_el) or {} + if ooyala_attrs.get('data-token-required') == 'true': + token_fetch_url = (self._parse_json(ooyala_attrs.get( + 'data-token-fetch-options', '{}'), + video_id, fatal=False) or {}).get('url') + if token_fetch_url: + embed_token = self._download_json(urljoin( + url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url( + video_url, {'embed_token': embed_token}) + elif provider == 'brightcove': + video_id = sdc['data-video-id'] + account_id = sdc.get('data-account-id') or '6058004172001' + player_id = sdc.get('data-player-id') or 'RC9PQUaJ6' + video_url = self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id) + ie_key = 'BrightcoveNew' return { '_type': 'url_transparent', 'id': video_id, 'url': video_url, + 'ie_key': ie_key, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = self._process_ooyala_element(webpage, self._search_regex( + self._SDC_EL_REGEX, webpage, 'sdc element'), url) + info.update({ 'title': self._og_search_title(webpage), 'description': strip_or_none(self._og_search_description(webpage)), - 'ie_key': 'Ooyala', - } + }) + return info class SkySportsIE(SkyBaseIE): + IE_NAME = 'sky:sports' _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/([^/]+/)*(?P[0-9]+)' _TESTS = [{ 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', @@ -62,15 +87,45 @@ class SkySportsIE(SkyBaseIE): class SkyNewsIE(SkyBaseIE): + IE_NAME = 'sky:news' _VALID_URL = r'https?://news\.sky\.com/video/[0-9a-z-]+-(?P[0-9]+)' _TEST = { 'url': 'https://news.sky.com/video/russian-plane-inspected-after-deadly-fire-11712962', - 'md5': 'd6327e581473cea9976a3236ded370cd', + 'md5': '411e8893fd216c75eaf7e4c65d364115', 'info_dict': { - 'id': '1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', + 'id': 'ref:1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', 'ext': 'mp4', 'title': 'Russian plane inspected after deadly fire', 'description': 'The Russian Investigative Committee has released video of the wreckage of a passenger plane which caught fire near Moscow.', + 'uploader_id': '6058004172001', + 'timestamp': 1567112345, + 'upload_date': '20190829', }, - 'add_ie': ['Ooyala'], + 'add_ie': ['BrightcoveNew'], } + + +class SkySportsNewsIE(SkyBaseIE): + IE_NAME = 'sky:sports:news' + _VALID_URL = r'https?://(?:www\.)?skysports\.com/([^/]+/)*news/\d+/(?P\d+)' + _TEST = { + 'url': 'http://www.skysports.com/golf/news/12176/10871916/dustin-johnson-ready-to-conquer-players-championship-at-tpc-sawgrass', + 'info_dict': { + 'id': '10871916', + 'title': 'Dustin Johnson ready to conquer Players Championship at TPC Sawgrass', + 'description': 'Dustin Johnson is confident he can continue his dominant form in 2017 by adding the Players Championship to his list of victories.', + }, + 'playlist_count': 2, + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + + entries = [] + for sdc_el in re.findall(self._SDC_EL_REGEX, webpage): + entries.append(self._process_ooyala_element(webpage, sdc_el, url)) + + return self.playlist_result( + entries, article_id, self._og_search_title(webpage), + self._html_search_meta(['og:description', 'description'], webpage)) From a13444f11773850692b4cf9042e353682a3eab53 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:01:11 +0100 Subject: [PATCH 230/384] [arcpublishing] Add new extractor closes #2298 closes #9340 closes #17200 --- haruhi_dl/extractor/arcpublishing.py | 173 ++++++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/generic.py | 16 +++ haruhi_dl/extractor/washingtonpost.py | 101 +++------------ 4 files changed, 207 insertions(+), 84 deletions(-) create mode 100644 haruhi_dl/extractor/arcpublishing.py diff --git a/haruhi_dl/extractor/arcpublishing.py b/haruhi_dl/extractor/arcpublishing.py new file mode 100644 index 000000000..d1fb1a054 --- /dev/null +++ b/haruhi_dl/extractor/arcpublishing.py @@ -0,0 +1,173 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + parse_iso8601, + try_get, +) + + +class ArcPublishingIE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + _VALID_URL = r'arcpublishing:(?P[a-z]+):(?P%s)' % _UUID_REGEX + _TESTS = [{ + # https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/ + 'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'only_matching': True, + }, { + # https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/ + 'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1', + 'only_matching': True, + }, { + # https://www.actionnewsjax.com/video/live-stream/ + 'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a', + 'only_matching': True, + }, { + # https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/ + 'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3', + 'only_matching': True, + }, { + # https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/ + 'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe', + 'only_matching': True, + }, { + # https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/ + 'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e', + 'only_matching': True, + }, { + # https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/ + 'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143', + 'only_matching': True, + }, { + # https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/ + 'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055', + 'only_matching': True, + }, { + # https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/ + 'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d', + 'only_matching': True, + }, { + # https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/ + 'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7', + 'only_matching': True, + }, { + # https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/ + 'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b', + 'only_matching': True, + }, { + # https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html + 'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685', + 'only_matching': True, + }] + _POWA_DEFAULTS = [ + (['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'), + ([ + 'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo', + 'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom', + 'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek', + ], 'video-api-cdn.%s.arcpublishing.com/api'), + ] + + def _extract_urls(webpage): + entries = [] + # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview + for powa_el in re.findall(r'(]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): + powa = extract_attributes(powa_el) or {} + org = powa.get('data-org') + uuid = powa.get('data-uuid') + if org and uuid: + entries.append('arcpublishing:%s:%s' % (org, uuid)) + return entries + + def _real_extract(self, url): + org, uuid = re.match(self._VALID_URL, url).groups() + for orgs, tmpl in self._POWA_DEFAULTS: + if org in orgs: + base_api_tmpl = tmpl + break + else: + base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api' + if org == 'wapo': + org = 'washpost' + video = self._download_json( + 'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org), + uuid, query={'uuid': uuid})[0] + title = video['headlines']['basic'] + is_live = video.get('status') == 'live' + + urls = [] + formats = [] + for s in video.get('streams', []): + s_url = s.get('url') + if not s_url or s_url in urls: + continue + urls.append(s_url) + stream_type = s.get('stream_type') + if stream_type == 'smil': + smil_formats = self._extract_smil_formats( + s_url, uuid, fatal=False) + for f in smil_formats: + if f['url'].endswith('/cfx/st'): + f['app'] = 'cfx/st' + if not f['play_path'].startswith('mp4:'): + f['play_path'] = 'mp4:' + f['play_path'] + if isinstance(f['tbr'], float): + f['vbr'] = f['tbr'] * 1000 + del f['tbr'] + f['format_id'] = 'rtmp-%d' % f['vbr'] + formats.extend(smil_formats) + elif stream_type in ('ts', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False) + if all([f.get('acodec') == 'none' for f in m3u8_formats]): + continue + for f in m3u8_formats: + if f.get('acodec') == 'none': + f['preference'] = -40 + elif f.get('vcodec') == 'none': + f['preference'] = -50 + height = f.get('height') + if not height: + continue + vbr = self._search_regex( + r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None) + if vbr: + f['vbr'] = int(vbr) + formats.extend(m3u8_formats) + else: + vbr = int_or_none(s.get('bitrate')) + formats.append({ + 'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type, + 'vbr': vbr, + 'width': int_or_none(s.get('width')), + 'height': int_or_none(s.get('height')), + 'filesize': int_or_none(s.get('filesize')), + 'url': s_url, + 'preference': -1, + }) + self._sort_formats( + formats, ('preference', 'width', 'height', 'vbr', 'filesize', 'tbr', 'ext', 'format_id')) + + subtitles = {} + for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []): + subtitle_url = subtitle.get('url') + if subtitle_url: + subtitles.setdefault('en', []).append({'url': subtitle_url}) + + return { + 'id': uuid, + 'title': self._live_title(title) if is_live else title, + 'thumbnail': try_get(video, lambda x: x['promo_image']['url']), + 'description': try_get(video, lambda x: x['subheadlines']['basic']), + 'formats': formats, + 'duration': int_or_none(video.get('duration'), 100), + 'timestamp': parse_iso8601(video.get('created_date')), + 'subtitles': subtitles, + 'is_live': is_live, + } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 0d92c6d0c..2a680d4bf 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -66,6 +66,7 @@ from .appletrailers import ( AppleTrailersSectionIE, ) from .archiveorg import ArchiveOrgIE +from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE from .ard import ( ARDBetaMediathekIE, diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index b67f066eb..beb6ad2ad 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -133,6 +133,7 @@ from .rtlnl import RtlNlIE from .xnews import XLinkIE from .libsyn import LibsynIE from .pulsembed import PulsEmbedIE +from .arcpublishing import ArcPublishingIE class GenericIE(InfoExtractor): @@ -2261,6 +2262,20 @@ class GenericIE(InfoExtractor): 'uploader': 'OTT Videos', }, }, + { + # ArcPublishing PoWa video player + 'url': 'https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/', + 'md5': 'b03b2fac8680e1e5a7cc81a5c27e71b3', + 'info_dict': { + 'id': '8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'ext': 'mp4', + 'title': 'Senate candidates wave to voters on Anchorage streets', + 'description': 'md5:91f51a6511f090617353dc720318b20e', + 'timestamp': 1604378735, + 'upload_date': '20201103', + 'duration': 1581, + }, + }, ] def report_following_redirect(self, new_url): @@ -2676,6 +2691,7 @@ class GenericIE(InfoExtractor): XLinkIE, LibsynIE, VHXEmbedIE, + ArcPublishingIE, ): try: ie_key = embie.ie_key() diff --git a/haruhi_dl/extractor/washingtonpost.py b/haruhi_dl/extractor/washingtonpost.py index 329907465..7924d80fc 100644 --- a/haruhi_dl/extractor/washingtonpost.py +++ b/haruhi_dl/extractor/washingtonpost.py @@ -4,17 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - strip_jsonp, -) class WashingtonPostIE(InfoExtractor): IE_NAME = 'washingtonpost' - _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/(?:video|posttv)/(?:[^/]+/)*)(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' - _TEST = { + _TESTS = [{ 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', 'info_dict': { @@ -23,10 +19,15 @@ class WashingtonPostIE(InfoExtractor): 'title': 'Egypt finds belongings, debris from plane crash', 'description': 'md5:a17ceee432f215a5371388c1f680bd86', 'upload_date': '20160520', - 'uploader': 'Reuters', - 'timestamp': 1463778452, + 'timestamp': 1463775187, }, - } + }, { + 'url': 'https://www.washingtonpost.com/video/world/egypt-finds-belongings-debris-from-plane-crash/2016/05/20/480ba4ee-1ec7-11e6-82c2-a7dcb313287d_video.html', + 'only_matching': True, + }, { + 'url': 'https://www.washingtonpost.com/posttv/world/iraq-to-track-down-antiquities-after-islamic-state-museum-rampage/2015/02/28/7c57e916-bf86-11e4-9dfb-03366e719af8_video.html', + 'only_matching': True, + }] @classmethod def _extract_urls(cls, webpage, **kwargs): @@ -35,73 +36,8 @@ class WashingtonPostIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id, - video_id, transform_source=strip_jsonp)[0]['contentConfig'] - title = video_data['title'] - - urls = [] - formats = [] - for s in video_data.get('streams', []): - s_url = s.get('url') - if not s_url or s_url in urls: - continue - urls.append(s_url) - video_type = s.get('type') - if video_type == 'smil': - continue - elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url): - m3u8_formats = self._extract_m3u8_formats( - s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - for m3u8_format in m3u8_formats: - width = m3u8_format.get('width') - if not width: - continue - vbr = self._search_regex( - r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None) - if vbr: - m3u8_format.update({ - 'vbr': int_or_none(vbr), - }) - formats.extend(m3u8_formats) - else: - width = int_or_none(s.get('width')) - vbr = int_or_none(s.get('bitrate')) - has_width = width != 0 - formats.append({ - 'format_id': ( - '%s-%d-%d' % (video_type, width, vbr) - if width - else video_type), - 'vbr': vbr if has_width else None, - 'width': width, - 'height': int_or_none(s.get('height')), - 'acodec': s.get('audioCodec'), - 'vcodec': s.get('videoCodec') if has_width else 'none', - 'filesize': int_or_none(s.get('fileSize')), - 'url': s_url, - 'ext': 'mp4', - 'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None, - }) - source_media_url = video_data.get('sourceMediaURL') - if source_media_url: - formats.append({ - 'format_id': 'source_media', - 'url': source_media_url, - }) - self._sort_formats( - formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id')) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('blurb'), - 'uploader': video_data.get('credits', {}).get('source'), - 'formats': formats, - 'duration': int_or_none(video_data.get('videoDuration'), 100), - 'timestamp': int_or_none( - video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000), - } + return self.url_result( + 'arcpublishing:wapo:' + video_id, 'ArcPublishing', video_id) class WashingtonPostArticleIE(InfoExtractor): @@ -121,9 +57,8 @@ class WashingtonPostArticleIE(InfoExtractor): 'title': 'Breaking Points: The Paper Mine', 'duration': 1290, 'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.', - 'uploader': 'The Washington Post', - 'timestamp': 1395527908, - 'upload_date': '20140322', + 'timestamp': 1395440416, + 'upload_date': '20140321', }, }, { 'md5': '1fff6a689d8770966df78c8cb6c8c17c', @@ -133,9 +68,8 @@ class WashingtonPostArticleIE(InfoExtractor): 'title': 'The town bureaucracy sustains', 'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.', 'duration': 2220, - 'timestamp': 1395528005, - 'upload_date': '20140322', - 'uploader': 'The Washington Post', + 'timestamp': 1395441819, + 'upload_date': '20140321', }, }], }, { @@ -151,8 +85,7 @@ class WashingtonPostArticleIE(InfoExtractor): 'ext': 'mp4', 'description': 'Washington Post transportation reporter Ashley Halsey III explains why a plane\'s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.', 'upload_date': '20141230', - 'uploader': 'The Washington Post', - 'timestamp': 1419974765, + 'timestamp': 1419972442, 'title': 'Why black boxes don’t transmit data in real time', } }] From 56a45e91d22b936c33f29d73e42e925eda3026f5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:01:45 +0100 Subject: [PATCH 231/384] [arcpublishing] add missing staticmethod decorator --- haruhi_dl/extractor/arcpublishing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haruhi_dl/extractor/arcpublishing.py b/haruhi_dl/extractor/arcpublishing.py index d1fb1a054..ca6a6c4d8 100644 --- a/haruhi_dl/extractor/arcpublishing.py +++ b/haruhi_dl/extractor/arcpublishing.py @@ -73,6 +73,7 @@ class ArcPublishingIE(InfoExtractor): ], 'video-api-cdn.%s.arcpublishing.com/api'), ] + @staticmethod def _extract_urls(webpage): entries = [] # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview From 5ccde7fdb33c7afdf37884d32ac36fe43f5c5a16 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:01:52 +0100 Subject: [PATCH 232/384] [acast] fix extraction(closes #21444)(closes #27612)(closes #27613) --- haruhi_dl/extractor/acast.py | 116 ++++++++++++++++------------------- 1 file changed, 53 insertions(+), 63 deletions(-) diff --git a/haruhi_dl/extractor/acast.py b/haruhi_dl/extractor/acast.py index b17c792d2..60378db1b 100644 --- a/haruhi_dl/extractor/acast.py +++ b/haruhi_dl/extractor/acast.py @@ -2,21 +2,47 @@ from __future__ import unicode_literals import re -import functools from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, - float_or_none, int_or_none, - try_get, - unified_timestamp, - OnDemandPagedList, + parse_iso8601, ) -class ACastIE(InfoExtractor): +class ACastBaseIE(InfoExtractor): + def _extract_episode(self, episode, show_info): + title = episode['title'] + info = { + 'id': episode['id'], + 'display_id': episode.get('episodeUrl'), + 'url': episode['url'], + 'title': title, + 'description': clean_html(episode.get('description') or episode.get('summary')), + 'thumbnail': episode.get('image'), + 'timestamp': parse_iso8601(episode.get('publishDate')), + 'duration': int_or_none(episode.get('duration')), + 'filesize': int_or_none(episode.get('contentLength')), + 'season_number': int_or_none(episode.get('season')), + 'episode': title, + 'episode_number': int_or_none(episode.get('episode')), + } + info.update(show_info) + return info + + def _extract_show_info(self, show): + return { + 'creator': show.get('author'), + 'series': show.get('title'), + } + + def _call_api(self, path, video_id, query=None): + return self._download_json( + 'https://feeder.acast.com/api/v1/shows/' + path, video_id, query=query) + + +class ACastIE(ACastBaseIE): IE_NAME = 'acast' _VALID_URL = r'''(?x) https?:// @@ -28,15 +54,15 @@ class ACastIE(InfoExtractor): ''' _TESTS = [{ 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': '16d936099ec5ca2d5869e3a813ee8dc4', + 'md5': 'f5598f3ad1e4776fed12ec1407153e4b', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', - 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', + 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67', 'timestamp': 1477346700, 'upload_date': '20161024', - 'duration': 2766.602563, + 'duration': 2766, 'creator': 'Anton Berg & Martin Johnson', 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', @@ -45,7 +71,7 @@ class ACastIE(InfoExtractor): 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', 'only_matching': True, }, { - 'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22', + 'url': 'https://play.acast.com/s/rattegangspodden/s04e09styckmordetihelenelund-del2-2', 'only_matching': True, }, { 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', @@ -54,40 +80,14 @@ class ACastIE(InfoExtractor): def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() - s = self._download_json( - 'https://feeder.acast.com/api/v1/shows/%s/episodes/%s' % (channel, display_id), - display_id) - media_url = s['url'] - if re.search(r'[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}', display_id): - episode_url = s.get('episodeUrl') - if episode_url: - display_id = episode_url - else: - channel, display_id = re.match(self._VALID_URL, s['link']).groups() - cast_data = self._download_json( - 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), - display_id)['result'] - e = cast_data['episode'] - title = e.get('name') or s['title'] - return { - 'id': compat_str(e['id']), - 'display_id': display_id, - 'url': media_url, - 'title': title, - 'description': e.get('summary') or clean_html(e.get('description') or s.get('description')), - 'thumbnail': e.get('image'), - 'timestamp': unified_timestamp(e.get('publishingDate') or s.get('publishDate')), - 'duration': float_or_none(e.get('duration') or s.get('duration')), - 'filesize': int_or_none(e.get('contentLength')), - 'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str), - 'series': try_get(cast_data, lambda x: x['show']['name'], compat_str), - 'season_number': int_or_none(e.get('seasonNumber')), - 'episode': title, - 'episode_number': int_or_none(e.get('episodeNumber')), - } + episode = self._call_api( + '%s/episodes/%s' % (channel, display_id), + display_id, {'showInfo': 'true'}) + return self._extract_episode( + episode, self._extract_show_info(episode.get('show') or {})) -class ACastChannelIE(InfoExtractor): +class ACastChannelIE(ACastBaseIE): IE_NAME = 'acast:channel' _VALID_URL = r'''(?x) https?:// @@ -102,34 +102,24 @@ class ACastChannelIE(InfoExtractor): 'info_dict': { 'id': '4efc5294-5385-4847-98bd-519799ce5786', 'title': 'Today in Focus', - 'description': 'md5:9ba5564de5ce897faeb12963f4537a64', + 'description': 'md5:c09ce28c91002ce4ffce71d6504abaae', }, - 'playlist_mincount': 35, + 'playlist_mincount': 200, }, { 'url': 'http://play.acast.com/s/ft-banking-weekly', 'only_matching': True, }] - _API_BASE_URL = 'https://play.acast.com/api/' - _PAGE_SIZE = 10 @classmethod def suitable(cls, url): return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url) - def _fetch_page(self, channel_slug, page): - casts = self._download_json( - self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page), - channel_slug, note='Download page %d of channel data' % page) - for cast in casts: - yield self.url_result( - 'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']), - 'ACast', cast['id']) - def _real_extract(self, url): - channel_slug = self._match_id(url) - channel_data = self._download_json( - self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug) - entries = OnDemandPagedList(functools.partial( - self._fetch_page, channel_slug), self._PAGE_SIZE) - return self.playlist_result(entries, compat_str( - channel_data['id']), channel_data['name'], channel_data.get('description')) + show_slug = self._match_id(url) + show = self._call_api(show_slug, show_slug) + show_info = self._extract_show_info(show) + entries = [] + for episode in (show.get('episodes') or []): + entries.append(self._extract_episode(episode, show_info)) + return self.playlist_result( + entries, show.get('id'), show.get('title'), show.get('description')) From 51535e0624cb22b61fadfbed528d1bccca4e16cf Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:02:04 +0100 Subject: [PATCH 233/384] [stitcher] fix extraction(closes #20811)(closes #27606) --- haruhi_dl/extractor/stitcher.py | 60 ++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/haruhi_dl/extractor/stitcher.py b/haruhi_dl/extractor/stitcher.py index 97d1ff681..b8b5711b1 100644 --- a/haruhi_dl/extractor/stitcher.py +++ b/haruhi_dl/extractor/stitcher.py @@ -4,25 +4,28 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, + clean_html, + ExtractorError, int_or_none, - js_to_json, - unescapeHTML, + str_or_none, + try_get, ) class StitcherIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' + _VALID_URL = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/(?:[^/]+/)+e(?:pisode)?/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', - 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940', + 'md5': 'e9635098e0da10b21a0e2b85585530f6', 'info_dict': { 'id': '40789481', 'ext': 'mp3', 'title': 'Machine Learning Mastery and Cancer Clusters', - 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3', + 'description': 'md5:547adb4081864be114ae3831b4c2b42f', 'duration': 1604, 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20180126', + 'timestamp': 1516989316, }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', @@ -38,6 +41,7 @@ class StitcherIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Page Not Found', }, { # escaped title 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true', @@ -45,37 +49,39 @@ class StitcherIE(InfoExtractor): }, { 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true', 'only_matching': True, + }, { + 'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - audio_id = mobj.group('id') - display_id = mobj.group('display_id') or audio_id + display_id, audio_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, display_id) + resp = self._download_json( + 'https://api.prod.stitcher.com/episode/' + audio_id, + display_id or audio_id) + episode = try_get(resp, lambda x: x['data']['episodes'][0], dict) + if not episode: + raise ExtractorError(resp['errors'][0]['message'], expected=True) - episode = self._parse_json( - js_to_json(self._search_regex( - r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')), - display_id)['config']['episode'] + title = episode['title'].strip() + audio_url = episode['audio_url'] - title = unescapeHTML(episode['title']) - formats = [{ - 'url': episode[episode_key], - 'ext': determine_ext(episode[episode_key]) or 'mp3', - 'vcodec': 'none', - } for episode_key in ('episodeURL',) if episode.get(episode_key)] - description = self._search_regex( - r'Episode Info:\s*([^<]+)<', webpage, 'description', fatal=False) - duration = int_or_none(episode.get('duration')) - thumbnail = episode.get('episodeImage') + thumbnail = None + show_id = episode.get('show_id') + if show_id and episode.get('classic_id') != -1: + thumbnail = 'https://stitcher-classic.imgix.net/feedimages/%s.jpg' % show_id return { 'id': audio_id, 'display_id': display_id, 'title': title, - 'description': description, - 'duration': duration, + 'description': clean_html(episode.get('html_description') or episode.get('description')), + 'duration': int_or_none(episode.get('duration')), 'thumbnail': thumbnail, - 'formats': formats, + 'url': audio_url, + 'vcodec': 'none', + 'timestamp': int_or_none(episode.get('date_created')), + 'season_number': int_or_none(episode.get('season')), + 'season_id': str_or_none(episode.get('season_id')), } From 417963200c033afb95a9fca54eb6b313a6e284a4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:02:17 +0100 Subject: [PATCH 234/384] [vvvvid] fix season metadata extraction(#18130) --- haruhi_dl/extractor/vvvvid.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/haruhi_dl/extractor/vvvvid.py b/haruhi_dl/extractor/vvvvid.py index 145805492..f4cae7fe9 100644 --- a/haruhi_dl/extractor/vvvvid.py +++ b/haruhi_dl/extractor/vvvvid.py @@ -25,7 +25,6 @@ class VVVVIDIE(InfoExtractor): 'duration': 239, 'series': '"Perché dovrei guardarlo?" di Dario Moccia', 'season_id': '437', - 'season_number': 1, 'episode': 'Ping Pong', 'episode_number': 1, 'episode_id': '3334', @@ -75,7 +74,6 @@ class VVVVIDIE(InfoExtractor): def _extract_common_video_info(self, video_data): return { 'thumbnail': video_data.get('thumbnail'), - 'episode_number': int_or_none(video_data.get('number')), 'episode_id': str_or_none(video_data.get('id')), } @@ -145,6 +143,17 @@ class VVVVIDIE(InfoExtractor): return d + info = {} + + def metadata_from_url(r_url): + if not info and r_url: + mobj = re.search(r'_(?:S(\d+))?Ep(\d+)', r_url) + if mobj: + info['episode_number'] = int(mobj.group(2)) + season_number = mobj.group(1) + if season_number: + info['season_number'] = int(season_number) + for quality in ('_sd', ''): embed_code = video_data.get('embed_info' + quality) if not embed_code: @@ -166,9 +175,12 @@ class VVVVIDIE(InfoExtractor): else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) + metadata_from_url(embed_code) + self._sort_formats(formats) - info = self._extract_common_video_info(video_data) + metadata_from_url(video_data.get('thumbnail')) + info.update(self._extract_common_video_info(video_data)) info.update({ 'id': video_id, 'title': title, @@ -176,7 +188,6 @@ class VVVVIDIE(InfoExtractor): 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, - 'season_number': video_data.get('season_number'), 'episode': title, 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), @@ -211,7 +222,6 @@ class VVVVIDShowIE(VVVVIDIE): entries = [] for season in (seasons or []): - season_number = int_or_none(season.get('number')) episodes = season.get('episodes') or [] for episode in episodes: if episode.get('playable') is False: @@ -227,7 +237,6 @@ class VVVVIDShowIE(VVVVIDIE): 'url': '/'.join([base_url, season_id, video_id]), 'title': episode.get('title'), 'description': episode.get('description'), - 'season_number': season_number, 'season_id': season_id, }) entries.append(info) From 973258396df1e68d8fffd85007f012f6988e9597 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:02:33 +0100 Subject: [PATCH 235/384] [nrktv] Switch to playback endpoint mediaelement endpoint is no longer in use. --- haruhi_dl/extractor/nrk.py | 273 ++++++++----------------------------- 1 file changed, 57 insertions(+), 216 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index b545f291b..871e4845c 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -6,15 +6,11 @@ import random import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) +from ..compat import compat_str from ..utils import ( determine_ext, ExtractorError, int_or_none, - parse_age_limit, parse_duration, try_get, urljoin, @@ -63,60 +59,8 @@ class NRKBaseIE(InfoExtractor): return self._download_json( urljoin('http://psapi.nrk.no/', path), video_id, note or 'Downloading %s JSON' % item, - fatal=fatal, query=query) - - -class NRKIE(NRKBaseIE): - _VALID_URL = r'''(?x) - (?: - nrk:| - https?:// - (?: - (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| - v8[-.]psapi\.nrk\.no/mediaelement/ - ) - ) - (?P[^?\#&]+) - ''' - - _TESTS = [{ - # video - 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': 'f46be075326e23ad0e524edfcb06aeb6', - 'info_dict': { - 'id': '150533', - 'ext': 'mp4', - 'title': 'Dompap og andre fugler i Piip-Show', - 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', - 'duration': 262, - } - }, { - # audio - 'url': 'http://www.nrk.no/video/PS*154915', - # MD5 is unstable - 'info_dict': { - 'id': '154915', - 'ext': 'mp4', - 'title': 'Slik høres internett ut når du er blind', - 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', - 'duration': 20, - } - }, { - 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', - 'only_matching': True, - }, { - 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', - 'only_matching': True, - }, { - 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', - 'only_matching': True, - }, { - 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', - 'only_matching': True, - }] + fatal=fatal, query=query, + headers={'Accept-Encoding': 'gzip, deflate, br'}) def _extract_from_playback(self, video_id): path_templ = 'playback/%s/' + video_id @@ -178,6 +122,59 @@ class NRKIE(NRKBaseIE): 'formats': formats, } + +class NRKIE(NRKBaseIE): + _VALID_URL = r'''(?x) + (?: + nrk:| + https?:// + (?: + (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| + v8[-.]psapi\.nrk\.no/mediaelement/ + ) + ) + (?P[^?\#&]+) + ''' + + _TESTS = [{ + # video + 'url': 'http://www.nrk.no/video/PS*150533', + 'md5': 'f46be075326e23ad0e524edfcb06aeb6', + 'info_dict': { + 'id': '150533', + 'ext': 'mp4', + 'title': 'Dompap og andre fugler i Piip-Show', + 'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f', + 'duration': 262, + } + }, { + # audio + 'url': 'http://www.nrk.no/video/PS*154915', + # MD5 is unstable + 'info_dict': { + 'id': '154915', + 'ext': 'mp4', + 'title': 'Slik høres internett ut når du er blind', + 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', + 'duration': 20, + } + }, { + 'url': 'nrk:ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'nrk:clip/7707d5a3-ebe7-434a-87d5-a3ebe7a34a70', + 'only_matching': True, + }, { + 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', + 'only_matching': True, + }] + def _real_extract(self, url): video_id = self._match_id(url) return self._extract_from_playback(video_id) @@ -187,7 +184,6 @@ class NRKTVIE(NRKBaseIE): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE - _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', @@ -290,164 +286,9 @@ class NRKTVIE(NRKBaseIE): 'only_matching': True, }] - _api_host = None - - def _extract_from_mediaelement(self, video_id): - api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS - - for api_host in api_hosts: - data = self._download_json( - 'http://%s/mediaelement/%s' % (api_host, video_id), - video_id, 'Downloading mediaelement JSON', - fatal=api_host == api_hosts[-1]) - if not data: - continue - self._api_host = api_host - break - - title = data.get('fullTitle') or data.get('mainTitle') or data['title'] - video_id = data.get('id') or video_id - - urls = [] - entries = [] - - conviva = data.get('convivaStatistics') or {} - live = (data.get('mediaElementType') == 'Live' - or data.get('isLive') is True or conviva.get('isLive')) - - def make_title(t): - return self._live_title(t) if live else t - - media_assets = data.get('mediaAssets') - if media_assets and isinstance(media_assets, list): - def video_id_and_title(idx): - return ((video_id, title) if len(media_assets) == 1 - else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) - for num, asset in enumerate(media_assets, 1): - asset_url = asset.get('url') - if not asset_url or asset_url in urls: - continue - urls.append(asset_url) - formats = self._extract_nrk_formats(asset_url, video_id) - if not formats: - continue - self._sort_formats(formats) - - entry_id, entry_title = video_id_and_title(num) - duration = parse_duration(asset.get('duration')) - subtitles = {} - for subtitle in ('webVtt', 'timedText'): - subtitle_url = asset.get('%sSubtitlesUrl' % subtitle) - if subtitle_url: - subtitles.setdefault('no', []).append({ - 'url': compat_urllib_parse_unquote(subtitle_url) - }) - entries.append({ - 'id': asset.get('carrierId') or entry_id, - 'title': make_title(entry_title), - 'duration': duration, - 'subtitles': subtitles, - 'formats': formats, - 'is_live': live, - }) - - if not entries: - media_url = data.get('mediaUrl') - if media_url and media_url not in urls: - formats = self._extract_nrk_formats(media_url, video_id) - if formats: - self._sort_formats(formats) - duration = parse_duration(data.get('duration')) - entries = [{ - 'id': video_id, - 'title': make_title(title), - 'duration': duration, - 'formats': formats, - 'is_live': live, - }] - - if not entries: - self._raise_error(data) - - series = conviva.get('seriesName') or data.get('seriesTitle') - episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') - - season_number = None - episode_number = None - if data.get('mediaElementType') == 'Episode': - _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ - data.get('relativeOriginUrl', '') - EPISODENUM_RE = [ - r'/s(?P\d{,2})e(?P\d{,2})\.', - r'/sesong-(?P\d{,2})/episode-(?P\d{,2})', - ] - season_number = int_or_none(self._search_regex( - EPISODENUM_RE, _season_episode, 'season number', - default=None, group='season')) - episode_number = int_or_none(self._search_regex( - EPISODENUM_RE, _season_episode, 'episode number', - default=None, group='episode')) - - thumbnails = None - images = data.get('images') - if images and isinstance(images, dict): - web_images = images.get('webImages') - if isinstance(web_images, list): - thumbnails = [{ - 'url': image['imageUrl'], - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - } for image in web_images if image.get('imageUrl')] - - description = data.get('description') - category = data.get('mediaAnalytics', {}).get('category') - - common_info = { - 'description': description, - 'series': series, - 'episode': episode, - 'season_number': season_number, - 'episode_number': episode_number, - 'categories': [category] if category else None, - 'age_limit': parse_age_limit(data.get('legalAge')), - 'thumbnails': thumbnails, - } - - vcodec = 'none' if data.get('mediaType') == 'Audio' else None - - for entry in entries: - entry.update(common_info) - for f in entry['formats']: - f['vcodec'] = vcodec - - points = data.get('shortIndexPoints') - if isinstance(points, list): - chapters = [] - for next_num, point in enumerate(points, start=1): - if not isinstance(point, dict): - continue - start_time = parse_duration(point.get('startPoint')) - if start_time is None: - continue - end_time = parse_duration( - data.get('duration') - if next_num == len(points) - else points[next_num].get('startPoint')) - if end_time is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': point.get('title'), - }) - if chapters and len(entries) == 1: - entries[0]['chapters'] = chapters - - return self.playlist_result(entries, video_id, title, description) - def _real_extract(self, url): video_id = self._match_id(url) - return self._extract_from_mediaelement(video_id) + return self._extract_from_playback(video_id) class NRKTVEpisodeIE(InfoExtractor): From 18be494898ce0befd9da4ab722306b8c93ddc4d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:02 +0100 Subject: [PATCH 236/384] [nrk] Improve extraction (closes #27634, closes #27635) + Add support for mp3 formats * Generalize and delegate all item extractors to nrk, beware ie key breakages + Add support for podcasts + Generalize nrk shortcut form to support all kind of ids --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/nrk.py | 248 ++++++++++++++++++++---------- 2 files changed, 172 insertions(+), 77 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 2a680d4bf..086c7d42a 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -823,6 +823,7 @@ from .nrk import ( NRKSkoleIE, NRKTVIE, NRKTVDirekteIE, + NRKRadioPodkastIE, NRKTVEpisodeIE, NRKTVEpisodesIE, NRKTVSeasonIE, diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 871e4845c..9621522d4 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -62,66 +62,6 @@ class NRKBaseIE(InfoExtractor): fatal=fatal, query=query, headers={'Accept-Encoding': 'gzip, deflate, br'}) - def _extract_from_playback(self, video_id): - path_templ = 'playback/%s/' + video_id - - def call_playback_api(item, query=None): - return self._call_api(path_templ % item, video_id, item, query=query) - # known values for preferredCdn: akamai, iponly, minicdn and telenor - manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) - - if manifest.get('playability') == 'nonPlayable': - self._raise_error(manifest['nonPlayable']) - - playable = manifest['playable'] - - formats = [] - for asset in playable['assets']: - if not isinstance(asset, dict): - continue - if asset.get('encrypted'): - continue - format_url = url_or_none(asset.get('url')) - if not format_url: - continue - if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_nrk_formats(format_url, video_id)) - self._sort_formats(formats) - - data = call_playback_api('metadata') - - preplay = data['preplay'] - titles = preplay['titles'] - title = titles['title'] - alt_title = titles.get('subtitle') - - description = preplay.get('description') - duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) - - thumbnails = [] - for image in try_get( - preplay, lambda x: x['poster']['images'], list) or []: - if not isinstance(image, dict): - continue - image_url = url_or_none(image.get('url')) - if not image_url: - continue - thumbnails.append({ - 'url': image_url, - 'width': int_or_none(image.get('pixelWidth')), - 'height': int_or_none(image.get('pixelHeight')), - }) - - return { - 'id': video_id, - 'title': title, - 'alt_title': alt_title, - 'description': description, - 'duration': duration, - 'thumbnails': thumbnails, - 'formats': formats, - } - class NRKIE(NRKBaseIE): _VALID_URL = r'''(?x) @@ -173,14 +113,97 @@ class NRKIE(NRKBaseIE): }, { 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', 'only_matching': True, + }, { + # podcast + 'url': 'nrk:l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + # clip + 'url': 'nrk:150533', + 'only_matching': True, + }, { + # episode + 'url': 'nrk:MDDP12000117', + 'only_matching': True, + }, { + # direkte + 'url': 'nrk:nrk1', + 'only_matching': True, }] + def _extract_from_playback(self, video_id): + path_templ = 'playback/%s/' + video_id + + def call_playback_api(item, query=None): + return self._call_api(path_templ % item, video_id, item, query=query) + # known values for preferredCdn: akamai, iponly, minicdn and telenor + manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) + + if manifest.get('playability') == 'nonPlayable': + self._raise_error(manifest['nonPlayable']) + + playable = manifest['playable'] + + formats = [] + for asset in playable['assets']: + if not isinstance(asset, dict): + continue + if asset.get('encrypted'): + continue + format_url = url_or_none(asset.get('url')) + if not format_url: + continue + asset_format = (asset.get('format') or '').lower() + if asset_format == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_nrk_formats(format_url, video_id)) + elif asset_format == 'mp3': + formats.append({ + 'url': format_url, + 'format_id': asset_format, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + data = call_playback_api('metadata') + + preplay = data['preplay'] + titles = preplay['titles'] + title = titles['title'] + alt_title = titles.get('subtitle') + + description = preplay.get('description') + duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) + + thumbnails = [] + for image in try_get( + preplay, lambda x: x['poster']['images'], list) or []: + if not isinstance(image, dict): + continue + image_url = url_or_none(image.get('url')) + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('pixelWidth')), + 'height': int_or_none(image.get('pixelHeight')), + }) + + return { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'formats': formats, + } + def _real_extract(self, url): video_id = self._match_id(url) return self._extract_from_playback(video_id) -class NRKTVIE(NRKBaseIE): +class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE @@ -288,7 +311,8 @@ class NRKTVIE(NRKBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - return self._extract_from_playback(video_id) + return self.url_result( + 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) class NRKTVEpisodeIE(InfoExtractor): @@ -359,8 +383,6 @@ class NRKTVSerieBaseIE(NRKBaseIE): nrk_id = episode.get('prfId') or episode.get('episodeId') if not nrk_id or not isinstance(nrk_id, compat_str): continue - if not re.match(NRKTVIE._EPISODE_RE, nrk_id): - continue entries.append(self.url_result( 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) return entries @@ -372,6 +394,10 @@ class NRKTVSerieBaseIE(NRKBaseIE): if embedded.get(asset_key): return asset_key + @staticmethod + def _catalog_name(serie_kind): + return 'podcast' if serie_kind in ('podcast', 'podkast') else 'series' + def _entries(self, data, display_id): for page_num in itertools.count(1): embedded = data.get('_embedded') or data @@ -405,7 +431,16 @@ class NRKTVSerieBaseIE(NRKBaseIE): class NRKTVSeasonIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?Ptv|radio)\.nrk\.no/serie/(?P[^/]+)/(?:sesong/)?(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?Ptv|radio)\.nrk\.no/ + (?Pserie|pod[ck]ast)/ + (?P[^/]+)/ + (?: + (?:sesong/)?(?P\d+)| + sesong/(?P[^/?#&]+) + ) + ''' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', 'info_dict': { @@ -441,19 +476,34 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): # 180 entries, single page 'url': 'https://tv.nrk.no/serie/spangas/sesong/1', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/diagnose-kverulant', + 'info_dict': { + 'id': 'hele_historien/diagnose-kverulant', + 'title': 'Diagnose kverulant', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://radio.nrk.no/podkast/loerdagsraadet/sesong/202101', + 'only_matching': True, }] @classmethod def suitable(cls, url): - return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) + return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) or NRKRadioPodkastIE.suitable(url) else super(NRKTVSeasonIE, cls).suitable(url)) def _real_extract(self, url): - domain, serie, season_id = re.match(self._VALID_URL, url).groups() + mobj = re.match(self._VALID_URL, url) + domain = mobj.group('domain') + serie_kind = mobj.group('serie_kind') + serie = mobj.group('serie') + season_id = mobj.group('id') or mobj.group('id_2') display_id = '%s/%s' % (serie, season_id) data = self._call_api( - '%s/catalog/series/%s/seasons/%s' % (domain, serie, season_id), + '%s/catalog/%s/%s/seasons/%s' + % (domain, self._catalog_name(serie_kind), serie, season_id), display_id, 'season', query={'pageSize': 50}) title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id @@ -463,7 +513,7 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE): class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?P(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/serie/(?P[^/]+)' + _VALID_URL = r'https?://(?P(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/(?Pserie|pod[ck]ast)/(?P[^/]+)' _TESTS = [{ # new layout, instalments 'url': 'https://tv.nrk.no/serie/groenn-glede', @@ -523,23 +573,33 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, { 'url': 'https://nrksuper.no/serie/labyrint', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers', + 'info_dict': { + 'id': 'ulrikkes_univers', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/nrkno-poddkast-26588-134079-05042018030000', + 'only_matching': True, }] @classmethod def suitable(cls, url): return ( False if any(ie.suitable(url) - for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE)) + for ie in (NRKTVIE, NRKTVEpisodeIE, NRKRadioPodkastIE, NRKTVSeasonIE)) else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): - site, series_id = re.match(self._VALID_URL, url).groups() + site, serie_kind, series_id = re.match(self._VALID_URL, url).groups() is_radio = site == 'radio.nrk' domain = 'radio' if is_radio else 'tv' size_prefix = 'p' if is_radio else 'embeddedInstalmentsP' series = self._call_api( - '%s/catalog/series/%s' % (domain, series_id), + '%s/catalog/%s/%s' + % (domain, self._catalog_name(serie_kind), series_id), series_id, 'serie', query={size_prefix + 'ageSize': 50}) titles = try_get(series, [ lambda x: x['titles'], @@ -554,12 +614,14 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): embedded_seasons = embedded.get('seasons') or [] if len(linked_seasons) > len(embedded_seasons): for season in linked_seasons: - season_name = season.get('name') - if season_name and isinstance(season_name, compat_str): + season_url = urljoin(url, season.get('href')) + if not season_url: + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + season_url = 'https://%s.nrk.no/serie/%s/sesong/%s' % (domain, series_id, season_name) + if season_url: entries.append(self.url_result( - 'https://%s.nrk.no/serie/%s/sesong/%s' - % (domain, series_id, season_name), - ie=NRKTVSeasonIE.ie_key(), + season_url, ie=NRKTVSeasonIE.ie_key(), video_title=season.get('title'))) else: for season in embedded_seasons: @@ -584,6 +646,38 @@ class NRKTVDirekteIE(NRKTVIE): }] +class NRKRadioPodkastIE(InfoExtractor): + _VALID_URL = r'https?://radio\.nrk\.no/pod[ck]ast/(?:[^/]+/)+(?Pl_[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + _TESTS = [{ + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'md5': '8d40dab61cea8ab0114e090b029a0565', + 'info_dict': { + 'id': 'MUHH48000314AA', + 'ext': 'mp4', + 'title': '20 spørsmål 23.05.2014', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'duration': 1741, + 'series': '20 spørsmål', + 'episode': '23.05.2014', + }, + }, { + 'url': 'https://radio.nrk.no/podcast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/sesong/1/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/bortfoert-i-bergen/l_774d1a2c-7aa7-4965-8d1a-2c7aa7d9652c', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) + + class NRKPlaylistBaseIE(InfoExtractor): def _extract_description(self, webpage): pass From e1145c77fd95b4df2c3c9e77bd6c4584838897eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:08 +0100 Subject: [PATCH 237/384] [nrk] Add more shortcut tests --- haruhi_dl/extractor/nrk.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 9621522d4..61a7c9aad 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -117,18 +117,30 @@ class NRKIE(NRKBaseIE): # podcast 'url': 'nrk:l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', 'only_matching': True, + }, { + 'url': 'nrk:podcast/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, }, { # clip 'url': 'nrk:150533', 'only_matching': True, }, { - # episode + 'url': 'nrk:clip/150533', + 'only_matching': True, + }, { + # program 'url': 'nrk:MDDP12000117', 'only_matching': True, + }, { + 'url': 'nrk:program/ENRK10100318', + 'only_matching': True, }, { # direkte 'url': 'nrk:nrk1', 'only_matching': True, + }, { + 'url': 'nrk:channel/nrk1', + 'only_matching': True, }] def _extract_from_playback(self, video_id): From 634ebea93d108d522cfb6816b6552b44e6b878f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:15 +0100 Subject: [PATCH 238/384] [nrk] Improve video id extraction --- haruhi_dl/extractor/nrk.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 61a7c9aad..5f12b0d9e 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -151,6 +151,8 @@ class NRKIE(NRKBaseIE): # known values for preferredCdn: akamai, iponly, minicdn and telenor manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) + video_id = try_get(manifest, lambda x: x['id'], compat_str) or video_id + if manifest.get('playability') == 'nonPlayable': self._raise_error(manifest['nonPlayable']) @@ -211,7 +213,7 @@ class NRKIE(NRKBaseIE): } def _real_extract(self, url): - video_id = self._match_id(url) + video_id = self._match_id(url).split('/')[-1] return self._extract_from_playback(video_id) From d9673551d7f3cff0fe8f5f4fa1b101279d475897 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:34 +0100 Subject: [PATCH 239/384] [nrk] Inline _extract_from_playback --- haruhi_dl/extractor/nrk.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 5f12b0d9e..520206534 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -143,7 +143,9 @@ class NRKIE(NRKBaseIE): 'only_matching': True, }] - def _extract_from_playback(self, video_id): + def _real_extract(self, url): + video_id = self._match_id(url).split('/')[-1] + path_templ = 'playback/%s/' + video_id def call_playback_api(item, query=None): @@ -212,10 +214,6 @@ class NRKIE(NRKBaseIE): 'formats': formats, } - def _real_extract(self, url): - video_id = self._match_id(url).split('/')[-1] - return self._extract_from_playback(video_id) - class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' From eff203d3aec5451693a95f21f575c540bc4d164d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:38 +0100 Subject: [PATCH 240/384] [nrk] Fix age limit extraction --- haruhi_dl/extractor/nrk.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 520206534..d023de7f7 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -204,6 +204,9 @@ class NRKIE(NRKBaseIE): 'height': int_or_none(image.get('pixelHeight')), }) + age_limit = int_or_none(try_get( + data, lambda x: x['legalAge']['body']['rating']['code'])) + return { 'id': video_id, 'title': title, @@ -211,6 +214,7 @@ class NRKIE(NRKBaseIE): 'description': description, 'duration': duration, 'thumbnails': thumbnails, + 'age_limit': age_limit, 'formats': formats, } From db48c8dbfe36e43a33041b9738d51b4fa803ab77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:44 +0100 Subject: [PATCH 241/384] [nrk] Extract subtitles --- haruhi_dl/extractor/nrk.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index d023de7f7..bd96d9d14 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -12,6 +12,7 @@ from ..utils import ( ExtractorError, int_or_none, parse_duration, + str_or_none, try_get, urljoin, url_or_none, @@ -204,6 +205,21 @@ class NRKIE(NRKBaseIE): 'height': int_or_none(image.get('pixelHeight')), }) + subtitles = {} + for sub in try_get(playable, lambda x: x['subtitles'], list) or []: + if not isinstance(sub, dict): + continue + sub_url = url_or_none(sub.get('webVtt')) + if not sub_url: + continue + sub_key = str_or_none(sub.get('language')) or 'nb' + sub_type = str_or_none(sub.get('type')) + if sub_type: + sub_key += '-%s' % sub_type + subtitles.setdefault(sub_key, []).append({ + 'url': sub_url, + }) + age_limit = int_or_none(try_get( data, lambda x: x['legalAge']['body']['rating']['code'])) @@ -216,6 +232,7 @@ class NRKIE(NRKBaseIE): 'thumbnails': thumbnails, 'age_limit': age_limit, 'formats': formats, + 'subtitles': subtitles, } From aa829b6cd3a5f6690c195e69a15020ed872bbfe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:04:49 +0100 Subject: [PATCH 242/384] [nrk] Improve series metadata extraction --- haruhi_dl/extractor/nrk.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index bd96d9d14..20a5d7673 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -223,7 +223,9 @@ class NRKIE(NRKBaseIE): age_limit = int_or_none(try_get( data, lambda x: x['legalAge']['body']['rating']['code'])) - return { + is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' + + info = { 'id': video_id, 'title': title, 'alt_title': alt_title, @@ -235,6 +237,27 @@ class NRKIE(NRKBaseIE): 'subtitles': subtitles, } + if is_series: + series = title + if alt_title: + title += ' - %s' % alt_title + season_number = int_or_none(self._search_regex( + r'Sesong\s+(\d+)', description or '', 'season number', + default=None)) + episode = alt_title if is_series else None + episode_number = int_or_none(self._search_regex( + r'(\d+)\.\s+episode', episode or '', 'episode number', + default=None)) + info.update({ + 'title': title, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + + return info + class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' From c00a4d81ca304960bfcab53261f2506a27e551a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:05:07 +0100 Subject: [PATCH 243/384] [nrktv] Fix tests --- haruhi_dl/extractor/nrk.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 20a5d7673..4fb7df959 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -267,7 +267,7 @@ class NRKTVIE(InfoExtractor): 'url': 'https://tv.nrk.no/program/MDDP12000117', 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', 'info_dict': { - 'id': 'MDDP12000117AA', + 'id': 'MDDP12000117', 'ext': 'mp4', 'title': 'Alarm Trolltunga', 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', @@ -278,24 +278,25 @@ class NRKTVIE(InfoExtractor): 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': '8d40dab61cea8ab0114e090b029a0565', 'info_dict': { - 'id': 'MUHH48000314AA', + 'id': 'MUHH48000314', 'ext': 'mp4', - 'title': '20 spørsmål 23.05.2014', + 'title': '20 spørsmål - 23. mai 2014', + 'alt_title': '23. mai 2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, 'series': '20 spørsmål', - 'episode': '23.05.2014', + 'episode': '23. mai 2014', }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', 'info_dict': { - 'id': 'MDFP15000514CA', + 'id': 'MDFP15000514', 'ext': 'mp4', - 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', + 'title': 'Kunnskapskanalen - Grunnlovsjubiléet - Stor ståhei for ingenting', 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', 'duration': 4605.08, 'series': 'Kunnskapskanalen', - 'episode': '24.05.2014', + 'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting', }, 'params': { 'skip_download': True, @@ -304,7 +305,7 @@ class NRKTVIE(InfoExtractor): # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', 'info_dict': { - 'id': 'MSPO40010515AH', + 'id': 'MSPO40010515', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', @@ -317,22 +318,23 @@ class NRKTVIE(InfoExtractor): }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', 'info_dict': { - 'id': 'MSPO40010515AH', + 'id': 'MSPO40010515', 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', }, 'expected_warnings': ['Failed to download m3u8 information'], + 'skip': 'Ikke tilgjengelig utenfor Norge', }, { 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', 'info_dict': { - 'id': 'KMTE50001317AA', + 'id': 'KMTE50001317', 'ext': 'mp4', - 'title': 'Anno 13:30', + 'title': 'Anno - 13. episode', 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', 'duration': 2340, 'series': 'Anno', - 'episode': '13:30', + 'episode': '13. episode', 'season_number': 3, 'episode_number': 13, }, @@ -342,7 +344,7 @@ class NRKTVIE(InfoExtractor): }, { 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', 'info_dict': { - 'id': 'MUHH46000317AA', + 'id': 'MUHH46000317', 'ext': 'mp4', 'title': 'Nytt på Nytt 27.01.2017', 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', From 57a63ed4a12df04093fc94a8e8544f4f34f8c4b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:05:12 +0100 Subject: [PATCH 244/384] [nrk] Improve episode and season number extraction --- haruhi_dl/extractor/nrk.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 4fb7df959..48387420c 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -246,7 +246,9 @@ class NRKIE(NRKBaseIE): default=None)) episode = alt_title if is_series else None episode_number = int_or_none(self._search_regex( - r'(\d+)\.\s+episode', episode or '', 'episode number', + r'^(\d+)\.', episode or '', 'episode number', + default=None)) or int_or_none(self._search_regex( + r'\((\d+)\s*:\s*\d+\)', description or '', 'episode number', default=None)) info.update({ 'title': title, @@ -374,19 +376,19 @@ class NRKTVIE(InfoExtractor): class NRKTVEpisodeIE(InfoExtractor): - _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/\d+/episode/\d+)' + _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/(?P\d+)/episode/(?P\d+))' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2', 'info_dict': { - 'id': 'MUHH36005220BA', + 'id': 'MUHH36005220', 'ext': 'mp4', - 'title': 'Kro, krig og kjærlighet 2:6', - 'description': 'md5:b32a7dc0b1ed27c8064f58b97bda4350', - 'duration': 1563, + 'title': 'Hellums kro - 2. Kro, krig og kjærlighet', + 'description': 'md5:ad92ddffc04cea8ce14b415deef81787', + 'duration': 1563.92, 'series': 'Hellums kro', - 'season_number': 1, + # 'season_number': 1, 'episode_number': 2, - 'episode': '2:6', + 'episode': '2. Kro, krig og kjærlighet', 'age_limit': 6, }, 'params': { @@ -395,15 +397,15 @@ class NRKTVEpisodeIE(InfoExtractor): }, { 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8', 'info_dict': { - 'id': 'MSUI14000816AA', + 'id': 'MSUI14000816', 'ext': 'mp4', - 'title': 'Backstage 8:30', + 'title': 'Backstage - 8. episode', 'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4', 'duration': 1320, 'series': 'Backstage', 'season_number': 1, 'episode_number': 8, - 'episode': '8:30', + 'episode': '8. episode', }, 'params': { 'skip_download': True, @@ -412,7 +414,7 @@ class NRKTVEpisodeIE(InfoExtractor): }] def _real_extract(self, url): - display_id = self._match_id(url) + display_id, season_number, episode_number = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) @@ -424,10 +426,12 @@ class NRKTVEpisodeIE(InfoExtractor): assert re.match(NRKTVIE._EPISODE_RE, nrk_id) info.update({ - '_type': 'url_transparent', + '_type': 'url', 'id': nrk_id, 'url': 'nrk:%s' % nrk_id, 'ie_key': NRKIE.ie_key(), + 'season_number': int(season_number), + 'episode_number': int(episode_number), }) return info From 785078cb0880e52b055e49167a1de13725956068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:05:21 +0100 Subject: [PATCH 245/384] [nrk] PEP 8 --- haruhi_dl/extractor/nrk.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 48387420c..2873d7938 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -247,9 +247,11 @@ class NRKIE(NRKBaseIE): episode = alt_title if is_series else None episode_number = int_or_none(self._search_regex( r'^(\d+)\.', episode or '', 'episode number', - default=None)) or int_or_none(self._search_regex( - r'\((\d+)\s*:\s*\d+\)', description or '', 'episode number', default=None)) + if not episode_number: + episode_number = int_or_none(self._search_regex( + r'\((\d+)\s*:\s*\d+\)', description or '', + 'episode number', default=None)) info.update({ 'title': title, 'series': series, From b51ed7b039f87362fa087ab61a1f19c7816d5ae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:05:26 +0100 Subject: [PATCH 246/384] [nrk] Improve series metadata extraction (closes #27473) --- haruhi_dl/extractor/nrk.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 2873d7938..5d33355e7 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -238,16 +238,29 @@ class NRKIE(NRKBaseIE): } if is_series: - series = title + series = season_id = season_number = episode = episode_number = None + programs = self._call_api( + 'programs/%s' % video_id, video_id, 'programs', fatal=False) + if programs and isinstance(programs, dict): + series = str_or_none(programs.get('seriesTitle')) + season_id = str_or_none(programs.get('seasonId')) + season_number = int_or_none(programs.get('seasonNumber')) + episode = str_or_none(programs.get('episodeTitle')) + episode_number = int_or_none(programs.get('episodeNumber')) + if not series: + series = title if alt_title: title += ' - %s' % alt_title - season_number = int_or_none(self._search_regex( - r'Sesong\s+(\d+)', description or '', 'season number', - default=None)) - episode = alt_title if is_series else None - episode_number = int_or_none(self._search_regex( - r'^(\d+)\.', episode or '', 'episode number', - default=None)) + if not season_number: + season_number = int_or_none(self._search_regex( + r'Sesong\s+(\d+)', description or '', 'season number', + default=None)) + if not episode: + episode = alt_title if is_series else None + if not episode_number: + episode_number = int_or_none(self._search_regex( + r'^(\d+)\.', episode or '', 'episode number', + default=None)) if not episode_number: episode_number = int_or_none(self._search_regex( r'\((\d+)\s*:\s*\d+\)', description or '', @@ -255,6 +268,7 @@ class NRKIE(NRKBaseIE): info.update({ 'title': title, 'series': series, + 'season_id': season_id, 'season_number': season_number, 'episode': episode, 'episode_number': episode_number, @@ -388,7 +402,7 @@ class NRKTVEpisodeIE(InfoExtractor): 'description': 'md5:ad92ddffc04cea8ce14b415deef81787', 'duration': 1563.92, 'series': 'Hellums kro', - # 'season_number': 1, + 'season_number': 1, 'episode_number': 2, 'episode': '2. Kro, krig og kjærlighet', 'age_limit': 6, From 8e538fc605286405544d7f5f8b8c7bee4c555167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:05:31 +0100 Subject: [PATCH 247/384] [nrk] Fix age limit extraction --- haruhi_dl/extractor/nrk.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 5d33355e7..69178e157 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -220,8 +220,15 @@ class NRKIE(NRKBaseIE): 'url': sub_url, }) - age_limit = int_or_none(try_get( - data, lambda x: x['legalAge']['body']['rating']['code'])) + legal_age = try_get( + data, lambda x: x['legalAge']['body']['rating']['code'], compat_str) + # https://en.wikipedia.org/wiki/Norwegian_Media_Authority + if legal_age == 'A': + age_limit = 0 + elif legal_age.isdigit(): + age_limit = int_or_none(legal_age) + else: + age_limit = None is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' @@ -304,6 +311,7 @@ class NRKTVIE(InfoExtractor): 'duration': 1741, 'series': '20 spørsmål', 'episode': '23. mai 2014', + 'age_limit': 0, }, }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', @@ -315,6 +323,7 @@ class NRKTVIE(InfoExtractor): 'duration': 4605.08, 'series': 'Kunnskapskanalen', 'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -327,6 +336,7 @@ class NRKTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -340,6 +350,7 @@ class NRKTVIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'age_limit': 0, }, 'expected_warnings': ['Failed to download m3u8 information'], 'skip': 'Ikke tilgjengelig utenfor Norge', @@ -355,6 +366,7 @@ class NRKTVIE(InfoExtractor): 'episode': '13. episode', 'season_number': 3, 'episode_number': 13, + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -369,6 +381,7 @@ class NRKTVIE(InfoExtractor): 'duration': 1796, 'series': 'Nytt på nytt', 'episode': '27.01.2017', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -422,6 +435,7 @@ class NRKTVEpisodeIE(InfoExtractor): 'season_number': 1, 'episode_number': 8, 'episode': '8. episode', + 'age_limit': 0, }, 'params': { 'skip_download': True, From 8406b57ac6d2b1cbebae7517d57dafb1bdcb352e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:05:39 +0100 Subject: [PATCH 248/384] [stv] improve episode id extraction(closes #23083) --- haruhi_dl/extractor/stv.py | 42 +++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/extractor/stv.py b/haruhi_dl/extractor/stv.py index bae8b71f4..539220a94 100644 --- a/haruhi_dl/extractor/stv.py +++ b/haruhi_dl/extractor/stv.py @@ -8,13 +8,17 @@ from ..utils import ( compat_str, float_or_none, int_or_none, + smuggle_url, + str_or_none, + try_get, ) class STVPlayerIE(InfoExtractor): IE_NAME = 'stv:player' _VALID_URL = r'https?://player\.stv\.tv/(?Pepisode|video)/(?P[a-z0-9]{4})' - _TEST = { + _TESTS = [{ + # shortform 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/', 'md5': '5adf9439c31d554f8be0707c7abe7e0a', 'info_dict': { @@ -27,7 +31,11 @@ class STVPlayerIE(InfoExtractor): 'uploader_id': '1486976045', }, 'skip': 'this resource is unavailable outside of the UK', - } + }, { + # episodes + 'url': 'https://player.stv.tv/episode/4125/jennifer-saunders-memory-lane', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s' _PTYPE_MAP = { 'episode': 'episodes', @@ -36,11 +44,31 @@ class STVPlayerIE(InfoExtractor): def _real_extract(self, url): ptype, video_id = re.match(self._VALID_URL, url).groups() - resp = self._download_json( - 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id), - video_id) - result = resp['results'] + webpage = self._download_webpage(url, video_id, fatal=False) or '' + props = (self._parse_json(self._search_regex( + r']+id="__NEXT_DATA__"[^>]*>({.+?})', + webpage, 'next data', default='{}'), video_id, + fatal=False) or {}).get('props') or {} + player_api_cache = try_get( + props, lambda x: x['initialReduxState']['playerApiCache']) or {} + + api_path, resp = None, {} + for k, v in player_api_cache.items(): + if k.startswith('/episodes/') or k.startswith('/shortform/'): + api_path, resp = k, v + break + else: + episode_id = str_or_none(try_get( + props, lambda x: x['pageProps']['episodeId'])) + api_path = '/%s/%s' % (self._PTYPE_MAP[ptype], episode_id or video_id) + + result = resp.get('results') + if not result: + resp = self._download_json( + 'https://player.api.stv.tv/v1' + api_path, video_id) + result = resp['results'] + video = result['video'] video_id = compat_str(video['id']) @@ -57,7 +85,7 @@ class STVPlayerIE(InfoExtractor): return { '_type': 'url_transparent', 'id': video_id, - 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_id, + 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['GB']}), 'description': result.get('summary'), 'duration': float_or_none(video.get('length'), 1000), 'subtitles': subtitles, From 3f43c99d4ad44e4f48ee03f76de3e5d7eb03413a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:05:44 +0100 Subject: [PATCH 249/384] =?UTF-8?q?[stitcher]=20Add=20support=20for=20show?= =?UTF-8?q?s=20and=20show=20metadata=20extraction(closes=20=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …#20510) --- haruhi_dl/extractor/extractors.py | 5 +- haruhi_dl/extractor/stitcher.py | 120 ++++++++++++++++++++++-------- 2 files changed, 92 insertions(+), 33 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 086c7d42a..2722c0501 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1137,7 +1137,10 @@ from .spike import ( BellatorIE, ParamountNetworkIE, ) -from .stitcher import StitcherIE +from .stitcher import ( + StitcherIE, + StitcherShowIE, +) from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE diff --git a/haruhi_dl/extractor/stitcher.py b/haruhi_dl/extractor/stitcher.py index b8b5711b1..3dd0d3b5f 100644 --- a/haruhi_dl/extractor/stitcher.py +++ b/haruhi_dl/extractor/stitcher.py @@ -1,19 +1,60 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( clean_html, ExtractorError, int_or_none, str_or_none, try_get, + url_or_none, ) -class StitcherIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/(?:[^/]+/)+e(?:pisode)?/(?:(?P[^/#?&]+?)-)?(?P\d+)(?:[/#?&]|$)' +class StitcherBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/' + + def _call_api(self, path, video_id, query): + resp = self._download_json( + 'https://api.prod.stitcher.com/' + path, + video_id, query=query) + error_massage = try_get(resp, lambda x: x['errors'][0]['message']) + if error_massage: + raise ExtractorError(error_massage, expected=True) + return resp['data'] + + def _extract_description(self, data): + return clean_html(data.get('html_description') or data.get('description')) + + def _extract_audio_url(self, episode): + return url_or_none(episode.get('audio_url') or episode.get('guid')) + + def _extract_show_info(self, show): + return { + 'thumbnail': show.get('image_base_url'), + 'series': show.get('title'), + } + + def _extract_episode(self, episode, audio_url, show_info): + info = { + 'id': compat_str(episode['id']), + 'display_id': episode.get('slug'), + 'title': episode['title'].strip(), + 'description': self._extract_description(episode), + 'duration': int_or_none(episode.get('duration')), + 'url': audio_url, + 'vcodec': 'none', + 'timestamp': int_or_none(episode.get('date_published')), + 'season_number': int_or_none(episode.get('season')), + 'season_id': str_or_none(episode.get('season_id')), + } + info.update(show_info) + return info + + +class StitcherIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?:[^/]+/)+e(?:pisode)?/(?:[^/#?&]+-)?(?P\d+)' _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', 'md5': 'e9635098e0da10b21a0e2b85585530f6', @@ -24,8 +65,9 @@ class StitcherIE(InfoExtractor): 'description': 'md5:547adb4081864be114ae3831b4c2b42f', 'duration': 1604, 'thumbnail': r're:^https?://.*\.jpg', - 'upload_date': '20180126', - 'timestamp': 1516989316, + 'upload_date': '20151008', + 'timestamp': 1444285800, + 'series': 'Talking Machines', }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', @@ -55,33 +97,47 @@ class StitcherIE(InfoExtractor): }] def _real_extract(self, url): - display_id, audio_id = re.match(self._VALID_URL, url).groups() + audio_id = self._match_id(url) + data = self._call_api( + 'shows/episodes', audio_id, {'episode_ids': audio_id}) + episode = data['episodes'][0] + audio_url = self._extract_audio_url(episode) + if not audio_url: + self.raise_login_required() + show = try_get(data, lambda x: x['shows'][0], dict) or {} + return self._extract_episode( + episode, audio_url, self._extract_show_info(show)) - resp = self._download_json( - 'https://api.prod.stitcher.com/episode/' + audio_id, - display_id or audio_id) - episode = try_get(resp, lambda x: x['data']['episodes'][0], dict) - if not episode: - raise ExtractorError(resp['errors'][0]['message'], expected=True) - title = episode['title'].strip() - audio_url = episode['audio_url'] +class StitcherShowIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?P[^/#?&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.stitcher.com/podcast/the-talking-machines', + 'info_dict': { + 'id': 'the-talking-machines', + 'title': 'Talking Machines', + 'description': 'md5:831f0995e40f26c10231af39cf1ebf0b', + }, + 'playlist_mincount': 106, + }, { + 'url': 'https://www.stitcher.com/show/the-talking-machines', + 'only_matching': True, + }] - thumbnail = None - show_id = episode.get('show_id') - if show_id and episode.get('classic_id') != -1: - thumbnail = 'https://stitcher-classic.imgix.net/feedimages/%s.jpg' % show_id + def _real_extract(self, url): + show_slug = self._match_id(url) + data = self._call_api( + 'search/show/%s/allEpisodes' % show_slug, show_slug, {'count': 10000}) + show = try_get(data, lambda x: x['shows'][0], dict) or {} + show_info = self._extract_show_info(show) - return { - 'id': audio_id, - 'display_id': display_id, - 'title': title, - 'description': clean_html(episode.get('html_description') or episode.get('description')), - 'duration': int_or_none(episode.get('duration')), - 'thumbnail': thumbnail, - 'url': audio_url, - 'vcodec': 'none', - 'timestamp': int_or_none(episode.get('date_created')), - 'season_number': int_or_none(episode.get('season')), - 'season_id': str_or_none(episode.get('season_id')), - } + entries = [] + for episode in (data.get('episodes') or []): + audio_url = self._extract_audio_url(episode) + if not audio_url: + continue + entries.append(self._extract_episode(episode, audio_url, show_info)) + + return self.playlist_result( + entries, show_slug, show.get('title'), + self._extract_description(show)) From 28c4062a5894da0e236352880d741cfc52a9f4b5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:05:50 +0100 Subject: [PATCH 250/384] =?UTF-8?q?[twitter]=20try=20to=20use=20a=20Generi?= =?UTF-8?q?c=20fallback=20for=20unknown=20twitter=20cards(clo=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ses #25982) --- haruhi_dl/extractor/twitter.py | 52 ++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/haruhi_dl/extractor/twitter.py b/haruhi_dl/extractor/twitter.py index 4284487db..a35e1686c 100644 --- a/haruhi_dl/extractor/twitter.py +++ b/haruhi_dl/extractor/twitter.py @@ -251,10 +251,10 @@ class TwitterIE(TwitterBaseIE): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'simon vetugo - BEAT PROD: @suhmeduh #Damndaniel', + 'title': 'simon vertugo - BEAT PROD: @suhmeduh #Damndaniel', 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'simon vetugo', + 'uploader': 'simon vertugo', 'uploader_id': 'simonvertugo', 'duration': 30.0, 'timestamp': 1455777459, @@ -312,6 +312,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1492000653, 'upload_date': '20170412', }, + 'skip': 'Account suspended', }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', 'info_dict': { @@ -380,6 +381,14 @@ class TwitterIE(TwitterBaseIE): # promo_video_website card 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', 'only_matching': True, + }, { + # promo_video_convo card + 'url': 'https://twitter.com/poco_dandy/status/1047395834013384704', + 'only_matching': True, + }, { + # appplayer card + 'url': 'https://twitter.com/poco_dandy/status/1150646424461176832', + 'only_matching': True, }] def _real_extract(self, url): @@ -462,7 +471,25 @@ class TwitterIE(TwitterBaseIE): return try_get(o, lambda x: x[x['type'].lower() + '_value']) card_name = card['name'].split(':')[-1] - if card_name in ('amplify', 'promo_video_website'): + if card_name == 'player': + info.update({ + '_type': 'url', + 'url': get_binding_value('player_url'), + }) + elif card_name == 'periscope_broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('url') or get_binding_value('player_url'), + 'ie_key': PeriscopeIE.ie_key(), + }) + elif card_name == 'broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('broadcast_url'), + 'ie_key': TwitterBroadcastIE.ie_key(), + }) + # amplify, promo_video_website, promo_video_convo, appplayer, ... + else: is_amplify = card_name == 'amplify' vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) @@ -488,25 +515,6 @@ class TwitterIE(TwitterBaseIE): 'duration': int_or_none(get_binding_value( 'content_duration_seconds')), }) - elif card_name == 'player': - info.update({ - '_type': 'url', - 'url': get_binding_value('player_url'), - }) - elif card_name == 'periscope_broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('url') or get_binding_value('player_url'), - 'ie_key': PeriscopeIE.ie_key(), - }) - elif card_name == 'broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('broadcast_url'), - 'ie_key': TwitterBroadcastIE.ie_key(), - }) - else: - raise ExtractorError('Unsupported Twitter Card.') else: expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) if not expanded_url: From 3f2bf67bc908cb1b11ccdfc29b69c00f6bde1cf5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:05:55 +0100 Subject: [PATCH 251/384] [twitter] Add support for summary card(closes #25121) --- haruhi_dl/extractor/twitter.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/haruhi_dl/extractor/twitter.py b/haruhi_dl/extractor/twitter.py index a35e1686c..1190d721e 100644 --- a/haruhi_dl/extractor/twitter.py +++ b/haruhi_dl/extractor/twitter.py @@ -488,6 +488,11 @@ class TwitterIE(TwitterBaseIE): 'url': get_binding_value('broadcast_url'), 'ie_key': TwitterBroadcastIE.ie_key(), }) + elif card_name == 'summary': + info.update({ + '_type': 'url', + 'url': get_binding_value('card_url'), + }) # amplify, promo_video_website, promo_video_convo, appplayer, ... else: is_amplify = card_name == 'amplify' From a22e2b59b4d4ce1af561668352252554c7d7ae91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:06:00 +0100 Subject: [PATCH 252/384] [nrktv] Add subtitles test --- haruhi_dl/extractor/nrk.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 69178e157..cafb85616 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -298,6 +298,14 @@ class NRKTVIE(InfoExtractor): 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', 'duration': 2223.44, 'age_limit': 6, + 'subtitles': { + 'nb-nor': [{ + 'ext': 'vtt', + }], + 'nb-ttv': [{ + 'ext': 'vtt', + }] + }, }, }, { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', From e98e8454c5298961453be56d1c7f8a66a991f2a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:06:04 +0100 Subject: [PATCH 253/384] [xfileshare] Add support for aparat.cam (closes #27651) --- haruhi_dl/extractor/xfileshare.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/haruhi_dl/extractor/xfileshare.py b/haruhi_dl/extractor/xfileshare.py index 20f7013f3..783358fe9 100644 --- a/haruhi_dl/extractor/xfileshare.py +++ b/haruhi_dl/extractor/xfileshare.py @@ -45,6 +45,7 @@ def aa_decode(aa_code): class XFileShareIE(InfoExtractor): _SITES = ( + (r'aparat\.cam', 'Aparat'), (r'clipwatching\.com', 'ClipWatching'), (r'gounlimited\.to', 'GoUnlimited'), (r'govid\.me', 'GoVid'), @@ -78,6 +79,9 @@ class XFileShareIE(InfoExtractor): 'title': 'sample', 'thumbnail': r're:http://.*\.jpg', }, + }, { + 'url': 'https://aparat.cam/n4d6dh0wvlpr', + 'only_matching': True, }] @staticmethod From 017215032a927504449ccf7c578a472139570e89 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:06:12 +0100 Subject: [PATCH 254/384] [utils] add a function to clean podcast URLs --- haruhi_dl/utils.py | 17 +++++++++++++++++ test/test_utils.py | 5 +++++ 2 files changed, 22 insertions(+) diff --git a/haruhi_dl/utils.py b/haruhi_dl/utils.py index 62b59bcdb..d35033b7e 100644 --- a/haruhi_dl/utils.py +++ b/haruhi_dl/utils.py @@ -5708,3 +5708,20 @@ def random_birthday(year_field, month_field, day_field): month_field: str(random_date.month), day_field: str(random_date.day), } + + +def clean_podcast_url(url): + return re.sub(r'''(?x) + (?: + (?: + chtbl\.com/track| + media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/ + play\.podtrac\.com + )/[^/]+| + (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure + flex\.acast\.com| + pd(?: + cn\.co| # https://podcorn.com/analytics-prefix/ + st\.fm # https://podsights.com/docs/ + )/e + )/''', '', url) diff --git a/test/test_utils.py b/test/test_utils.py index dc3dde0c4..d052a23de 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,6 +21,7 @@ from haruhi_dl.utils import ( encode_base_n, caesar, clean_html, + clean_podcast_url, date_from_str, DateRange, detect_exe_version, @@ -1470,6 +1471,10 @@ Line 1 self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + def test_clean_podcast_url(self): + self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') + self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + if __name__ == '__main__': unittest.main() From 1e653be1d0bb4efbf7011204da9060b5ad1b0ad4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:06:16 +0100 Subject: [PATCH 255/384] [stitcher] clean podcast URLs --- haruhi_dl/extractor/stitcher.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/stitcher.py b/haruhi_dl/extractor/stitcher.py index 3dd0d3b5f..822782507 100644 --- a/haruhi_dl/extractor/stitcher.py +++ b/haruhi_dl/extractor/stitcher.py @@ -4,6 +4,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( clean_html, + clean_podcast_url, ExtractorError, int_or_none, str_or_none, @@ -43,7 +44,7 @@ class StitcherBaseIE(InfoExtractor): 'title': episode['title'].strip(), 'description': self._extract_description(episode), 'duration': int_or_none(episode.get('duration')), - 'url': audio_url, + 'url': clean_podcast_url(audio_url), 'vcodec': 'none', 'timestamp': int_or_none(episode.get('date_published')), 'season_number': int_or_none(episode.get('season')), From 626d26e13accd0d7424b935e33e31c45b92cc851 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:06:22 +0100 Subject: [PATCH 256/384] [acast] clean podcast URLs --- haruhi_dl/extractor/acast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/acast.py b/haruhi_dl/extractor/acast.py index 60378db1b..b9355a2c8 100644 --- a/haruhi_dl/extractor/acast.py +++ b/haruhi_dl/extractor/acast.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + clean_podcast_url, int_or_none, parse_iso8601, ) @@ -17,7 +18,7 @@ class ACastBaseIE(InfoExtractor): info = { 'id': episode['id'], 'display_id': episode.get('episodeUrl'), - 'url': episode['url'], + 'url': clean_podcast_url(episode['url']), 'title': title, 'description': clean_html(episode.get('description') or episode.get('summary')), 'thumbnail': episode.get('image'), From e52adb5328a6b67d4fc889969d07cb40fa75472b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:06:26 +0100 Subject: [PATCH 257/384] [iheart] Add new extractor for iHeartRadio(#27037) --- haruhi_dl/extractor/extractors.py | 4 ++ haruhi_dl/extractor/iheart.py | 97 +++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 haruhi_dl/extractor/iheart.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 2722c0501..265556a21 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -493,6 +493,10 @@ from .ign import ( OneUPIE, PCMagIE, ) +from .iheart import ( + IHeartRadioIE, + IHeartRadioPodcastIE, +) from .imdb import ( ImdbIE, ImdbListIE diff --git a/haruhi_dl/extractor/iheart.py b/haruhi_dl/extractor/iheart.py new file mode 100644 index 000000000..6710baeb4 --- /dev/null +++ b/haruhi_dl/extractor/iheart.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + str_or_none, +) + + +class IHeartRadioBaseIE(InfoExtractor): + def _call_api(self, path, video_id, fatal=True, query=None): + return self._download_json( + 'https://api.iheart.com/api/v3/podcast/' + path, + video_id, fatal=fatal, query=query) + + def _extract_episode(self, episode): + return { + 'thumbnail': episode.get('imageUrl'), + 'description': episode.get('description'), + 'timestamp': int_or_none(episode.get('startDate'), 1000), + 'duration': int_or_none(episode.get('duration')), + } + + +class IHeartRadioIE(IHeartRadioBaseIE): + IENAME = 'iheartradio' + _VALID_URL = r'(?:https?://(?:www\.)?iheart\.com/podcast/[^/]+/episode/(?P[^/?&#]+)-|iheartradio:)(?P\d+)' + _TEST = { + 'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true', + 'md5': 'c8609c92c8688dcb69d8541042b8abca', + 'info_dict': { + 'id': '70346499', + 'ext': 'mp3', + 'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus', + 'description': 'md5:66480b2d25ec93a5f60c0faa3275ce5c', + 'timestamp': 1597741200, + 'upload_date': '20200818', + } + } + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api( + 'episodes/' + episode_id, episode_id)['episode'] + info = self._extract_episode(episode) + print(episode['mediaUrl']) + info.update({ + 'id': episode_id, + 'title': episode['title'], + 'url': clean_podcast_url(episode['mediaUrl']), + }) + return info + + +class IHeartRadioPodcastIE(IHeartRadioBaseIE): + IE_NAME = 'iheartradio:podcast' + _VALID_URL = r'https?://(?:www\.)?iheart(?:podcastnetwork)?\.com/podcast/[^/?&#]+-(?P\d+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://www.iheart.com/podcast/1119-it-could-happen-here-30717896/', + 'info_dict': { + 'id': '30717896', + 'title': 'It Could Happen Here', + 'description': 'md5:5842117412a967eb0b01f8088eb663e2', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://www.iheartpodcastnetwork.com/podcast/105-stuff-you-should-know-26940277', + 'only_matching': True, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + path = 'podcasts/' + podcast_id + episodes = self._call_api( + path + '/episodes', podcast_id, query={'limit': 1000000000})['data'] + + entries = [] + for episode in episodes: + episode_id = str_or_none(episode.get('id')) + if not episode_id: + continue + info = self._extract_episode(episode) + info.update({ + '_type': 'url', + 'id': episode_id, + 'title': episode.get('title'), + 'url': 'iheartradio:' + episode_id, + 'ie_key': IHeartRadioIE.ie_key(), + }) + entries.append(info) + + podcast = self._call_api(path, podcast_id, False) or {} + + return self.playlist_result( + entries, podcast_id, podcast.get('title'), podcast.get('description')) From 1b1752a1b5806055971fc2cf903eb3e2f888b81a Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:08:15 +0100 Subject: [PATCH 258/384] [googlepodcasts] Add new extractor --- haruhi_dl/extractor/extractors.py | 4 ++ haruhi_dl/extractor/googlepodcasts.py | 88 +++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 haruhi_dl/extractor/googlepodcasts.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 265556a21..96b039096 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -448,6 +448,10 @@ from .godtube import GodTubeIE from .golem import GolemIE from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE +from .googlepodcasts import ( + GooglePodcastsIE, + GooglePodcastsFeedIE, +) from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE diff --git a/haruhi_dl/extractor/googlepodcasts.py b/haruhi_dl/extractor/googlepodcasts.py new file mode 100644 index 000000000..31ad79907 --- /dev/null +++ b/haruhi_dl/extractor/googlepodcasts.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + try_get, + urlencode_postdata, +) + + +class GooglePodcastsBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/' + + def _batch_execute(self, func_id, video_id, params): + return json.loads(self._download_json( + 'https://podcasts.google.com/_/PodcastsUi/data/batchexecute', + video_id, data=urlencode_postdata({ + 'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]), + }), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2]) + + def _extract_episode(self, episode): + return { + 'id': episode[4][3], + 'title': episode[8], + 'url': clean_podcast_url(episode[13]), + 'thumbnail': episode[2], + 'description': episode[9], + 'creator': try_get(episode, lambda x: x[14]), + 'timestamp': int_or_none(episode[11]), + 'duration': int_or_none(episode[12]), + 'series': episode[1], + } + + +class GooglePodcastsIE(GooglePodcastsBaseIE): + IE_NAME = 'google:podcasts' + _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P[^/]+)/episode/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh', + 'md5': 'fa56b2ee8bd0703e27e42d4b104c4766', + 'info_dict': { + 'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a', + 'ext': 'mp3', + 'title': 'WWDTM New Year 2021', + 'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.', + 'upload_date': '20210102', + 'timestamp': 1609606800, + 'duration': 2901, + 'series': "Wait Wait... Don't Tell Me!", + } + } + + def _real_extract(self, url): + b64_feed_url, b64_guid = re.match(self._VALID_URL, url).groups() + episode = self._batch_execute( + 'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1] + return self._extract_episode(episode) + + +class GooglePodcastsFeedIE(GooglePodcastsBaseIE): + IE_NAME = 'google:podcasts:feed' + _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P[^/?&#]+)/?(?:[?#&]|$)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA', + 'info_dict': { + 'title': "Wait Wait... Don't Tell Me!", + 'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.", + }, + 'playlist_mincount': 20, + } + + def _real_extract(self, url): + b64_feed_url = self._match_id(url) + data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url]) + + entries = [] + for episode in (try_get(data, lambda x: x[1][0]) or []): + entries.append(self._extract_episode(episode)) + + feed = try_get(data, lambda x: x[3]) or [] + return self.playlist_result( + entries, playlist_title=try_get(feed, lambda x: x[0]), + playlist_description=try_get(feed, lambda x: x[2])) From 607b324dfff8cb513e224e61520d711df282ae5d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:08:20 +0100 Subject: [PATCH 259/384] [applepodcasts] Add new extractor(#25918) --- haruhi_dl/extractor/applepodcasts.py | 61 ++++++++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 1 + 2 files changed, 62 insertions(+) create mode 100644 haruhi_dl/extractor/applepodcasts.py diff --git a/haruhi_dl/extractor/applepodcasts.py b/haruhi_dl/extractor/applepodcasts.py new file mode 100644 index 000000000..95758fece --- /dev/null +++ b/haruhi_dl/extractor/applepodcasts.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + try_get, +) + + +class ApplePodcastsIE(InfoExtractor): + _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P\d+)' + _TESTS = [{ + 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'info_dict': { + 'id': '1000482637777', + 'ext': 'mp3', + 'title': '207 - Whitney Webb Returns', + 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'upload_date': '20200705', + 'timestamp': 1593921600, + 'duration': 6425, + 'series': 'The Tim Dillon Show', + } + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + webpage = self._download_webpage(url, episode_id) + ember_data = self._parse_json(self._search_regex( + r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) + episode = ember_data['data']['attributes'] + description = episode.get('description') or {} + + series = None + for inc in (ember_data.get('included') or []): + if inc.get('type') == 'media/podcast': + series = try_get(inc, lambda x: x['attributes']['name']) + + return { + 'id': episode_id, + 'title': episode['name'], + 'url': clean_podcast_url(episode['assetUrl']), + 'description': description.get('standard') or description.get('short'), + 'timestamp': parse_iso8601(episode.get('releaseDateTime')), + 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), + 'series': series, + } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 96b039096..2b2bd0b7c 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -65,6 +65,7 @@ from .appletrailers import ( AppleTrailersIE, AppleTrailersSectionIE, ) +from .applepodcasts import ApplePodcastsIE from .archiveorg import ArchiveOrgIE from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE From 1dc43fd3fc6d741bad0ee412b0ff0149d418e0ad Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:08:27 +0100 Subject: [PATCH 260/384] [googleplus] Remove Extractor(closes #4955)(closes #7400) --- haruhi_dl/extractor/extractors.py | 1 - haruhi_dl/extractor/googleplus.py | 73 ------------------------------- 2 files changed, 74 deletions(-) delete mode 100644 haruhi_dl/extractor/googleplus.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 2b2bd0b7c..2717c6e45 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -448,7 +448,6 @@ from .go import GoIE from .godtube import GodTubeIE from .golem import GolemIE from .googledrive import GoogleDriveIE -from .googleplus import GooglePlusIE from .googlepodcasts import ( GooglePodcastsIE, GooglePodcastsFeedIE, diff --git a/haruhi_dl/extractor/googleplus.py b/haruhi_dl/extractor/googleplus.py deleted file mode 100644 index 6b927bb44..000000000 --- a/haruhi_dl/extractor/googleplus.py +++ /dev/null @@ -1,73 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import codecs - -from .common import InfoExtractor -from ..utils import unified_strdate - - -class GooglePlusIE(InfoExtractor): - IE_DESC = 'Google Plus' - _VALID_URL = r'https?://plus\.google\.com/(?:[^/]+/)*?posts/(?P\w+)' - IE_NAME = 'plus.google' - _TEST = { - 'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH', - 'info_dict': { - 'id': 'ZButuJc6CtH', - 'ext': 'flv', - 'title': '嘆きの天使 降臨', - 'upload_date': '20120613', - 'uploader': '井上ヨシマサ', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - # Step 1, Retrieve post webpage to extract further information - webpage = self._download_webpage(url, video_id, 'Downloading entry webpage') - - title = self._og_search_description(webpage).splitlines()[0] - upload_date = unified_strdate(self._html_search_regex( - r'''(?x) - ([0-9]{4}-[0-9]{2}-[0-9]{2})''', - webpage, 'upload date', fatal=False, flags=re.VERBOSE)) - uploader = self._html_search_regex( - r'rel="author".*?>(.*?)', webpage, 'uploader', fatal=False) - - # Step 2, Simulate clicking the image box to launch video - DOMAIN = 'https://plus.google.com/' - video_page = self._search_regex( - r' Date: Fri, 26 Feb 2021 16:08:34 +0100 Subject: [PATCH 261/384] [iheart] remove print statement --- haruhi_dl/extractor/iheart.py | 1 - 1 file changed, 1 deletion(-) diff --git a/haruhi_dl/extractor/iheart.py b/haruhi_dl/extractor/iheart.py index 6710baeb4..7a7295ff4 100644 --- a/haruhi_dl/extractor/iheart.py +++ b/haruhi_dl/extractor/iheart.py @@ -45,7 +45,6 @@ class IHeartRadioIE(IHeartRadioBaseIE): episode = self._call_api( 'episodes/' + episode_id, episode_id)['episode'] info = self._extract_episode(episode) - print(episode['mediaUrl']) info.update({ 'id': episode_id, 'title': episode['title'], From 67ff5da6ea2c9ec7dc57bd43d2f345f4e038f751 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:08:39 +0100 Subject: [PATCH 262/384] [iheart] clean HTML tags from episode description --- haruhi_dl/extractor/iheart.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/iheart.py b/haruhi_dl/extractor/iheart.py index 7a7295ff4..266c67a76 100644 --- a/haruhi_dl/extractor/iheart.py +++ b/haruhi_dl/extractor/iheart.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + clean_html, clean_podcast_url, int_or_none, str_or_none, @@ -18,7 +19,7 @@ class IHeartRadioBaseIE(InfoExtractor): def _extract_episode(self, episode): return { 'thumbnail': episode.get('imageUrl'), - 'description': episode.get('description'), + 'description': clean_html(episode.get('description')), 'timestamp': int_or_none(episode.get('startDate'), 1000), 'duration': int_or_none(episode.get('duration')), } From 25dff12eb19d0bc4851057c970c6ebbf995637d4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:08:44 +0100 Subject: [PATCH 263/384] [nrk] fix extraction for videos without a legalAge rating --- haruhi_dl/extractor/nrk.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index cafb85616..40dee2162 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -223,12 +223,12 @@ class NRKIE(NRKBaseIE): legal_age = try_get( data, lambda x: x['legalAge']['body']['rating']['code'], compat_str) # https://en.wikipedia.org/wiki/Norwegian_Media_Authority - if legal_age == 'A': - age_limit = 0 - elif legal_age.isdigit(): - age_limit = int_or_none(legal_age) - else: - age_limit = None + age_limit = None + if legal_age: + if legal_age == 'A': + age_limit = 0 + elif legal_age.isdigit(): + age_limit = int_or_none(legal_age) is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' From 055e9eb904a78069da659711520e46ba1b730d9a Mon Sep 17 00:00:00 2001 From: Yurii H Date: Fri, 26 Feb 2021 16:08:50 +0100 Subject: [PATCH 264/384] [iheart] Update test description value (#27037) the description has no HTML tags now. --- haruhi_dl/extractor/iheart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/iheart.py b/haruhi_dl/extractor/iheart.py index 266c67a76..b54c05eeb 100644 --- a/haruhi_dl/extractor/iheart.py +++ b/haruhi_dl/extractor/iheart.py @@ -35,7 +35,7 @@ class IHeartRadioIE(IHeartRadioBaseIE): 'id': '70346499', 'ext': 'mp3', 'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus', - 'description': 'md5:66480b2d25ec93a5f60c0faa3275ce5c', + 'description': 'md5:96cc7297b3a5a9ebae28643801c96fae', 'timestamp': 1597741200, 'upload_date': '20200818', } From 903c90bd4ccead58f6247d2f2fb06ca16148168e Mon Sep 17 00:00:00 2001 From: Kevin O'Connor Date: Fri, 26 Feb 2021 16:08:56 +0100 Subject: [PATCH 265/384] [downloader/hls] Disable decryption in tests (#27660) Tests truncate the download to 10241 bytes, which is not divisible by 16 and cannot be decrypted. Tests don't really care about the decrypted content, just that the data they retrieved is the expected data. Therefore, it's fine to just return the encrypted data to tests. See: #27621 and #27620 --- haruhi_dl/downloader/hls.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/downloader/hls.py b/haruhi_dl/downloader/hls.py index 56c84e113..3aa58e1c0 100644 --- a/haruhi_dl/downloader/hls.py +++ b/haruhi_dl/downloader/hls.py @@ -172,8 +172,12 @@ class HlsFD(FragmentFD): iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) decrypt_info['KEY'] = decrypt_info.get('KEY') or self.hdl.urlopen( self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() - frag_content = AES.new( - decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) + # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block + # size (see https://github.com/hdl-org/haruhi-dl/pull/27660). Tests only care that the correct data downloaded, + # not what it decrypts to. + if not test: + frag_content = AES.new( + decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) self._append_fragment(ctx, frag_content) # We only download the first fragment during the test if test: From a72df1d2492115e432be97363f6cb2d9f9021414 Mon Sep 17 00:00:00 2001 From: cladmi Date: Fri, 26 Feb 2021 16:09:15 +0100 Subject: [PATCH 266/384] [motherless] Fix recent videos upload date extraction (closes #27661) Less than a week old videos use a '20h ago' or '1d ago' format. I kept the support for 'Ago' with uppercase start at is was already in the code. --- haruhi_dl/extractor/motherless.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/motherless.py b/haruhi_dl/extractor/motherless.py index b1615b4d8..6cc36b308 100644 --- a/haruhi_dl/extractor/motherless.py +++ b/haruhi_dl/extractor/motherless.py @@ -85,18 +85,27 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Views<', r'Views\s+([^<]+)<'), + (r'>([\d,.]+)\s+Views<', # 1,234,567 Views + r'Views\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Favorites<', r'Favorited\s+([^<]+)<'), + (r'>([\d,.]+)\s+Favorites<', # 1,234 Favorites + r'Favorited\s+([^<]+)<'), webpage, 'like count', fatal=False)) upload_date = self._html_search_regex( (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', + r'class=["\']count[^>]+>(\d+[hd])\s+[aA]go<', # 20h/1d ago r'Uploaded\s+([^<]+)<'), webpage, 'upload date') - if 'Ago' in upload_date: - days = int(re.search(r'([0-9]+)', upload_date).group(1)) - upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') + relative = re.match(r'(\d+)([hd])$', upload_date) + if relative: + delta = int(relative.group(1)) + unit = relative.group(2) + if unit == 'h': + delta_t = datetime.timedelta(hours=delta) + else: # unit == 'd' + delta_t = datetime.timedelta(days=delta) + upload_date = (datetime.datetime.now() - delta_t).strftime('%Y%m%d') else: upload_date = unified_strdate(upload_date) From e94762a1a79d81b580f5c9b69b36c079e4fe9bda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:09:21 +0100 Subject: [PATCH 267/384] =?UTF-8?q?[motherless]=20Fix=20review=20issues=20?= =?UTF-8?q?and=20improve=20extraction=20(closes=20#26495,=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … closes #27450) --- haruhi_dl/extractor/motherless.py | 52 ++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/haruhi_dl/extractor/motherless.py b/haruhi_dl/extractor/motherless.py index 6cc36b308..ef1e081f2 100644 --- a/haruhi_dl/extractor/motherless.py +++ b/haruhi_dl/extractor/motherless.py @@ -61,6 +61,23 @@ class MotherlessIE(InfoExtractor): # no keywords 'url': 'http://motherless.com/8B4BBC1', 'only_matching': True, + }, { + # see https://motherless.com/videos/recent for recent videos with + # uploaded date in "ago" format + 'url': 'https://motherless.com/3C3E2CF', + 'info_dict': { + 'id': '3C3E2CF', + 'ext': 'mp4', + 'title': 'a/ Hot Teens', + 'categories': list, + 'upload_date': '20210104', + 'uploader_id': 'yonbiw', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -85,29 +102,28 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - (r'>([\d,.]+)\s+Views<', # 1,234,567 Views - r'Views\s+([^<]+)<'), + (r'>([\d,.]+)\s+Views<', r'Views\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - (r'>([\d,.]+)\s+Favorites<', # 1,234 Favorites + (r'>([\d,.]+)\s+Favorites<', r'Favorited\s+([^<]+)<'), webpage, 'like count', fatal=False)) - upload_date = self._html_search_regex( - (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', - r'class=["\']count[^>]+>(\d+[hd])\s+[aA]go<', # 20h/1d ago - r'Uploaded\s+([^<]+)<'), webpage, 'upload date') - relative = re.match(r'(\d+)([hd])$', upload_date) - if relative: - delta = int(relative.group(1)) - unit = relative.group(2) - if unit == 'h': - delta_t = datetime.timedelta(hours=delta) - else: # unit == 'd' - delta_t = datetime.timedelta(days=delta) - upload_date = (datetime.datetime.now() - delta_t).strftime('%Y%m%d') - else: - upload_date = unified_strdate(upload_date) + upload_date = unified_strdate(self._search_regex( + r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage, + 'upload date', default=None)) + if not upload_date: + uploaded_ago = self._search_regex( + r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago', + default=None) + if uploaded_ago: + delta = int(uploaded_ago[:-1]) + _AGO_UNITS = { + 'h': 'hours', + 'd': 'days', + } + kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} + upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') comment_count = webpage.count('class="media-comment-contents"') uploader_id = self._html_search_regex( From 7f4e988520f71abf757e7781c04548febe67ff4e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:09:33 +0100 Subject: [PATCH 268/384] [dplay] Add suport Discovery+ domains(closes #27680) --- haruhi_dl/extractor/dplay.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/dplay.py b/haruhi_dl/extractor/dplay.py index a7b9db568..47501dbe6 100644 --- a/haruhi_dl/extractor/dplay.py +++ b/haruhi_dl/extractor/dplay.py @@ -17,7 +17,12 @@ from ..utils import ( class DPlayIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?P - (?:www\.)?(?Pdplay\.(?Pdk|fi|jp|se|no))| + (?:www\.)?(?Pd + (?: + play\.(?Pdk|fi|jp|se|no)| + iscoveryplus\.(?Pdk|es|fi|it|se|no) + ) + )| (?Pes|it)\.dplay\.com )/[^/]+/(?P[^/]+/[^/?#]+)''' @@ -126,6 +131,24 @@ class DPlayIE(InfoExtractor): }, { 'url': 'https://www.dplay.jp/video/gold-rush/24086', 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.no/videoer/i-kongens-klr/sesong-1-episode-7', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.it/videos/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.es/videos/la-fiebre-del-oro/temporada-8-episodio-1', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.fi/videot/shifting-gears-with-aaron-kaufman/episode-16', + 'only_matching': True, }] def _get_disco_api_info(self, url, display_id, disco_host, realm, country): @@ -241,7 +264,7 @@ class DPlayIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') domain = mobj.group('domain').lstrip('www.') - country = mobj.group('country') or mobj.group('subdomain_country') - host = 'disco-api.' + domain if domain.startswith('dplay.') else 'eu2-prod.disco-api.com' + country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country') + host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( url, display_id, host, 'dplay' + country, country) From eb001126da5540c22be16037b01efec4ca58b954 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:09:38 +0100 Subject: [PATCH 269/384] [ketnet] fix extraction(closes #27662) --- haruhi_dl/extractor/ketnet.py | 119 ++++++++++++++-------------------- 1 file changed, 49 insertions(+), 70 deletions(-) diff --git a/haruhi_dl/extractor/ketnet.py b/haruhi_dl/extractor/ketnet.py index 93a98e1e0..e0599d02f 100644 --- a/haruhi_dl/extractor/ketnet.py +++ b/haruhi_dl/extractor/ketnet.py @@ -2,92 +2,71 @@ from __future__ import unicode_literals from .canvas import CanvasIE from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + int_or_none, + parse_iso8601, +) class KetnetIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ - 'url': 'https://www.ketnet.be/kijken/zomerse-filmpjes', - 'md5': '6bdeb65998930251bbd1c510750edba9', + 'url': 'https://www.ketnet.be/kijken/n/nachtwacht/3/nachtwacht-s3a1-de-greystook', + 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', 'info_dict': { - 'id': 'zomerse-filmpjes', + 'id': 'pbs-pub-aef8b526-115e-4006-aa24-e59ff6c6ef6f$vid-ddb815bf-c8e7-467b-8879-6bad7a32cebd', 'ext': 'mp4', - 'title': 'Gluur mee op de filmset en op Pennenzakkenrock', - 'description': 'Gluur mee met Ghost Rockers op de filmset', + 'title': 'Nachtwacht - Reeks 3: Aflevering 1', + 'description': 'De Nachtwacht krijgt te maken met een parasiet', 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - # mzid in playerConfig instead of sources - 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-greystook', - 'md5': '90139b746a0a9bd7bb631283f6e2a64e', - 'info_dict': { - 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'ext': 'flv', - 'title': 'Nachtwacht: De Greystook', - 'description': 'md5:1db3f5dc4c7109c821261e7512975be7', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.03, + 'duration': 1468.02, + 'timestamp': 1609225200, + 'upload_date': '20201229', + 'series': 'Nachtwacht', + 'season': 'Reeks 3', + 'episode': 'De Greystook', + 'episode_number': 1, }, 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], }, { - 'url': 'https://www.ketnet.be/kijken/karrewiet/uitzending-8-september-2016', - 'only_matching': True, - }, { - 'url': 'https://www.ketnet.be/achter-de-schermen/sien-repeteert-voor-stars-for-life', - 'only_matching': True, - }, { - # mzsource, geo restricted to Belgium - 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-bermadoe', + 'url': 'https://www.ketnet.be/themas/karrewiet/jaaroverzicht-20200/karrewiet-het-jaar-van-black-mamba', 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + video = self._download_json( + 'https://senior-bff.ketnet.be/graphql', display_id, query={ + 'query': '''{ + video(id: "content/ketnet/nl/%s.model.json") { + description + episodeNr + imageUrl + mediaReference + programTitle + publicationDate + seasonTitle + subtitleVideodetail + titleVideodetail + } +}''' % display_id, + })['data']['video'] - config = self._parse_json( - self._search_regex( - r'(?s)playerConfig\s*=\s*({.+?})\s*;', webpage, - 'player config'), - video_id) - - mzid = config.get('mzid') - if mzid: - return self.url_result( - 'https://mediazone.vrt.be/api/v1/ketnet/assets/%s' % mzid, - CanvasIE.ie_key(), video_id=mzid) - - title = config['title'] - - formats = [] - for source_key in ('', 'mz'): - source = config.get('%ssource' % source_key) - if not isinstance(source, dict): - continue - for format_id, format_url in source.items(): - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id, - fatal=False)) - elif format_id == 'hds': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_id, fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) + mz_id = compat_urllib_parse_unquote(video['mediaReference']) return { - 'id': video_id, - 'title': title, - 'description': config.get('description'), - 'thumbnail': config.get('image'), - 'series': config.get('program'), - 'episode': config.get('episode'), - 'formats': formats, + '_type': 'url_transparent', + 'id': mz_id, + 'title': video['titleVideodetail'], + 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/' + mz_id, + 'thumbnail': video.get('imageUrl'), + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('publicationDate')), + 'series': video.get('programTitle'), + 'season': video.get('seasonTitle'), + 'episode': video.get('subtitleVideodetail'), + 'episode_number': int_or_none(video.get('episodeNr')), + 'ie_key': CanvasIE.ie_key(), } From 24f5760134a8ed9e7ebb5d7266761a08f5accf67 Mon Sep 17 00:00:00 2001 From: nixxo Date: Fri, 26 Feb 2021 16:09:42 +0100 Subject: [PATCH 270/384] [rai] Detect ContentItem in iframe (closes #12652) (#27673) Co-authored-by: Sergey M. --- haruhi_dl/extractor/rai.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/rai.py b/haruhi_dl/extractor/rai.py index ecb628f14..0a68d16b0 100644 --- a/haruhi_dl/extractor/rai.py +++ b/haruhi_dl/extractor/rai.py @@ -326,6 +326,19 @@ class RaiIE(RaiBaseIE): 'params': { 'skip_download': True, }, + }, { + # ContentItem in iframe (see #12652) + 'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html', + 'info_dict': { + 'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd', + 'ext': 'mp4', + 'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015', + 'description': 'md5:d291b03407ec505f95f27970c0b025f4', + 'upload_date': '20150913', + }, + 'params': { + 'skip_download': True, + }, }, { # Direct MMS URL 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', @@ -403,7 +416,8 @@ class RaiIE(RaiBaseIE): r'''(?x) (?: (?:initEdizione|drawMediaRaiTV)\(| - <(?:[^>]+\bdata-id|var\s+uniquename)= + <(?:[^>]+\bdata-id|var\s+uniquename)=| + ]+\bsrc= ) (["\']) (?:(?!\1).)*\bContentItem-(?P%s) From 948dc5834d9cd88a3ebbe54fc4cf18fa4d734590 Mon Sep 17 00:00:00 2001 From: 23rd <23rd@vivaldi.net> Date: Fri, 26 Feb 2021 16:09:46 +0100 Subject: [PATCH 271/384] [twitch] Switch access token to GraphQL and refactor. --- haruhi_dl/extractor/twitch.py | 114 ++++++++++++++++++++-------------- 1 file changed, 68 insertions(+), 46 deletions(-) diff --git a/haruhi_dl/extractor/twitch.py b/haruhi_dl/extractor/twitch.py index ab6654432..50dcb93ef 100644 --- a/haruhi_dl/extractor/twitch.py +++ b/haruhi_dl/extractor/twitch.py @@ -160,7 +160,64 @@ class TwitchBaseIE(InfoExtractor): return compat_str(self._parse_json(token, channel_name)['channel_id']) -class TwitchVodIE(TwitchBaseIE): +class TwitchGraphQLBaseIE(TwitchBaseIE): + _PAGE_LIMIT = 100 + + _OPERATION_HASHES = { + 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', + 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', + 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', + 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', + 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', + 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', + 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', + } + + def _download_base_gql(self, video_id, ops, note, fatal=True): + return self._download_json( + 'https://gql.twitch.tv/gql', video_id, note, + data=json.dumps(ops).encode(), + headers={ + 'Content-Type': 'text/plain;charset=UTF-8', + 'Client-ID': self._CLIENT_ID, + }, fatal=fatal) + + def _download_gql(self, video_id, ops, note, fatal=True): + for op in ops: + op['extensions'] = { + 'persistedQuery': { + 'version': 1, + 'sha256Hash': self._OPERATION_HASHES[op['operationName']], + } + } + return self._download_base_gql(video_id, ops, note) + + def _download_access_token_gql(self, video_id, item_type=None): + if item_type == 'vod': + method = 'videoPlaybackAccessToken' + param_name = 'id' + else: + method = 'streamPlaybackAccessToken' + param_name = 'channelName' + ops = { + 'query': '''{ + %s( + %s: "%s", + params: { + platform: "web", + playerBackend: "mediaplayer", + playerType: "site" + }) { + value + signature + } + }''' % (method, param_name, video_id), + } + note = 'Downloading access token GraphQL' + return self._download_base_gql(video_id, ops, note)['data'][method] + + +class TwitchVodIE(TwitchGraphQLBaseIE): IE_NAME = 'twitch:vod' _VALID_URL = r'''(?x) https?:// @@ -276,9 +333,7 @@ class TwitchVodIE(TwitchBaseIE): vod_id = self._match_id(url) info = self._download_info(vod_id) - access_token = self._call_api( - 'api/vods/%s/access_token' % vod_id, vod_id, - 'Downloading %s access token' % self._ITEM_TYPE) + access_token = self._download_access_token_gql(vod_id, self._ITEM_TYPE) formats = self._extract_m3u8_formats( '%s/vod/%s.m3u8?%s' % ( @@ -289,8 +344,8 @@ class TwitchVodIE(TwitchBaseIE): 'allow_spectre': 'true', 'player': 'twitchweb', 'playlist_include_framerate': 'true', - 'nauth': access_token['token'], - 'nauthsig': access_token['sig'], + 'nauth': access_token['value'], + 'nauthsig': access_token['signature'], })), vod_id, 'mp4', entry_protocol='m3u8_native') @@ -333,36 +388,6 @@ def _make_video_result(node): } -class TwitchGraphQLBaseIE(TwitchBaseIE): - _PAGE_LIMIT = 100 - - _OPERATION_HASHES = { - 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', - 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', - 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', - 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', - 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', - 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', - 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', - } - - def _download_gql(self, video_id, ops, note, fatal=True): - for op in ops: - op['extensions'] = { - 'persistedQuery': { - 'version': 1, - 'sha256Hash': self._OPERATION_HASHES[op['operationName']], - } - } - return self._download_json( - 'https://gql.twitch.tv/gql', video_id, note, - data=json.dumps(ops).encode(), - headers={ - 'Content-Type': 'text/plain;charset=UTF-8', - 'Client-ID': self._CLIENT_ID, - }, fatal=fatal) - - class TwitchCollectionIE(TwitchGraphQLBaseIE): _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/collections/(?P[^/]+)' @@ -814,8 +839,8 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): if not stream: raise ExtractorError('%s is offline' % channel_name, expected=True) - access_token = self._download_access_token(channel_name) - token = access_token['token'] + access_token = self._download_access_token_gql(channel_name) + token = access_token['value'] stream_id = stream.get('id') or channel_name query = { @@ -826,7 +851,7 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): 'player': 'twitchweb', 'playlist_include_framerate': 'true', 'segment_preference': '4', - 'sig': access_token['sig'].encode('utf-8'), + 'sig': access_token['signature'].encode('utf-8'), 'token': token.encode('utf-8'), } formats = self._extract_m3u8_formats( @@ -866,7 +891,7 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): } -class TwitchClipsIE(TwitchBaseIE): +class TwitchClipsIE(TwitchGraphQLBaseIE): IE_NAME = 'twitch:clips' _VALID_URL = r'''(?x) https?:// @@ -912,8 +937,8 @@ class TwitchClipsIE(TwitchBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - clip = self._download_json( - 'https://gql.twitch.tv/gql', video_id, data=json.dumps({ + clip = self._download_base_gql( + video_id, { 'query': '''{ clip(slug: "%s") { broadcaster { @@ -937,10 +962,7 @@ class TwitchClipsIE(TwitchBaseIE): } viewCount } -}''' % video_id, - }).encode(), headers={ - 'Client-ID': self._CLIENT_ID, - })['data']['clip'] +}''' % video_id}, 'Downloading clip GraphQL')['data']['clip'] if not clip: raise ExtractorError( From 189885594b9c1221165b09cd91ae6b0084b4a698 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:09:51 +0100 Subject: [PATCH 272/384] =?UTF-8?q?[twitch]=20Improve=20access=20token=20e?= =?UTF-8?q?xtraction=20and=20remove=20unused=20code=20(clos=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …es #27646) --- haruhi_dl/extractor/twitch.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/haruhi_dl/extractor/twitch.py b/haruhi_dl/extractor/twitch.py index 50dcb93ef..a939ea24e 100644 --- a/haruhi_dl/extractor/twitch.py +++ b/haruhi_dl/extractor/twitch.py @@ -192,29 +192,27 @@ class TwitchGraphQLBaseIE(TwitchBaseIE): } return self._download_base_gql(video_id, ops, note) - def _download_access_token_gql(self, video_id, item_type=None): - if item_type == 'vod': - method = 'videoPlaybackAccessToken' - param_name = 'id' - else: - method = 'streamPlaybackAccessToken' - param_name = 'channelName' + def _download_access_token_gql(self, video_id, token_kind, param_name): + method = '%sPlaybackAccessToken' % token_kind ops = { 'query': '''{ %s( %s: "%s", - params: { - platform: "web", - playerBackend: "mediaplayer", - playerType: "site" - }) { + params: { + platform: "web", + playerBackend: "mediaplayer", + playerType: "site" + } + ) + { value signature } }''' % (method, param_name, video_id), } - note = 'Downloading access token GraphQL' - return self._download_base_gql(video_id, ops, note)['data'][method] + return self._download_base_gql( + video_id, ops, + 'Downloading %s access token GraphQL' % token_kind)['data'][method] class TwitchVodIE(TwitchGraphQLBaseIE): @@ -227,8 +225,6 @@ class TwitchVodIE(TwitchGraphQLBaseIE): ) (?P\d+) ''' - _ITEM_TYPE = 'vod' - _ITEM_SHORTCUT = 'v' _TESTS = [{ 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s', @@ -333,7 +329,7 @@ class TwitchVodIE(TwitchGraphQLBaseIE): vod_id = self._match_id(url) info = self._download_info(vod_id) - access_token = self._download_access_token_gql(vod_id, self._ITEM_TYPE) + access_token = self._download_access_token_gql(vod_id, 'video', 'id') formats = self._extract_m3u8_formats( '%s/vod/%s.m3u8?%s' % ( @@ -839,7 +835,8 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): if not stream: raise ExtractorError('%s is offline' % channel_name, expected=True) - access_token = self._download_access_token_gql(channel_name) + access_token = self._download_access_token_gql( + channel_name, 'stream', 'channelName') token = access_token['value'] stream_id = stream.get('id') or channel_name From ff330d972761562ba7dbc97231942c55f2dd6dad Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:09:56 +0100 Subject: [PATCH 273/384] [canvas] Fix VRT NU extraction(closes #26957)(closes #27053) --- haruhi_dl/extractor/canvas.py | 120 ++++++++++++---------------------- 1 file changed, 43 insertions(+), 77 deletions(-) diff --git a/haruhi_dl/extractor/canvas.py b/haruhi_dl/extractor/canvas.py index 8667a0d04..65d65d52e 100644 --- a/haruhi_dl/extractor/canvas.py +++ b/haruhi_dl/extractor/canvas.py @@ -7,12 +7,12 @@ from .common import InfoExtractor from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( + extract_attributes, ExtractorError, strip_or_none, float_or_none, int_or_none, merge_dicts, - parse_iso8601, str_or_none, url_or_none, ) @@ -37,6 +37,7 @@ class CanvasIE(InfoExtractor): 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'only_matching': True, }] + _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', 'HLS_AES': 'm3u8', @@ -47,29 +48,34 @@ class CanvasIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) site_id, video_id = mobj.group('site_id'), mobj.group('id') - # Old API endpoint, serves more formats but may fail for some videos - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) + data = None + if site_id != 'vrtvideo': + # Old API endpoint, serves more formats but may fail for some videos + data = self._download_json( + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), video_id, 'Downloading asset JSON', + 'Unable to download asset JSON', fatal=False) # New API endpoint if not data: + headers = self.geo_verification_headers() + headers.update({'Content-Type': 'application/json'}) token = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', - headers={'Content-Type': 'application/json'})['vrtPlayerToken'] + 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', fatal=False, query={ + video_id, 'Downloading video JSON', query={ 'vrtPlayerToken': token, 'client': '%s@PROD' % site_id, }, expected_status=400) - message = data.get('message') - if message and not data.get('title'): - if data.get('code') == 'AUTHENTICATION_REQUIRED': - self.raise_login_required(message) - raise ExtractorError(message, expected=True) + if not data.get('title'): + code = data.get('code') + if code == 'AUTHENTICATION_REQUIRED': + self.raise_login_required() + elif code == 'INVALID_LOCATION': + self.raise_geo_restricted(countries=['BE']) + raise ExtractorError(data.get('message') or code, expected=True) title = data['title'] description = data.get('description') @@ -208,17 +214,21 @@ class VrtNUIE(GigyaBaseIE): _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?Pvrtnu)/(?:[^/]+/)*(?P[^/?#&]+)' _TESTS = [{ # Available via old API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', + 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', 'info_dict': { - 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', + 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', 'ext': 'mp4', - 'title': 'De zwarte weduwe', - 'description': 'md5:db1227b0f318c849ba5eab1fef895ee4', + 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', + 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', 'duration': 1457.04, 'thumbnail': r're:^https?://.*\.jpg$', - 'season': 'Season 1', - 'season_number': 1, + 'series': 'Postbus X', + 'season': 'Seizoen 1989', + 'season_number': 1989, + 'episode': 'De zwarte weduwe', 'episode_number': 1, + 'timestamp': 1595822400, + 'upload_date': '20200727', }, 'skip': 'This video is only available for registered users', 'params': { @@ -300,69 +310,25 @@ class VrtNUIE(GigyaBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage, urlh = self._download_webpage_handle(url, display_id) + webpage = self._download_webpage(url, display_id) + + attrs = extract_attributes(self._search_regex( + r'(]+>)', webpage, 'media element')) + video_id = attrs['videoid'] + publication_id = attrs.get('publicationid') + if publication_id: + video_id = publication_id + '$' + video_id + + page = (self._parse_json(self._search_regex( + r'digitalData\s*=\s*({.+?});', webpage, 'digial data', + default='{}'), video_id, fatal=False) or {}).get('page') or {} info = self._search_json_ld(webpage, display_id, default={}) - - # title is optional here since it may be extracted by extractor - # that is delegated from here - title = strip_or_none(self._html_search_regex( - r'(?ms)

(.+?)

', - webpage, 'title', default=None)) - - description = self._html_search_regex( - r'(?ms)
(.+?)
', - webpage, 'description', default=None) - - season = self._html_search_regex( - [r'''(?xms)\s* - seizoen\ (.+?)\s* - ''', - r'

', r'', r'

(.*?)

', r']*>(.*?)'], webpage, 'title') - description = self._html_search_meta( + description = self._og_search_description(webpage, default=None) or self._html_search_meta( 'dcterms.abstract', webpage, 'description', default=None) if description is None: description = self._html_search_meta( @@ -249,18 +249,18 @@ class ARDMediathekIE(ARDMediathekBaseIE): class ARDIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?:video-?)?(?P[0-9]+))\.html' _TESTS = [{ - # available till 14.02.2019 - 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', - 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', + # available till 7.01.2022 + 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html', + 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1', 'info_dict': { - 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', - 'id': '102', + 'display_id': 'maischberger-die-woche', + 'id': '100', 'ext': 'mp4', - 'duration': 4435.0, - 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', - 'upload_date': '20180214', + 'duration': 3687.0, + 'title': 'maischberger. die woche vom 7. Januar 2021', + 'upload_date': '20210107', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -315,17 +315,17 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', + 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', + 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', 'info_dict': { 'display_id': 'die-robuste-roswita', - 'id': '70153354', + 'id': '78566716', 'title': 'Die robuste Roswita', - 'description': r're:^Der Mord.*trüber ist als die Ilm.', + 'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita', 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', - 'timestamp': 1577047500, - 'upload_date': '20191222', + 'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard', + 'timestamp': 1596658200, + 'upload_date': '20200805', 'ext': 'mp4', }, }, { From 4630d90a5a93b8829962362b6fd95d51e16c687e Mon Sep 17 00:00:00 2001 From: Tatsh Date: Fri, 26 Feb 2021 16:13:47 +0100 Subject: [PATCH 307/384] [Minds] Add new extractor (#17934) --- haruhi_dl/extractor/extractors.py | 5 + haruhi_dl/extractor/minds.py | 164 ++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+) create mode 100644 haruhi_dl/extractor/minds.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index a5c5a5508..3bd95ed8a 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -682,6 +682,11 @@ from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, ) +from .minds import ( + MindsIE, + MindsActivityIE, + MindsChannelIE, +) from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .miomio import MioMioIE diff --git a/haruhi_dl/extractor/minds.py b/haruhi_dl/extractor/minds.py new file mode 100644 index 000000000..4523d0938 --- /dev/null +++ b/haruhi_dl/extractor/minds.py @@ -0,0 +1,164 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import (int_or_none, sanitized_Request, str_or_none, + unified_strdate) + + +class MindsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?minds\.com/media/(?P[0-9]+)' + _TEST = { + 'url': 'https://www.minds.com/media/100000000000086822', + 'md5': '215a658184a419764852239d4970b045', + 'info_dict': { + 'id': '100000000000086822', + 'ext': 'mp4', + 'title': 'Minds intro sequence', + 'thumbnail': 'https://cdn-cinemr.minds.com/cinemr_com/334128440657580032/thumbnail-00001.png', + 'uploader_id': '100000000000000341', + 'description': '', + 'upload_date': '20130524', + 'timestamp': 1369404826, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_api_url = 'https://www.minds.com/api/v1/media/%s' % video_id + token = self._get_cookies(url).get('XSRF-TOKEN') + headers = { + 'authority': 'www.minds.com', + 'referer': url, + 'x-xsrf-token': token.value if token else '', + } + data = self._download_json(video_api_url, video_id, headers=headers, + query={'children': 'false'}) + formats = [] + owner = data.get('ownerObj', {}) + + transcodes = data.get('transcodes', {}) + # These keys are the width so keep the highest width last + keys = sorted(transcodes.keys()) + + for format_id in keys: + is_numeric = re.match('^[0-9]+\.mp4', format_id) + video_url = transcodes[format_id] + info = { + 'url': video_url, + 'format_id': format_id, + 'http_headers': headers, + } + if is_numeric: + info['width'] = int(format_id.split('.')[0]) + formats.append(info) + + uploader_id = str_or_none(owner.get('guid') or + data.get('owner_guid') or + owner.get('legacy_guid') or + owner.get('owner_guid')) + description = str_or_none(data.get('description')) + if description: + description = description.strip() + uploader_url = age_limit = thumbnail = None + + if owner.get('username'): + uploader_url = 'https://www.minds.com/%s' % owner.get('username') + if data.get('mature') is True: + age_limit = 18 + + thumbnail_api_url = data.get('thumbnail_src') + if thumbnail_api_url: + req = sanitized_Request(thumbnail_api_url) + req.get_method = lambda: 'HEAD' + res = self._request_webpage(req, video_id) + if res.headers.get('content-type', '').startswith('image/'): + thumbnail = getattr(res, 'url', None) + tags = data.get('tags', '').strip() + if isinstance(tags, compat_str) and tags: + tags = [x.strip() for x in tags.split(',')] + else: + tags = None + category = data.get('category') + if isinstance(category, compat_str) and category: + category = [category] + else: + category = None + + return { + 'id': video_id, + 'title': data['title'], + 'formats': formats, + 'description': description, + 'license': str_or_none(data.get('license')), + 'creator': str_or_none(owner.get('name') or owner.get('username')), + 'release_date': unified_strdate(data.get('time_created')), + 'timestamp': int_or_none(data.get('time_created')), + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + 'view_count': int_or_none(data.get('play:count')), + 'like_count': int_or_none(data.get('thumbs:up:count')), + 'dislike_count': int_or_none(data.get('thumbs:down:count')), + 'average_rating': int_or_none(data.get('rating')), + 'age_limit': age_limit, + 'categories': [str_or_none(data.get('category'))], + 'tags': tags, + # As of 20181020 the API is returning `false` for this value both + # at top level and within the entity.comments:count path. The only + # other way to get this is to fetch all comments and count. + 'comment_count': int_or_none(data.get('comments:count')), + 'thumbnail': thumbnail, + } + + +class MindsActivityIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?minds\.com/newsfeed/(?P[0-9]+)' + + def _real_extract(self, url): + guid = self._match_id(url) + api_url = 'https://www.minds.com/api/v1/newsfeed/single/%s' % guid + token = self._get_cookies(url).get('XSRF-TOKEN') + headers = { + 'authority': 'www.minds.com', + 'referer': url, + 'x-xsrf-token': token.value if token else '', + } + data = self._download_json(api_url, guid, headers=headers) + return self.url_result('https://www.minds.com/media/%s' % data['activity']['entity_guid']) + + +class MindsChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?minds\.com/(?!newsfeed|media|api)(?P[^/]+)' + + def _real_extract(self, url): + channel_name = self._match_id(url) + api_url = 'https://www.minds.com/api/v1/channel/%s' % channel_name + token = self._get_cookies(url).get('XSRF-TOKEN') + headers = { + 'authority': 'www.minds.com', + 'referer': url, + 'x-xsrf-token': token.value if token else '', + } + data = self._download_json(api_url, channel_name, headers=headers) + channel = data.get('channel', {}) + params = {'limit': 12, 'offset': ''} + api_url = 'https://www.minds.com/api/v1/newsfeed/personal/%s' % channel['guid'] + entries = [] + while True: + data = self._download_json(api_url, channel['guid'], + headers=headers, query=params) + activity = data.get('activity', []) + if len(activity) == 0 or not data.get('load-next'): + break + for info in activity: + if info.get('custom_type') != 'video': + continue + entries.append(self.url_result('https://www.minds.com/media/%s' % info['entity_guid'])) + params['offset'] = data['load-next'] + return self.playlist_result(entries, + playlist_title='%s activity' % channel_name) From 6bec24872b2c879ac97a23af23f24b0cb873e5a2 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:13:55 +0100 Subject: [PATCH 308/384] [minds] improve extraction --- haruhi_dl/extractor/extractors.py | 2 +- haruhi_dl/extractor/minds.py | 288 +++++++++++++++++------------- 2 files changed, 161 insertions(+), 129 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 3bd95ed8a..d61f4f247 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -684,8 +684,8 @@ from .microsoftvirtualacademy import ( ) from .minds import ( MindsIE, - MindsActivityIE, MindsChannelIE, + MindsGroupIE, ) from .ministrygrid import MinistryGridIE from .minoto import MinotoIE diff --git a/haruhi_dl/extractor/minds.py b/haruhi_dl/extractor/minds.py index 4523d0938..8e9f0f825 100644 --- a/haruhi_dl/extractor/minds.py +++ b/haruhi_dl/extractor/minds.py @@ -1,164 +1,196 @@ # coding: utf-8 from __future__ import unicode_literals -import re from .common import InfoExtractor from ..compat import compat_str -from ..utils import (int_or_none, sanitized_Request, str_or_none, - unified_strdate) +from ..utils import ( + clean_html, + int_or_none, + str_or_none, + strip_or_none, +) -class MindsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?minds\.com/media/(?P[0-9]+)' - _TEST = { +class MindsBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?minds\.com/' + + def _call_api(self, path, video_id, resource, query=None): + api_url = 'https://www.minds.com/api/' + path + token = self._get_cookies(api_url).get('XSRF-TOKEN') + return self._download_json( + api_url, video_id, 'Downloading %s JSON metadata' % resource, headers={ + 'Referer': 'https://www.minds.com/', + 'X-XSRF-TOKEN': token.value if token else '', + }, query=query) + + +class MindsIE(MindsBaseIE): + IE_NAME = 'minds' + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?:media|newsfeed|archive/view)/(?P[0-9]+)' + _TESTS = [{ 'url': 'https://www.minds.com/media/100000000000086822', 'md5': '215a658184a419764852239d4970b045', 'info_dict': { 'id': '100000000000086822', 'ext': 'mp4', 'title': 'Minds intro sequence', - 'thumbnail': 'https://cdn-cinemr.minds.com/cinemr_com/334128440657580032/thumbnail-00001.png', - 'uploader_id': '100000000000000341', - 'description': '', + 'thumbnail': r're:https?://.+\.png', + 'uploader_id': 'ottman', 'upload_date': '20130524', 'timestamp': 1369404826, + 'uploader': 'Bill Ottman', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': ['animation'], + 'comment_count': int, + 'license': 'attribution-cc', }, - 'params': { - 'skip_download': True, + }, { + # entity.type == 'activity' and empty title + 'url': 'https://www.minds.com/newsfeed/798025111988506624', + 'md5': 'b2733a74af78d7fd3f541c4cbbaa5950', + 'info_dict': { + 'id': '798022190320226304', + 'ext': 'mp4', + 'title': '798022190320226304', + 'uploader': 'ColinFlaherty', + 'upload_date': '20180111', + 'timestamp': 1515639316, + 'uploader_id': 'ColinFlaherty', }, - } + }, { + 'url': 'https://www.minds.com/archive/view/715172106794442752', + 'only_matching': True, + }, { + # youtube perma_url + 'url': 'https://www.minds.com/newsfeed/1197131838022602752', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - video_api_url = 'https://www.minds.com/api/v1/media/%s' % video_id - token = self._get_cookies(url).get('XSRF-TOKEN') - headers = { - 'authority': 'www.minds.com', - 'referer': url, - 'x-xsrf-token': token.value if token else '', - } - data = self._download_json(video_api_url, video_id, headers=headers, - query={'children': 'false'}) + entity_id = self._match_id(url) + entity = self._call_api( + 'v1/entities/entity/' + entity_id, entity_id, 'entity')['entity'] + if entity.get('type') == 'activity': + if entity.get('custom_type') == 'video': + video_id = entity['entity_guid'] + else: + return self.url_result(entity['perma_url']) + else: + assert(entity['subtype'] == 'video') + video_id = entity_id + # 1080p and webm formats available only on the sources array + video = self._call_api( + 'v2/media/video/' + video_id, video_id, 'video') + formats = [] - owner = data.get('ownerObj', {}) + for source in (video.get('sources') or []): + src = source.get('src') + if not src: + continue + formats.append({ + 'format_id': source.get('label'), + 'height': int_or_none(source.get('size')), + 'url': src, + }) + self._sort_formats(formats) - transcodes = data.get('transcodes', {}) - # These keys are the width so keep the highest width last - keys = sorted(transcodes.keys()) + entity = video.get('entity') or entity + owner = entity.get('ownerObj') or {} + uploader_id = owner.get('username') - for format_id in keys: - is_numeric = re.match('^[0-9]+\.mp4', format_id) - video_url = transcodes[format_id] - info = { - 'url': video_url, - 'format_id': format_id, - 'http_headers': headers, - } - if is_numeric: - info['width'] = int(format_id.split('.')[0]) - formats.append(info) + tags = entity.get('tags') + if tags and isinstance(tags, compat_str): + tags = [tags] - uploader_id = str_or_none(owner.get('guid') or - data.get('owner_guid') or - owner.get('legacy_guid') or - owner.get('owner_guid')) - description = str_or_none(data.get('description')) - if description: - description = description.strip() - uploader_url = age_limit = thumbnail = None - - if owner.get('username'): - uploader_url = 'https://www.minds.com/%s' % owner.get('username') - if data.get('mature') is True: - age_limit = 18 - - thumbnail_api_url = data.get('thumbnail_src') - if thumbnail_api_url: - req = sanitized_Request(thumbnail_api_url) - req.get_method = lambda: 'HEAD' - res = self._request_webpage(req, video_id) - if res.headers.get('content-type', '').startswith('image/'): - thumbnail = getattr(res, 'url', None) - tags = data.get('tags', '').strip() - if isinstance(tags, compat_str) and tags: - tags = [x.strip() for x in tags.split(',')] - else: - tags = None - category = data.get('category') - if isinstance(category, compat_str) and category: - category = [category] - else: - category = None + thumbnail = None + poster = video.get('poster') or entity.get('thumbnail_src') + if poster: + urlh = self._request_webpage(poster, video_id, fatal=False) + if urlh: + thumbnail = urlh.geturl() return { 'id': video_id, - 'title': data['title'], + 'title': entity.get('title') or video_id, 'formats': formats, - 'description': description, - 'license': str_or_none(data.get('license')), - 'creator': str_or_none(owner.get('name') or owner.get('username')), - 'release_date': unified_strdate(data.get('time_created')), - 'timestamp': int_or_none(data.get('time_created')), + 'description': clean_html(entity.get('description')) or None, + 'license': str_or_none(entity.get('license')), + 'timestamp': int_or_none(entity.get('time_created')), + 'uploader': strip_or_none(owner.get('name')), 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - 'view_count': int_or_none(data.get('play:count')), - 'like_count': int_or_none(data.get('thumbs:up:count')), - 'dislike_count': int_or_none(data.get('thumbs:down:count')), - 'average_rating': int_or_none(data.get('rating')), - 'age_limit': age_limit, - 'categories': [str_or_none(data.get('category'))], + 'uploader_url': 'https://www.minds.com/' + uploader_id if uploader_id else None, + 'view_count': int_or_none(entity.get('play:count')), + 'like_count': int_or_none(entity.get('thumbs:up:count')), + 'dislike_count': int_or_none(entity.get('thumbs:down:count')), 'tags': tags, - # As of 20181020 the API is returning `false` for this value both - # at top level and within the entity.comments:count path. The only - # other way to get this is to fetch all comments and count. - 'comment_count': int_or_none(data.get('comments:count')), + 'comment_count': int_or_none(entity.get('comments:count')), 'thumbnail': thumbnail, } -class MindsActivityIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?minds\.com/newsfeed/(?P[0-9]+)' +class MindsFeedBaseIE(MindsBaseIE): + _PAGE_SIZE = 150 - def _real_extract(self, url): - guid = self._match_id(url) - api_url = 'https://www.minds.com/api/v1/newsfeed/single/%s' % guid - token = self._get_cookies(url).get('XSRF-TOKEN') - headers = { - 'authority': 'www.minds.com', - 'referer': url, - 'x-xsrf-token': token.value if token else '', - } - data = self._download_json(api_url, guid, headers=headers) - return self.url_result('https://www.minds.com/media/%s' % data['activity']['entity_guid']) - - -class MindsChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?minds\.com/(?!newsfeed|media|api)(?P[^/]+)' - - def _real_extract(self, url): - channel_name = self._match_id(url) - api_url = 'https://www.minds.com/api/v1/channel/%s' % channel_name - token = self._get_cookies(url).get('XSRF-TOKEN') - headers = { - 'authority': 'www.minds.com', - 'referer': url, - 'x-xsrf-token': token.value if token else '', - } - data = self._download_json(api_url, channel_name, headers=headers) - channel = data.get('channel', {}) - params = {'limit': 12, 'offset': ''} - api_url = 'https://www.minds.com/api/v1/newsfeed/personal/%s' % channel['guid'] - entries = [] + def _entries(self, feed_id): + query = {'limit': self._PAGE_SIZE, 'sync': 1} + i = 1 while True: - data = self._download_json(api_url, channel['guid'], - headers=headers, query=params) - activity = data.get('activity', []) - if len(activity) == 0 or not data.get('load-next'): - break - for info in activity: - if info.get('custom_type') != 'video': + data = self._call_api( + 'v2/feeds/container/%s/videos' % feed_id, + feed_id, 'page %s' % i, query) + entities = data.get('entities') or [] + for entity in entities: + guid = entity.get('guid') + if not guid: continue - entries.append(self.url_result('https://www.minds.com/media/%s' % info['entity_guid'])) - params['offset'] = data['load-next'] - return self.playlist_result(entries, - playlist_title='%s activity' % channel_name) + yield self.url_result( + 'https://www.minds.com/newsfeed/' + guid, + MindsIE.ie_key(), guid) + query['from_timestamp'] = data['load-next'] + if not (query['from_timestamp'] and len(entities) == self._PAGE_SIZE): + break + i += 1 + + def _real_extract(self, url): + feed_id = self._match_id(url) + feed = self._call_api( + 'v1/%s/%s' % (self._FEED_PATH, feed_id), + feed_id, self._FEED_TYPE)[self._FEED_TYPE] + + return self.playlist_result( + self._entries(feed['guid']), feed_id, + strip_or_none(feed.get('name')), + feed.get('briefdescription')) + + +class MindsChannelIE(MindsFeedBaseIE): + _FEED_TYPE = 'channel' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?!(?:newsfeed|media|api|archive|groups)/)(?P[^/?&#]+)' + _FEED_PATH = 'channel' + _TEST = { + 'url': 'https://www.minds.com/ottman', + 'info_dict': { + 'id': 'ottman', + 'title': 'Bill Ottman', + 'description': 'Co-creator & CEO @minds', + }, + 'playlist_mincount': 54, + } + + +class MindsGroupIE(MindsFeedBaseIE): + _FEED_TYPE = 'group' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'groups/profile/(?P[0-9]+)' + _FEED_PATH = 'groups/group' + _TEST = { + 'url': 'https://www.minds.com/groups/profile/785582576369672204/feed/videos', + 'info_dict': { + 'id': '785582576369672204', + 'title': 'Cooking Videos', + }, + 'playlist_mincount': 1, + } From 4d3655d3d9879f02acabf6667a3946a4563f2d08 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:13:59 +0100 Subject: [PATCH 309/384] [aljazeera] fix extraction(closes #20911)(closes #27779) --- haruhi_dl/extractor/aljazeera.py | 41 +++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/haruhi_dl/extractor/aljazeera.py b/haruhi_dl/extractor/aljazeera.py index c68be3134..c4f915a3c 100644 --- a/haruhi_dl/extractor/aljazeera.py +++ b/haruhi_dl/extractor/aljazeera.py @@ -1,13 +1,16 @@ from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?Pprogram/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P[^/?&#]+)' _TESTS = [{ - 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', + 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', 'info_dict': { 'id': '3792260579001', 'ext': 'mp4', @@ -20,14 +23,34 @@ class AlJazeeraIE(InfoExtractor): 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', }, { - 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html', + 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', + 'only_matching': True, + }, { + 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' def _real_extract(self, url): - program_name = self._match_id(url) - webpage = self._download_webpage(url, program_name) - brightcove_id = self._search_regex( - r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + post_type, name = re.match(self._VALID_URL, url).groups() + post_type = { + 'features': 'post', + 'program': 'episode', + 'videos': 'video', + }[post_type.split('/')[0]] + video = self._download_json( + 'https://www.aljazeera.com/graphql', name, query={ + 'operationName': 'SingleArticleQuery', + 'variables': json.dumps({ + 'name': name, + 'postType': post_type, + }), + }, headers={ + 'wp-site': 'aje', + })['data']['article']['video'] + video_id = video['id'] + account_id = video.get('accountId') or '665003303001' + player_id = video.get('playerId') or 'BkeSH5BDb' + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), + 'BrightcoveNew', video_id) From 300dfe0df27111a8d4db4de90030a3066a67ce56 Mon Sep 17 00:00:00 2001 From: Brian Marks Date: Fri, 26 Feb 2021 16:14:08 +0100 Subject: [PATCH 310/384] =?UTF-8?q?[americastestkitchen]=20Improve=20metad?= =?UTF-8?q?ata=20extraction=20for=20ATK=20episodes=20(#=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …27860) --- haruhi_dl/extractor/americastestkitchen.py | 30 ++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/americastestkitchen.py b/haruhi_dl/extractor/americastestkitchen.py index e20f00fc3..7d2c375c4 100644 --- a/haruhi_dl/extractor/americastestkitchen.py +++ b/haruhi_dl/extractor/americastestkitchen.py @@ -6,8 +6,10 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + int_or_none, try_get, unified_strdate, + unified_timestamp, ) @@ -22,8 +24,8 @@ class AmericasTestKitchenIE(InfoExtractor): 'ext': 'mp4', 'description': 'md5:64e606bfee910627efc4b5f050de92b3', 'thumbnail': r're:^https?://', - 'timestamp': 1523664000, - 'upload_date': '20180414', + 'timestamp': 1523318400, + 'upload_date': '20180410', 'release_date': '20180410', 'series': "America's Test Kitchen", 'season_number': 18, @@ -33,6 +35,27 @@ class AmericasTestKitchenIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # Metadata parsing behaves differently for newer episodes (705) as opposed to older episodes (582 above) + 'url': 'https://www.americastestkitchen.com/episode/705-simple-chicken-dinner', + 'md5': '06451608c57651e985a498e69cec17e5', + 'info_dict': { + 'id': '5fbe8c61bda2010001c6763b', + 'title': 'Simple Chicken Dinner', + 'ext': 'mp4', + 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', + 'thumbnail': r're:^https?://', + 'timestamp': 1610755200, + 'upload_date': '20210116', + 'release_date': '20210116', + 'series': "America's Test Kitchen", + 'season_number': 21, + 'episode': 'Simple Chicken Dinner', + 'episode_number': 3, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, @@ -60,7 +83,10 @@ class AmericasTestKitchenIE(InfoExtractor): 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'], 'ie_key': 'Zype', 'description': clean_html(video.get('description')), + 'timestamp': unified_timestamp(video.get('publishDate')), 'release_date': unified_strdate(video.get('publishDate')), + 'episode_number': int_or_none(episode.get('number')), + 'season_number': int_or_none(episode.get('season')), 'series': try_get(episode, lambda x: x['show']['title']), 'episode': episode.get('title'), } From 5a5b791576c035e9af39c707f3fdc52331c7dc92 Mon Sep 17 00:00:00 2001 From: DrWursterich <31037782+DrWursterich@users.noreply.github.com> Date: Fri, 26 Feb 2021 16:14:15 +0100 Subject: [PATCH 311/384] [9gag] Fix Extraction (#23022) --- haruhi_dl/extractor/ninegag.py | 200 ++++++++++++++++++++------------- 1 file changed, 122 insertions(+), 78 deletions(-) diff --git a/haruhi_dl/extractor/ninegag.py b/haruhi_dl/extractor/ninegag.py index dc6a27d36..3753bc0a2 100644 --- a/haruhi_dl/extractor/ninegag.py +++ b/haruhi_dl/extractor/ninegag.py @@ -3,102 +3,146 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import str_to_int +from ..utils import ( + determine_ext, + url_or_none, + int_or_none, + float_or_none, + ExtractorError +) class NineGagIE(InfoExtractor): IE_NAME = '9gag' - _VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv|\.tv)/(?:p|embed)/(?P[a-zA-Z0-9]+)(?:/(?P[^?#/]+))?' + _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome', + 'url': 'https://9gag.com/gag/an5Qz5b', 'info_dict': { - 'id': 'kXzwOKyGlSA', - 'ext': 'mp4', - 'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)', - 'title': '\"People Are Awesome 2013\" Is Absolutely Awesome', - 'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA', - 'uploader': 'CompilationChannel', - 'upload_date': '20131110', - 'view_count': int, - }, - 'add_ie': ['Youtube'], + 'id': 'an5Qz5b', + 'ext': 'webm', + 'title': 'Dogs playing tetherball', + 'upload_date': '20191108', + 'timestamp': 1573243994, + 'age_limit': 0, + 'categories': [ + 'Wholesome' + ], + 'tags': [ + 'Dog' + ] + } }, { - 'url': 'http://9gag.com/tv/p/aKolP3', + 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { - 'id': 'aKolP3', - 'ext': 'mp4', - 'title': 'This Guy Travelled 11 countries In 44 days Just To Make This Amazing Video', - 'description': "I just saw more in 1 minute than I've seen in 1 year. This guy's video is epic!!", - 'uploader_id': 'rickmereki', - 'uploader': 'Rick Mereki', - 'upload_date': '20110803', - 'view_count': int, - }, - 'add_ie': ['Vimeo'], - }, { - 'url': 'http://9gag.com/tv/p/KklwM', - 'only_matching': True, - }, { - 'url': 'http://9gag.tv/p/Kk2X5', - 'only_matching': True, - }, { - 'url': 'http://9gag.com/tv/embed/a5Dmvl', - 'only_matching': True, + 'id': 'ae5Ag7B', + 'ext': 'webm', + 'title': 'Capybara Agility Training', + 'upload_date': '20191108', + 'timestamp': 1573237208, + 'age_limit': 0, + 'categories': [ + 'Awesome' + ], + 'tags': [ + 'Weimaraner', + 'American Pit Bull Terrier' + ] + } }] - _EXTERNAL_VIDEO_PROVIDER = { - '1': { - 'url': '%s', - 'ie_key': 'Youtube', - }, - '2': { - 'url': 'http://player.vimeo.com/video/%s', - 'ie_key': 'Vimeo', - }, - '3': { - 'url': 'http://instagram.com/p/%s', - 'ie_key': 'Instagram', - }, - '4': { - 'url': 'http://vine.co/v/%s', - 'ie_key': 'Vine', - }, + _EXTERNAL_VIDEO_PROVIDERS = { + 'Youtube': 'https://youtube.com/watch?v=%s' } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + rawJsonData = self._search_regex( + r'window._config\s*=\s*JSON.parse\(["\']({.+?})["\']\);', + webpage, + 'data') + rawJsonData = rawJsonData.replace('\\"', '"').replace('\\\\/', '/') + data = self._parse_json(rawJsonData, video_id)['data']['post'] - webpage = self._download_webpage(url, display_id) + if data['type'] == 'Video': + vid = data['video']['id'] + ie_key = data['video']['source'].capitalize() + return { + '_type': 'url_transparent', + 'url': self._EXTERNAL_VIDEO_PROVIDERS[ie_key] % vid, + 'ie_key': ie_key, + 'id': vid, + 'duration': data['video'].get('duration'), + 'start_time': data['video'].get('startTs') + } - post_view = self._parse_json( - self._search_regex( - r'var\s+postView\s*=\s*new\s+app\.PostView\({\s*post:\s*({.+?})\s*,\s*posts:\s*prefetchedCurrentPost', - webpage, 'post view'), - display_id) + if data['type'] == 'EmbedVideo': + vid = data['video']['id'] + ie_key = data['video']['source'].capitalize() + return { + '_type': 'url_transparent', + 'url': data['video']['embedUrl'], + #'ie_key': vid, + 'start_time': data['video'].get('startTs') + } - ie_key = None - source_url = post_view.get('sourceUrl') - if not source_url: - external_video_id = post_view['videoExternalId'] - external_video_provider = post_view['videoExternalProvider'] - source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id - ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key'] - title = post_view['title'] - description = post_view.get('description') - view_count = str_to_int(post_view.get('externalView')) - thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w') + if data['type'] != 'Animated': + raise ExtractorError( + 'The given url does not contain a video', + expected=True) + + duration = None + formats = [] + thumbnails = [] + for key in data['images']: + image = data['images'][key] + if 'duration' in image and duration is None: + duration = int_or_none(image['duration']) + url = url_or_none(image.get('url')) + if url == None: + continue + ext = determine_ext(url) + if ext == 'jpg' or ext == 'png': + thumbnail = { + 'url': url, + 'width': float_or_none(image.get('width')), + 'height': float_or_none(image.get('height')) + } + thumbnails.append(thumbnail) + elif ext == 'webm' or ext == 'mp4': + formats.append({ + 'format_id': re.sub(r'.*_([^\.]+).(.*)', r'\1_\2', url), + 'ext': ext, + 'url': url, + 'width': float_or_none(image.get('width')), + 'height': float_or_none(image.get('height')) + }) + section = None + postSection = data.get('postSection') + if postSection != None and 'name' in postSection: + section = re.sub(r'\\[^\\]{5}', '', postSection['name']) + age_limit = int_or_none(data.get('nsfw')) + if age_limit != None: + age_limit = age_limit * 18 + tags = None + if 'tags' in data: + tags = [] + for tag in data.get('tags') or []: + tags.append(tag.get('key')) return { - '_type': 'url_transparent', - 'url': source_url, - 'ie_key': ie_key, 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'view_count': view_count, - 'thumbnail': thumbnail, + 'title': data['title'], + 'timestamp': int_or_none(data.get('creationTs')), + 'duration': duration, + 'formats': formats, + 'thumbnails': thumbnails, + 'like_count': int_or_none(data.get('upVoteCount')), + 'dislike_count': int_or_none(data.get('downVoteCount')), + 'comment_count': int_or_none(data.get('commentsCount')), + 'age_limit': age_limit, + 'categories': [section], + 'tags': tags, + 'is_live': False } From b99e3e93f383ce35e559a85adee20ea8ad9b15a4 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:14:21 +0100 Subject: [PATCH 312/384] [ninegag] improve extraction --- haruhi_dl/extractor/ninegag.py | 189 +++++++++++++++------------------ 1 file changed, 83 insertions(+), 106 deletions(-) diff --git a/haruhi_dl/extractor/ninegag.py b/haruhi_dl/extractor/ninegag.py index 3753bc0a2..440f865bc 100644 --- a/haruhi_dl/extractor/ninegag.py +++ b/haruhi_dl/extractor/ninegag.py @@ -1,148 +1,125 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( determine_ext, - url_or_none, + ExtractorError, int_or_none, - float_or_none, - ExtractorError + try_get, + url_or_none, ) class NineGagIE(InfoExtractor): IE_NAME = '9gag' - _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[^/?&#]+)' - _TESTS = [{ - 'url': 'https://9gag.com/gag/an5Qz5b', - 'info_dict': { - 'id': 'an5Qz5b', - 'ext': 'webm', - 'title': 'Dogs playing tetherball', - 'upload_date': '20191108', - 'timestamp': 1573243994, - 'age_limit': 0, - 'categories': [ - 'Wholesome' - ], - 'tags': [ - 'Dog' - ] - } - }, { + _TEST = { 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { 'id': 'ae5Ag7B', - 'ext': 'webm', + 'ext': 'mp4', 'title': 'Capybara Agility Training', 'upload_date': '20191108', 'timestamp': 1573237208, - 'age_limit': 0, - 'categories': [ - 'Awesome' - ], - 'tags': [ - 'Weimaraner', - 'American Pit Bull Terrier' - ] + 'categories': ['Awesome'], + 'tags': ['Weimaraner', 'American Pit Bull Terrier'], + 'duration': 44, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, } - }] - - _EXTERNAL_VIDEO_PROVIDERS = { - 'Youtube': 'https://youtube.com/watch?v=%s' } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - rawJsonData = self._search_regex( - r'window._config\s*=\s*JSON.parse\(["\']({.+?})["\']\);', - webpage, - 'data') - rawJsonData = rawJsonData.replace('\\"', '"').replace('\\\\/', '/') - data = self._parse_json(rawJsonData, video_id)['data']['post'] + post_id = self._match_id(url) + post = self._download_json( + 'https://9gag.com/v1/post', post_id, query={ + 'id': post_id + })['data']['post'] - if data['type'] == 'Video': - vid = data['video']['id'] - ie_key = data['video']['source'].capitalize() - return { - '_type': 'url_transparent', - 'url': self._EXTERNAL_VIDEO_PROVIDERS[ie_key] % vid, - 'ie_key': ie_key, - 'id': vid, - 'duration': data['video'].get('duration'), - 'start_time': data['video'].get('startTs') - } - - if data['type'] == 'EmbedVideo': - vid = data['video']['id'] - ie_key = data['video']['source'].capitalize() - return { - '_type': 'url_transparent', - 'url': data['video']['embedUrl'], - #'ie_key': vid, - 'start_time': data['video'].get('startTs') - } - - if data['type'] != 'Animated': + if post.get('type') != 'Animated': raise ExtractorError( 'The given url does not contain a video', expected=True) + title = post['title'] + duration = None formats = [] thumbnails = [] - for key in data['images']: - image = data['images'][key] - if 'duration' in image and duration is None: - duration = int_or_none(image['duration']) - url = url_or_none(image.get('url')) - if url == None: + for key, image in (post.get('images') or {}).items(): + image_url = url_or_none(image.get('url')) + if not image_url: continue - ext = determine_ext(url) - if ext == 'jpg' or ext == 'png': - thumbnail = { - 'url': url, - 'width': float_or_none(image.get('width')), - 'height': float_or_none(image.get('height')) - } - thumbnails.append(thumbnail) - elif ext == 'webm' or ext == 'mp4': - formats.append({ - 'format_id': re.sub(r'.*_([^\.]+).(.*)', r'\1_\2', url), + ext = determine_ext(image_url) + image_id = key.strip('image') + common = { + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } + if ext in ('jpg', 'png'): + webp_url = image.get('webpUrl') + if webp_url: + t = common.copy() + t.update({ + 'id': image_id + '-webp', + 'url': webp_url, + }) + thumbnails.append(t) + common.update({ + 'id': image_id, 'ext': ext, - 'url': url, - 'width': float_or_none(image.get('width')), - 'height': float_or_none(image.get('height')) }) - section = None - postSection = data.get('postSection') - if postSection != None and 'name' in postSection: - section = re.sub(r'\\[^\\]{5}', '', postSection['name']) - age_limit = int_or_none(data.get('nsfw')) - if age_limit != None: - age_limit = age_limit * 18 + thumbnails.append(common) + elif ext in ('webm', 'mp4'): + if not duration: + duration = int_or_none(image.get('duration')) + common['acodec'] = 'none' if image.get('hasAudio') == 0 else None + for vcodec in ('vp8', 'vp9', 'h265'): + c_url = image.get(vcodec + 'Url') + if not c_url: + continue + c_f = common.copy() + c_f.update({ + 'format_id': image_id + '-' + vcodec, + 'url': c_url, + 'vcodec': vcodec, + }) + formats.append(c_f) + common.update({ + 'ext': ext, + 'format_id': image_id, + }) + formats.append(common) + self._sort_formats(formats) + + section = try_get(post, lambda x: x['postSection']['name']) + tags = None - if 'tags' in data: + post_tags = post.get('tags') + if post_tags: tags = [] - for tag in data.get('tags') or []: - tags.append(tag.get('key')) + for tag in post_tags: + tag_key = tag.get('key') + if not tag_key: + continue + tags.append(tag_key) + + get_count = lambda x: int_or_none(post.get(x + 'Count')) return { - 'id': video_id, - 'title': data['title'], - 'timestamp': int_or_none(data.get('creationTs')), + 'id': post_id, + 'title': title, + 'timestamp': int_or_none(post.get('creationTs')), 'duration': duration, 'formats': formats, 'thumbnails': thumbnails, - 'like_count': int_or_none(data.get('upVoteCount')), - 'dislike_count': int_or_none(data.get('downVoteCount')), - 'comment_count': int_or_none(data.get('commentsCount')), - 'age_limit': age_limit, - 'categories': [section], + 'like_count': get_count('upVote'), + 'dislike_count': get_count('downVote'), + 'comment_count': get_count('comments'), + 'age_limit': 18 if post.get('nsfw') == 1 else None, + 'categories': [section] if section else None, 'tags': tags, - 'is_live': False } From 6295bb4307ee377c43de731fa25d807ec5c2b178 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:14:27 +0100 Subject: [PATCH 313/384] [yahoo] fix single video extraction --- haruhi_dl/extractor/yahoo.py | 80 ++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/haruhi_dl/extractor/yahoo.py b/haruhi_dl/extractor/yahoo.py index e4615376c..a17b10d6e 100644 --- a/haruhi_dl/extractor/yahoo.py +++ b/haruhi_dl/extractor/yahoo.py @@ -177,46 +177,9 @@ class YahooIE(InfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - url, country, display_id = re.match(self._VALID_URL, url).groups() - if not country: - country = 'us' - else: - country = country.split('-')[0] - api_base = 'https://%s.yahoo.com/_td/api/resource/' % country - - for i, uuid in enumerate(['url=' + url, 'ymedia-alias=' + display_id]): - content = self._download_json( - api_base + 'content;getDetailView=true;uuids=["%s"]' % uuid, - display_id, 'Downloading content JSON metadata', fatal=i == 1) - if content: - item = content['items'][0] - break - - if item.get('type') != 'video': - entries = [] - - cover = item.get('cover') or {} - if cover.get('type') == 'yvideo': - cover_url = cover.get('url') - if cover_url: - entries.append(self.url_result( - cover_url, 'Yahoo', cover.get('uuid'))) - - for e in item.get('body', []): - if e.get('type') == 'videoIframe': - iframe_url = e.get('url') - if not iframe_url: - continue - entries.append(self.url_result(iframe_url)) - - return self.playlist_result( - entries, item.get('uuid'), - item.get('title'), item.get('summary')) - - video_id = item['uuid'] + def _extract_yahoo_video(self, video_id, country): video = self._download_json( - api_base + 'VideoService.videos;view=full;video_ids=["%s"]' % video_id, + 'https://%s.yahoo.com/_td/api/resource/VideoService.videos;view=full;video_ids=["%s"]' % (country, video_id), video_id, 'Downloading video JSON metadata')[0] title = video['title'] @@ -298,7 +261,6 @@ class YahooIE(InfoExtractor): 'id': video_id, 'title': self._live_title(title) if is_live else title, 'formats': formats, - 'display_id': display_id, 'thumbnails': thumbnails, 'description': clean_html(video.get('description')), 'timestamp': parse_iso8601(video.get('publish_time')), @@ -311,6 +273,44 @@ class YahooIE(InfoExtractor): 'episode_number': int_or_none(series_info.get('episode_number')), } + def _real_extract(self, url): + url, country, display_id = re.match(self._VALID_URL, url).groups() + if not country: + country = 'us' + else: + country = country.split('-')[0] + + item = self._download_json( + 'https://%s.yahoo.com/caas/content/article' % country, display_id, + 'Downloading content JSON metadata', query={ + 'url': url + })['items'][0]['data']['partnerData'] + + if item.get('type') != 'video': + entries = [] + + cover = item.get('cover') or {} + if cover.get('type') == 'yvideo': + cover_url = cover.get('url') + if cover_url: + entries.append(self.url_result( + cover_url, 'Yahoo', cover.get('uuid'))) + + for e in (item.get('body') or []): + if e.get('type') == 'videoIframe': + iframe_url = e.get('url') + if not iframe_url: + continue + entries.append(self.url_result(iframe_url)) + + return self.playlist_result( + entries, item.get('uuid'), + item.get('title'), item.get('summary')) + + info = self._extract_yahoo_video(item['uuid'], country) + info['display_id'] = display_id + return info + class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' From 6e51fd65a87f4c9c2bfe568d3d1188683fe8c2a5 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:14:37 +0100 Subject: [PATCH 314/384] [aol] add support for yahoo videos(closes #26650) --- haruhi_dl/extractor/aol.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/aol.py b/haruhi_dl/extractor/aol.py index e87994a6a..f6ecb8438 100644 --- a/haruhi_dl/extractor/aol.py +++ b/haruhi_dl/extractor/aol.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .yahoo import YahooIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, @@ -15,9 +15,9 @@ from ..utils import ( ) -class AolIE(InfoExtractor): +class AolIE(YahooIE): IE_NAME = 'aol.com' - _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P[0-9a-f]+)' + _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' _TESTS = [{ # video with 5min ID @@ -76,10 +76,16 @@ class AolIE(InfoExtractor): }, { 'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/', 'only_matching': True, + }, { + # Yahoo video + 'url': 'https://www.aol.com/video/play/991e6700-ac02-11ea-99ff-357400036f61/24bbc846-3e30-3c46-915e-fe8ccd7fcc46/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + if '-' in video_id: + return self._extract_yahoo_video(video_id, 'us') response = self._download_json( 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, From d381066e1d4297109a3d196cd8724fea836244aa Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:15:17 +0100 Subject: [PATCH 315/384] [trovo] Add new extractor(closes #26125) --- haruhi_dl/extractor/extractors.py | 4 + haruhi_dl/extractor/trovo.py | 193 ++++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+) create mode 100644 haruhi_dl/extractor/trovo.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index d61f4f247..1fe6132d3 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1287,6 +1287,10 @@ from .transistorfm import ( TransistorFMShareIE, ) from .trilulilu import TriluliluIE +from .trovo import ( + TrovoIE, + TrovoVodIE, +) from .trunews import TruNewsIE from .trutv import TruTVIE from .tubafm import ( diff --git a/haruhi_dl/extractor/trovo.py b/haruhi_dl/extractor/trovo.py new file mode 100644 index 000000000..43745213d --- /dev/null +++ b/haruhi_dl/extractor/trovo.py @@ -0,0 +1,193 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + try_get, +) + + +class TrovoBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?trovo\.live/' + + def _extract_streamer_info(self, data): + streamer_info = data.get('streamerInfo') or {} + username = streamer_info.get('userName') + return { + 'uploader': streamer_info.get('nickName'), + 'uploader_id': str_or_none(streamer_info.get('uid')), + 'uploader_url': 'https://trovo.live/' + username if username else None, + } + + +class TrovoIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?!(?:clip|video)/)(?P[^/?&#]+)' + + def _real_extract(self, url): + username = self._match_id(url) + live_info = self._download_json( + 'https://gql.trovo.live/', username, query={ + 'query': '''{ + getLiveInfo(params: {userName: "%s"}) { + isLive + programInfo { + coverUrl + id + streamInfo { + desc + playUrl + } + title + } + streamerInfo { + nickName + uid + userName + } + } +}''' % username, + })['data']['getLiveInfo'] + if live_info.get('isLive') == 0: + raise ExtractorError('%s is offline' % username, expected=True) + program_info = live_info['programInfo'] + program_id = program_info['id'] + title = self._live_title(program_info['title']) + + formats = [] + for stream_info in (program_info.get('streamInfo') or []): + play_url = stream_info.get('playUrl') + if not play_url: + continue + format_id = stream_info.get('desc') + formats.append({ + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'url': play_url, + }) + self._sort_formats(formats) + + info = { + 'id': program_id, + 'title': title, + 'formats': formats, + 'thumbnail': program_info.get('coverUrl'), + 'is_live': True, + } + info.update(self._extract_streamer_info(live_info)) + return info + + +class TrovoVodIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?:clip|video)/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://trovo.live/video/ltv-100095501_100095501_1609596043', + 'info_dict': { + 'id': 'ltv-100095501_100095501_1609596043', + 'ext': 'mp4', + 'title': 'Spontaner 12 Stunden Stream! - Ok Boomer!', + 'uploader': 'Exsl', + 'timestamp': 1609640305, + 'upload_date': '20210103', + 'uploader_id': '100095501', + 'duration': 43977, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'comments': 'mincount:8', + 'categories': ['Grand Theft Auto V'], + }, + }, { + 'url': 'https://trovo.live/clip/lc-5285890810184026005', + 'only_matching': True, + }] + + def _real_extract(self, url): + vid = self._match_id(url) + resp = self._download_json( + 'https://gql.trovo.live/', vid, data=json.dumps([{ + 'query': '''{ + batchGetVodDetailInfo(params: {vids: ["%s"]}) { + VodDetailInfos + } +}''' % vid, + }, { + 'query': '''{ + getCommentList(params: {appInfo: {postID: "%s"}, pageSize: 1000000000, preview: {}}) { + commentList { + author { + nickName + uid + } + commentID + content + createdAt + parentID + } + } +}''' % vid, + }]).encode(), headers={ + 'Content-Type': 'application/json', + }) + vod_detail_info = resp[0]['data']['batchGetVodDetailInfo']['VodDetailInfos'][vid] + vod_info = vod_detail_info['vodInfo'] + title = vod_info['title'] + + language = vod_info.get('languageName') + formats = [] + for play_info in (vod_info.get('playInfos') or []): + play_url = play_info.get('playUrl') + if not play_url: + continue + format_id = play_info.get('desc') + formats.append({ + 'ext': 'mp4', + 'filesize': int_or_none(play_info.get('fileSize')), + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'language': language, + 'protocol': 'm3u8_native', + 'tbr': int_or_none(play_info.get('bitrate')), + 'url': play_url, + }) + self._sort_formats(formats) + + category = vod_info.get('categoryName') + get_count = lambda x: int_or_none(vod_info.get(x + 'Num')) + + comment_list = try_get(resp, lambda x: x[1]['data']['getCommentList']['commentList'], list) or [] + comments = [] + for comment in comment_list: + content = comment.get('content') + if not content: + continue + author = comment.get('author') or {} + parent = comment.get('parentID') + comments.append({ + 'author': author.get('nickName'), + 'author_id': str_or_none(author.get('uid')), + 'id': str_or_none(comment.get('commentID')), + 'text': content, + 'timestamp': int_or_none(comment.get('createdAt')), + 'parent': 'root' if parent == 0 else str_or_none(parent), + }) + + info = { + 'id': vid, + 'title': title, + 'formats': formats, + 'thumbnail': vod_info.get('coverUrl'), + 'timestamp': int_or_none(vod_info.get('publishTs')), + 'duration': int_or_none(vod_info.get('duration')), + 'view_count': get_count('watch'), + 'like_count': get_count('like'), + 'comment_count': get_count('comment'), + 'comments': comments, + 'categories': [category] if category else None, + } + info.update(self._extract_streamer_info(vod_detail_info)) + return info From 9b58478829760eddf9ce3434540abb3f8bd37919 Mon Sep 17 00:00:00 2001 From: Brian Marks Date: Fri, 26 Feb 2021 16:16:38 +0100 Subject: [PATCH 316/384] =?UTF-8?q?[americastestkitchen]=20Add=20support?= =?UTF-8?q?=20for=20downloading=20entire=20seasons=20(#27=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …861) --- haruhi_dl/extractor/americastestkitchen.py | 67 ++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 5 +- 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/americastestkitchen.py b/haruhi_dl/extractor/americastestkitchen.py index 7d2c375c4..35d3220c1 100644 --- a/haruhi_dl/extractor/americastestkitchen.py +++ b/haruhi_dl/extractor/americastestkitchen.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -90,3 +91,69 @@ class AmericasTestKitchenIE(InfoExtractor): 'series': try_get(episode, lambda x: x['show']['title']), 'episode': episode.get('title'), } + + +class AmericasTestKitchenSeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|cookscountry)\.com/episodes/browse/season_(?P\d+)' + _TESTS = [{ + # ATK Season + 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', + 'info_dict': { + 'id': 'season-1', + 'title': 'Season 1', + }, + 'playlist_count': 13, + }, { + # Cooks Country Season + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'info_dict': { + 'id': 'season-12', + 'title': 'Season 12', + }, + 'playlist_count': 13, + }, { + # Multi-digit season + 'url': 'https://www.americastestkitchen.com/episodes/browse/season_20', + 'only_matching': True, + }] + + def _real_extract(self, url): + show_name, season = re.match(self._VALID_URL, url).groups() + + slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + + filters = [ + 'search_season_list:Season %s' % season, + 'search_document_klass:episode', + 'search_show_slug:%s' % slug, + ] + + season_search = self._download_json( + 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_atk_season_desc_production', + season, headers={ + 'Origin': 'https://www.%s.com' % show_name, + 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', + 'X-Algolia-Application-Id': 'Y1FNZXUI30', + }, query={ + 'facetFilters': json.dumps(filters), + 'attributesToRetrieve': 'search_url', + 'attributesToHighlight': '', + # ATK and CCO generally have less than 26 episodes per season + 'hitsPerPage': '100', + }) + + entries = [ + self.url_result( + 'https://www.%s.com%s' % (show_name, episode['search_url']), + 'AmericasTestKitchen', + try_get(episode, lambda e: e['objectID'].split('_')[-1])) + for episode in season_search['hits'] + if 'search_url' in episode and episode['search_url'] + ] + + return { + '_type': 'playlist', + 'id': 'season-%s' % season, + 'title': 'Season %s' % season, + 'entries': sorted(entries, key=lambda e: e.get('id')), + } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 1fe6132d3..87d75e071 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -49,7 +49,10 @@ from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .amara import AmaraIE from .amcnetworks import AMCNetworksIE -from .americastestkitchen import AmericasTestKitchenIE +from .americastestkitchen import ( + AmericasTestKitchenIE, + AmericasTestKitchenSeasonIE, +) from .animeondemand import AnimeOnDemandIE from .anvato import AnvatoIE from .aol import AolIE From 64c8ca8464fc1e304b2bc05ccc6360255e7a93c1 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:16:44 +0100 Subject: [PATCH 317/384] [americastestkitchen] improve season extraction --- haruhi_dl/extractor/americastestkitchen.py | 62 +++++++++++----------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/haruhi_dl/extractor/americastestkitchen.py b/haruhi_dl/extractor/americastestkitchen.py index 35d3220c1..be960c0f9 100644 --- a/haruhi_dl/extractor/americastestkitchen.py +++ b/haruhi_dl/extractor/americastestkitchen.py @@ -99,7 +99,7 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', 'info_dict': { - 'id': 'season-1', + 'id': 'season_1', 'title': 'Season 1', }, 'playlist_count': 13, @@ -107,53 +107,53 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): # Cooks Country Season 'url': 'https://www.cookscountry.com/episodes/browse/season_12', 'info_dict': { - 'id': 'season-12', + 'id': 'season_12', 'title': 'Season 12', }, 'playlist_count': 13, - }, { - # Multi-digit season - 'url': 'https://www.americastestkitchen.com/episodes/browse/season_20', - 'only_matching': True, }] def _real_extract(self, url): - show_name, season = re.match(self._VALID_URL, url).groups() + show_name, season_number = re.match(self._VALID_URL, url).groups() + season_number = int(season_number) slug = 'atk' if show_name == 'americastestkitchen' else 'cco' - filters = [ - 'search_season_list:Season %s' % season, - 'search_document_klass:episode', - 'search_show_slug:%s' % slug, - ] + season = 'Season %d' % season_number season_search = self._download_json( - 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_atk_season_desc_production', + 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, season, headers={ 'Origin': 'https://www.%s.com' % show_name, 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ - 'facetFilters': json.dumps(filters), - 'attributesToRetrieve': 'search_url', + 'facetFilters': json.dumps([ + 'search_season_list:' + season, + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ]), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, 'attributesToHighlight': '', - # ATK and CCO generally have less than 26 episodes per season - 'hitsPerPage': '100', + 'hitsPerPage': 1000, }) - entries = [ - self.url_result( - 'https://www.%s.com%s' % (show_name, episode['search_url']), - 'AmericasTestKitchen', - try_get(episode, lambda e: e['objectID'].split('_')[-1])) - for episode in season_search['hits'] - if 'search_url' in episode and episode['search_url'] - ] + def entries(): + for episode in (season_search.get('hits') or []): + search_url = episode.get('search_url') + if not search_url: + continue + yield { + '_type': 'url', + 'url': 'https://www.%s.com%s' % (show_name, search_url), + 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'timestamp': unified_timestamp(episode.get('search_document_date')), + 'season_number': season_number, + 'episode_number': int_or_none(episode.get('search_%s_episode_number' % slug)), + 'ie_key': AmericasTestKitchenIE.ie_key(), + } - return { - '_type': 'playlist', - 'id': 'season-%s' % season, - 'title': 'Season %s' % season, - 'entries': sorted(entries, key=lambda e: e.get('id')), - } + return self.playlist_result( + entries(), 'season_%d' % season_number, season) From e5ac4c2a67ca9e6ac74c5aeed05b1d47ad3d4e14 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:16:55 +0100 Subject: [PATCH 318/384] [wat] fix format extraction(closes #27901) --- haruhi_dl/extractor/wat.py | 55 +++++++++----------------------------- 1 file changed, 13 insertions(+), 42 deletions(-) diff --git a/haruhi_dl/extractor/wat.py b/haruhi_dl/extractor/wat.py index 8ef3e0906..7214bfebf 100644 --- a/haruhi_dl/extractor/wat.py +++ b/haruhi_dl/extractor/wat.py @@ -1,12 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - ExtractorError, unified_strdate, HEADRequest, int_or_none, @@ -97,46 +94,20 @@ class WatIE(InfoExtractor): return red_url return None - def remove_bitrate_limit(manifest_url): - return re.sub(r'(?:max|min)_bitrate=\d+&?', '', manifest_url) - formats = [] - try: - alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')] - manifest_urls = self._download_json( - 'http://www.wat.tv/get/webhtml/' + video_id, video_id) - m3u8_url = manifest_urls.get('hls') - if m3u8_url: - m3u8_url = remove_bitrate_limit(m3u8_url) - for m3u8_alt_url in alt_urls(m3u8_url): - formats.extend(self._extract_m3u8_formats( - m3u8_alt_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - mpd_url = manifest_urls.get('mpd') - if mpd_url: - mpd_url = remove_bitrate_limit(mpd_url) - for mpd_alt_url in alt_urls(mpd_url): - formats.extend(self._extract_mpd_formats( - mpd_alt_url, video_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - except ExtractorError: - abr = 64 - for vbr, width, height in self._FORMATS: - tbr = vbr + abr - format_id = 'http-%s' % tbr - fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr) - if self._is_valid_url(fmt_url, video_id, format_id): - formats.append({ - 'format_id': format_id, - 'url': fmt_url, - 'vbr': vbr, - 'abr': abr, - 'width': width, - 'height': height, - }) + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id) + m3u8_url = manifest_urls.get('hls') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + mpd_url = manifest_urls.get('mpd') + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), + video_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') upload_date = unified_strdate(date_diffusion) if date_diffusion else None From 8976512791ab258ef4f2c88ad4e6f92966687468 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:16:59 +0100 Subject: [PATCH 319/384] [wat] remove unused variable --- haruhi_dl/extractor/wat.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/haruhi_dl/extractor/wat.py b/haruhi_dl/extractor/wat.py index 7214bfebf..f6940b371 100644 --- a/haruhi_dl/extractor/wat.py +++ b/haruhi_dl/extractor/wat.py @@ -43,15 +43,6 @@ class WatIE(InfoExtractor): }, ] - _FORMATS = ( - (200, 416, 234), - (400, 480, 270), - (600, 640, 360), - (1200, 640, 360), - (1800, 960, 540), - (2500, 1280, 720), - ) - def _real_extract(self, url): video_id = self._match_id(url) video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) From 48680ac382f1a87b152503f932af0225be187b9d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:17:05 +0100 Subject: [PATCH 320/384] [comedycentral] fix extraction(closes #27905) --- haruhi_dl/extractor/comedycentral.py | 141 +++++---------------------- haruhi_dl/extractor/extractors.py | 3 - haruhi_dl/extractor/mtv.py | 23 +++-- haruhi_dl/extractor/spike.py | 15 +-- 4 files changed, 37 insertions(+), 145 deletions(-) diff --git a/haruhi_dl/extractor/comedycentral.py b/haruhi_dl/extractor/comedycentral.py index d08b909a6..1bfa912be 100644 --- a/haruhi_dl/extractor/comedycentral.py +++ b/haruhi_dl/extractor/comedycentral.py @@ -1,142 +1,51 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -from .common import InfoExtractor class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes))) - /(?P.*)''' + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P<id>[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ - 'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', - 'md5': 'c4f48e9eda1b16dd10add0744344b6d8', + 'url': 'http://www.cc.com/video-clips/5ke9v2/the-daily-show-with-trevor-noah-doc-rivers-and-steve-ballmer---the-nba-player-strike', + 'md5': 'b8acb347177c680ff18a292aa2166f80', 'info_dict': { - 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', + 'id': '89ccc86e-1b02-4f83-b0c9-1d9592ecd025', 'ext': 'mp4', - 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother', - 'description': 'After a certain point, breastfeeding becomes c**kblocking.', - 'timestamp': 1376798400, - 'upload_date': '20130818', + 'title': 'The Daily Show with Trevor Noah|August 28, 2020|25|25149|Doc Rivers and Steve Ballmer - The NBA Player Strike', + 'description': 'md5:5334307c433892b85f4f5e5ac9ef7498', + 'timestamp': 1598670000, + 'upload_date': '20200829', }, }, { - 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', + 'url': 'http://www.cc.com/episodes/pnzzci/drawn-together--american-idol--parody-clip-show-season-3-ep-314', 'only_matching': True, - }] - - -class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (?:full-episodes|shows(?=/[^/]+/full-episodes)) - /(?P<id>[^?]+)''' - _FEED_URL = 'http://comedycentral.com/feeds/mrss/' - - _TESTS = [{ - 'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028', - 'info_dict': { - 'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."', - 'title': 'November 28, 2016 - Ryan Speedo Green', - }, - 'playlist_count': 4, }, { - 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1') - videos_info = self._get_videos_info(mgid) - return videos_info - - -class ToshIE(MTVServicesInfoExtractor): - IE_DESC = 'Tosh.0' - _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)' - _FEED_URL = 'http://tosh.cc.com/feeds/mrss' - - _TESTS = [{ - 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', - 'info_dict': { - 'description': 'Tosh asked fans to share their summer plans.', - 'title': 'Twitter Users Share Summer Plans', - }, - 'playlist': [{ - 'md5': 'f269e88114c1805bb6d7653fecea9e06', - 'info_dict': { - 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30', - 'ext': 'mp4', - 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', - 'description': 'Tosh asked fans to share their summer plans.', - 'thumbnail': r're:^https?://.*\.jpg', - # It's really reported to be published on year 2077 - 'upload_date': '20770610', - 'timestamp': 3390510600, - 'subtitles': { - 'en': 'mincount:3', - }, - }, - }] - }, { - 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp', + 'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate', 'only_matching': True, }] class ComedyCentralTVIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/folgen/(?P<id>[0-9a-z]{6})' _TESTS = [{ - 'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4', + 'url': 'https://www.comedycentral.tv/folgen/pxdpec/josh-investigates-klimawandel-staffel-1-ep-1', 'info_dict': { - 'id': 'local_playlist-f99b626bdfe13568579a', - 'ext': 'flv', - 'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1', + 'id': '15907dc3-ec3c-11e8-a442-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'Josh Investigates', + 'description': 'Steht uns das Ende der Welt bevor?', }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://www.comedycentral.tv/shows/1074-workaholics', - 'only_matching': True, - }, { - 'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus', - 'only_matching': True, }] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' + _GEO_COUNTRIES = ['DE'] - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - mrss_url = self._search_regex( - r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'mrss url', group='url') - - return self._get_videos_info_from_url(mrss_url, video_id) - - -class ComedyCentralShortnameIE(InfoExtractor): - _VALID_URL = r'^:(?P<id>tds|thedailyshow|theopposition)$' - _TESTS = [{ - 'url': ':tds', - 'only_matching': True, - }, { - 'url': ':thedailyshow', - 'only_matching': True, - }, { - 'url': ':theopposition', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - shortcut_map = { - 'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes', + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'web.cc.tv', + 'ep': 'b9032c3a', + 'imageEp': 'web.cc.tv', + 'mgid': uri, } - return self.url_result(shortcut_map[video_id]) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 87d75e071..4ab45f479 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -245,11 +245,8 @@ from .cnn import ( ) from .coub import CoubIE from .comedycentral import ( - ComedyCentralFullEpisodesIE, ComedyCentralIE, - ComedyCentralShortnameIE, ComedyCentralTVIE, - ToshIE, ) from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonprotocols import ( diff --git a/haruhi_dl/extractor/mtv.py b/haruhi_dl/extractor/mtv.py index df1034fc5..f5e30d22d 100644 --- a/haruhi_dl/extractor/mtv.py +++ b/haruhi_dl/extractor/mtv.py @@ -253,6 +253,10 @@ class MTVServicesInfoExtractor(InfoExtractor): return try_get(feed, lambda x: x['result']['data']['id'], compat_str) + @staticmethod + def _extract_child_with_type(parent, t): + return next(c for c in parent['children'] if c.get('type') == t) + def _extract_mgid(self, webpage): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf @@ -278,6 +282,13 @@ class MTVServicesInfoExtractor(InfoExtractor): if not mgid: mgid = self._extract_triforce_mgid(webpage) + if not mgid: + data = self._parse_json(self._search_regex( + r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) + main_container = self._extract_child_with_type(data, 'MainContainer') + video_player = self._extract_child_with_type(main_container, 'VideoPlayer') + mgid = video_player['props']['media']['video']['config']['uri'] + return mgid def _real_extract(self, url): @@ -349,18 +360,6 @@ class MTVIE(MTVServicesInfoExtractor): 'only_matching': True, }] - @staticmethod - def extract_child_with_type(parent, t): - children = parent['children'] - return next(c for c in children if c.get('type') == t) - - def _extract_mgid(self, webpage): - data = self._parse_json(self._search_regex( - r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) - main_container = self.extract_child_with_type(data, 'MainContainer') - video_player = self.extract_child_with_type(main_container, 'VideoPlayer') - return video_player['props']['media']['video']['config']['uri'] - class MTVJapanIE(MTVServicesInfoExtractor): IE_NAME = 'mtvjapan' diff --git a/haruhi_dl/extractor/spike.py b/haruhi_dl/extractor/spike.py index 4c5e3f7c2..5805f3d44 100644 --- a/haruhi_dl/extractor/spike.py +++ b/haruhi_dl/extractor/spike.py @@ -20,9 +20,6 @@ class BellatorIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.bellator.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - def _extract_mgid(self, webpage): - return self._extract_triforce_mgid(webpage) - class ParamountNetworkIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' @@ -46,16 +43,6 @@ class ParamountNetworkIE(MTVServicesInfoExtractor): def _get_feed_query(self, uri): return { 'arcEp': 'paramountnetwork.com', + 'imageEp': 'paramountnetwork.com', 'mgid': uri, } - - def _extract_mgid(self, webpage): - root_data = self._parse_json(self._search_regex( - r'window\.__DATA__\s*=\s*({.+})', - webpage, 'data'), None) - - def find_sub_data(data, data_type): - return next(c for c in data['children'] if c.get('type') == data_type) - - c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer') - return c['props']['media']['video']['config']['uri'] From 886d51e368331ab8667fc4c862ad5a721c5d5f4c Mon Sep 17 00:00:00 2001 From: aarubui <aarubui@users.noreply.github.com> Date: Fri, 26 Feb 2021 16:17:10 +0100 Subject: [PATCH 321/384] [njpwworld] fix extraction (#27890) --- haruhi_dl/extractor/njpwworld.py | 54 +++++++++++++++++--------------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/haruhi_dl/extractor/njpwworld.py b/haruhi_dl/extractor/njpwworld.py index 025c5d249..3639d142f 100644 --- a/haruhi_dl/extractor/njpwworld.py +++ b/haruhi_dl/extractor/njpwworld.py @@ -6,30 +6,40 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - extract_attributes, get_element_by_class, urlencode_postdata, ) class NJPWWorldIE(InfoExtractor): - _VALID_URL = r'https?://njpwworld\.com/p/(?P<id>[a-z0-9_]+)' + _VALID_URL = r'https?://(front\.)?njpwworld\.com/p/(?P<id>[a-z0-9_]+)' IE_DESC = '新日本プロレスワールド' _NETRC_MACHINE = 'njpwworld' - _TEST = { + _TESTS = [{ 'url': 'http://njpwworld.com/p/s_series_00155_1_9/', 'info_dict': { 'id': 's_series_00155_1_9', 'ext': 'mp4', - 'title': '第9試合 ランディ・サベージ vs リック・スタイナー', + 'title': '闘強導夢2000 2000年1月4日 東京ドーム 第9試合 ランディ・サベージ VS リック・スタイナー', 'tags': list, }, 'params': { 'skip_download': True, # AES-encrypted m3u8 }, 'skip': 'Requires login', - } + }, { + 'url': 'https://front.njpwworld.com/p/s_series_00563_16_bs', + 'info_dict': { + 'id': 's_series_00563_16_bs', + 'ext': 'mp4', + 'title': 'WORLD TAG LEAGUE 2020 & BEST OF THE SUPER Jr.27 2020年12月6日 福岡・福岡国際センター バックステージコメント(字幕あり)', + 'tags': ["福岡・福岡国際センター", "バックステージコメント", "2020", "20年代"], + }, + 'params': { + 'skip_download': True, + }, + }] _LOGIN_URL = 'https://front.njpwworld.com/auth/login' @@ -64,35 +74,27 @@ class NJPWWorldIE(InfoExtractor): webpage = self._download_webpage(url, video_id) formats = [] - for mobj in re.finditer(r'<a[^>]+\bhref=(["\'])/player.+?[^>]*>', webpage): - player = extract_attributes(mobj.group(0)) - player_path = player.get('href') - if not player_path: - continue - kind = self._search_regex( - r'(low|high)$', player.get('class') or '', 'kind', - default='low') + for kind, vid in re.findall(r'if\s+\(\s*imageQualityType\s*==\s*\'([^\']+)\'\s*\)\s*{\s*video_id\s*=\s*"(\d+)"', webpage): + player_path = '/intent?id=%s&type=url' % vid player_url = compat_urlparse.urljoin(url, player_path) - player_page = self._download_webpage( - player_url, video_id, note='Downloading player page') - entries = self._parse_html5_media_entries( - player_url, player_page, video_id, m3u8_id='hls-%s' % kind, - m3u8_entry_protocol='m3u8_native') - kind_formats = entries[0]['formats'] - for f in kind_formats: - f['quality'] = 2 if kind == 'high' else 1 - formats.extend(kind_formats) + formats.append({ + 'url': player_url, + 'format_id': kind, + 'ext': 'mp4', + 'protocol': 'm3u8', + 'quality': 2 if kind == 'high' else 1, + }) self._sort_formats(formats) - post_content = get_element_by_class('post-content', webpage) + tag_block = get_element_by_class('tag-block', webpage) tags = re.findall( - r'<li[^>]+class="tag-[^"]+"><a[^>]*>([^<]+)</a></li>', post_content - ) if post_content else None + r'<a[^>]+class="tag-[^"]+"[^>]*>([^<]+)</a>', tag_block + ) if tag_block else None return { 'id': video_id, - 'title': self._og_search_title(webpage), + 'title': get_element_by_class('article-title', webpage) or self._og_search_title(webpage), 'formats': formats, 'tags': tags, } From 1752b8b8c8932642f767d02337e59ebd7c112e41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 16:17:36 +0100 Subject: [PATCH 322/384] Introduce --output-na-placeholder (closes #27896) --- haruhi_dl/HaruhiDL.py | 7 ++++--- haruhi_dl/__init__.py | 1 + haruhi_dl/options.py | 4 ++++ test/test_HaruhiDL.py | 15 +++++++++++---- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/haruhi_dl/HaruhiDL.py b/haruhi_dl/HaruhiDL.py index a04e0dab2..6d0a59ff6 100755 --- a/haruhi_dl/HaruhiDL.py +++ b/haruhi_dl/HaruhiDL.py @@ -163,6 +163,7 @@ class HaruhiDL(object): simulate: Do not download the video files. format: Video format code. See options.py for more information. outtmpl: Template for output names. + outtmpl_na_placeholder: Placeholder for unavailable meta fields. restrictfilenames: Do not allow "&" and spaces in file names ignoreerrors: Do not stop on download errors. force_generic_extractor: Force downloader to use the generic extractor @@ -662,7 +663,7 @@ class HaruhiDL(object): template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) for k, v in template_dict.items() if v is not None and not isinstance(v, (list, tuple, dict))) - template_dict = collections.defaultdict(lambda: 'NA', template_dict) + template_dict = collections.defaultdict(lambda: self.params.get('outtmpl_na_placeholder', 'NA'), template_dict) outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) @@ -682,8 +683,8 @@ class HaruhiDL(object): # Missing numeric fields used together with integer presentation types # in format specification will break the argument substitution since - # string 'NA' is returned for missing fields. We will patch output - # template for missing fields to meet string presentation type. + # string NA placeholder is returned for missing fields. We will patch + # output template for missing fields to meet string presentation type. for numeric_field in self._NUMERIC_FIELDS: if numeric_field not in template_dict: # As of [1] format syntax is: diff --git a/haruhi_dl/__init__.py b/haruhi_dl/__init__.py index b90ca151d..f9c495323 100644 --- a/haruhi_dl/__init__.py +++ b/haruhi_dl/__init__.py @@ -340,6 +340,7 @@ def _real_main(argv=None): 'format': opts.format, 'listformats': opts.listformats, 'outtmpl': outtmpl, + 'outtmpl_na_placeholder': opts.outtmpl_na_placeholder, 'autonumber_size': opts.autonumber_size, 'autonumber_start': opts.autonumber_start, 'restrictfilenames': opts.restrictfilenames, diff --git a/haruhi_dl/options.py b/haruhi_dl/options.py index 76f97f452..66472259e 100644 --- a/haruhi_dl/options.py +++ b/haruhi_dl/options.py @@ -693,6 +693,10 @@ def parseOpts(overrideArguments=None): '-o', '--output', dest='outtmpl', metavar='TEMPLATE', help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info')) + filesystem.add_option( + '--output-na-placeholder', + dest='outtmpl_na_placeholder', metavar='PLACEHOLDER', default='NA', + help=('Placeholder value for unavailable meta fields in output filename template (default is "%default")')) filesystem.add_option( '--autonumber-size', dest='autonumber_size', metavar='NUMBER', type=int, diff --git a/test/test_HaruhiDL.py b/test/test_HaruhiDL.py index f853342f3..cb9954716 100644 --- a/test/test_HaruhiDL.py +++ b/test/test_HaruhiDL.py @@ -636,13 +636,20 @@ class TestHaruhiDL(unittest.TestCase): 'title2': '%PATH%', } - def fname(templ): - hdl = HaruhiDL({'outtmpl': templ}) + def fname(templ, na_placeholder='NA'): + params = {'outtmpl': templ} + if na_placeholder != 'NA': + params['outtmpl_na_placeholder'] = na_placeholder + hdl = HaruhiDL(params) return hdl.prepare_filename(info) self.assertEqual(fname('%(id)s.%(ext)s'), '1234.mp4') self.assertEqual(fname('%(id)s-%(width)s.%(ext)s'), '1234-NA.mp4') - # Replace missing fields with 'NA' - self.assertEqual(fname('%(uploader_date)s-%(id)s.%(ext)s'), 'NA-1234.mp4') + NA_TEST_OUTTMPL = '%(uploader_date)s-%(width)d-%(id)s.%(ext)s' + # Replace missing fields with 'NA' by default + self.assertEqual(fname(NA_TEST_OUTTMPL), 'NA-NA-1234.mp4') + # Or by provided placeholder + self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder='none'), 'none-none-1234.mp4') + self.assertEqual(fname(NA_TEST_OUTTMPL, na_placeholder=''), '--1234.mp4') self.assertEqual(fname('%(height)d.%(ext)s'), '1080.mp4') self.assertEqual(fname('%(height)6d.%(ext)s'), ' 1080.mp4') self.assertEqual(fname('%(height)-6d.%(ext)s'), '1080 .mp4') From c5f29934842c190d2dda3c4657aad527b2302009 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 16:18:10 +0100 Subject: [PATCH 323/384] [options] Clarify --extract-audio help string (closes #27878) --- haruhi_dl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/options.py b/haruhi_dl/options.py index 66472259e..63e11b517 100644 --- a/haruhi_dl/options.py +++ b/haruhi_dl/options.py @@ -790,7 +790,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '-x', '--extract-audio', action='store_true', dest='extractaudio', default=False, - help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') + help='Convert video files to audio-only files (requires ffmpeg/avconv and ffprobe/avprobe)') postproc.add_option( '--audio-format', metavar='FORMAT', dest='audioformat', default='best', help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x') From d2324df444cbb35b66fbd23f1514908e41121f2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FAur=3DC3=3DA9lien=3D20Grosdidier=3F=3D?= <aurelien.grosdidier@gmail.com> Date: Fri, 26 Feb 2021 16:18:19 +0100 Subject: [PATCH 324/384] [franceculture] Fix extraction (closes #27891) (#27903) Co-authored-by: Sergey M. <dstftw@gmail.com> --- haruhi_dl/extractor/franceculture.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/franceculture.py b/haruhi_dl/extractor/franceculture.py index 306b45fc9..7e9ceabbc 100644 --- a/haruhi_dl/extractor/franceculture.py +++ b/haruhi_dl/extractor/franceculture.py @@ -20,7 +20,7 @@ class FranceCultureIE(InfoExtractor): 'title': 'Rendez-vous au pays des geeks', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140301', - 'timestamp': 1393642916, + 'timestamp': 1393700400, 'vcodec': 'none', } } @@ -36,12 +36,12 @@ class FranceCultureIE(InfoExtractor): </h1>| <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> ).*? - (<button[^>]+data-asset-source="[^"]+"[^>]+>) + (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>) ''', webpage, 'video data')) - video_url = video_data['data-asset-source'] - title = video_data.get('data-asset-title') or self._og_search_title(webpage) + video_url = video_data.get('data-url') or video_data['data-asset-source'] + title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage) description = self._html_search_regex( r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>', @@ -64,6 +64,6 @@ class FranceCultureIE(InfoExtractor): 'ext': ext, 'vcodec': 'none' if ext == 'mp3' else None, 'uploader': uploader, - 'timestamp': int_or_none(video_data.get('data-asset-created-date')), + 'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')), 'duration': int_or_none(video_data.get('data-duration')), } From b94ec338adb69a32f6830bf388b8ab734b86830b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= <dstftw@gmail.com> Date: Fri, 26 Feb 2021 16:18:24 +0100 Subject: [PATCH 325/384] [franceculture] Make thumbnail optional (closes #18807) --- haruhi_dl/extractor/franceculture.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/franceculture.py b/haruhi_dl/extractor/franceculture.py index 7e9ceabbc..14f4cb489 100644 --- a/haruhi_dl/extractor/franceculture.py +++ b/haruhi_dl/extractor/franceculture.py @@ -11,7 +11,7 @@ from ..utils import ( class FranceCultureIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', 'info_dict': { 'id': 'rendez-vous-au-pays-des-geeks', @@ -23,7 +23,11 @@ class FranceCultureIE(InfoExtractor): 'timestamp': 1393700400, 'vcodec': 'none', } - } + }, { + # no thumbnail + 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -48,7 +52,7 @@ class FranceCultureIE(InfoExtractor): webpage, 'description', default=None) thumbnail = self._search_regex( r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"', - webpage, 'thumbnail', fatal=False) + webpage, 'thumbnail', default=None) uploader = self._html_search_regex( r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None) From 5c5e0318162729b3a77bbe34fd5100b388dcd2af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FAdrian=3D20Heine=3D20n=3DC3=3DA9=3D20Lan?= =?UTF-8?q?g=3F=3D?= <mail@adrianheine.de> Date: Fri, 26 Feb 2021 16:18:31 +0100 Subject: [PATCH 326/384] [ADN] Implement login (#27937) closes #17091 closes #27841 --- haruhi_dl/extractor/adn.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/adn.py b/haruhi_dl/extractor/adn.py index d611ee237..40111586d 100644 --- a/haruhi_dl/extractor/adn.py +++ b/haruhi_dl/extractor/adn.py @@ -26,6 +26,7 @@ from ..utils import ( strip_or_none, try_get, unified_strdate, + urlencode_postdata, ) @@ -51,9 +52,11 @@ class ADNIE(InfoExtractor): } } + _NETRC_MACHINE = 'animedigitalnetwork' _BASE_URL = 'http://animedigitalnetwork.fr' _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' _PLAYER_BASE_URL = _API_BASE_URL + 'player/' + _HEADERS = {} _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537) _POS_ALIGN_MAP = { 'start': 1, @@ -129,19 +132,32 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }]) return subtitles + def _real_initialize(self): + username, password = self._get_login_info() + if username: + access_token = (self._download_json( + self._API_BASE_URL + 'authentication/login', None, + 'Logging in', errnote='Unable to log in', fatal=False, + data=urlencode_postdata({ + 'password': password, + 'rememberMe': False, + 'source': 'Web', + 'username': username, + })) or {}).get('accessToken') + if access_token: + self._HEADERS = {'authorization': 'Bearer ' + access_token} + def _real_extract(self, url): video_id = self._match_id(url) video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id player = self._download_json( video_base_url + 'configuration', video_id, - 'Downloading player config JSON metadata')['player'] + 'Downloading player config JSON metadata', headers=self._HEADERS)['player'] options = player['options'] user = options['user'] if not user.get('hasAccess'): - raise ExtractorError( - 'This video is only available for paying users', expected=True) - # self.raise_login_required() # FIXME: Login is not implemented + self.raise_login_required() token = self._download_json( user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'), From 36552561a682df42523293c2073aa164a7c6e454 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 16:18:36 +0100 Subject: [PATCH 327/384] [zype] fix uplynk id extraction(closes #27956) --- haruhi_dl/extractor/zype.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/zype.py b/haruhi_dl/extractor/zype.py index 60dc6cb24..2ada5dd0b 100644 --- a/haruhi_dl/extractor/zype.py +++ b/haruhi_dl/extractor/zype.py @@ -87,11 +87,16 @@ class ZypeIE(InfoExtractor): r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', body, 'm3u8 url', group='url', default=None) if not m3u8_url: - source = self._parse_json(self._search_regex( - r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, - 'source'), video_id, js_to_json) - if source.get('integration') == 'verizon-media': - m3u8_url = 'https://content.uplynk.com/%s.m3u8' % source['id'] + source = self._search_regex( + r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, 'source') + + def get_attr(key): + return self._search_regex( + r'\b%s\s*:\s*([\'"])(?P<val>(?:(?!\1).)+)\1' % key, + source, key, group='val') + + if get_attr('integration') == 'verizon-media': + m3u8_url = 'https://content.uplynk.com/%s.m3u8' % get_attr('id') formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') text_tracks = self._search_regex( From d5cdaae9c8e0c9139240a7eddd80d55e7b04a35e Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 16:18:40 +0100 Subject: [PATCH 328/384] [adn] improve login warning reporting --- haruhi_dl/extractor/adn.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/adn.py b/haruhi_dl/extractor/adn.py index 40111586d..a55ebbcbd 100644 --- a/haruhi_dl/extractor/adn.py +++ b/haruhi_dl/extractor/adn.py @@ -57,6 +57,7 @@ class ADNIE(InfoExtractor): _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' _PLAYER_BASE_URL = _API_BASE_URL + 'player/' _HEADERS = {} + _LOGIN_ERR_MESSAGE = 'Unable to log in' _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537) _POS_ALIGN_MAP = { 'start': 1, @@ -134,10 +135,12 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' def _real_initialize(self): username, password = self._get_login_info() - if username: + if not username: + return + try: access_token = (self._download_json( self._API_BASE_URL + 'authentication/login', None, - 'Logging in', errnote='Unable to log in', fatal=False, + 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False, data=urlencode_postdata({ 'password': password, 'rememberMe': False, @@ -146,13 +149,21 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' })) or {}).get('accessToken') if access_token: self._HEADERS = {'authorization': 'Bearer ' + access_token} + except ExtractorError as e: + message = None + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json( + e.cause.read().decode(), None, fatal=False) or {} + message = resp.get('message') or resp.get('code') + self.report_warning(message or self._LOGIN_ERR_MESSAGE) def _real_extract(self, url): video_id = self._match_id(url) video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id player = self._download_json( video_base_url + 'configuration', video_id, - 'Downloading player config JSON metadata', headers=self._HEADERS)['player'] + 'Downloading player config JSON metadata', + headers=self._HEADERS)['player'] options = player['options'] user = options['user'] @@ -204,8 +215,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' message = error.get('message') if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': self.raise_geo_restricted(msg=message) - else: - raise ExtractorError(message) + raise ExtractorError(message) else: raise ExtractorError('Giving up retrying') From 6e3cdd851507288156743463cb1c00048b7b0a82 Mon Sep 17 00:00:00 2001 From: tpikonen <tpikonen@gmail.com> Date: Fri, 26 Feb 2021 16:18:45 +0100 Subject: [PATCH 329/384] [tv2] Add support for mtvuutiset.fi (#27744) --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/tv2.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 4ab45f479..a493b2699 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1313,6 +1313,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, KatsomoIE, + MTVuutisetIE, ) from .tv2dk import ( TV2DKIE, diff --git a/haruhi_dl/extractor/tv2.py b/haruhi_dl/extractor/tv2.py index 4a19b9be6..42a9af126 100644 --- a/haruhi_dl/extractor/tv2.py +++ b/haruhi_dl/extractor/tv2.py @@ -190,3 +190,32 @@ class KatsomoIE(TV2IE): _API_DOMAIN = 'api.katsomo.fi' _PROTOCOLS = ('HLS', 'MPD') _GEO_COUNTRIES = ['FI'] + + +class MTVuutisetIE(KatsomoIE): + _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/(?:artikkeli/[0-9a-z-]+/|video/prog)(?P<id>\d+)' + _TEST = { + 'url': 'https://www.mtvuutiset.fi/artikkeli/tallaisia-vaurioita-viking-amorellassa-on-useamman-osaston-alla-vetta/7931384', + 'info_dict': { + 'id': '1311159', + 'ext': 'mp4', + 'title': 'MTV Uutiset Live', + 'description': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', + 'timestamp': 1600608966, + 'upload_date': '20200920', + 'duration': 153.7886666, + 'view_count': int, + 'categories': list, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + art_id = self._match_id(url) + webpage = self._download_webpage(url, art_id) + video_id = self._html_search_regex( + r'<div class=\'player-container\' .*data-katsomoid="(.+?)"', webpage, 'video_id') + return self.url_result("http://mtv.fi/a/0/a/%s" % video_id, video_id=video_id, ie="Katsomo") From 743a3f4c0019234c4f0ab0622b818e2257d66dd0 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 16:18:51 +0100 Subject: [PATCH 330/384] [tv2] improve MTV Uutiset Article extraction --- haruhi_dl/extractor/extractors.py | 2 +- haruhi_dl/extractor/tv2.py | 73 +++++++++++++++++++++---------- 2 files changed, 51 insertions(+), 24 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index a493b2699..47ea20ad5 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1313,7 +1313,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, KatsomoIE, - MTVuutisetIE, + MTVUutisetArticleIE, ) from .tv2dk import ( TV2DKIE, diff --git a/haruhi_dl/extractor/tv2.py b/haruhi_dl/extractor/tv2.py index 42a9af126..334b7d540 100644 --- a/haruhi_dl/extractor/tv2.py +++ b/haruhi_dl/extractor/tv2.py @@ -20,7 +20,7 @@ from ..utils import ( class TV2IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { 'id': '916509', @@ -33,7 +33,7 @@ class TV2IE(InfoExtractor): 'view_count': int, 'categories': list, }, - } + }] _API_DOMAIN = 'sumo.tv2.no' _PROTOCOLS = ('HDS', 'HLS', 'DASH') _GEO_COUNTRIES = ['NO'] @@ -42,6 +42,12 @@ class TV2IE(InfoExtractor): video_id = self._match_id(url) api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id) + asset = self._download_json( + api_base + '.json', video_id, + 'Downloading metadata JSON')['asset'] + title = asset.get('subtitle') or asset['title'] + is_live = asset.get('live') is True + formats = [] format_urls = [] for protocol in self._PROTOCOLS: @@ -81,7 +87,8 @@ class TV2IE(InfoExtractor): elif ext == 'm3u8': if not data.get('drmProtected'): formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', + video_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', m3u8_id=format_id, fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -99,11 +106,6 @@ class TV2IE(InfoExtractor): raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) - asset = self._download_json( - api_base + '.json', video_id, - 'Downloading metadata JSON')['asset'] - title = asset['title'] - thumbnails = [{ 'id': thumbnail.get('@type'), 'url': thumbnail.get('url'), @@ -112,7 +114,7 @@ class TV2IE(InfoExtractor): return { 'id': video_id, 'url': video_url, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': strip_or_none(asset.get('description')), 'thumbnails': thumbnails, 'timestamp': parse_iso8601(asset.get('createTime')), @@ -120,6 +122,7 @@ class TV2IE(InfoExtractor): 'view_count': int_or_none(asset.get('views')), 'categories': asset.get('keywords', '').split(','), 'formats': formats, + 'is_live': is_live, } @@ -168,13 +171,13 @@ class TV2ArticleIE(InfoExtractor): class KatsomoIE(TV2IE): - _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv)\.fi/(?:#!/)?(?:[^/]+/[0-9a-z-]+-\d+/[0-9a-z-]+-|[^/]+/\d+/[^/]+/)(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv(uutiset)?)\.fi/(?:sarja/[0-9a-z-]+-\d+/[0-9a-z-]+-|(?:#!/)?jakso/(?:\d+/[^/]+/)?|video/prog)(?P<id>\d+)' + _TESTS = [{ 'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321', 'info_dict': { 'id': '1181321', 'ext': 'mp4', - 'title': 'MTV Uutiset Live', + 'title': 'Lahden Pelicans teki kovan ratkaisun – Ville Nieminen pihalle', 'description': 'Päätöksen teki Pelicansin hallitus.', 'timestamp': 1575116484, 'upload_date': '20191130', @@ -186,20 +189,29 @@ class KatsomoIE(TV2IE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://www.katsomo.fi/#!/jakso/33001005/studio55-fi/658521/jukka-kuoppamaki-tekee-yha-lauluja-vaikka-lentokoneessa', + 'only_matching': True, + }, { + 'url': 'https://www.mtvuutiset.fi/video/prog1311159', + 'only_matching': True, + }, { + 'url': 'https://www.katsomo.fi/#!/jakso/1311159', + 'only_matching': True, + }] _API_DOMAIN = 'api.katsomo.fi' _PROTOCOLS = ('HLS', 'MPD') _GEO_COUNTRIES = ['FI'] -class MTVuutisetIE(KatsomoIE): - _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/(?:artikkeli/[0-9a-z-]+/|video/prog)(?P<id>\d+)' - _TEST = { +class MTVUutisetArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/artikkeli/[^/]+/(?P<id>\d+)' + _TESTS = [{ 'url': 'https://www.mtvuutiset.fi/artikkeli/tallaisia-vaurioita-viking-amorellassa-on-useamman-osaston-alla-vetta/7931384', 'info_dict': { 'id': '1311159', 'ext': 'mp4', - 'title': 'MTV Uutiset Live', + 'title': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', 'description': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', 'timestamp': 1600608966, 'upload_date': '20200920', @@ -211,11 +223,26 @@ class MTVuutisetIE(KatsomoIE): # m3u8 download 'skip_download': True, }, - } + }, { + # multiple Youtube embeds + 'url': 'https://www.mtvuutiset.fi/artikkeli/50-vuotta-subarun-vastaiskua/6070962', + 'only_matching': True, + }] def _real_extract(self, url): - art_id = self._match_id(url) - webpage = self._download_webpage(url, art_id) - video_id = self._html_search_regex( - r'<div class=\'player-container\' .*data-katsomoid="(.+?)"', webpage, 'video_id') - return self.url_result("http://mtv.fi/a/0/a/%s" % video_id, video_id=video_id, ie="Katsomo") + article_id = self._match_id(url) + article = self._download_json( + 'http://api.mtvuutiset.fi/mtvuutiset/api/json/' + article_id, + article_id) + + def entries(): + for video in (article.get('videos') or []): + video_type = video.get('videotype') + video_url = video.get('url') + if not (video_url and video_type in ('katsomo', 'youtube')): + continue + yield self.url_result( + video_url, video_type.capitalize(), video.get('video_id')) + + return self.playlist_result( + entries(), article_id, article.get('title'), article.get('description')) From 95fa7a89859d322f5c99c417814df2bb07a1d63d Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 16:18:56 +0100 Subject: [PATCH 331/384] [tv4] relax _VALID_URL(closes #27964) --- haruhi_dl/extractor/tv4.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/tv4.py b/haruhi_dl/extractor/tv4.py index c498b0191..b73bab9a8 100644 --- a/haruhi_dl/extractor/tv4.py +++ b/haruhi_dl/extractor/tv4.py @@ -17,7 +17,7 @@ class TV4IE(InfoExtractor): tv4\.se/(?:[^/]+)/klipp/(?:.*)-| tv4play\.se/ (?: - (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)| + (?:program|barn)/(?:(?:[^/]+/){1,2}|(?:[^\?]+)\?video_id=)| iframe/video/| film/| sport/| @@ -65,6 +65,10 @@ class TV4IE(InfoExtractor): { 'url': 'http://www.tv4play.se/program/farang/3922081', 'only_matching': True, + }, + { + 'url': 'https://www.tv4play.se/program/nyheterna/avsnitt/13315940', + 'only_matching': True, } ] From 288b2cc25b2c962630c71a67a778356d6ead94c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FAdrian=3D20Heine=3D20n=3DC3=3DA9=3D20Lan?= =?UTF-8?q?g=3F=3D?= <mail@adrianheine.de> Date: Fri, 26 Feb 2021 16:19:03 +0100 Subject: [PATCH 332/384] [AMP] Fix upload_date and timestamp extraction (#27970) --- haruhi_dl/extractor/abcnews.py | 2 ++ haruhi_dl/extractor/amp.py | 3 ++- haruhi_dl/extractor/bleacherreport.py | 10 ++++++++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/abcnews.py b/haruhi_dl/extractor/abcnews.py index 8b407bf9c..64ea6e6ed 100644 --- a/haruhi_dl/extractor/abcnews.py +++ b/haruhi_dl/extractor/abcnews.py @@ -36,6 +36,8 @@ class AbcNewsVideoIE(AMPIE): 'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.', 'duration': 180, 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1380454200, + 'upload_date': '20130929', }, 'params': { # m3u8 download diff --git a/haruhi_dl/extractor/amp.py b/haruhi_dl/extractor/amp.py index 7ff098cfa..24c684cad 100644 --- a/haruhi_dl/extractor/amp.py +++ b/haruhi_dl/extractor/amp.py @@ -8,6 +8,7 @@ from ..utils import ( int_or_none, mimetype2ext, parse_iso8601, + unified_timestamp, url_or_none, ) @@ -88,7 +89,7 @@ class AMPIE(InfoExtractor): self._sort_formats(formats) - timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) + timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) return { 'id': video_id, diff --git a/haruhi_dl/extractor/bleacherreport.py b/haruhi_dl/extractor/bleacherreport.py index dc60224d0..d1bf8e829 100644 --- a/haruhi_dl/extractor/bleacherreport.py +++ b/haruhi_dl/extractor/bleacherreport.py @@ -90,13 +90,19 @@ class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', - 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', + 'md5': '670b2d73f48549da032861130488c681', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + 'upload_date': '20150723', + 'timestamp': 1437679032, + }, + 'expected_warnings': [ + 'Unable to download f4m manifest' + ] }] def _real_extract(self, url): From 8d47c811f115d7b0aab1698f70cb33ec308a9fa1 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 16:19:08 +0100 Subject: [PATCH 333/384] [abcnews] fix extraction(closes #12394)(closes #27920) --- haruhi_dl/extractor/abcnews.py | 126 ++++++++++++++++++--------------- 1 file changed, 67 insertions(+), 59 deletions(-) diff --git a/haruhi_dl/extractor/abcnews.py b/haruhi_dl/extractor/abcnews.py index 64ea6e6ed..908c83377 100644 --- a/haruhi_dl/extractor/abcnews.py +++ b/haruhi_dl/extractor/abcnews.py @@ -1,14 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar import re -import time from .amp import AMPIE from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import compat_urlparse +from ..utils import ( + parse_duration, + parse_iso8601, + try_get, +) class AbcNewsVideoIE(AMPIE): @@ -18,8 +19,8 @@ class AbcNewsVideoIE(AMPIE): (?: abcnews\.go\.com/ (?: - [^/]+/video/(?P<display_id>[0-9a-z-]+)-| - video/embed\?.*?\bid= + (?:[^/]+/)*video/(?P<display_id>[0-9a-z-]+)-| + video/(?:embed|itemfeed)\?.*?\bid= )| fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/ ) @@ -49,6 +50,12 @@ class AbcNewsVideoIE(AMPIE): }, { 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', 'only_matching': True, + }, { + 'url': 'http://abcnews.go.com/video/itemfeed?id=46979033', + 'only_matching': True, + }, { + 'url': 'https://abcnews.go.com/GMA/News/video/history-christmas-story-67894761', + 'only_matching': True, }] def _real_extract(self, url): @@ -69,28 +76,23 @@ class AbcNewsIE(InfoExtractor): _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)' _TESTS = [{ - 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', + # Youtube Embeds + 'url': 'https://abcnews.go.com/Entertainment/peter-billingsley-child-actor-christmas-story-hollywood-power/story?id=51286501', 'info_dict': { - 'id': '10505354', - 'ext': 'flv', - 'display_id': 'dramatic-video-rare-death-job-america', - 'title': 'Occupational Hazards', - 'description': 'Nightline investigates the dangers that lurk at various jobs.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20100428', - 'timestamp': 1272412800, + 'id': '51286501', + 'title': "Peter Billingsley: From child actor in 'A Christmas Story' to Hollywood power player", + 'description': 'Billingsley went from a child actor to Hollywood power player.', }, - 'add_ie': ['AbcNewsVideo'], + 'playlist_count': 5, }, { 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', 'info_dict': { 'id': '38897857', 'ext': 'mp4', - 'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016', 'title': 'Justin Timberlake Drops Hints For Secret Single', 'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.', - 'upload_date': '20160515', - 'timestamp': 1463329500, + 'upload_date': '20160505', + 'timestamp': 1462442280, }, 'params': { # m3u8 download @@ -102,49 +104,55 @@ class AbcNewsIE(InfoExtractor): }, { 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', 'only_matching': True, + }, { + # inline.type == 'video' + 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - video_id = mobj.group('id') + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + story = self._parse_json(self._search_regex( + r"window\['__abcnews__'\]\s*=\s*({.+?});", + webpage, 'data'), story_id)['page']['content']['story']['everscroll'][0] + article_contents = story.get('articleContents') or {} - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') - full_video_url = compat_urlparse.urljoin(url, video_url) + def entries(): + featured_video = story.get('featuredVideo') or {} + feed = try_get(featured_video, lambda x: x['video']['feed']) + if feed: + yield { + '_type': 'url', + 'id': featured_video.get('id'), + 'title': featured_video.get('name'), + 'url': feed, + 'thumbnail': featured_video.get('images'), + 'description': featured_video.get('description'), + 'timestamp': parse_iso8601(featured_video.get('uploadDate')), + 'duration': parse_duration(featured_video.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - youtube_url = YoutubeIE._extract_url(webpage) + for inline in (article_contents.get('inlines') or []): + inline_type = inline.get('type') + if inline_type == 'iframe': + iframe_url = try_get(inline, lambda x: x['attrs']['src']) + if iframe_url: + yield self.url_result(iframe_url) + elif inline_type == 'video': + video_id = inline.get('id') + if video_id: + yield { + '_type': 'url', + 'id': video_id, + 'url': 'http://abcnews.go.com/video/embed?id=' + video_id, + 'thumbnail': inline.get('imgSrc') or inline.get('imgDefault'), + 'description': inline.get('description'), + 'duration': parse_duration(inline.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - timestamp = None - date_str = self._html_search_regex( - r'<span[^>]+class="timestamp">([^<]+)</span>', - webpage, 'timestamp', fatal=False) - if date_str: - tz_offset = 0 - if date_str.endswith(' ET'): # Eastern Time - tz_offset = -5 - date_str = date_str[:-3] - date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p'] - for date_format in date_formats: - try: - timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format)) - except ValueError: - continue - if timestamp is not None: - timestamp -= tz_offset * 3600 - - entry = { - '_type': 'url_transparent', - 'ie_key': AbcNewsVideoIE.ie_key(), - 'url': full_video_url, - 'id': video_id, - 'display_id': display_id, - 'timestamp': timestamp, - } - - if youtube_url: - entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] - return self.playlist_result(entries) - - return entry + return self.playlist_result( + entries(), story_id, article_contents.get('headline'), + article_contents.get('subHead')) From 6a18fcbd8a1359f95aeb5828f910fbf5c4ea8597 Mon Sep 17 00:00:00 2001 From: Remita Amine <remitamine@gmail.com> Date: Fri, 26 Feb 2021 16:20:26 +0100 Subject: [PATCH 334/384] [medialaan] add support DPG Media MyChannels based websites closes #14871 closes #15597 closes #16106 closes #16489 --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/generic.py | 19 ++ haruhi_dl/extractor/medialaan.py | 303 ++++++++---------------------- haruhi_dl/extractor/vtm.py | 62 ++++++ 4 files changed, 156 insertions(+), 229 deletions(-) create mode 100644 haruhi_dl/extractor/vtm.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 47ea20ad5..4beedb24a 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1517,6 +1517,7 @@ from .vrv import ( VRVSeriesIE, ) from .vshare import VShareIE +from .vtm import VTMIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index beb6ad2ad..4a885775a 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -134,6 +134,7 @@ from .xnews import XLinkIE from .libsyn import LibsynIE from .pulsembed import PulsEmbedIE from .arcpublishing import ArcPublishingIE +from .medialaan import MedialaanIE class GenericIE(InfoExtractor): @@ -2276,6 +2277,20 @@ class GenericIE(InfoExtractor): 'duration': 1581, }, }, + { + # MyChannels SDK embed + # https://www.24kitchen.nl/populair/deskundige-dit-waarom-sommigen-gevoelig-zijn-voor-voedselallergieen + 'url': 'https://www.demorgen.be/nieuws/burgemeester-rotterdam-richt-zich-in-videoboodschap-tot-relschoppers-voelt-het-goed~b0bcfd741/', + 'md5': '90c0699c37006ef18e198c032d81739c', + 'info_dict': { + 'id': '194165', + 'ext': 'mp4', + 'title': 'Burgemeester Aboutaleb spreekt relschoppers toe', + 'timestamp': 1611740340, + 'upload_date': '20210127', + 'duration': 159, + }, + }, ] def report_following_redirect(self, new_url): @@ -2515,6 +2530,9 @@ class GenericIE(InfoExtractor): webpage = self._webpage_read_content( full_response, url, video_id, prefix=first_bytes) + if '<title>DPG Media Privacy Gate' in webpage: + webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? @@ -2692,6 +2710,7 @@ class GenericIE(InfoExtractor): LibsynIE, VHXEmbedIE, ArcPublishingIE, + MedialaanIE, ): try: ie_key = embie.ie_key() diff --git a/haruhi_dl/extractor/medialaan.py b/haruhi_dl/extractor/medialaan.py index 50d5db802..788acf7fb 100644 --- a/haruhi_dl/extractor/medialaan.py +++ b/haruhi_dl/extractor/medialaan.py @@ -2,268 +2,113 @@ from __future__ import unicode_literals import re -from .gigya import GigyaBaseIE - -from ..compat import compat_str +from .common import InfoExtractor from ..utils import ( + extract_attributes, int_or_none, - parse_duration, - try_get, - unified_timestamp, + mimetype2ext, + parse_iso8601, ) -class MedialaanIE(GigyaBaseIE): +class MedialaanIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// - (?:www\.|nieuws\.)? (?: - (?Pvtm|q2|vtmkzoom)\.be/ - (?: - video(?:/[^/]+/id/|/?\?.*?\baid=)| - (?:[^/]+/)* - ) + (?:embed\.)?mychannels.video/embed/| + embed\.mychannels\.video/(?:s(?:dk|cript)/)?production/| + (?:www\.)?(?: + (?: + 7sur7| + demorgen| + hln| + joe| + qmusic + )\.be| + (?: + [abe]d| + bndestem| + destentor| + gelderlander| + pzc| + tubantia| + volkskrant + )\.nl + )/video/(?:[^/]+/)*[^/?&#]+~p ) - (?P[^/?#&]+) + (?P\d+) ''' - _NETRC_MACHINE = 'medialaan' - _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-' - _SITE_TO_APP_ID = { - 'vtm': 'vtm_watch', - 'q2': 'q2', - 'vtmkzoom': 'vtmkzoom', - } _TESTS = [{ - # vod - 'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch', + 'url': 'https://www.bndestem.nl/video/de-terugkeer-van-ally-de-aap-en-wie-vertrekt-er-nog-bij-nac~p193993', 'info_dict': { - 'id': 'vtm_20170219_VM0678361_vtmwatch', + 'id': '193993', 'ext': 'mp4', - 'title': 'Allemaal Chris afl. 6', - 'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2', - 'timestamp': 1487533280, - 'upload_date': '20170219', - 'duration': 2562, - 'series': 'Allemaal Chris', - 'season': 'Allemaal Chris', - 'season_number': 1, - 'season_id': '256936078124527', - 'episode': 'Allemaal Chris afl. 6', - 'episode_number': 6, - 'episode_id': '256936078591527', + 'title': 'De terugkeer van Ally de Aap en wie vertrekt er nog bij NAC?', + 'timestamp': 1611663540, + 'upload_date': '20210126', + 'duration': 238, }, 'params': { 'skip_download': True, }, - 'skip': 'Requires account credentials', }, { - # clip - 'url': 'http://vtm.be/video?aid=168332', - 'info_dict': { - 'id': '168332', - 'ext': 'mp4', - 'title': '"Veronique liegt!"', - 'description': 'md5:1385e2b743923afe54ba4adc38476155', - 'timestamp': 1489002029, - 'upload_date': '20170308', - 'duration': 96, - }, - }, { - # vod - 'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000', + 'url': 'https://www.gelderlander.nl/video/kanalen/degelderlander~c320/series/snel-nieuws~s984/noodbevel-in-doetinchem-politie-stuurt-mensen-centrum-uit~p194093', 'only_matching': True, }, { - # vod - 'url': 'http://vtm.be/video?aid=163157', + 'url': 'https://embed.mychannels.video/sdk/production/193993?options=TFTFF_default', 'only_matching': True, }, { - # vod - 'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2', + 'url': 'https://embed.mychannels.video/script/production/193993', 'only_matching': True, }, { - # clip - 'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio', + 'url': 'https://embed.mychannels.video/production/193993', 'only_matching': True, }, { - # http/s redirect - 'url': 'https://vtmkzoom.be/video?aid=45724', - 'info_dict': { - 'id': '257136373657000', - 'ext': 'mp4', - 'title': 'K3 Dansstudio Ushuaia afl.6', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires account credentials', + 'url': 'https://mychannels.video/embed/193993', + 'only_matching': True, }, { - # nieuws.vtm.be - 'url': 'https://nieuws.vtm.be/stadion/stadion/genk-nog-moeilijk-programma', + 'url': 'https://embed.mychannels.video/embed/193993', 'only_matching': True, }] - def _real_initialize(self): - self._logged_in = False - - def _login(self): - username, password = self._get_login_info() - if username is None: - self.raise_login_required() - - auth_data = { - 'APIKey': self._APIKEY, - 'sdk': 'js_6.1', - 'format': 'json', - 'loginID': username, - 'password': password, - } - - auth_info = self._gigya_login(auth_data) - - self._uid = auth_info['UID'] - self._uid_signature = auth_info['UIDSignature'] - self._signature_timestamp = auth_info['signatureTimestamp'] - - self._logged_in = True + @staticmethod + def _extract_urls(webpage): + entries = [] + for element in re.findall(r'(]+data-mychannels-type="video"[^>]*>)', webpage): + mychannels_id = extract_attributes(element).get('data-mychannels-id') + if mychannels_id: + entries.append('https://mychannels.video/embed/' + mychannels_id) + return entries def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, site_id = mobj.group('id', 'site_id') + production_id = self._match_id(url) + production = self._download_json( + 'https://embed.mychannels.video/sdk/production/' + production_id, + production_id, query={'options': 'UUUU_default'})['productions'][0] + title = production['title'] - webpage = self._download_webpage(url, video_id) - - config = self._parse_json( - self._search_regex( - r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);', - webpage, 'config', default='{}'), video_id, - transform_source=lambda s: s.replace( - '\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'")) - - vod_id = config.get('vodId') or self._search_regex( - (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"', - r'"vodId"\s*:\s*"(.+?)"', - r'<[^>]+id=["\']vod-(\d+)'), - webpage, 'video_id', default=None) - - # clip, no authentication required - if not vod_id: - player = self._parse_json( - self._search_regex( - r'vmmaplayer\(({.+?})\);', webpage, 'vmma player', - default=''), - video_id, transform_source=lambda s: '[%s]' % s, fatal=False) - if player: - video = player[-1] - if video['videoUrl'] in ('http', 'https'): - return self.url_result(video['url'], MedialaanIE.ie_key()) - info = { - 'id': video_id, - 'url': video['videoUrl'], - 'title': video['title'], - 'thumbnail': video.get('imageUrl'), - 'timestamp': int_or_none(video.get('createdDate')), - 'duration': int_or_none(video.get('duration')), - } + formats = [] + for source in (production.get('sources') or []): + src = source.get('src') + if not src: + continue + ext = mimetype2ext(source.get('type')) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, production_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: - info = self._parse_html5_media_entries( - url, webpage, video_id, m3u8_id='hls')[0] - info.update({ - 'id': video_id, - 'title': self._html_search_meta('description', webpage), - 'duration': parse_duration(self._html_search_meta('duration', webpage)), + formats.append({ + 'ext': ext, + 'url': src, }) - # vod, authentication required - else: - if not self._logged_in: - self._login() + self._sort_formats(formats) - settings = self._parse_json( - self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings', default='{}'), - video_id) - - def get(container, item): - return try_get( - settings, lambda x: x[container][item], - compat_str) or self._search_regex( - r'"%s"\s*:\s*"([^"]+)' % item, webpage, item, - default=None) - - app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch') - sso = get('vod', 'gigyaDatabase') or 'vtm-sso' - - data = self._download_json( - 'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id, - video_id, query={ - 'app_id': app_id, - 'user_network': sso, - 'UID': self._uid, - 'UIDSignature': self._uid_signature, - 'signatureTimestamp': self._signature_timestamp, - }) - - formats = self._extract_m3u8_formats( - data['response']['uri'], video_id, entry_protocol='m3u8_native', - ext='mp4', m3u8_id='hls') - - self._sort_formats(formats) - - info = { - 'id': vod_id, - 'formats': formats, - } - - api_key = get('vod', 'apiKey') - channel = get('medialaanGigya', 'channel') - - if api_key: - videos = self._download_json( - 'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False, - query={ - 'channels': channel, - 'ids': vod_id, - 'limit': 1, - 'apikey': api_key, - }) - if videos: - video = try_get( - videos, lambda x: x['response']['videos'][0], dict) - if video: - def get(container, item, expected_type=None): - return try_get( - video, lambda x: x[container][item], expected_type) - - def get_string(container, item): - return get(container, item, compat_str) - - info.update({ - 'series': get_string('program', 'title'), - 'season': get_string('season', 'title'), - 'season_number': int_or_none(get('season', 'number')), - 'season_id': get_string('season', 'id'), - 'episode': get_string('episode', 'title'), - 'episode_number': int_or_none(get('episode', 'number')), - 'episode_id': get_string('episode', 'id'), - 'duration': int_or_none( - video.get('duration')) or int_or_none( - video.get('durationMillis'), scale=1000), - 'title': get_string('episode', 'title'), - 'description': get_string('episode', 'text'), - 'timestamp': unified_timestamp(get_string( - 'publication', 'begin')), - }) - - if not info.get('title'): - info['title'] = try_get( - config, lambda x: x['videoConfig']['title'], - compat_str) or self._html_search_regex( - r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title', - default=None) or self._og_search_title(webpage) - - if not info.get('description'): - info['description'] = self._html_search_regex( - r']+class="field-item\s+even">\s*

(.+?)

', - webpage, 'description', default=None) - - return info + return { + 'id': production_id, + 'title': title, + 'formats': formats, + 'thumbnail': production.get('posterUrl'), + 'timestamp': parse_iso8601(production.get('publicationDate'), ' '), + 'duration': int_or_none(production.get('duration')) or None, + } diff --git a/haruhi_dl/extractor/vtm.py b/haruhi_dl/extractor/vtm.py new file mode 100644 index 000000000..093f1aa69 --- /dev/null +++ b/haruhi_dl/extractor/vtm.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, +) + + +class VTMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vtm\.be/([^/?&#]+)~v(?P[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12})' + _TEST = { + 'url': 'https://vtm.be/gast-vernielt-genkse-hotelkamer~ve7534523-279f-4b4d-a5c9-a33ffdbe23e1', + 'md5': '37dca85fbc3a33f2de28ceb834b071f8', + 'info_dict': { + 'id': '192445', + 'ext': 'mp4', + 'title': 'Gast vernielt Genkse hotelkamer', + 'timestamp': 1611060180, + 'upload_date': '20210119', + 'duration': 74, + # TODO: fix url _type result processing + # 'series': 'Op Interventie', + } + } + + def _real_extract(self, url): + uuid = self._match_id(url) + video = self._download_json( + 'https://omc4vm23offuhaxx6hekxtzspi.appsync-api.eu-west-1.amazonaws.com/graphql', + uuid, query={ + 'query': '''{ + getComponent(type: Video, uuid: "%s") { + ... on Video { + description + duration + myChannelsVideo + program { + title + } + publishedAt + title + } + } +}''' % uuid, + }, headers={ + 'x-api-key': 'da2-lz2cab4tfnah3mve6wiye4n77e', + })['data']['getComponent'] + + return { + '_type': 'url', + 'id': uuid, + 'title': video.get('title'), + 'url': 'http://mychannels.video/embed/%d' % video['myChannelsVideo'], + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('publishedAt')), + 'duration': int_or_none(video.get('duration')), + 'series': try_get(video, lambda x: x['program']['title']), + 'ie_key': 'Medialaan', + } From 00197b5fa8bda7a2b9dd67209413f44f3eb73376 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FAdrian=3D20Heine=3D20n=3DC3=3DA9=3D20Lan?= =?UTF-8?q?g=3F=3D?= Date: Fri, 26 Feb 2021 16:24:30 +0100 Subject: [PATCH 335/384] [awaan] Extract uploader id (#27963) --- haruhi_dl/extractor/awaan.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/haruhi_dl/extractor/awaan.py b/haruhi_dl/extractor/awaan.py index a2603bbff..3a7700cd4 100644 --- a/haruhi_dl/extractor/awaan.py +++ b/haruhi_dl/extractor/awaan.py @@ -48,6 +48,7 @@ class AWAANBaseIE(InfoExtractor): 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, + 'uploader_id': video_data.get('user_id'), } @@ -107,6 +108,7 @@ class AWAANLiveIE(AWAANBaseIE): 'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'upload_date': '20150107', 'timestamp': 1420588800, + 'uploader_id': '71', }, 'params': { # m3u8 download From 148394b527321d1f76f772fbce353a836ffcef60 Mon Sep 17 00:00:00 2001 From: nixxo Date: Fri, 26 Feb 2021 16:25:08 +0100 Subject: [PATCH 336/384] [vvvvid] add support for youtube embeds (#27825) --- haruhi_dl/extractor/vvvvid.py | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/vvvvid.py b/haruhi_dl/extractor/vvvvid.py index f4cae7fe9..778ce8b76 100644 --- a/haruhi_dl/extractor/vvvvid.py +++ b/haruhi_dl/extractor/vvvvid.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( ExtractorError, int_or_none, @@ -47,6 +48,22 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # video_type == 'video/youtube' + 'url': 'https://www.vvvvid.it/show/404/one-punch-man/406/486683/trailer', + 'md5': '33e0edfba720ad73a8782157fdebc648', + 'info_dict': { + 'id': 'RzmFKUDOUgw', + 'ext': 'mp4', + 'title': 'Trailer', + 'upload_date': '20150906', + 'description': 'md5:a5e802558d35247fee285875328c0b80', + 'uploader_id': 'BandaiVisual', + 'uploader': 'BANDAI NAMCO Arts Channel', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', 'only_matching': True @@ -154,12 +171,13 @@ class VVVVIDIE(InfoExtractor): if season_number: info['season_number'] = int(season_number) - for quality in ('_sd', ''): + video_type = video_data.get('video_type') + is_youtube = False + for quality in ('', '_sd'): embed_code = video_data.get('embed_info' + quality) if not embed_code: continue embed_code = ds(embed_code) - video_type = video_data.get('video_type') if video_type in ('video/rcs', 'video/kenc'): if video_type == 'video/kenc': kenc = self._download_json( @@ -172,19 +190,28 @@ class VVVVIDIE(InfoExtractor): if kenc_message: embed_code += '?' + ds(kenc_message) formats.extend(self._extract_akamai_formats(embed_code, video_id)) + elif video_type == 'video/youtube': + info.update({ + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'url': embed_code, + }) + is_youtube = True + break else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) metadata_from_url(embed_code) - self._sort_formats(formats) + if not is_youtube: + self._sort_formats(formats) + info['formats'] = formats metadata_from_url(video_data.get('thumbnail')) info.update(self._extract_common_video_info(video_data)) info.update({ 'id': video_id, 'title': title, - 'formats': formats, 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, From aec7e2fbb1d476fe6924f5d996432bb69eac647d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FAdrian=3D20Heine=3D20n=3DC3=3DA9=3D20Lan?= =?UTF-8?q?g=3F=3D?= Date: Fri, 26 Feb 2021 16:25:20 +0100 Subject: [PATCH 337/384] [AENetworks] update AENetworksShowIE test playlist id (#27851) --- haruhi_dl/extractor/aenetworks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/aenetworks.py b/haruhi_dl/extractor/aenetworks.py index a5d88ebbe..e55c03fd7 100644 --- a/haruhi_dl/extractor/aenetworks.py +++ b/haruhi_dl/extractor/aenetworks.py @@ -252,7 +252,7 @@ class AENetworksShowIE(AENetworksListBaseIE): _TESTS = [{ 'url': 'http://www.history.com/shows/ancient-aliens', 'info_dict': { - 'id': 'SH012427480000', + 'id': 'SERIES1574', 'title': 'Ancient Aliens', 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', }, From 591d23365c8268681f1f5d68d1bec36538654162 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:26:08 +0100 Subject: [PATCH 338/384] [vidio] improve metadata extraction --- haruhi_dl/extractor/vidio.py | 86 ++++++++++++++++++++---------------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/haruhi_dl/extractor/vidio.py b/haruhi_dl/extractor/vidio.py index b48baf00b..b1243e847 100644 --- a/haruhi_dl/extractor/vidio.py +++ b/haruhi_dl/extractor/vidio.py @@ -4,7 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + parse_iso8601, + str_or_none, + strip_or_none, + try_get, +) class VidioIE(InfoExtractor): @@ -21,57 +27,63 @@ class VidioIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 149, 'like_count': int, + 'uploader': 'TWELVE Pic', + 'timestamp': 1444902800, + 'upload_date': '20151015', + 'uploader_id': 'twelvepictures', + 'channel': 'Cover Music Video', + 'channel_id': '280236', + 'view_count': int, + 'dislike_count': int, + 'comment_count': int, + 'tags': 'count:4', }, }, { 'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north', 'only_matching': True, }] + def _real_initialize(self): + self._api_key = self._download_json( + 'https://www.vidio.com/auth', None, data=b'')['api_key'] + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, display_id = mobj.group('id', 'display_id') + video_id, display_id = re.match(self._VALID_URL, url).groups() + data = self._download_json( + 'https://api.vidio.com/videos/' + video_id, display_id, headers={ + 'Content-Type': 'application/vnd.api+json', + 'X-API-KEY': self._api_key, + }) + video = data['videos'][0] + title = video['title'].strip() - webpage = self._download_webpage(url, display_id) - - title = self._og_search_title(webpage) - - m3u8_url, duration, thumbnail = [None] * 3 - - clips = self._parse_json( - self._html_search_regex( - r'data-json-clips\s*=\s*(["\'])(?P\[.+?\])\1', - webpage, 'video data', default='[]', group='data'), - display_id, fatal=False) - if clips: - clip = clips[0] - m3u8_url = clip.get('sources', [{}])[0].get('file') - duration = clip.get('clip_duration') - thumbnail = clip.get('image') - - m3u8_url = m3u8_url or self._search_regex( - r'data(?:-vjs)?-clip-hls-url=(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'hls url', group='url') formats = self._extract_m3u8_formats( - m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + data['clips'][0]['hls_url'], display_id, 'mp4', 'm3u8_native') self._sort_formats(formats) - duration = int_or_none(duration or self._search_regex( - r'data-video-duration=(["\'])(?P\d+)\1', webpage, - 'duration', fatal=False, group='duration')) - thumbnail = thumbnail or self._og_search_thumbnail(webpage) - - like_count = int_or_none(self._search_regex( - (r']+data-comment-vote-count=["\'](\d+)', - r']+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'), - webpage, 'like count', fatal=False)) + get_first = lambda x: try_get(data, lambda y: y[x + 's'][0], dict) or {} + channel = get_first('channel') + user = get_first('user') + username = user.get('username') + get_count = lambda x: int_or_none(video.get('total_' + x)) return { 'id': video_id, 'display_id': display_id, 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': thumbnail, - 'duration': duration, - 'like_count': like_count, + 'description': strip_or_none(video.get('description')), + 'thumbnail': video.get('image_url_medium'), + 'duration': int_or_none(video.get('duration')), + 'like_count': get_count('likes'), 'formats': formats, + 'uploader': user.get('name'), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader_id': username, + 'uploader_url': 'https://www.vidio.com/@' + username if username else None, + 'channel': channel.get('name'), + 'channel_id': str_or_none(channel.get('id')), + 'view_count': get_count('view_count'), + 'dislike_count': get_count('dislikes'), + 'comment_count': get_count('comments'), + 'tags': video.get('tag_list'), } From e2095ebc115afe03c2c3035d0f1402a0cab82cac Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:26:30 +0100 Subject: [PATCH 339/384] [vidzi] remove extractor(closes #12629) --- haruhi_dl/extractor/extractors.py | 1 - haruhi_dl/extractor/vidzi.py | 68 ------------------------------- 2 files changed, 69 deletions(-) delete mode 100644 haruhi_dl/extractor/vidzi.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 4beedb24a..44a24b3dc 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1457,7 +1457,6 @@ from .vidme import ( VidmeUserIE, VidmeUserLikesIE, ) -from .vidzi import VidziIE from .vier import VierIE, VierVideosIE from .viewlift import ( ViewLiftIE, diff --git a/haruhi_dl/extractor/vidzi.py b/haruhi_dl/extractor/vidzi.py deleted file mode 100644 index b95eaadc0..000000000 --- a/haruhi_dl/extractor/vidzi.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - decode_packed_codes, - js_to_json, - NO_DEFAULT, - PACKED_CODES_RE, -) - - -class VidziIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P[0-9a-zA-Z]+)' - _TESTS = [{ - 'url': 'http://vidzi.tv/cghql9yq6emu.html', - 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', - 'info_dict': { - 'id': 'cghql9yq6emu', - 'ext': 'mp4', - 'title': 'haruhi-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.cc/cghql9yq6emu.html', - 'only_matching': True, - }, { - 'url': 'https://vidzi.si/rph9gztxj1et.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.nu/cghql9yq6emu.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://vidzi.tv/%s' % video_id, video_id) - title = self._html_search_regex( - r'(?s)

(.*?)

', webpage, 'title') - - codes = [webpage] - codes.extend([ - decode_packed_codes(mobj.group(0)).replace('\\\'', '\'') - for mobj in re.finditer(PACKED_CODES_RE, webpage)]) - for num, code in enumerate(codes, 1): - jwplayer_data = self._parse_json( - self._search_regex( - r'setup\(([^)]+)\)', code, 'jwplayer data', - default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=lambda s: js_to_json( - re.sub(r'\s*\+\s*window\[.+?\]', '', s))) - if jwplayer_data: - break - - info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) - info_dict['title'] = title - - return info_dict From cd74c846a68eae8748a8d2cc50be8f9bb1d014db Mon Sep 17 00:00:00 2001 From: Viren Rajput Date: Fri, 26 Feb 2021 16:26:38 +0100 Subject: [PATCH 340/384] [egghead] update API domain(closes #28038) --- haruhi_dl/extractor/egghead.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/haruhi_dl/extractor/egghead.py b/haruhi_dl/extractor/egghead.py index df11dc206..94dd75b9b 100644 --- a/haruhi_dl/extractor/egghead.py +++ b/haruhi_dl/extractor/egghead.py @@ -12,7 +12,14 @@ from ..utils import ( ) -class EggheadCourseIE(InfoExtractor): +class EggheadBaseIE(InfoExtractor): + def _call_api(self, path, video_id, resource, fatal=True): + return self._download_json( + 'https://app.egghead.io/api/v1/' + path, + video_id, 'Downloading %s JSON' % resource) + + +class EggheadCourseIE(EggheadBaseIE): IE_DESC = 'egghead.io course' IE_NAME = 'egghead:course' _VALID_URL = r'https://egghead\.io/courses/(?P[^/?#&]+)' @@ -28,10 +35,9 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - - lessons = self._download_json( - 'https://egghead.io/api/v1/series/%s/lessons' % playlist_id, - playlist_id, 'Downloading course lessons JSON') + series_path = 'series/' + playlist_id + lessons = self._call_api( + series_path + '/lessons', playlist_id, 'course lessons') entries = [] for lesson in lessons: @@ -44,9 +50,8 @@ class EggheadCourseIE(InfoExtractor): entries.append(self.url_result( lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) - course = self._download_json( - 'https://egghead.io/api/v1/series/%s' % playlist_id, - playlist_id, 'Downloading course JSON', fatal=False) or {} + course = self._call_api( + series_path, playlist_id, 'course', False) or {} playlist_id = course.get('id') if playlist_id: @@ -57,7 +62,7 @@ class EggheadCourseIE(InfoExtractor): course.get('description')) -class EggheadLessonIE(InfoExtractor): +class EggheadLessonIE(EggheadBaseIE): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P[^/?#&]+)' @@ -74,7 +79,7 @@ class EggheadLessonIE(InfoExtractor): 'upload_date': '20161209', 'duration': 304, 'view_count': 0, - 'tags': ['javascript', 'free'], + 'tags': ['free', 'javascript'], }, 'params': { 'skip_download': True, @@ -88,8 +93,8 @@ class EggheadLessonIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - lesson = self._download_json( - 'https://egghead.io/api/v1/lessons/%s' % display_id, display_id) + lesson = self._call_api( + 'lessons/' + display_id, display_id, 'lesson') lesson_id = compat_str(lesson['id']) title = lesson['title'] From e9b3810524928b4a253870cbb58d105027f99a76 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:26:44 +0100 Subject: [PATCH 341/384] [egghead] fix typo --- haruhi_dl/extractor/egghead.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/egghead.py b/haruhi_dl/extractor/egghead.py index 94dd75b9b..aff9b88c0 100644 --- a/haruhi_dl/extractor/egghead.py +++ b/haruhi_dl/extractor/egghead.py @@ -16,7 +16,7 @@ class EggheadBaseIE(InfoExtractor): def _call_api(self, path, video_id, resource, fatal=True): return self._download_json( 'https://app.egghead.io/api/v1/' + path, - video_id, 'Downloading %s JSON' % resource) + video_id, 'Downloading %s JSON' % resource, fatal=fatal) class EggheadCourseIE(EggheadBaseIE): @@ -79,7 +79,7 @@ class EggheadLessonIE(EggheadBaseIE): 'upload_date': '20161209', 'duration': 304, 'view_count': 0, - 'tags': ['free', 'javascript'], + 'tags': 'count:2', }, 'params': { 'skip_download': True, From 6cf6a0cf154dfb5c114be495cc53c38202a4da2c Mon Sep 17 00:00:00 2001 From: Guillem Vela Date: Fri, 26 Feb 2021 16:28:15 +0100 Subject: [PATCH 342/384] [ccma] improve metadata extraction(closes #27994) - extract age_limit, alt_title, categories, series and episode_number - fix timestamp multiple subtitles extraction --- haruhi_dl/extractor/ccma.py | 65 ++++++++++++++++++++++++++++++------- 1 file changed, 54 insertions(+), 11 deletions(-) diff --git a/haruhi_dl/extractor/ccma.py b/haruhi_dl/extractor/ccma.py index 544647f92..4db51e650 100644 --- a/haruhi_dl/extractor/ccma.py +++ b/haruhi_dl/extractor/ccma.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import datetime import re from .common import InfoExtractor @@ -8,8 +9,8 @@ from ..utils import ( clean_html, int_or_none, parse_duration, - parse_iso8601, parse_resolution, + try_get, url_or_none, ) @@ -24,8 +25,9 @@ class CCMAIE(InfoExtractor): 'ext': 'mp4', 'title': 'L\'espot de La Marató de TV3', 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', - 'timestamp': 1470918540, - 'upload_date': '20160811', + 'timestamp': 1478608140, + 'upload_date': '20161108', + 'age_limit': 0, } }, { 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', @@ -35,8 +37,24 @@ class CCMAIE(InfoExtractor): 'ext': 'mp3', 'title': 'El Consell de Savis analitza el derbi', 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', - 'upload_date': '20171205', - 'timestamp': 1512507300, + 'upload_date': '20170512', + 'timestamp': 1494622500, + 'vcodec': 'none', + 'categories': ['Esports'], + } + }, { + 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', + 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', + 'info_dict': { + 'id': '6031387', + 'ext': 'mp4', + 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)', + 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60', + 'timestamp': 1582577700, + 'upload_date': '20200224', + 'subtitles': 'mincount:4', + 'age_limit': 16, + 'series': 'Crims', } }] @@ -72,17 +90,27 @@ class CCMAIE(InfoExtractor): informacio = media['informacio'] title = informacio['titol'] - durada = informacio.get('durada', {}) + durada = informacio.get('durada') or {} duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) - timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) + tematica = try_get(informacio, lambda x: x['tematica']['text']) + + timestamp = None + data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) + try: + timestamp = datetime.datetime.strptime( + data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp() + except TypeError: + pass subtitles = {} - subtitols = media.get('subtitols', {}) - if subtitols: - sub_url = subtitols.get('url') + subtitols = media.get('subtitols') or [] + if isinstance(subtitols, dict): + subtitols = [subtitols] + for st in subtitols: + sub_url = st.get('url') if sub_url: subtitles.setdefault( - subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({ + st.get('iso') or st.get('text') or 'ca', []).append({ 'url': sub_url, }) @@ -97,6 +125,16 @@ class CCMAIE(InfoExtractor): 'height': int_or_none(imatges.get('alcada')), }] + age_limit = None + codi_etic = try_get(informacio, lambda x: x['codi_etic']['id']) + if codi_etic: + codi_etic_s = codi_etic.split('_') + if len(codi_etic_s) == 2: + if codi_etic_s[1] == 'TP': + age_limit = 0 + else: + age_limit = int_or_none(codi_etic_s[1]) + return { 'id': media_id, 'title': title, @@ -106,4 +144,9 @@ class CCMAIE(InfoExtractor): 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, + 'age_limit': age_limit, + 'alt_title': informacio.get('titol_complet'), + 'episode_number': int_or_none(informacio.get('capitol')), + 'categories': [tematica] if tematica else None, + 'series': informacio.get('programa'), } From 21321d23dcc54bbddda2b82d666417586ec7f191 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 16:28:28 +0100 Subject: [PATCH 343/384] [bravotv] add support for oxygen.com(closes #13357)(closes #22500) --- haruhi_dl/extractor/bravotv.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/bravotv.py b/haruhi_dl/extractor/bravotv.py index b9715df00..bae2aedce 100644 --- a/haruhi_dl/extractor/bravotv.py +++ b/haruhi_dl/extractor/bravotv.py @@ -12,7 +12,7 @@ from ..utils import ( class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pbravotv|oxygen)\.com/(?:[^/]+/)+(?P[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', @@ -28,10 +28,13 @@ class BravoTVIE(AdobePassIE): }, { 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) settings = self._parse_json(self._search_regex( r']+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})', webpage, 'drupal settings'), @@ -53,11 +56,14 @@ class BravoTVIE(AdobePassIE): tp_path = release_pid = tve['release_pid'] if tve.get('entitlement') == 'auth': adobe_pass = settings.get('tve_adobe_auth', {}) + if site == 'bravotv': + site = 'bravo' resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'bravo'), + adobe_pass.get('adobePassResourceId') or site, tve['title'], release_pid, tve.get('rating')) query['auth'] = self._extract_mvpd_auth( - url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + url, release_pid, + adobe_pass.get('adobePassRequestorId') or site, resource) else: shared_playlist = settings['ls_playlist'] account_pid = shared_playlist['account_pid'] From 00fe24846c67f1965b77283570bb16feb546b286 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 16:28:34 +0100 Subject: [PATCH 344/384] =?UTF-8?q?[pornhub:user]=20Add=20support=20for=20?= =?UTF-8?q?URLs=20unavailable=20via=20/videos=20page=20and=20=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …improve paging (closes #27853) --- haruhi_dl/extractor/pornhub.py | 56 ++++++++++++++++++++++++++++------ 1 file changed, 47 insertions(+), 9 deletions(-) diff --git a/haruhi_dl/extractor/pornhub.py b/haruhi_dl/extractor/pornhub.py index a66152e98..9b6c8ad3e 100644 --- a/haruhi_dl/extractor/pornhub.py +++ b/haruhi_dl/extractor/pornhub.py @@ -22,6 +22,7 @@ from ..utils import ( orderedSet, remove_quotes, str_to_int, + update_url_query, url_or_none, ) @@ -404,6 +405,10 @@ class PornHubIE(PornHubBaseIE): class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_page(self, url): + return int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) + def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see @@ -462,14 +467,27 @@ class PornHubUserIE(PornHubPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', 'only_matching': True, + }, { + # Unavailable via /videos page, but available with direct pagination + # on pornstar page (see [1]), requires premium + # 1. https://github.com/hdl-org/haruhi-dl/issues/27853 + 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west', + 'only_matching': True, + }, { + # Same as before, multi page + 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('id') + videos_url = '%s/videos' % mobj.group('url') + page = self._extract_page(url) + if page: + videos_url = update_url_query(videos_url, {'page': page}) return self.url_result( - '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(), - video_id=user_id) + videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id) class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): @@ -487,17 +505,37 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): host = mobj.group('host') item_id = mobj.group('id') - page = int_or_none(self._search_regex( - r'\bpage=(\d+)', url, 'page', default=None)) + page = self._extract_page(url) + + VIDEOS = '/videos' + + def download_page(base_url, num): + note = 'Downloading %spage %d' % ('' if VIDEOS in base_url else 'fallback ', num) + return self._download_webpage( + base_url, item_id, note, query={'page': num}) + + def is_404(e): + return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 entries = [] - for page_num in (page, ) if page is not None else itertools.count(1): + base_url = url + has_page = page is not None + first_page = page if has_page else 1 + for page_num in (first_page, ) if has_page else itertools.count(first_page): try: - webpage = self._download_webpage( - url, item_id, 'Downloading page %d' % page_num, - query={'page': page_num}) + try: + webpage = download_page(base_url, page_num) + except ExtractorError as e: + # Some sources may not be available via /videos page, + # trying to fallback to main page pagination (see [1]) + # 1. https://github.com/hdl-org/haruhi-dl/issues/27853 + if is_404(e) and page_num == first_page and VIDEOS in base_url: + base_url = base_url.replace(VIDEOS, '') + webpage = download_page(base_url, page_num) + else: + raise except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if is_404(e) and page_num != first_page: break raise page_entries = self._extract_entries(webpage, host) From f2cffa26d4a76d93bd6e5a3a46721aabed564c2c Mon Sep 17 00:00:00 2001 From: Sergey M Date: Fri, 26 Feb 2021 18:00:16 +0100 Subject: [PATCH 345/384] [pornhub] Add support for authentication (closes #18797, closes #21416, closes #24294) --- haruhi_dl/extractor/pornhub.py | 106 +++++++++++++++++++++++---------- 1 file changed, 76 insertions(+), 30 deletions(-) diff --git a/haruhi_dl/extractor/pornhub.py b/haruhi_dl/extractor/pornhub.py index 9b6c8ad3e..525286b2c 100644 --- a/haruhi_dl/extractor/pornhub.py +++ b/haruhi_dl/extractor/pornhub.py @@ -23,6 +23,7 @@ from ..utils import ( remove_quotes, str_to_int, update_url_query, + urlencode_postdata, url_or_none, ) @@ -57,6 +58,66 @@ class PornHubBaseIE(InfoExtractor): return webpage, urlh + def _real_initialize(self): + self._logged_in = False + + def _login(self, host): + if self._logged_in: + return + + site = host.split('.')[0] + + # Both sites pornhub and pornhubpremium have separate accounts + # so there should be an option to provide credentials for both. + # At the same time some videos are available under the same video id + # on both sites so that we have to identify them as the same video. + # For that purpose we have to keep both in the same extractor + # but under different netrc machines. + username, password = self._get_login_info(netrc_machine=site) + if username is None: + return + + login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') + login_page = self._download_webpage( + login_url, None, 'Downloading %s login page' % site) + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']signOut', + r'>Sign\s+[Oo]ut\s*<')) + + if is_logged(login_page): + self._logged_in = True + return + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + response = self._download_json( + 'https://www.%s/front/authenticate' % host, None, + 'Logging in to %s' % site, + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': login_url, + 'X-Requested-With': 'XMLHttpRequest', + }) + + if response.get('success') == '1': + self._logged_in = True + return + + message = response.get('message') + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % message, expected=True) + + raise ExtractorError('Unable to log in') + class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' @@ -167,12 +228,20 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', 'only_matching': True, + }, { + # Some videos are available with the same id on both premium + # and non-premium sites (e.g. this and the following test) + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage, **kwargs): return re.findall( - r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)', + r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -184,12 +253,7 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') - if 'premium' in host: - if not self._downloader.params.get('cookiefile'): - raise ExtractorError( - 'PornHub Premium requires authentication.' - ' You may want to use --cookies.', - expected=True) + self._login(host) self._set_cookie(host, 'age_verified', '1') @@ -426,26 +490,6 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): container)) ] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - playlist_id = mobj.group('id') - - webpage = self._download_webpage(url, playlist_id) - - entries = self._extract_entries(webpage, host) - - playlist = self._parse_json( - self._search_regex( - r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage, - 'playlist', default='{}'), - playlist_id, fatal=False) - title = playlist.get('title') or self._search_regex( - r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False) - - return self.playlist_result( - entries, playlist_id, title, playlist.get('description')) - class PornHubUserIE(PornHubPlaylistBaseIE): _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' @@ -505,12 +549,14 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): host = mobj.group('host') item_id = mobj.group('id') + self._login(host) + page = self._extract_page(url) VIDEOS = '/videos' - def download_page(base_url, num): - note = 'Downloading %spage %d' % ('' if VIDEOS in base_url else 'fallback ', num) + def download_page(base_url, num, fallback=False): + note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') return self._download_webpage( base_url, item_id, note, query={'page': num}) @@ -531,7 +577,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): # 1. https://github.com/hdl-org/haruhi-dl/issues/27853 if is_404(e) and page_num == first_page and VIDEOS in base_url: base_url = base_url.replace(VIDEOS, '') - webpage = download_page(base_url, page_num) + webpage = download_page(base_url, page_num, fallback=True) else: raise except ExtractorError as e: From 044b166cda6055c0107f16fcb41c5333f8e8bf27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 18:00:39 +0100 Subject: [PATCH 346/384] [svtplay] Fix video id extraction (closes #28058) --- haruhi_dl/extractor/svt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/svt.py b/haruhi_dl/extractor/svt.py index 1c2e747c8..2f768edf4 100644 --- a/haruhi_dl/extractor/svt.py +++ b/haruhi_dl/extractor/svt.py @@ -254,8 +254,10 @@ class SVTPlayIE(SVTPlayBaseIE): svt_id = self._search_regex( (r']+data-video-id=["\']([\da-zA-Z-]+)', r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', - r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), + r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'), webpage, 'video id') info_dict = self._extract_by_video_id(svt_id, webpage) From f21660e9631e4e59c228cedefff62064f49d63c3 Mon Sep 17 00:00:00 2001 From: Sergey M Date: Fri, 26 Feb 2021 18:02:15 +0100 Subject: [PATCH 347/384] [pornhub] Add placeholder netrc machine --- haruhi_dl/extractor/pornhub.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haruhi_dl/extractor/pornhub.py b/haruhi_dl/extractor/pornhub.py index 525286b2c..d2524a090 100644 --- a/haruhi_dl/extractor/pornhub.py +++ b/haruhi_dl/extractor/pornhub.py @@ -30,6 +30,7 @@ from ..utils import ( class PornHubBaseIE(InfoExtractor): _REQUIRES_PLAYWRIGHT = True + _NETRC_MACHINE = 'pornhub' def _download_webpage_handle(self, *args, **kwargs): def dl(*args, **kwargs): From 4d9300cc44ea283501f9a6be2afd7a511280714e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 18:02:37 +0100 Subject: [PATCH 348/384] [pornhub] Implement lazy playlist extraction --- haruhi_dl/extractor/pornhub.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/haruhi_dl/extractor/pornhub.py b/haruhi_dl/extractor/pornhub.py index d2524a090..97b5508eb 100644 --- a/haruhi_dl/extractor/pornhub.py +++ b/haruhi_dl/extractor/pornhub.py @@ -545,13 +545,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): ]+\bid=["\']moreDataBtn ''', webpage) is not None - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - item_id = mobj.group('id') - - self._login(host) - + def _entries(self, url, host, item_id): page = self._extract_page(url) VIDEOS = '/videos' @@ -564,7 +558,6 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): def is_404(e): return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 - entries = [] base_url = url has_page = page is not None first_page = page if has_page else 1 @@ -588,11 +581,19 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): page_entries = self._extract_entries(webpage, host) if not page_entries: break - entries.extend(page_entries) + for e in page_entries: + yield e if not self._has_more(webpage): break - return self.playlist_result(orderedSet(entries), item_id) + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + + return self.playlist_result(self._entries(url, host, item_id), item_id) class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): From 1da3c6765184a75f552ad466485f28130acce9bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FAdrian=3D20Heine=3D20n=3DC3=3DA9=3D20Lan?= =?UTF-8?q?g=3F=3D?= Date: Fri, 26 Feb 2021 18:03:38 +0100 Subject: [PATCH 349/384] [azmedien] Fix extraction (#28064) --- haruhi_dl/extractor/azmedien.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/azmedien.py b/haruhi_dl/extractor/azmedien.py index b1e20def5..930266990 100644 --- a/haruhi_dl/extractor/azmedien.py +++ b/haruhi_dl/extractor/azmedien.py @@ -47,7 +47,7 @@ class AZMedienIE(InfoExtractor): 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d' + _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be' _PARTNER_ID = '1719221' def _real_extract(self, url): From 1432a02035522a81c963e616e0ee03a9cf71ab51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FAdrian=3D20Heine=3D20n=3DC3=3DA9=3D20Lan?= =?UTF-8?q?g=3F=3D?= Date: Fri, 26 Feb 2021 18:03:49 +0100 Subject: [PATCH 350/384] [urplay] Fix extraction (closes #28073) (#28074) --- haruhi_dl/extractor/urplay.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/urplay.py b/haruhi_dl/extractor/urplay.py index 10b817760..5452c7ca1 100644 --- a/haruhi_dl/extractor/urplay.py +++ b/haruhi_dl/extractor/urplay.py @@ -42,8 +42,8 @@ class URPlayIE(InfoExtractor): url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) urplayer_data = self._parse_json(self._html_search_regex( - r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"', - webpage, 'urplayer data'), video_id)['currentProduct'] + r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['accessibleEpisodes'][0] episode = urplayer_data['title'] raw_streaming_info = urplayer_data['streamingInfo']['raw'] host = self._download_json( From 2df9ac25262ef72488313054f15afb95cbc40bd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 18:03:59 +0100 Subject: [PATCH 351/384] =?UTF-8?q?[archiveorg]=20Fix=20and=20improve=20ex?= =?UTF-8?q?traction=20(closes=20#21330,=20closes=20#23586=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …, closes #25277, closes #26780, closes #27109, closes #27236, closes #28063) --- haruhi_dl/extractor/archiveorg.py | 54 ++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 12 deletions(-) diff --git a/haruhi_dl/extractor/archiveorg.py b/haruhi_dl/extractor/archiveorg.py index c79c58e82..e42ed5e79 100644 --- a/haruhi_dl/extractor/archiveorg.py +++ b/haruhi_dl/extractor/archiveorg.py @@ -2,15 +2,17 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - unified_strdate, clean_html, + extract_attributes, + unified_strdate, + unified_timestamp, ) class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P[^/?#]+)(?:[?].*)?$' + _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'md5': '8af1d4cf447933ed3c7f4871162602db', @@ -19,8 +21,11 @@ class ArchiveOrgIE(InfoExtractor): 'ext': 'ogg', 'title': '1968 Demo - FJCC Conference Presentation Reel #1', 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', - 'upload_date': '19681210', - 'uploader': 'SRI International' + 'creator': 'SRI International', + 'release_date': '19681210', + 'uploader': 'SRI International', + 'timestamp': 1268695290, + 'upload_date': '20100315', } }, { 'url': 'https://archive.org/details/Cops1922', @@ -29,22 +34,43 @@ class ArchiveOrgIE(InfoExtractor): 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6', + 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', + 'timestamp': 1387699629, + 'upload_date': '20131222', } }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'only_matching': True, + }, { + 'url': 'https://archive.org/details/MSNBCW_20131125_040000_To_Catch_a_Predator/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://archive.org/embed/' + video_id, video_id) - jwplayer_playlist = self._parse_json(self._search_regex( - r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", - webpage, 'jwplayer playlist'), video_id) - info = self._parse_jwplayer_data( - {'playlist': jwplayer_playlist}, video_id, base_url=url) + + playlist = None + play8 = self._search_regex( + r'(<[^>]+\bclass=["\']js-play8-playlist[^>]+>)', webpage, + 'playlist', default=None) + if play8: + attrs = extract_attributes(play8) + playlist = attrs.get('value') + if not playlist: + # Old jwplayer fallback + playlist = self._search_regex( + r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", + webpage, 'jwplayer playlist', default='[]') + jwplayer_playlist = self._parse_json(playlist, video_id, fatal=False) + if jwplayer_playlist: + info = self._parse_jwplayer_data( + {'playlist': jwplayer_playlist}, video_id, base_url=url) + else: + # HTML5 media fallback + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info['id'] = video_id def get_optional(metadata, field): return metadata.get(field, [None])[0] @@ -58,8 +84,12 @@ class ArchiveOrgIE(InfoExtractor): 'description': clean_html(get_optional(metadata, 'description')), }) if info.get('_type') != 'playlist': + creator = get_optional(metadata, 'creator') info.update({ - 'uploader': get_optional(metadata, 'creator'), - 'upload_date': unified_strdate(get_optional(metadata, 'date')), + 'creator': creator, + 'release_date': unified_strdate(get_optional(metadata, 'date')), + 'uploader': get_optional(metadata, 'publisher') or creator, + 'timestamp': unified_timestamp(get_optional(metadata, 'publicdate')), + 'language': get_optional(metadata, 'language'), }) return info From 22b2970a2d3b9d37fde5dcac9f7f02d9c73c41f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 18:04:08 +0100 Subject: [PATCH 352/384] =?UTF-8?q?[xhamster]=20Extract=20formats=20from?= =?UTF-8?q?=20xplayer=20settings=20and=20extract=20filesize=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …s (closes #28114) --- haruhi_dl/extractor/xhamster.py | 80 ++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 12 deletions(-) diff --git a/haruhi_dl/extractor/xhamster.py b/haruhi_dl/extractor/xhamster.py index 19ec98de2..3a1e83043 100644 --- a/haruhi_dl/extractor/xhamster.py +++ b/haruhi_dl/extractor/xhamster.py @@ -11,11 +11,14 @@ from ..utils import ( dict_get, extract_attributes, ExtractorError, + float_or_none, int_or_none, parse_duration, + str_or_none, try_get, unified_strdate, url_or_none, + urljoin, ) @@ -146,36 +149,89 @@ class XHamsterIE(InfoExtractor): video = initials['videoModel'] title = video['title'] formats = [] - for format_id, formats_dict in video['sources'].items(): + format_urls = set() + format_sizes = {} + sources = try_get(video, lambda x: x['sources'], dict) or {} + for format_id, formats_dict in sources.items(): if not isinstance(formats_dict, dict): continue + download_sources = try_get(sources, lambda x: x['download'], dict) or {} + for quality, format_dict in download_sources.items(): + if not isinstance(format_dict, dict): + continue + format_sizes[quality] = float_or_none(format_dict.get('size')) for quality, format_item in formats_dict.items(): if format_id == 'download': # Download link takes some time to be generated, # skipping for now continue - if not isinstance(format_item, dict): - continue - format_url = format_item.get('link') - filesize = int_or_none( - format_item.get('size'), invscale=1000000) - else: - format_url = format_item - filesize = None + format_url = format_item format_url = url_or_none(format_url) - if not format_url: + if not format_url or format_url in format_urls: continue + format_urls.add(format_url) formats.append({ 'format_id': '%s-%s' % (format_id, quality), 'url': format_url, 'ext': determine_ext(format_url, 'mp4'), 'height': get_height(quality), - 'filesize': filesize, + 'filesize': format_sizes.get(quality), 'http_headers': { 'Referer': urlh.geturl(), }, }) - self._sort_formats(formats) + xplayer_sources = try_get( + initials, lambda x: x['xplayerSettings']['sources'], dict) + if xplayer_sources: + hls_sources = xplayer_sources.get('hls') + if isinstance(hls_sources, dict): + for hls_format_key in ('url', 'fallback'): + hls_url = hls_sources.get(hls_format_key) + if not hls_url: + continue + hls_url = urljoin(url, hls_url) + if not hls_url or hls_url in format_urls: + continue + format_urls.add(hls_url) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + standard_sources = xplayer_sources.get('standard') + if isinstance(standard_sources, dict): + for format_id, formats_list in standard_sources.items(): + if not isinstance(formats_list, list): + continue + for standard_format in formats_list: + if not isinstance(standard_format, dict): + continue + for standard_format_key in ('url', 'fallback'): + standard_url = standard_format.get(standard_format_key) + if not standard_url: + continue + standard_url = urljoin(url, standard_url) + if not standard_url or standard_url in format_urls: + continue + format_urls.add(standard_url) + ext = determine_ext(standard_url, 'mp4') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + standard_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = (str_or_none(standard_format.get('quality')) + or str_or_none(standard_format.get('label')) + or '') + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': standard_url, + 'ext': ext, + 'height': get_height(quality), + 'filesize': format_sizes.get(quality), + 'http_headers': { + 'Referer': standard_url, + }, + }) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) categories_list = video.get('categories') if isinstance(categories_list, list): From 87dee740afd930ec86b101c35d2d63f104065166 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:04:15 +0100 Subject: [PATCH 353/384] [ign] fix extraction(closes #24771) --- haruhi_dl/extractor/extractors.py | 4 +- haruhi_dl/extractor/ign.py | 371 ++++++++++++++++-------------- 2 files changed, 200 insertions(+), 175 deletions(-) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 44a24b3dc..74d8320c4 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -500,8 +500,8 @@ from .hungama import ( from .hypem import HypemIE from .ign import ( IGNIE, - OneUPIE, - PCMagIE, + IGNVideoIE, + IGNArticleIE, ) from .iheart import ( IHeartRadioIE, diff --git a/haruhi_dl/extractor/ign.py b/haruhi_dl/extractor/ign.py index a96ea8010..0d9f50ed2 100644 --- a/haruhi_dl/extractor/ign.py +++ b/haruhi_dl/extractor/ign.py @@ -3,230 +3,255 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( + HEADRequest, + determine_ext, int_or_none, parse_iso8601, + strip_or_none, + try_get, ) -class IGNIE(InfoExtractor): +class IGNBaseIE(InfoExtractor): + def _call_api(self, slug): + return self._download_json( + 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) + + +class IGNIE(IGNBaseIE): """ Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. Some videos of it.ign.com are also supported """ - _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?Pvideos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P.+)' + _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P[^/?&#]+)' IE_NAME = 'ign.com' + _PAGE_TYPE = 'video' - _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s' - _EMBED_RE = r']+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']' - - _TESTS = [ - { - 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - 'md5': 'febda82c4bafecd2d44b6e1a18a595f8', - 'info_dict': { - 'id': '8f862beef863986b2785559b9e1aa599', - 'ext': 'mp4', - 'title': 'The Last of Us Review', - 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', - 'timestamp': 1370440800, - 'upload_date': '20130605', - 'uploader_id': 'cberidon@ign.com', - } - }, - { - 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', - 'info_dict': { - 'id': '100-little-things-in-gta-5-that-will-blow-your-mind', - }, - 'playlist': [ - { - 'info_dict': { - 'id': '5ebbd138523268b93c9141af17bec937', - 'ext': 'mp4', - 'title': 'GTA 5 Video Review', - 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', - 'timestamp': 1379339880, - 'upload_date': '20130916', - 'uploader_id': 'danieljkrupa@gmail.com', - }, - }, - { - 'info_dict': { - 'id': '638672ee848ae4ff108df2a296418ee2', - 'ext': 'mp4', - 'title': '26 Twisted Moments from GTA 5 in Slow Motion', - 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', - 'timestamp': 1386878820, - 'upload_date': '20131212', - 'uploader_id': 'togilvie@ign.com', - }, - }, - ], - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', - 'md5': '618fedb9c901fd086f6f093564ef8558', - 'info_dict': { - 'id': '078fdd005f6d3c02f63d795faa1b984f', - 'ext': 'mp4', - 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', - 'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.', - 'timestamp': 1408047180, - 'upload_date': '20140814', - 'uploader_id': 'jamesduggan1990@gmail.com', - }, - }, - { - 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', - 'only_matching': True, - }, - { - 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', - 'only_matching': True, - }, - { - # videoId pattern - 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', - 'only_matching': True, - }, - ] - - def _find_video_id(self, webpage): - res_id = [ - r'"video_id"\s*:\s*"(.*?)"', - r'class="hero-poster[^"]*?"[^>]*id="(.+?)"', - r'data-video-id="(.+?)"', - r']*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', - webpage) - if multiple_urls: - entries = [self.url_result(u, ie='IGN') for u in multiple_urls] - return { - '_type': 'playlist', - 'id': name_or_id, - 'entries': entries, - } - - video_id = self._find_video_id(webpage) - if not video_id: - return self.url_result(self._search_regex( - self._EMBED_RE, webpage, 'embed url')) - return self._get_video_info(video_id) - - def _get_video_info(self, video_id): - api_data = self._download_json( - self._API_URL_TEMPLATE % video_id, video_id) + display_id = self._match_id(url) + video = self._call_api(display_id) + video_id = video['videoId'] + metadata = video['metadata'] + title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] formats = [] - m3u8_url = api_data['refs'].get('m3uUrl') + refs = video.get('refs') or {} + + m3u8_url = refs.get('m3uUrl') if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - f4m_url = api_data['refs'].get('f4mUrl') + + f4m_url = refs.get('f4mUrl') if f4m_url: formats.extend(self._extract_f4m_formats( f4m_url, video_id, f4m_id='hds', fatal=False)) - for asset in api_data['assets']: + + for asset in (video.get('assets') or []): + asset_url = asset.get('url') + if not asset_url: + continue formats.append({ - 'url': asset['url'], - 'tbr': asset.get('actual_bitrate_kbps'), - 'fps': asset.get('frame_rate'), + 'url': asset_url, + 'tbr': int_or_none(asset.get('bitrate'), 1000), + 'fps': int_or_none(asset.get('frame_rate')), 'height': int_or_none(asset.get('height')), 'width': int_or_none(asset.get('width')), }) + + mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) + if mezzanine_url: + formats.append({ + 'ext': determine_ext(mezzanine_url, 'mp4'), + 'format_id': 'mezzanine', + 'preference': 1, + 'url': mezzanine_url, + }) + self._sort_formats(formats) - thumbnails = [{ - 'url': thumbnail['url'] - } for thumbnail in api_data.get('thumbnails', [])] + thumbnails = [] + for thumbnail in (video.get('thumbnails') or []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + }) - metadata = api_data['metadata'] + tags = [] + for tag in (video.get('tags') or []): + display_name = tag.get('displayName') + if not display_name: + continue + tags.append(display_name) return { - 'id': api_data.get('videoId') or video_id, - 'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'], - 'description': metadata.get('description'), + 'id': video_id, + 'title': title, + 'description': strip_or_none(metadata.get('description')), 'timestamp': parse_iso8601(metadata.get('publishDate')), 'duration': int_or_none(metadata.get('duration')), - 'display_id': metadata.get('slug') or video_id, - 'uploader_id': metadata.get('creator'), + 'display_id': display_id, 'thumbnails': thumbnails, 'formats': formats, + 'tags': tags, } -class OneUPIE(IGNIE): - _VALID_URL = r'https?://gamevideos\.1up\.com/(?Pvideo)/id/(?P.+)\.html' - IE_NAME = '1up.com' - +class IGNVideoIE(InfoExtractor): + _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P\d+)/(?:video|trailer)/' _TESTS = [{ - 'url': 'http://gamevideos.1up.com/video/id/34976.html', - 'md5': 'c9cc69e07acb675c31a16719f909e347', + 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', + 'md5': 'dd9aca7ed2657c4e118d8b261e5e9de1', 'info_dict': { - 'id': '34976', + 'id': 'e9be7ea899a9bbfc0674accc22a36cc8', 'ext': 'mp4', - 'title': 'Sniper Elite V2 - Trailer', - 'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826', - 'timestamp': 1313099220, - 'upload_date': '20110811', - 'uploader_id': 'IGN', + 'title': 'How Hitman Aims to Be Different Than Every Other Stealth Game - NYCC 2015', + 'description': 'Taking out assassination targets in Hitman has never been more stylish.', + 'timestamp': 1444665600, + 'upload_date': '20151012', } + }, { + 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://me.ign.com/ar/ratchet-clank-rift-apart/144327/trailer/embed', + 'only_matching': True, + }, { + # Twitter embed + 'url': 'http://adria.ign.com/sherlock-season-4/9687/trailer/embed', + 'only_matching': True, + }, { + # Vimeo embed + 'url': 'https://kr.ign.com/bic-2018/3307/trailer/embed', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - result = super(OneUPIE, self)._real_extract(url) - result['id'] = mobj.group('name_or_id') - return result + video_id = self._match_id(url) + req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') + url = self._request_webpage(req, video_id).geturl() + ign_url = compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('url', [None])[0] + if ign_url: + return self.url_result(ign_url, IGNIE.ie_key()) + return self.url_result(url) -class PCMagIE(IGNIE): - _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?Pvideos|article2)(/.+)?/(?P.+)' - IE_NAME = 'pcmag' - - _EMBED_RE = r'iframe\.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content\.html?[^"]*url=([^"]+)["&]' - +class IGNArticleIE(IGNBaseIE): + _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P[^/?&#]+)' + _PAGE_TYPE = 'article' _TESTS = [{ - 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', - 'md5': '212d6154fd0361a2781075f1febbe9ad', + 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'info_dict': { - 'id': 'ee10d774b508c9b8ec07e763b9125b91', - 'ext': 'mp4', - 'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?', - 'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3', - 'timestamp': 1420571160, - 'upload_date': '20150106', - 'uploader_id': 'cozzipix@gmail.com', - } + 'id': '524497489e4e8ff5848ece34', + 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', + }, + 'playlist': [ + { + 'info_dict': { + 'id': '5ebbd138523268b93c9141af17bec937', + 'ext': 'mp4', + 'title': 'GTA 5 Video Review', + 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + 'timestamp': 1379339880, + 'upload_date': '20130916', + }, + }, + { + 'info_dict': { + 'id': '638672ee848ae4ff108df2a296418ee2', + 'ext': 'mp4', + 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', + 'timestamp': 1386878820, + 'upload_date': '20131212', + }, + }, + ], + 'params': { + 'playlist_items': '2-3', + 'skip_download': True, + }, }, { - 'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp', - 'md5': '94130c1ca07ba0adb6088350681f16c1', + 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'info_dict': { - 'id': '042e560ba94823d43afcb12ddf7142ca', - 'ext': 'mp4', - 'title': 'HTC\'s Weird New Re Camera - What\'s New Now', - 'description': 'md5:53433c45df96d2ea5d0fda18be2ca908', - 'timestamp': 1412953920, - 'upload_date': '20141010', - 'uploader_id': 'chris_snyder@pcmag.com', - } + 'id': '53ee806780a81ec46e0790f8', + 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', + }, + 'playlist_count': 2, + }, { + # videoId pattern + 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://www.ign.com/articles/2021-mvp-named-in-puppy-bowl-xvii', + 'only_matching': True, + }, { + # IMDB embed + 'url': 'https://www.ign.com/articles/2014/08/07/sons-of-anarchy-final-season-trailer', + 'only_matching': True, + }, { + # Facebook embed + 'url': 'https://www.ign.com/articles/2017/09/20/marvels-the-punisher-watch-the-new-trailer-for-the-netflix-series', + 'only_matching': True, + }, { + # Brightcove embed + 'url': 'https://www.ign.com/articles/2016/01/16/supergirl-goes-flying-with-martian-manhunter-in-new-clip', + 'only_matching': True, }] + + def _real_extract(self, url): + display_id = self._match_id(url) + article = self._call_api(display_id) + + def entries(): + media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) + if media_url: + yield self.url_result(media_url, IGNIE.ie_key()) + for content in (article.get('content') or []): + for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): + yield self.url_result(video_url) + + return self.playlist_result( + entries(), article.get('articleId'), + strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) From ec2eaef0cae96edb82f80bd6941d189fa86c28bb Mon Sep 17 00:00:00 2001 From: Kevin Velghe Date: Fri, 26 Feb 2021 18:04:23 +0100 Subject: [PATCH 354/384] [canvas] Add new extractor for Dagelijkse Kost (#28119) --- haruhi_dl/extractor/canvas.py | 56 +++++++++++++++++++++++++++++-- haruhi_dl/extractor/extractors.py | 1 + 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/canvas.py b/haruhi_dl/extractor/canvas.py index 8b76a0200..eefbab241 100644 --- a/haruhi_dl/extractor/canvas.py +++ b/haruhi_dl/extractor/canvas.py @@ -7,19 +7,21 @@ from .common import InfoExtractor from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( - extract_attributes, ExtractorError, - strip_or_none, + clean_html, + extract_attributes, float_or_none, + get_element_by_class, int_or_none, merge_dicts, str_or_none, + strip_or_none, url_or_none, ) class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P[^/?#&]+)' + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?Pcanvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'md5': '68993eda72ef62386a15ea2cf3c93107', @@ -332,3 +334,51 @@ class VrtNUIE(GigyaBaseIE): 'display_id': display_id, 'season_number': int_or_none(page.get('episode_season')), }) + + +class DagelijkseKostIE(InfoExtractor): + IE_DESC = 'dagelijksekost.een.be' + _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P[^/?#&]+)' + _TEST = { + 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', + 'md5': '30bfffc323009a3e5f689bef6efa2365', + 'info_dict': { + 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', + 'display_id': 'hachis-parmentier-met-witloof', + 'ext': 'mp4', + 'title': 'Hachis parmentier met witloof', + 'description': 'md5:9960478392d87f63567b5b117688cdc5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 283.02, + }, + 'expected_warnings': ['is not a supported codec'], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + title = strip_or_none(get_element_by_class( + 'dish-metadata__title', webpage + ) or self._html_search_meta( + 'twitter:title', webpage)) + + description = clean_html(get_element_by_class( + 'dish-description', webpage) + ) or self._html_search_meta( + ('description', 'twitter:description', 'og:description'), + webpage) + + video_id = self._html_search_regex( + r'data-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video id', + group='id') + + return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 74d8320c4..b3fc1ad81 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -173,6 +173,7 @@ from .canvas import ( CanvasIE, CanvasEenIE, VrtNUIE, + DagelijkseKostIE, ) from .carambatv import ( CarambaTVIE, From 89b41a73aac98ff3f5f33ca62f0c8795daf9b4b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 18:04:40 +0100 Subject: [PATCH 355/384] [ard] Improve formats extraction (closes #28155) --- haruhi_dl/extractor/ard.py | 44 ++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/haruhi_dl/extractor/ard.py b/haruhi_dl/extractor/ard.py index 6bf5b3f13..143fc51e9 100644 --- a/haruhi_dl/extractor/ard.py +++ b/haruhi_dl/extractor/ard.py @@ -284,20 +284,42 @@ class ARDIE(InfoExtractor): formats = [] for a in video_node.findall('.//asset'): + file_name = xpath_text(a, './fileName', default=None) + if not file_name: + continue + format_type = a.attrib.get('type') + format_url = url_or_none(file_name) + if format_url: + ext = determine_ext(file_name) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_type or 'hls', fatal=False)) + continue + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), + display_id, f4m_id=format_type or 'hds', fatal=False)) + continue f = { - 'format_id': a.attrib['type'], - 'width': int_or_none(a.find('./frameWidth').text), - 'height': int_or_none(a.find('./frameHeight').text), - 'vbr': int_or_none(a.find('./bitrateVideo').text), - 'abr': int_or_none(a.find('./bitrateAudio').text), - 'vcodec': a.find('./codecVideo').text, - 'tbr': int_or_none(a.find('./totalBitrate').text), + 'format_id': format_type, + 'width': int_or_none(xpath_text(a, './frameWidth')), + 'height': int_or_none(xpath_text(a, './frameHeight')), + 'vbr': int_or_none(xpath_text(a, './bitrateVideo')), + 'abr': int_or_none(xpath_text(a, './bitrateAudio')), + 'vcodec': xpath_text(a, './codecVideo'), + 'tbr': int_or_none(xpath_text(a, './totalBitrate')), } - if a.find('./serverPrefix').text: - f['url'] = a.find('./serverPrefix').text - f['playpath'] = a.find('./fileName').text + server_prefix = xpath_text(a, './serverPrefix', default=None) + if server_prefix: + f.update({ + 'url': server_prefix, + 'playpath': file_name, + }) else: - f['url'] = a.find('./fileName').text + if not format_url: + continue + f['url'] = format_url formats.append(f) self._sort_formats(formats) From 221f01621a7666c8887e2534c54ccfb8a767ac56 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:04:46 +0100 Subject: [PATCH 356/384] [xboxclips] fix extraction(closes #27151) --- haruhi_dl/extractor/xboxclips.py | 45 +++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/haruhi_dl/extractor/xboxclips.py b/haruhi_dl/extractor/xboxclips.py index d9c277bc3..25f487e1e 100644 --- a/haruhi_dl/extractor/xboxclips.py +++ b/haruhi_dl/extractor/xboxclips.py @@ -1,40 +1,55 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( int_or_none, + month_by_abbreviation, parse_filesize, - unified_strdate, ) class XboxClipsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P[\w-]{36})' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:xboxclips\.com|gameclips\.io)/(?:video\.php\?.*vid=|[^/]+/)(?P[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ 'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', 'info_dict': { 'id': '074a69a9-5faf-46aa-b93b-9909c1720325', 'ext': 'mp4', - 'title': 'Iabdulelah playing Titanfall', + 'title': 'iAbdulElah playing Titanfall', 'filesize_approx': 26800000, 'upload_date': '20140807', 'duration': 56, } - } + }, { + 'url': 'https://gameclips.io/iAbdulElah/074a69a9-5faf-46aa-b93b-9909c1720325', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + if '/video.php' in url: + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0]) - video_url = self._html_search_regex( - r'>(?:Link|Download): ]+href="([^"]+)"', webpage, 'video URL') - title = self._html_search_regex( - r'XboxClips \| ([^<]+)', webpage, 'title') - upload_date = unified_strdate(self._html_search_regex( - r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False)) + webpage = self._download_webpage(url, video_id) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + + title = self._html_search_meta(['og:title', 'twitter:title'], webpage) + upload_date = None + mobj = re.search( + r'>Recorded: (\d{2})-(Jan|Feb|Mar|Apr|May|Ju[nl]|Aug|Sep|Oct|Nov|Dec)-(\d{4})', + webpage) + if mobj: + upload_date = '%s%.2d%s' % (mobj.group(3), month_by_abbreviation(mobj.group(2)), mobj.group(1)) filesize = parse_filesize(self._html_search_regex( r'>Size: ([^<]+)<', webpage, 'file size', fatal=False)) duration = int_or_none(self._html_search_regex( @@ -42,12 +57,12 @@ class XboxClipsIE(InfoExtractor): view_count = int_or_none(self._html_search_regex( r'>Views: (\d+)<', webpage, 'view count', fatal=False)) - return { + info.update({ 'id': video_id, - 'url': video_url, 'title': title, 'upload_date': upload_date, 'filesize_approx': filesize, 'duration': duration, 'view_count': view_count, - } + }) + return info From 3fc10250f2f72f6368295cba1060239f1f44ac9b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:04:54 +0100 Subject: [PATCH 357/384] =?UTF-8?q?[kakao]=20improve=20info=20extraction?= =?UTF-8?q?=20and=20detect=20geo=20restriction(closes=20#26=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …577) --- haruhi_dl/extractor/kakao.py | 64 +++++++++++++++++------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/haruhi_dl/extractor/kakao.py b/haruhi_dl/extractor/kakao.py index 32935bb28..31ce7a85c 100644 --- a/haruhi_dl/extractor/kakao.py +++ b/haruhi_dl/extractor/kakao.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, int_or_none, + str_or_none, strip_or_none, + try_get, unified_timestamp, update_url_query, ) @@ -23,7 +26,7 @@ class KakaoIE(InfoExtractor): 'id': '301965083', 'ext': 'mp4', 'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』', - 'uploader_id': 2671005, + 'uploader_id': '2671005', 'uploader': '그랑그랑이', 'timestamp': 1488160199, 'upload_date': '20170227', @@ -36,11 +39,15 @@ class KakaoIE(InfoExtractor): 'ext': 'mp4', 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', - 'uploader_id': 2653210, + 'uploader_id': '2653210', 'uploader': '쇼! 음악중심', 'timestamp': 1485684628, 'upload_date': '20170129', } + }, { + # geo restricted + 'url': 'https://tv.kakao.com/channel/3643855/cliplink/412069491', + 'only_matching': True, }] def _real_extract(self, url): @@ -68,8 +75,7 @@ class KakaoIE(InfoExtractor): 'fields': ','.join([ '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title', 'description', 'channelId', 'createTime', 'duration', 'playCount', - 'likeCount', 'commentCount', 'tagList', 'channel', 'name', - 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault', + 'likeCount', 'commentCount', 'tagList', 'channel', 'name', 'thumbnailUrl', 'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label']) } @@ -82,24 +88,28 @@ class KakaoIE(InfoExtractor): title = clip.get('title') or clip_link.get('displayTitle') - query['tid'] = impress.get('tid', '') + query.update({ + 'fields': '-*,code,message,url', + 'tid': impress.get('tid') or '', + }) formats = [] - for fmt in clip.get('videoOutputList', []): + for fmt in (clip.get('videoOutputList') or []): try: profile_name = fmt['profile'] if profile_name == 'AUDIO': continue - query.update({ - 'profile': profile_name, - 'fields': '-*,url', - }) - fmt_url_json = self._download_json( - api_base + 'raw/videolocation', display_id, - 'Downloading video URL for profile %s' % profile_name, - query=query, headers=player_header, fatal=False) - - if fmt_url_json is None: + query['profile'] = profile_name + try: + fmt_url_json = self._download_json( + api_base + 'raw/videolocation', display_id, + 'Downloading video URL for profile %s' % profile_name, + query=query, headers=player_header) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + resp = self._parse_json(e.cause.read().decode(), video_id) + if resp.get('code') == 'GeoBlocked': + self.raise_geo_restricted() continue fmt_url = fmt_url_json['url'] @@ -116,27 +126,13 @@ class KakaoIE(InfoExtractor): pass self._sort_formats(formats) - thumbs = [] - for thumb in clip.get('clipChapterThumbnailList', []): - thumbs.append({ - 'url': thumb.get('thumbnailUrl'), - 'id': compat_str(thumb.get('timeInSec')), - 'preference': -1 if thumb.get('isDefault') else 0 - }) - top_thumbnail = clip.get('thumbnailUrl') - if top_thumbnail: - thumbs.append({ - 'url': top_thumbnail, - 'preference': 10, - }) - return { 'id': display_id, 'title': title, 'description': strip_or_none(clip.get('description')), - 'uploader': clip_link.get('channel', {}).get('name'), - 'uploader_id': clip_link.get('channelId'), - 'thumbnails': thumbs, + 'uploader': try_get(clip_link, lambda x: x['channel']['name']), + 'uploader_id': str_or_none(clip_link.get('channelId')), + 'thumbnail': clip.get('thumbnailUrl'), 'timestamp': unified_timestamp(clip_link.get('createTime')), 'duration': int_or_none(clip.get('duration')), 'view_count': int_or_none(clip.get('playCount')), From eb88460be94f42f81953e9c5beea28e7d29f1c82 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:10:30 +0100 Subject: [PATCH 358/384] [videopress] add support for video.wordpress.com --- haruhi_dl/extractor/videopress.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/haruhi_dl/extractor/videopress.py b/haruhi_dl/extractor/videopress.py index 8938050a5..1f046b85b 100644 --- a/haruhi_dl/extractor/videopress.py +++ b/haruhi_dl/extractor/videopress.py @@ -4,21 +4,22 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, + int_or_none, parse_age_limit, qualities, random_birthday, - try_get, unified_timestamp, urljoin, ) class VideoPressIE(InfoExtractor): - _VALID_URL = r'https?://videopress\.com/embed/(?P[\da-zA-Z]+)' + _ID_REGEX = r'[\da-zA-Z]{8}' + _PATH_REGEX = r'video(?:\.word)?press\.com/embed/' + _VALID_URL = r'https?://%s(?P%s)' % (_PATH_REGEX, _ID_REGEX) _TESTS = [{ 'url': 'https://videopress.com/embed/kUJmAcSf', 'md5': '706956a6c875873d51010921310e4bc6', @@ -36,35 +37,36 @@ class VideoPressIE(InfoExtractor): # 17+, requires birth_* params 'url': 'https://videopress.com/embed/iH3gstfZ', 'only_matching': True, + }, { + 'url': 'https://video.wordpress.com/embed/kUJmAcSf', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage, **kwargs): return re.findall( - r']+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)', + r']+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX), webpage) def _real_extract(self, url): video_id = self._match_id(url) query = random_birthday('birth_year', 'birth_month', 'birth_day') + query['fields'] = 'description,duration,file_url_base,files,height,original,poster,rating,title,upload_date,width' video = self._download_json( 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, video_id, query=query) title = video['title'] - def base_url(scheme): - return try_get( - video, lambda x: x['file_url_base'][scheme], compat_str) - - base_url = base_url('https') or base_url('http') + file_url_base = video.get('file_url_base') or {} + base_url = file_url_base.get('https') or file_url_base.get('http') QUALITIES = ('std', 'dvd', 'hd') quality = qualities(QUALITIES) formats = [] - for format_id, f in video['files'].items(): + for format_id, f in (video.get('files') or {}).items(): if not isinstance(f, dict): continue for ext, path in f.items(): @@ -75,12 +77,14 @@ class VideoPressIE(InfoExtractor): 'ext': determine_ext(path, ext), 'quality': quality(format_id), }) - original_url = try_get(video, lambda x: x['original'], compat_str) + original_url = video.get('original') if original_url: formats.append({ 'url': original_url, 'format_id': 'original', 'quality': len(QUALITIES), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), }) self._sort_formats(formats) From 727a4a5b791426f0ab6d5e924b60bfcf0217e77e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:11:46 +0100 Subject: [PATCH 359/384] [ccma] fix timestamp parsing in python 2 --- haruhi_dl/extractor/ccma.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/ccma.py b/haruhi_dl/extractor/ccma.py index 4db51e650..e6ae49352 100644 --- a/haruhi_dl/extractor/ccma.py +++ b/haruhi_dl/extractor/ccma.py @@ -1,12 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals +import calendar import datetime import re from .common import InfoExtractor from ..utils import ( clean_html, + extract_timezone, int_or_none, parse_duration, parse_resolution, @@ -97,8 +99,9 @@ class CCMAIE(InfoExtractor): timestamp = None data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) try: - timestamp = datetime.datetime.strptime( - data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp() + timezone, data_utc = extract_timezone(data_utc) + timestamp = calendar.timegm((datetime.datetime.strptime( + data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple()) except TypeError: pass From a5d4fcbbd57a59004a93194375906e107b9dc3d7 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:11:59 +0100 Subject: [PATCH 360/384] [zhihu] Add new extractor(closes #28177) --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/zhihu.py | 69 +++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 haruhi_dl/extractor/zhihu.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index b3fc1ad81..cfbfe802b 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1656,5 +1656,6 @@ from .zattoo import ( ZattooLiveIE, ) from .zdf import ZDFIE, ZDFChannelIE +from .zhihu import ZhihuIE from .zingmp3 import ZingMp3IE from .zype import ZypeIE diff --git a/haruhi_dl/extractor/zhihu.py b/haruhi_dl/extractor/zhihu.py new file mode 100644 index 000000000..d1ed55be3 --- /dev/null +++ b/haruhi_dl/extractor/zhihu.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import float_or_none, int_or_none + + +class ZhihuIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zhihu\.com/zvideo/(?P[0-9]+)' + _TEST = { + 'url': 'https://www.zhihu.com/zvideo/1342930761977176064', + 'md5': 'c8d4c9cd72dd58e6f9bc9c2c84266464', + 'info_dict': { + 'id': '1342930761977176064', + 'ext': 'mp4', + 'title': '写春联也太难了吧!', + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': '桥半舫', + 'timestamp': 1612959715, + 'upload_date': '20210210', + 'uploader_id': '244ecb13b0fd7daf92235288c8ca3365', + 'duration': 146.333, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + zvideo = self._download_json( + 'https://www.zhihu.com/api/v4/zvideos/' + video_id, video_id) + title = zvideo['title'] + video = zvideo.get('video') or {} + + formats = [] + for format_id, q in (video.get('playlist') or {}).items(): + play_url = q.get('url') or q.get('play_url') + if not play_url: + continue + formats.append({ + 'asr': int_or_none(q.get('sample_rate')), + 'filesize': int_or_none(q.get('size')), + 'format_id': format_id, + 'fps': int_or_none(q.get('fps')), + 'height': int_or_none(q.get('height')), + 'tbr': float_or_none(q.get('bitrate')), + 'url': play_url, + 'width': int_or_none(q.get('width')), + }) + self._sort_formats(formats) + + author = zvideo.get('author') or {} + url_token = author.get('url_token') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video.get('thumbnail') or zvideo.get('image_url'), + 'uploader': author.get('name'), + 'timestamp': int_or_none(zvideo.get('published_at')), + 'uploader_id': author.get('id'), + 'uploader_url': 'https://www.zhihu.com/people/' + url_token if url_token else None, + 'duration': float_or_none(video.get('duration')), + 'view_count': int_or_none(zvideo.get('play_count')), + 'like_count': int_or_none(zvideo.get('liked_count')), + 'comment_count': int_or_none(zvideo.get('comment_count')), + } From 2a7bf89e70597c1e1dbca29f9b8577123541ff84 Mon Sep 17 00:00:00 2001 From: Stephen Stair Date: Fri, 26 Feb 2021 18:12:05 +0100 Subject: [PATCH 361/384] [storyfire] Add new extractor(closes #25628)(closes #26349) --- haruhi_dl/extractor/extractors.py | 5 + haruhi_dl/extractor/storyfire.py | 151 ++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 haruhi_dl/extractor/storyfire.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index cfbfe802b..bfb23f47b 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1186,6 +1186,11 @@ from .srgssr import ( from .srmediathek import SRMediathekIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE +from .storyfire import ( + StoryFireIE, + StoryFireUserIE, + StoryFireSeriesIE, +) from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE diff --git a/haruhi_dl/extractor/storyfire.py b/haruhi_dl/extractor/storyfire.py new file mode 100644 index 000000000..9c698626f --- /dev/null +++ b/haruhi_dl/extractor/storyfire.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools + +from .common import InfoExtractor +from ..utils import ( + # HEADRequest, + int_or_none, + OnDemandPagedList, + smuggle_url, +) + + +class StoryFireBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?storyfire\.com/' + + def _call_api(self, path, video_id, resource, query=None): + return self._download_json( + 'https://storyfire.com/app/%s/%s' % (path, video_id), video_id, + 'Downloading %s JSON metadata' % resource, query=query) + + def _parse_video(self, video): + title = video['title'] + vimeo_id = self._search_regex( + r'https?://player\.vimeo\.com/external/(\d+)', + video['vimeoVideoURL'], 'vimeo id') + + # video_url = self._request_webpage( + # HEADRequest(video['vimeoVideoURL']), video_id).geturl() + # formats = [] + # for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]: + # formats.extend(self._extract_m3u8_formats( + # v_url, video_id, 'mp4', 'm3u8_native', + # m3u8_id='hls' + suffix, fatal=False)) + # formats.extend(self._extract_mpd_formats( + # v_url.replace('.m3u8', '.mpd'), video_id, + # mpd_id='dash' + suffix, fatal=False)) + # self._sort_formats(formats) + + uploader_id = video.get('hostID') + + return { + '_type': 'url_transparent', + 'id': vimeo_id, + 'title': title, + 'description': video.get('description'), + 'url': smuggle_url( + 'https://player.vimeo.com/video/' + vimeo_id, { + 'http_headers': { + 'Referer': 'https://storyfire.com/', + } + }), + # 'formats': formats, + 'thumbnail': video.get('storyImage'), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likesCount')), + 'comment_count': int_or_none(video.get('commentsCount')), + 'duration': int_or_none(video.get('videoDuration')), + 'timestamp': int_or_none(video.get('publishDate')), + 'uploader': video.get('username'), + 'uploader_id': uploader_id, + 'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None, + 'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')), + } + + +class StoryFireIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'video-details/(?P[0-9a-f]{24})' + _TEST = { + 'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181', + 'md5': 'caec54b9e4621186d6079c7ec100c1eb', + 'info_dict': { + 'id': '378954662', + 'ext': 'mp4', + 'title': 'Buzzfeed Teaches You About Memes', + 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + 'timestamp': 1576129028, + 'description': 'md5:0b4e28021548e144bed69bb7539e62ea', + 'uploader': 'whang!', + 'upload_date': '20191212', + 'duration': 418, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'] + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._call_api( + 'generic/video-detail', video_id, 'video')['video'] + return self._parse_video(video) + + +class StoryFireUserIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'user/(?P[^/]+)/video' + _TEST = { + 'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video', + 'info_dict': { + 'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2', + }, + 'playlist_mincount': 151, + } + _PAGE_SIZE = 20 + + def _fetch_page(self, user_id, page): + videos = self._call_api( + 'publicVideos', user_id, 'page %d' % (page + 1), { + 'skip': page * self._PAGE_SIZE, + })['videos'] + for video in videos: + yield self._parse_video(video) + + def _real_extract(self, url): + user_id = self._match_id(url) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, user_id), self._PAGE_SIZE) + return self.playlist_result(entries, user_id) + + +class StoryFireSeriesIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'write/series/stories/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/', + 'info_dict': { + 'id': '-Lq6MsuIHLODO6d2dDkr', + }, + 'playlist_mincount': 13, + }, { + 'url': 'https://storyfire.com/write/series/stories/the_mortal_one/', + 'info_dict': { + 'id': 'the_mortal_one', + }, + 'playlist_count': 0, + }] + + def _extract_videos(self, stories): + for story in stories.values(): + if story.get('hasVideo'): + yield self._parse_video(story) + + def _real_extract(self, url): + series_id = self._match_id(url) + stories = self._call_api( + 'seriesStories', series_id, 'series stories') + return self.playlist_result(self._extract_videos(stories), series_id) From f23361c5d2739d5012c5de98ce451601bc05fbce Mon Sep 17 00:00:00 2001 From: Max Date: Fri, 26 Feb 2021 18:12:11 +0100 Subject: [PATCH 362/384] =?UTF-8?q?[postprocessor/embedthumbnail]=20Recogn?= =?UTF-8?q?ize=20atomicparsley=20binary=20in=20lowe=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …rcase (#28112) --- haruhi_dl/postprocessor/embedthumbnail.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/postprocessor/embedthumbnail.py b/haruhi_dl/postprocessor/embedthumbnail.py index 5a3359588..3990908b6 100644 --- a/haruhi_dl/postprocessor/embedthumbnail.py +++ b/haruhi_dl/postprocessor/embedthumbnail.py @@ -89,10 +89,14 @@ class EmbedThumbnailPP(FFmpegPostProcessor): os.rename(encodeFilename(temp_filename), encodeFilename(filename)) elif info['ext'] in ['m4a', 'mp4']: - if not check_executable('AtomicParsley', ['-v']): + atomicparsley = next((x + for x in ['AtomicParsley', 'atomicparsley'] + if check_executable(x, ['-v'])), None) + + if atomicparsley is None: raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') - cmd = [encodeFilename('AtomicParsley', True), + cmd = [encodeFilename(atomicparsley, True), encodeFilename(filename, True), encodeArgument('--artwork'), encodeFilename(thumbnail_filename, True), From fc9e0b111dd87e03a848b41d7067aa4be7c9eb61 Mon Sep 17 00:00:00 2001 From: dmsummers Date: Fri, 26 Feb 2021 18:13:28 +0100 Subject: [PATCH 363/384] [simplecast] Add new extractor(closes #24107) --- haruhi_dl/extractor/extractors.py | 5 + haruhi_dl/extractor/generic.py | 11 ++ haruhi_dl/extractor/simplecast.py | 160 ++++++++++++++++++++++++++++++ 3 files changed, 176 insertions(+) create mode 100644 haruhi_dl/extractor/simplecast.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index bfb23f47b..54e8f477b 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1102,6 +1102,11 @@ from .shared import ( VivoIE, ) from .showroomlive import ShowRoomLiveIE +from .simplecast import ( + SimplecastIE, + SimplecastEpisodeIE, + SimplecastPodcastIE, +) from .sina import SinaIE from .sixplay import SixPlayIE from .skyit import ( diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index 4a885775a..c2c930ac5 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -135,6 +135,7 @@ from .libsyn import LibsynIE from .pulsembed import PulsEmbedIE from .arcpublishing import ArcPublishingIE from .medialaan import MedialaanIE +from .simplecast import SimplecastIE class GenericIE(InfoExtractor): @@ -2291,6 +2292,15 @@ class GenericIE(InfoExtractor): 'duration': 159, }, }, + { + # Simplecast player embed + 'url': 'https://www.bio.org/podcast', + 'info_dict': { + 'id': 'podcast', + 'title': 'I AM BIO Podcast | BIO', + }, + 'playlist_mincount': 52, + }, ] def report_following_redirect(self, new_url): @@ -2711,6 +2721,7 @@ class GenericIE(InfoExtractor): VHXEmbedIE, ArcPublishingIE, MedialaanIE, + SimplecastIE, ): try: ie_key = embie.ie_key() diff --git a/haruhi_dl/extractor/simplecast.py b/haruhi_dl/extractor/simplecast.py new file mode 100644 index 000000000..98d8f7de6 --- /dev/null +++ b/haruhi_dl/extractor/simplecast.py @@ -0,0 +1,160 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + strip_or_none, + try_get, + urlencode_postdata, +) + + +class SimplecastBaseIE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + _API_BASE = 'https://api.simplecast.com/' + + def _call_api(self, path_tmpl, video_id): + return self._download_json( + self._API_BASE + path_tmpl % video_id, video_id) + + def _call_search_api(self, resource, resource_id, resource_url): + return self._download_json( + 'https://api.simplecast.com/%ss/search' % resource, resource_id, + data=urlencode_postdata({'url': resource_url})) + + def _parse_episode(self, episode): + episode_id = episode['id'] + title = episode['title'].strip() + audio_file = episode.get('audio_file') or {} + audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url'] + + season = episode.get('season') or {} + season_href = season.get('href') + season_id = None + if season_href: + season_id = self._search_regex( + r'https?://api.simplecast.com/seasons/(%s)' % self._UUID_REGEX, + season_href, 'season id', default=None) + + webpage_url = episode.get('episode_url') + channel_url = None + if webpage_url: + channel_url = self._search_regex( + r'(https?://[^/]+\.simplecast\.com)', + webpage_url, 'channel url', default=None) + + return { + 'id': episode_id, + 'display_id': episode.get('slug'), + 'title': title, + 'url': clean_podcast_url(audio_file_url), + 'webpage_url': webpage_url, + 'channel_url': channel_url, + 'series': try_get(episode, lambda x: x['podcast']['title']), + 'season_number': int_or_none(season.get('number')), + 'season_id': season_id, + 'thumbnail': episode.get('image_url'), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode.get('number')), + 'description': strip_or_none(episode.get('description')), + 'timestamp': parse_iso8601(episode.get('published_at')), + 'duration': int_or_none(episode.get('duration')), + 'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')), + } + + +class SimplecastIE(SimplecastBaseIE): + IE_NAME = 'simplecast' + _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P%s)' % SimplecastBaseIE._UUID_REGEX + _COMMON_TEST_INFO = { + 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays', + 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'ext': 'mp3', + 'title': 'Errant Signal - Chris Franklin & New Wave Video Essays', + 'episode_number': 1, + 'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'description': 'md5:34752789d3d2702e2d2c975fbd14f357', + 'season_number': 1, + 'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13', + 'series': 'The RE:BIND.io Podcast', + 'duration': 5343, + 'timestamp': 1580979475, + 'upload_date': '20200206', + 'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', + 'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$', + } + _TESTS = [{ + 'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'md5': '8c93be7be54251bf29ee97464eabe61c', + 'info_dict': _COMMON_TEST_INFO, + }, { + 'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage, **kw): + return re.findall( + r'''(?x)]+src=["\'] + ( + https?://(?:embed\.simplecast\.com/[0-9a-f]{8}| + player\.simplecast\.com/%s + ))''' % SimplecastBaseIE._UUID_REGEX, webpage) + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api('episodes/%s', episode_id) + return self._parse_episode(episode) + + +class SimplecastEpisodeIE(SimplecastBaseIE): + IE_NAME = 'simplecast:episode' + _VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', + 'md5': '8c93be7be54251bf29ee97464eabe61c', + 'info_dict': SimplecastIE._COMMON_TEST_INFO, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + episode = self._call_search_api( + 'episode', mobj.group(1), mobj.group(0)) + return self._parse_episode(episode) + + +class SimplecastPodcastIE(SimplecastBaseIE): + IE_NAME = 'simplecast:podcast' + _VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)' + _TESTS = [{ + 'url': 'https://the-re-bind-io-podcast.simplecast.com', + 'playlist_mincount': 33, + 'info_dict': { + 'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c', + 'title': 'The RE:BIND.io Podcast', + }, + }, { + 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes', + 'only_matching': True, + }] + + def _real_extract(self, url): + subdomain = self._match_id(url) + site = self._call_search_api('site', subdomain, url) + podcast = site['podcast'] + podcast_id = podcast['id'] + podcast_title = podcast.get('title') + + def entries(): + episodes = self._call_api('podcasts/%s/episodes', podcast_id) + for episode in (episodes.get('collection') or []): + info = self._parse_episode(episode) + info['series'] = podcast_title + yield info + + return self.playlist_result(entries(), podcast_id, podcast_title) From f10f61fa0e3ad8f15cbed6d8f3b8c0c58a8b081e Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:17:01 +0100 Subject: [PATCH 364/384] [dplay] Add support for discoveryplus.com (closes #24698) --- haruhi_dl/extractor/dplay.py | 123 +++++++++++++++++++++++------- haruhi_dl/extractor/extractors.py | 5 +- 2 files changed, 99 insertions(+), 29 deletions(-) diff --git a/haruhi_dl/extractor/dplay.py b/haruhi_dl/extractor/dplay.py index 47501dbe6..540505719 100644 --- a/haruhi_dl/extractor/dplay.py +++ b/haruhi_dl/extractor/dplay.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -151,56 +152,79 @@ class DPlayIE(InfoExtractor): 'only_matching': True, }] + def _process_errors(self, e, geo_countries): + info = self._parse_json(e.cause.read().decode('utf-8'), None) + error = info['errors'][0] + error_code = error.get('code') + if error_code == 'access.denied.geoblocked': + self.raise_geo_restricted(countries=geo_countries) + elif error_code in ('access.denied.missingpackage', 'invalid.token'): + raise ExtractorError( + 'This video is only available for registered users. You may want to use --cookies.', expected=True) + raise ExtractorError(info['errors'][0]['detail'], expected=True) + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['Authorization'] = 'Bearer ' + self._download_json( + disco_base + 'token', display_id, 'Downloading token', + query={ + 'realm': realm, + })['data']['attributes']['token'] + + def _download_video_playback_info(self, disco_base, video_id, headers): + streaming = self._download_json( + disco_base + 'playback/videoPlaybackInfo/' + video_id, + video_id, headers=headers)['data']['attributes']['streaming'] + streaming_list = [] + for format_id, format_dict in streaming.items(): + streaming_list.append({ + 'type': format_id, + 'url': format_dict.get('url'), + }) + return streaming_list + def _get_disco_api_info(self, url, display_id, disco_host, realm, country): geo_countries = [country.upper()] self._initialize_geo_bypass({ 'countries': geo_countries, }) disco_base = 'https://%s/' % disco_host - token = self._download_json( - disco_base + 'token', display_id, 'Downloading token', - query={ - 'realm': realm, - })['data']['attributes']['token'] headers = { 'Referer': url, - 'Authorization': 'Bearer ' + token, } - video = self._download_json( - disco_base + 'content/videos/' + display_id, display_id, - headers=headers, query={ - 'fields[channel]': 'name', - 'fields[image]': 'height,src,width', - 'fields[show]': 'name', - 'fields[tag]': 'name', - 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', - 'include': 'images,primaryChannel,show,tags' - }) + self._update_disco_api_headers(headers, disco_base, display_id, realm) + try: + video = self._download_json( + disco_base + 'content/videos/' + display_id, display_id, + headers=headers, query={ + 'fields[channel]': 'name', + 'fields[image]': 'height,src,width', + 'fields[show]': 'name', + 'fields[tag]': 'name', + 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', + 'include': 'images,primaryChannel,show,tags' + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + self._process_errors(e, geo_countries) + raise video_id = video['data']['id'] info = video['data']['attributes'] title = info['name'].strip() formats = [] try: - streaming = self._download_json( - disco_base + 'playback/videoPlaybackInfo/' + video_id, - display_id, headers=headers)['data']['attributes']['streaming'] + streaming = self._download_video_playback_info( + disco_base, video_id, headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - info = self._parse_json(e.cause.read().decode('utf-8'), display_id) - error = info['errors'][0] - error_code = error.get('code') - if error_code == 'access.denied.geoblocked': - self.raise_geo_restricted(countries=geo_countries) - elif error_code == 'access.denied.missingpackage': - self.raise_login_required() - raise ExtractorError(info['errors'][0]['detail'], expected=True) + self._process_errors(e, geo_countries) raise - for format_id, format_dict in streaming.items(): + for format_dict in streaming: if not isinstance(format_dict, dict): continue format_url = format_dict.get('url') if not format_url: continue + format_id = format_dict.get('type') ext = determine_ext(format_url) if format_id == 'dash' or ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -268,3 +292,46 @@ class DPlayIE(InfoExtractor): host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( url, display_id, host, 'dplay' + country, country) + + +class DiscoveryPlusIE(DPlayIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video/(?P[^/]+/[^/]+)' + _TESTS = [{ + 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', + 'info_dict': { + 'id': '1140794', + 'display_id': 'property-brothers-forever-home/food-and-family', + 'ext': 'mp4', + 'title': 'Food and Family', + 'description': 'The brothers help a Richmond family expand their single-level home.', + 'duration': 2583.113, + 'timestamp': 1609304400, + 'upload_date': '20201230', + 'creator': 'HGTV', + 'series': 'Property Brothers: Forever Home', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }] + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['x-disco-client'] = 'WEB:UNKNOWN:dplus_us:15.0.0' + + def _download_video_playback_info(self, disco_base, video_id, headers): + return self._download_json( + disco_base + 'playback/v3/videoPlaybackInfo', + video_id, headers=headers, data=json.dumps({ + 'deviceInfo': { + 'adBlocker': False, + }, + 'videoId': video_id, + 'wisteriaProperties': { + 'platform': 'desktop', + }, + }).encode('utf-8'))['data']['attributes']['streaming'] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us') diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 54e8f477b..b5d8343a4 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -298,7 +298,10 @@ from .douyutv import ( DouyuShowIE, DouyuTVIE, ) -from .dplay import DPlayIE +from .dplay import ( + DPlayIE, + DiscoveryPlusIE, +) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE From 131c65b8ba1aa0d60efe1c03ca10d52fff619a8b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:17:09 +0100 Subject: [PATCH 365/384] [dplay] add support for de.hgtv.com (closes #28182) --- haruhi_dl/extractor/dplay.py | 37 ++++++++++++++++++++++++++++--- haruhi_dl/extractor/extractors.py | 1 + 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/haruhi_dl/extractor/dplay.py b/haruhi_dl/extractor/dplay.py index 540505719..0f0632f26 100644 --- a/haruhi_dl/extractor/dplay.py +++ b/haruhi_dl/extractor/dplay.py @@ -11,11 +11,13 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + strip_or_none, unified_timestamp, ) class DPlayIE(InfoExtractor): + _PATH_REGEX = r'/(?P[^/]+/[^/?#]+)' _VALID_URL = r'''(?x)https?:// (?P (?:www\.)?(?Pd @@ -25,7 +27,7 @@ class DPlayIE(InfoExtractor): ) )| (?Pes|it)\.dplay\.com - )/[^/]+/(?P[^/]+/[^/?#]+)''' + )/[^/]+''' + _PATH_REGEX _TESTS = [{ # non geo restricted, via secure api, unsigned download hls URL @@ -272,7 +274,7 @@ class DPlayIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'title': title, - 'description': info.get('description'), + 'description': strip_or_none(info.get('description')), 'duration': float_or_none(info.get('videoDuration'), 1000), 'timestamp': unified_timestamp(info.get('publishStart')), 'series': series, @@ -295,7 +297,7 @@ class DPlayIE(InfoExtractor): class DiscoveryPlusIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video/(?P[^/]+/[^/]+)' + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX _TESTS = [{ 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', 'info_dict': { @@ -335,3 +337,32 @@ class DiscoveryPlusIE(DPlayIE): display_id = self._match_id(url) return self._get_disco_api_info( url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us') + + +class HGTVDeIE(DPlayIE): + _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/', + 'info_dict': { + 'id': '151205', + 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette', + 'ext': 'mp4', + 'title': 'Wer braucht schon eine Toilette', + 'description': 'md5:05b40a27e7aed2c9172de34d459134e2', + 'duration': 1177.024, + 'timestamp': 1595705400, + 'upload_date': '20200725', + 'creator': 'HGTV', + 'series': 'Tiny House - klein, aber oho', + 'season_number': 3, + 'episode_number': 3, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de') diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index b5d8343a4..6da45a869 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -301,6 +301,7 @@ from .douyutv import ( from .dplay import ( DPlayIE, DiscoveryPlusIE, + HGTVDeIE, ) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE From e3b224a330833e1ecd2a66776ad3b6a1109da916 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:17:34 +0100 Subject: [PATCH 366/384] [ninegag] unscape title(#28201) --- haruhi_dl/extractor/ninegag.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/haruhi_dl/extractor/ninegag.py b/haruhi_dl/extractor/ninegag.py index 440f865bc..14390823b 100644 --- a/haruhi_dl/extractor/ninegag.py +++ b/haruhi_dl/extractor/ninegag.py @@ -2,10 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - determine_ext, ExtractorError, + determine_ext, int_or_none, try_get, + unescapeHTML, url_or_none, ) @@ -14,7 +15,7 @@ class NineGagIE(InfoExtractor): IE_NAME = '9gag' _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[^/?&#]+)' - _TEST = { + _TESTS = [{ 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { 'id': 'ae5Ag7B', @@ -29,7 +30,11 @@ class NineGagIE(InfoExtractor): 'dislike_count': int, 'comment_count': int, } - } + }, { + # HTML escaped title + 'url': 'https://9gag.com/gag/av5nvyb', + 'only_matching': True, + }] def _real_extract(self, url): post_id = self._match_id(url) @@ -43,7 +48,7 @@ class NineGagIE(InfoExtractor): 'The given url does not contain a video', expected=True) - title = post['title'] + title = unescapeHTML(post['title']) duration = None formats = [] From fa6393bfd568a00aadef00e08977c66f3d3825b3 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:17:39 +0100 Subject: [PATCH 367/384] [viki] improve extraction(closes #26522)(closes #28203) - extract uploader_url and episode_number - report login required error - extract 480p formats - fix API v4 calls --- haruhi_dl/extractor/viki.py | 69 +++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/haruhi_dl/extractor/viki.py b/haruhi_dl/extractor/viki.py index a311f21ef..2e9cbf148 100644 --- a/haruhi_dl/extractor/viki.py +++ b/haruhi_dl/extractor/viki.py @@ -21,6 +21,7 @@ from ..utils import ( parse_iso8601, sanitized_Request, std_headers, + try_get, ) @@ -30,7 +31,7 @@ class VikiBaseIE(InfoExtractor): _API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s' _APP = '100005a' - _APP_VERSION = '2.2.5.1428709186' + _APP_VERSION = '6.0.0' _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad' _GEO_BYPASS = False @@ -41,7 +42,7 @@ class VikiBaseIE(InfoExtractor): _ERRORS = { 'geo': 'Sorry, this content is not available in your region.', 'upcoming': 'Sorry, this content is not yet available.', - # 'paywall': 'paywall', + 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers', } def _prepare_call(self, path, timestamp=None, post_data=None): @@ -62,7 +63,8 @@ class VikiBaseIE(InfoExtractor): def _call_api(self, path, video_id, note, timestamp=None, post_data=None): resp = self._download_json( - self._prepare_call(path, timestamp, post_data), video_id, note) + self._prepare_call(path, timestamp, post_data), video_id, note, + headers={'x-viki-app-ver': self._APP_VERSION}) error = resp.get('error') if error: @@ -82,11 +84,13 @@ class VikiBaseIE(InfoExtractor): expected=True) def _check_errors(self, data): - for reason, status in data.get('blocking', {}).items(): + for reason, status in (data.get('blocking') or {}).items(): if status and reason in self._ERRORS: message = self._ERRORS[reason] if reason == 'geo': self.raise_geo_restricted(msg=message) + elif reason == 'paywall': + self.raise_login_required(message) raise ExtractorError('%s said: %s' % ( self.IE_NAME, message), expected=True) @@ -131,13 +135,19 @@ class VikiIE(VikiBaseIE): 'info_dict': { 'id': '1023585v', 'ext': 'mp4', - 'title': 'Heirs Episode 14', - 'uploader': 'SBS', - 'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e', + 'title': 'Heirs - Episode 14', + 'uploader': 'SBS Contents Hub', + 'timestamp': 1385047627, 'upload_date': '20131121', 'age_limit': 13, + 'duration': 3570, + 'episode_number': 14, + }, + 'params': { + 'format': 'bestvideo', }, 'skip': 'Blocked in the US', + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # clip 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', @@ -153,7 +163,8 @@ class VikiIE(VikiBaseIE): 'uploader': 'Arirang TV', 'like_count': int, 'age_limit': 0, - } + }, + 'skip': 'Sorry. There was an error loading this video', }, { 'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi', 'info_dict': { @@ -171,7 +182,7 @@ class VikiIE(VikiBaseIE): }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '94e0e34fd58f169f40c184f232356cfe', + 'md5': '0a53dc252e6e690feccd756861495a8c', 'info_dict': { 'id': '44699v', 'ext': 'mp4', @@ -183,6 +194,10 @@ class VikiIE(VikiBaseIE): 'uploader': 'group8', 'like_count': int, 'age_limit': 13, + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', }, 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { @@ -209,7 +224,7 @@ class VikiIE(VikiBaseIE): }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': 'adf9e321a0ae5d0aace349efaaff7691', + 'md5': '41faaba0de90483fb4848952af7c7d0d', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -220,6 +235,10 @@ class VikiIE(VikiBaseIE): 'title': 'Love In Magic', 'age_limit': 13, }, + 'params': { + 'format': 'bestvideo', + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }] def _real_extract(self, url): @@ -229,36 +248,33 @@ class VikiIE(VikiBaseIE): 'https://www.viki.com/api/videos/' + video_id, video_id, 'Downloading video JSON', headers={ 'x-client-user-agent': std_headers['User-Agent'], - 'x-viki-app-ver': '4.0.57', + 'x-viki-app-ver': '3.0.0', }) video = resp['video'] self._check_errors(video) title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False) + episode_number = int_or_none(video.get('number')) if not title: - title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id - container_titles = video.get('container', {}).get('titles', {}) + title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id + container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {} container_title = self.dict_selection(container_titles, 'en') title = '%s - %s' % (container_title, title) description = self.dict_selection(video.get('descriptions', {}), 'en') - duration = int_or_none(video.get('duration')) - timestamp = parse_iso8601(video.get('created_at')) - uploader = video.get('author') - like_count = int_or_none(video.get('likes', {}).get('count')) - age_limit = parse_age_limit(video.get('rating')) + like_count = int_or_none(try_get(video, lambda x: x['likes']['count'])) thumbnails = [] - for thumbnail_id, thumbnail in video.get('images', {}).items(): + for thumbnail_id, thumbnail in (video.get('images') or {}).items(): thumbnails.append({ 'id': thumbnail_id, 'url': thumbnail.get('url'), }) subtitles = {} - for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): + for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items(): subtitles[subtitle_lang] = [{ 'ext': subtitles_format, 'url': self._prepare_call( @@ -269,13 +285,15 @@ class VikiIE(VikiBaseIE): 'id': video_id, 'title': title, 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'uploader': uploader, + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader': video.get('author'), + 'uploader_url': video.get('author_url'), 'like_count': like_count, - 'age_limit': age_limit, + 'age_limit': parse_age_limit(video.get('rating')), 'thumbnails': thumbnails, 'subtitles': subtitles, + 'episode_number': episode_number, } formats = [] @@ -360,7 +378,7 @@ class VikiChannelIE(VikiBaseIE): 'info_dict': { 'id': '50c', 'title': 'Boys Over Flowers', - 'description': 'md5:ecd3cff47967fe193cff37c0bec52790', + 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59', }, 'playlist_mincount': 71, }, { @@ -371,6 +389,7 @@ class VikiChannelIE(VikiBaseIE): 'description': 'md5:05bf5471385aa8b21c18ad450e350525', }, 'playlist_count': 127, + 'skip': 'Page not found', }, { 'url': 'http://www.viki.com/news/24569c-showbiz-korea', 'only_matching': True, From fd1c09264dfb841da55d3093455f1be629d7f919 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:17:44 +0100 Subject: [PATCH 368/384] [vimeo] add support for unlisted video source format extraction --- haruhi_dl/extractor/vimeo.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/vimeo.py b/haruhi_dl/extractor/vimeo.py index 773296173..6fc7b76b4 100644 --- a/haruhi_dl/extractor/vimeo.py +++ b/haruhi_dl/extractor/vimeo.py @@ -226,10 +226,12 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'is_live': is_live, } - def _extract_original_format(self, url, video_id): + def _extract_original_format(self, url, video_id, unlisted_hash=None): + query = {'action': 'load_download_config'} + if unlisted_hash: + query['unlisted_hash'] = unlisted_hash download_data = self._download_json( - url, video_id, fatal=False, - query={'action': 'load_download_config'}, + url, video_id, fatal=False, query=query, headers={'X-Requested-With': 'XMLHttpRequest'}) if download_data: source_file = download_data.get('source_file') @@ -509,6 +511,11 @@ class VimeoIE(VimeoBaseInfoExtractor): { 'url': 'https://vimeo.com/160743502/abd0e13fb4', 'only_matching': True, + }, + { + # requires passing unlisted_hash(a52724358e) to load_download_config request + 'url': 'https://vimeo.com/392479337/a52724358e', + 'only_matching': True, } # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header @@ -673,7 +680,8 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password(redirect_url, video_id, headers) - vod = config.get('video', {}).get('vod', {}) + video = config.get('video') or {} + vod = video.get('vod') or {} def is_rented(): if '>You rented this title.<' in webpage: @@ -733,7 +741,7 @@ class VimeoIE(VimeoBaseInfoExtractor): formats = [] source_format = self._extract_original_format( - 'https://vimeo.com/' + video_id, video_id) + 'https://vimeo.com/' + video_id, video_id, video.get('unlisted_hash')) if source_format: formats.append(source_format) From f9fa934413cb7bb4f5fcc37b7f4102d69d2e4c29 Mon Sep 17 00:00:00 2001 From: Isaac-the-Man Date: Fri, 26 Feb 2021 18:17:50 +0100 Subject: [PATCH 369/384] [samplefocus] Add new extractor(closes #27763) --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/samplefocus.py | 100 +++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 haruhi_dl/extractor/samplefocus.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 6da45a869..f6353a63c 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -1074,6 +1074,7 @@ from .safari import ( SafariApiIE, SafariCourseIE, ) +from .samplefocus import SampleFocusIE from .sapo import SapoIE from .savefrom import SaveFromIE from .sbs import SBSIE diff --git a/haruhi_dl/extractor/samplefocus.py b/haruhi_dl/extractor/samplefocus.py new file mode 100644 index 000000000..806c3c354 --- /dev/null +++ b/haruhi_dl/extractor/samplefocus.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + get_element_by_attribute, + int_or_none, +) + + +class SampleFocusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?samplefocus\.com/samples/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://samplefocus.com/samples/lil-peep-sad-emo-guitar', + 'md5': '48c8d62d60be467293912e0e619a5120', + 'info_dict': { + 'id': '40316', + 'display_id': 'lil-peep-sad-emo-guitar', + 'ext': 'mp3', + 'title': 'Lil Peep Sad Emo Guitar', + 'thumbnail': r're:^https?://.+\.png', + 'license': 'Standard License', + 'uploader': 'CapsCtrl', + 'uploader_id': 'capsctrl', + 'like_count': int, + 'comment_count': int, + 'categories': ['Samples', 'Guitar', 'Electric guitar'], + }, + }, { + 'url': 'https://samplefocus.com/samples/dababy-style-bass-808', + 'only_matching': True + }, { + 'url': 'https://samplefocus.com/samples/young-chop-kick', + 'only_matching': True + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + sample_id = self._search_regex( + r']+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P\d+)', + webpage, 'sample id', group='id') + + title = self._og_search_title(webpage, fatal=False) or self._html_search_regex( + r'

(.+?)

', webpage, 'title') + + mp3_url = self._search_regex( + r']+id=(["\'])sample_mp3\1[^>]+value=(["\'])(?P(?:(?!\2).)+)', + webpage, 'mp3', fatal=False, group='url') or extract_attributes(self._search_regex( + r']+itemprop=(["\'])contentUrl\1[^>]*>', + webpage, 'mp3 url', group=0))['content'] + + thumbnail = self._og_search_thumbnail(webpage) or self._html_search_regex( + r']+class=(?:["\'])waveform responsive-img[^>]+src=(["\'])(?P(?:(?!\1).)+)', + webpage, 'mp3', fatal=False, group='url') + + comments = [] + for author_id, author, body in re.findall(r'(?s)]+class="comment-author">]+href="/users/([^"]+)">([^"]+).+?]+class="comment-body">([^>]+)

', webpage): + comments.append({ + 'author': author, + 'author_id': author_id, + 'text': body, + }) + + uploader_id = uploader = None + mobj = re.search(r'>By ]+href="/users/([^"]+)"[^>]*>([^<]+)', webpage) + if mobj: + uploader_id, uploader = mobj.groups() + + breadcrumb = get_element_by_attribute('typeof', 'BreadcrumbList', webpage) + categories = [] + if breadcrumb: + for _, name in re.findall(r']+property=(["\'])name\1[^>]*>([^<]+)', breadcrumb): + categories.append(name) + + def extract_count(klass): + return int_or_none(self._html_search_regex( + r']+class=(?:["\'])?%s-count[^>]*>(\d+)' % klass, + webpage, klass, fatal=False)) + + return { + 'id': sample_id, + 'title': title, + 'url': mp3_url, + 'display_id': display_id, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'license': self._html_search_regex( + r']+href=(["\'])/license\1[^>]*>(?P[^<]+)<', + webpage, 'license', fatal=False, group='license'), + 'uploader_id': uploader_id, + 'like_count': extract_count('sample-%s-favorites' % sample_id), + 'comment_count': extract_count('comments'), + 'comments': comments, + 'categories': categories, + } From a3b6d4d975634b7f16fc7b3f365077a608598b95 Mon Sep 17 00:00:00 2001 From: piplongrun Date: Fri, 26 Feb 2021 18:18:12 +0100 Subject: [PATCH 370/384] [youporn] Extract duration (#28019) Co-authored-by: Sergey M --- haruhi_dl/extractor/youporn.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/haruhi_dl/extractor/youporn.py b/haruhi_dl/extractor/youporn.py index 4e4d1cb33..923cab4a9 100644 --- a/haruhi_dl/extractor/youporn.py +++ b/haruhi_dl/extractor/youporn.py @@ -25,6 +25,7 @@ class YouPornIE(InfoExtractor): 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 210, 'uploader': 'Ask Dan And Jennifer', 'upload_date': '20101217', 'average_rating': int, @@ -153,6 +154,8 @@ class YouPornIE(InfoExtractor): thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration', fatal=False)) uploader = self._html_search_regex( r'(?s)]+class=["\']submitByLink["\'][^>]*>(.+?)', @@ -194,6 +197,7 @@ class YouPornIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, 'uploader': uploader, 'upload_date': upload_date, 'average_rating': average_rating, From 6e796716f9de83de3e30bda5d8c8cbd6c9cacf0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 18:18:22 +0100 Subject: [PATCH 371/384] [youporn] Skip test --- haruhi_dl/extractor/youporn.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haruhi_dl/extractor/youporn.py b/haruhi_dl/extractor/youporn.py index 923cab4a9..03880cd01 100644 --- a/haruhi_dl/extractor/youporn.py +++ b/haruhi_dl/extractor/youporn.py @@ -55,6 +55,7 @@ class YouPornIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404', }, { 'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'only_matching': True, From 21daa7ea91777d516c3e17fe9941c4e3f07a6f2c Mon Sep 17 00:00:00 2001 From: Adrian Heine Date: Fri, 26 Feb 2021 18:18:28 +0100 Subject: [PATCH 372/384] [apa] Fix extraction --- haruhi_dl/extractor/apa.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/haruhi_dl/extractor/apa.py b/haruhi_dl/extractor/apa.py index aa07e07b9..dd6797907 100644 --- a/haruhi_dl/extractor/apa.py +++ b/haruhi_dl/extractor/apa.py @@ -7,7 +7,6 @@ from .common import InfoExtractor from ..utils import ( determine_ext, js_to_json, - url_or_none, ) @@ -17,14 +16,10 @@ class APAIE(InfoExtractor): 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', 'info_dict': { - 'id': 'jjv85FdZ', + 'id': '293f6d17-692a-44e3-9fd5-7b178f3a1029', 'ext': 'mp4', - 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'title': '293f6d17-692a-44e3-9fd5-7b178f3a1029', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 254, - 'timestamp': 1519211149, - 'upload_date': '20180221', }, }, { 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', @@ -48,7 +43,7 @@ class APAIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage('https://uvp.apa.at/player/%s' % video_id, video_id) jwplatform_id = self._search_regex( r'media[iI]d\s*:\s*["\'](?P[a-zA-Z0-9]{8})', webpage, @@ -59,18 +54,12 @@ class APAIE(InfoExtractor): 'jwplatform:' + jwplatform_id, ie='JWPlatform', video_id=video_id) - sources = self._parse_json( - self._search_regex( - r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'), - video_id, transform_source=js_to_json) + sources = self._parse_json("{" + self._search_regex( + r'("hls"\s*:\s*"[^"]+"\s*,\s*"progressive"\s*:\s*"[^"]+")', webpage, 'sources') + + "}", video_id, transform_source=js_to_json) formats = [] - for source in sources: - if not isinstance(source, dict): - continue - source_url = url_or_none(source.get('file')) - if not source_url: - continue + for (format, source_url) in sources.items(): ext = determine_ext(source_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -83,7 +72,7 @@ class APAIE(InfoExtractor): self._sort_formats(formats) thumbnail = self._search_regex( - r'image\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + r'"poster"\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, 'thumbnail', fatal=False, group='url') return { From a52155b1297c8c177cd034a2741ef9a07dddcb76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 18:18:33 +0100 Subject: [PATCH 373/384] [apa] Improve extraction (closes #27750) --- haruhi_dl/extractor/apa.py | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/haruhi_dl/extractor/apa.py b/haruhi_dl/extractor/apa.py index dd6797907..af1398bf8 100644 --- a/haruhi_dl/extractor/apa.py +++ b/haruhi_dl/extractor/apa.py @@ -6,12 +6,13 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, - js_to_json, + int_or_none, + url_or_none, ) class APAIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _VALID_URL = r'(?Phttps?://[^/]+\.apa\.at)/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TESTS = [{ 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', @@ -41,9 +42,11 @@ class APAIE(InfoExtractor): webpage)] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id, base_url = mobj.group('id', 'base_url') - webpage = self._download_webpage('https://uvp.apa.at/player/%s' % video_id, video_id) + webpage = self._download_webpage( + '%s/player/%s' % (base_url, video_id), video_id) jwplatform_id = self._search_regex( r'media[iI]d\s*:\s*["\'](?P[a-zA-Z0-9]{8})', webpage, @@ -54,30 +57,39 @@ class APAIE(InfoExtractor): 'jwplatform:' + jwplatform_id, ie='JWPlatform', video_id=video_id) - sources = self._parse_json("{" + self._search_regex( - r'("hls"\s*:\s*"[^"]+"\s*,\s*"progressive"\s*:\s*"[^"]+")', webpage, 'sources') - + "}", video_id, transform_source=js_to_json) + def extract(field, name=None): + return self._search_regex( + r'\b%s["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' % field, + webpage, name or field, default=None, group='value') + + title = extract('title') or video_id + description = extract('description') + thumbnail = extract('poster', 'thumbnail') formats = [] - for (format, source_url) in sources.items(): + for format_id in ('hls', 'progressive'): + source_url = url_or_none(extract(format_id)) + if not source_url: + continue ext = determine_ext(source_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) else: + height = int_or_none(self._search_regex( + r'(\d+)\.mp4', source_url, 'height', default=None)) formats.append({ 'url': source_url, + 'format_id': format_id, + 'height': height, }) self._sort_formats(formats) - thumbnail = self._search_regex( - r'"poster"\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'thumbnail', fatal=False, group='url') - return { 'id': video_id, - 'title': video_id, + 'title': title, + 'description': description, 'thumbnail': thumbnail, 'formats': formats, } From 1c08ff576b673e5bce29d9ee406acc1fe4be904a Mon Sep 17 00:00:00 2001 From: nixxo Date: Fri, 26 Feb 2021 18:18:58 +0100 Subject: [PATCH 374/384] [gedidigital] Add new extractor(closes #7347)(closes #26946) --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/gedidigital.py | 161 +++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 haruhi_dl/extractor/gedidigital.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index f6353a63c..db6166a93 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -446,6 +446,7 @@ from .gamestar import GameStarIE from .gaskrank import GaskrankIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE +from .gedidigital import GediDigitalIE from .generic import GenericIE from .gfycat import GfycatIE from .giantbomb import GiantBombIE diff --git a/haruhi_dl/extractor/gedidigital.py b/haruhi_dl/extractor/gedidigital.py new file mode 100644 index 000000000..1b47a4e27 --- /dev/null +++ b/haruhi_dl/extractor/gedidigital.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, +) + + +class GediDigitalIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://video\. + (?: + (?: + (?:espresso\.)?repubblica + |lastampa + |ilsecoloxix + )| + (?: + iltirreno + |messaggeroveneto + |ilpiccolo + |gazzettadimantova + |mattinopadova + |laprovinciapavese + |tribunatreviso + |nuovavenezia + |gazzettadimodena + |lanuovaferrara + |corrierealpi + |lasentinella + )\.gelocal + )\.it(?:/[^/]+){2,3}/(?P\d+)''' + _TESTS = [{ + 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', + 'md5': '84658d7fb9e55a6e57ecc77b73137494', + 'info_dict': { + 'id': '121559', + 'ext': 'mp4', + 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', + 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', + 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-full-.+?\.jpg$', + 'duration': 125, + }, + }, { + 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', + 'only_matching': True, + }, { + 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', + 'only_matching': True, + }, { + 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', + 'only_matching': True, + }, { + 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', + 'only_matching': True, + }, { + 'url': 'https://video.messaggeroveneto.gelocal.it/locale/maria-giovanna-elmi-covid-vaccino/138155/139268', + 'only_matching': True, + }, { + 'url': 'https://video.ilpiccolo.gelocal.it/dossier/big-john/dinosauro-big-john-al-via-le-visite-guidate-a-trieste/135226/135751', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimantova.gelocal.it/locale/dal-ponte-visconteo-di-valeggio-l-and-8217sos-dei-ristoratori-aprire-anche-a-cena/137310/137818', + 'only_matching': True, + }, { + 'url': 'https://video.mattinopadova.gelocal.it/dossier/coronavirus-in-veneto/covid-a-vo-un-anno-dopo-un-cuore-tricolore-per-non-dimenticare/138402/138964', + 'only_matching': True, + }, { + 'url': 'https://video.laprovinciapavese.gelocal.it/locale/mede-zona-rossa-via-alle-vaccinazioni-per-gli-over-80/137545/138120', + 'only_matching': True, + }, { + 'url': 'https://video.tribunatreviso.gelocal.it/dossier/coronavirus-in-veneto/ecco-le-prima-vaccinazioni-di-massa-nella-marca/134485/135024', + 'only_matching': True, + }, { + 'url': 'https://video.nuovavenezia.gelocal.it/locale/camion-troppo-alto-per-il-ponte-ferroviario-perde-il-carico/135734/136266', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimodena.gelocal.it/locale/modena-scoperta-la-proteina-che-predice-il-livello-di-gravita-del-covid/139109/139796', + 'only_matching': True, + }, { + 'url': 'https://video.lanuovaferrara.gelocal.it/locale/due-bombole-di-gpl-aperte-e-abbandonate-i-vigili-bruciano-il-gas/134391/134957', + 'only_matching': True, + }, { + 'url': 'https://video.corrierealpi.gelocal.it/dossier/cortina-2021-i-mondiali-di-sci-alpino/mondiali-di-sci-il-timelapse-sulla-splendida-olympia/133760/134331', + 'only_matching': True, + }, { + 'url': 'https://video.lasentinella.gelocal.it/locale/vestigne-centra-un-auto-e-si-ribalta/138931/139466', + 'only_matching': True, + }, { + 'url': 'https://video.espresso.repubblica.it/tutti-i-video/01-ted-villa/14772', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta( + ['twitter:title', 'og:title'], webpage, fatal=True) + player_data = re.findall( + r"PlayerFactory\.setParam\('(?Pformat|param)',\s*'(?P[^']+)',\s*'(?P[^']+)'\);", + webpage) + + formats = [] + duration = thumb = None + for t, n, v in player_data: + if t == 'format': + if n in ('video-hds-vod-ec', 'video-hls-vod-ec', 'video-viralize', 'video-youtube-pfp'): + continue + elif n.endswith('-vod-ak'): + formats.extend(self._extract_akamai_formats( + v, video_id, {'http': 'media.gedidigital.it'})) + else: + ext = determine_ext(v) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v, video_id, 'mp4', 'm3u8_native', m3u8_id=n, fatal=False)) + continue + f = { + 'format_id': n, + 'url': v, + } + if ext == 'mp3': + abr = int_or_none(self._search_regex( + r'-mp3-audio-(\d+)', v, 'abr', default=None)) + f.update({ + 'abr': abr, + 'tbr': abr, + 'vcodec': 'none' + }) + else: + mobj = re.match(r'^video-rrtv-(\d+)(?:-(\d+))?$', n) + if mobj: + f.update({ + 'height': int(mobj.group(1)), + 'vbr': int_or_none(mobj.group(2)), + }) + if not f.get('vbr'): + f['vbr'] = int_or_none(self._search_regex( + r'-video-rrtv-(\d+)', v, 'abr', default=None)) + formats.append(f) + elif t == 'param': + if n in ['image_full', 'image']: + thumb = v + elif n == 'videoDuration': + duration = int_or_none(v) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta( + ['twitter:description', 'og:description', 'description'], webpage), + 'thumbnail': thumb or self._og_search_thumbnail(webpage), + 'formats': formats, + 'duration': duration, + } From 77ba7006269ea16e9bc7f11cb8a873ff791499eb Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:19:07 +0100 Subject: [PATCH 375/384] [gedidigital] improve asset id matching --- haruhi_dl/extractor/gedidigital.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/gedidigital.py b/haruhi_dl/extractor/gedidigital.py index 1b47a4e27..6c4153b40 100644 --- a/haruhi_dl/extractor/gedidigital.py +++ b/haruhi_dl/extractor/gedidigital.py @@ -32,7 +32,7 @@ class GediDigitalIE(InfoExtractor): |corrierealpi |lasentinella )\.gelocal - )\.it(?:/[^/]+){2,3}/(?P\d+)''' + )\.it(?:/[^/]+){2,3}?/(?P\d+)(?:[/?&#]|$)''' _TESTS = [{ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', 'md5': '84658d7fb9e55a6e57ecc77b73137494', From 7a3bf913b8a1a98e1a788bb90f12df07e96203cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=3D=3FUTF-8=3Fq=3FSergey=3D20M=3DE2=3D80=3DA4=3F=3D?= Date: Fri, 26 Feb 2021 18:19:11 +0100 Subject: [PATCH 376/384] =?UTF-8?q?[tmz]=20Fix=20and=20improve=20extractio?= =?UTF-8?q?n=20(closes=20#24603,=20closes=20#24687,=20close=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …s #28211) --- haruhi_dl/extractor/tmz.py | 101 ++++++++++++++++++++++++++++--------- 1 file changed, 78 insertions(+), 23 deletions(-) diff --git a/haruhi_dl/extractor/tmz.py b/haruhi_dl/extractor/tmz.py index 419f9d92e..3d1bf75ff 100644 --- a/haruhi_dl/extractor/tmz.py +++ b/haruhi_dl/extractor/tmz.py @@ -2,55 +2,110 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .jwplatform import JWPlatformIE +from .kaltura import KalturaIE +from ..utils import ( + int_or_none, + unified_timestamp, +) class TMZIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.tmz.com/videos/0_okj015ty/', - 'md5': '4d22a51ef205b6c06395d8394f72d560', - 'info_dict': { - 'id': '0_okj015ty', - 'ext': 'mp4', - 'title': 'Kim Kardashian\'s Boobs Unlock a Mystery!', - 'description': 'Did Kim Kardasain try to one-up Khloe by one-upping Kylie??? Or is she just showing off her amazing boobs?', - 'timestamp': 1394747163, - 'uploader_id': 'batchUser', - 'upload_date': '20140313', - } - }, { 'url': 'http://www.tmz.com/videos/0-cegprt2p/', + 'md5': '31f9223e20eef55954973359afa61a20', + 'info_dict': { + 'id': 'P6YjLBLk', + 'ext': 'mp4', + 'title': "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet", + 'description': 'md5:b714359fc18607715ebccbd2da8ff488', + 'timestamp': 1467831837, + 'upload_date': '20160706', + }, + 'add_ie': [JWPlatformIE.ie_key()], + }, { + 'url': 'http://www.tmz.com/videos/0_okj015ty/', + 'only_matching': True, + }, { + 'url': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/', + 'only_matching': True, + }, { + 'url': 'https://www.tmz.com/videos/2021-02-19-021921-floyd-mayweather-1043872/', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url).replace('-', '_') - return self.url_result('kaltura:591531:%s' % video_id, 'Kaltura', video_id) + + webpage = self._download_webpage(url, video_id, fatal=False) + if webpage: + tmz_video_id = self._search_regex( + r'nodeRef\s*:\s*["\']tmz:video:([\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12})', + webpage, 'video id', default=None) + video = self._download_json( + 'https://www.tmz.com/_/video/%s' % tmz_video_id, video_id, + fatal=False) + if video: + message = video['message'] + info = { + '_type': 'url_transparent', + 'title': message.get('title'), + 'description': message.get('description'), + 'timestamp': unified_timestamp(message.get('published_at')), + 'duration': int_or_none(message.get('duration')), + } + jwplatform_id = message.get('jwplayer_media_id') + if jwplatform_id: + info.update({ + 'url': 'jwplatform:%s' % jwplatform_id, + 'ie_key': JWPlatformIE.ie_key(), + }) + else: + kaltura_entry_id = message.get('kaltura_entry_id') or video_id + kaltura_partner_id = message.get('kaltura_partner_id') or '591531' + info.update({ + 'url': 'kaltura:%s:%s' % (kaltura_partner_id, kaltura_entry_id), + 'ie_key': KalturaIE.ie_key(), + }) + return info + + return self.url_result( + 'kaltura:591531:%s' % video_id, KalturaIE.ie_key(), video_id) class TMZArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P[^/]+)/?' + _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P[^/?#&]+)' _TEST = { 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', - 'md5': '3316ff838ae5bb7f642537825e1e90d2', 'info_dict': { - 'id': '0_6snoelag', - 'ext': 'mov', + 'id': 'PAKZa97W', + 'ext': 'mp4', 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake', 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', - 'timestamp': 1429467813, + 'timestamp': 1429466400, 'upload_date': '20150419', - 'uploader_id': 'batchUser', - } + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [JWPlatformIE.ie_key()], } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + + tmz_url = self._search_regex( + r'clickLink\s*\(\s*["\'](?P%s)' % TMZIE._VALID_URL, webpage, + 'video id', default=None, group='url') + if tmz_url: + return self.url_result(tmz_url, ie=TMZIE.ie_key()) + embedded_video_info = self._parse_json(self._html_search_regex( r'tmzVideoEmbed\(({.+?})\);', webpage, 'embedded video info'), video_id) - return self.url_result( - 'http://www.tmz.com/videos/%s/' % embedded_video_info['id']) + 'http://www.tmz.com/videos/%s/' % embedded_video_info['id'], + ie=TMZIE.ie_key()) From ff1ee8a80efd089f15b6acd5c1164f8d4dd7e50d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:19:16 +0100 Subject: [PATCH 377/384] [tf1] improve extraction(closes #27980)(closes #28040) --- haruhi_dl/extractor/tf1.py | 127 ++++++++++++++++++------------------- haruhi_dl/extractor/wat.py | 95 ++++++++++++--------------- 2 files changed, 101 insertions(+), 121 deletions(-) diff --git a/haruhi_dl/extractor/tf1.py b/haruhi_dl/extractor/tf1.py index 55e2a0721..23c2808a1 100644 --- a/haruhi_dl/extractor/tf1.py +++ b/haruhi_dl/extractor/tf1.py @@ -1,92 +1,87 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor -from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, +) class TF1IE(InfoExtractor): - """TF1 uses the wat.tv player.""" - _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?tf1\.fr/[^/]+/(?P[^/]+)/videos/(?P[^/?&#]+)\.html' _TESTS = [{ - 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', - 'info_dict': { - 'id': '10635995', - 'ext': 'mp4', - 'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle', - 'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.', - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, - }, - 'expected_warnings': ['HTTP Error 404'], - }, { - 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', - 'info_dict': { - 'id': 'le-grand-mysterioso-chuggington-7085291-739', - 'ext': 'mp4', - 'title': 'Le grand Mystérioso - Chuggington', - 'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.', - 'upload_date': '20150103', - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, - }, - 'skip': 'HTTP Error 410: Gone', - }, { - 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', - 'only_matching': True, - }, { - 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html', - 'only_matching': True, - }, { - 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', - 'only_matching': True, - }, { 'url': 'https://www.tf1.fr/tmc/quotidien-avec-yann-barthes/videos/quotidien-premiere-partie-11-juin-2019.html', 'info_dict': { 'id': '13641379', 'ext': 'mp4', 'title': 'md5:f392bc52245dc5ad43771650c96fb620', - 'description': 'md5:44bc54f0a21322f5b91d68e76a544eae', + 'description': 'md5:a02cdb217141fb2d469d6216339b052f', 'upload_date': '20190611', + 'timestamp': 1560273989, + 'duration': 1738, + 'series': 'Quotidien avec Yann Barthès', + 'tags': ['intégrale', 'quotidien', 'Replay'], }, 'params': { # Sometimes wat serves the whole file with the --test option 'skip_download': True, + 'format': 'bestvideo', }, + }, { + 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', + 'only_matching': True, + }, { + 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + program_slug, slug = re.match(self._VALID_URL, url).groups() + video = self._download_json( + 'https://www.tf1.fr/graphql/web', slug, query={ + 'id': '9b80783950b85247541dd1d851f9cc7fa36574af015621f853ab111a679ce26f', + 'variables': json.dumps({ + 'programSlug': program_slug, + 'slug': slug, + }) + })['data']['videoBySlug'] + wat_id = video['streamId'] - webpage = self._download_webpage(url, video_id) + tags = [] + for tag in (video.get('tags') or []): + label = tag.get('label') + if not label: + continue + tags.append(label) - wat_id = None + decoration = video.get('decoration') or {} - data = self._parse_json( - self._search_regex( - r'__APOLLO_STATE__\s*=\s*({.+?})\s*(?:;|)', webpage, - 'data', default='{}'), video_id, fatal=False) + thumbnails = [] + for source in (try_get(decoration, lambda x: x['image']['sources'], list) or []): + source_url = source.get('url') + if not source_url: + continue + thumbnails.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + }) - if data: - try: - wat_id = next( - video.get('streamId') - for key, video in data.items() - if isinstance(video, dict) - and video.get('slug') == video_id) - if not isinstance(wat_id, compat_str) or not wat_id.isdigit(): - wat_id = None - except StopIteration: - pass - - if not wat_id: - wat_id = self._html_search_regex( - (r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P\d{8})\1', - r'(["\']?)streamId\1\s*:\s*(["\']?)(?P\d+)\2'), - webpage, 'wat id', group='id') - - return self.url_result('wat:%s' % wat_id, 'Wat') + return { + '_type': 'url_transparent', + 'id': wat_id, + 'url': 'wat:' + wat_id, + 'title': video.get('title'), + 'thumbnails': thumbnails, + 'description': decoration.get('description'), + 'timestamp': parse_iso8601(video.get('date')), + 'duration': int_or_none(try_get(video, lambda x: x['publicPlayingInfos']['duration'])), + 'tags': tags, + 'series': decoration.get('programLabel'), + 'season_number': int_or_none(video.get('season')), + 'episode_number': int_or_none(video.get('episode')), + } diff --git a/haruhi_dl/extractor/wat.py b/haruhi_dl/extractor/wat.py index f6940b371..147931d73 100644 --- a/haruhi_dl/extractor/wat.py +++ b/haruhi_dl/extractor/wat.py @@ -4,9 +4,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - unified_strdate, - HEADRequest, + ExtractorError, int_or_none, + try_get, + unified_strdate, ) @@ -29,6 +30,7 @@ class WatIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': ['HTTP Error 404'], + 'skip': 'This content is no longer available', }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', @@ -40,8 +42,10 @@ class WatIE(InfoExtractor): 'upload_date': '20140816', }, 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], + 'skip': 'This content is no longer available', }, ] + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) @@ -49,71 +53,52 @@ class WatIE(InfoExtractor): # 'contentv4' is used in the website, but it also returns the related # videos, we don't need them + # video_data = self._download_json( + # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) video_data = self._download_json( - 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) + 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, + video_id, query={'context': 'MYTF1'}) video_info = video_data['media'] error_desc = video_info.get('error_desc') if error_desc: - self.report_warning( - '%s returned error: %s' % (self.IE_NAME, error_desc)) + if video_info.get('error_code') == 'GEOBLOCKED': + self.raise_geo_restricted(error_desc, video_info.get('geoList')) + raise ExtractorError(error_desc) - chapters = video_info['chapters'] - if chapters: - first_chapter = chapters[0] - - def video_id_for_chapter(chapter): - return chapter['tc_start'].split('-')[0] - - if video_id_for_chapter(first_chapter) != video_id: - self.to_screen('Multipart video detected') - entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] - return self.playlist_result(entries, video_id, video_info['title']) - # Otherwise we can continue and extract just one part, we have to use - # the video id for getting the video url - else: - first_chapter = video_info - - title = first_chapter['title'] - - def extract_url(path_template, url_type): - req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) - head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type, fatal=False) - if head: - red_url = head.geturl() - if req_url != red_url: - return red_url - return None + title = video_info['title'] formats = [] - manifest_urls = self._download_json( - 'http://www.wat.tv/get/webhtml/' + video_id, video_id) - m3u8_url = manifest_urls.get('hls') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - mpd_url = manifest_urls.get('mpd') - if mpd_url: - formats.extend(self._extract_mpd_formats( - mpd_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), - video_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') - upload_date = unified_strdate(date_diffusion) if date_diffusion else None - duration = None - files = video_info['files'] - if files: - duration = int_or_none(files[0].get('duration')) + def extract_formats(manifest_urls): + for f, f_url in manifest_urls.items(): + if not f_url: + continue + if f in ('dash', 'mpd'): + formats.extend(self._extract_mpd_formats( + f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), + video_id, mpd_id='dash', fatal=False)) + elif f == 'hls': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + delivery = video_data.get('delivery') or {} + extract_formats({delivery.get('format'): delivery.get('url')}) + if not formats: + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False) + if manifest_urls: + extract_formats(manifest_urls) + + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'thumbnail': first_chapter.get('preview'), - 'description': first_chapter.get('description'), - 'view_count': int_or_none(video_info.get('views')), - 'upload_date': upload_date, - 'duration': duration, + 'thumbnail': video_info.get('preview'), + 'upload_date': unified_strdate(try_get( + video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])), + 'duration': int_or_none(video_info.get('duration')), 'formats': formats, } From 36ee1ad35d37dae31130009d8c3d0c9f3f37d70b Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:19:22 +0100 Subject: [PATCH 378/384] [wat] detect DRM protected videos(closes #27958) --- haruhi_dl/extractor/wat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/wat.py b/haruhi_dl/extractor/wat.py index 147931d73..f1bccc2d6 100644 --- a/haruhi_dl/extractor/wat.py +++ b/haruhi_dl/extractor/wat.py @@ -64,7 +64,7 @@ class WatIE(InfoExtractor): if error_desc: if video_info.get('error_code') == 'GEOBLOCKED': self.raise_geo_restricted(error_desc, video_info.get('geoList')) - raise ExtractorError(error_desc) + raise ExtractorError(error_desc, expected=True) title = video_info['title'] @@ -86,6 +86,8 @@ class WatIE(InfoExtractor): delivery = video_data.get('delivery') or {} extract_formats({delivery.get('format'): delivery.get('url')}) if not formats: + if delivery.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) manifest_urls = self._download_json( 'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False) if manifest_urls: From cf883f24cc602e3b50eda1db898aa8dc2f851b72 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:19:28 +0100 Subject: [PATCH 379/384] [dplay] Extract Ad-Free uplynk URLs(#28160) --- haruhi_dl/extractor/dplay.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haruhi_dl/extractor/dplay.py b/haruhi_dl/extractor/dplay.py index 0f0632f26..bbb199094 100644 --- a/haruhi_dl/extractor/dplay.py +++ b/haruhi_dl/extractor/dplay.py @@ -330,6 +330,7 @@ class DiscoveryPlusIE(DPlayIE): 'videoId': video_id, 'wisteriaProperties': { 'platform': 'desktop', + 'product': 'dplus_us', }, }).encode('utf-8'))['data']['attributes']['streaming'] From 2b6555f2ebe2b8b64bfe6858b89f6103ea02f29b Mon Sep 17 00:00:00 2001 From: nixxo Date: Fri, 26 Feb 2021 18:19:33 +0100 Subject: [PATCH 380/384] [vvvvid] extract series sublists playlist_title (#27601) (#27618) --- haruhi_dl/extractor/vvvvid.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/haruhi_dl/extractor/vvvvid.py b/haruhi_dl/extractor/vvvvid.py index 778ce8b76..d62404cf3 100644 --- a/haruhi_dl/extractor/vvvvid.py +++ b/haruhi_dl/extractor/vvvvid.py @@ -247,9 +247,13 @@ class VVVVIDShowIE(VVVVIDIE): show_info = self._download_info( show_id, 'info/', show_title, fatal=False) + if not show_title: + base_url += "/title" + entries = [] for season in (seasons or []): episodes = season.get('episodes') or [] + playlist_title = season.get('name') or show_info.get('title') for episode in episodes: if episode.get('playable') is False: continue @@ -259,12 +263,13 @@ class VVVVIDShowIE(VVVVIDIE): continue info = self._extract_common_video_info(episode) info.update({ - '_type': 'url', + '_type': 'url_transparent', 'ie_key': VVVVIDIE.ie_key(), 'url': '/'.join([base_url, season_id, video_id]), 'title': episode.get('title'), 'description': episode.get('description'), 'season_id': season_id, + 'playlist_title': playlist_title, }) entries.append(info) From 302b6ffb09910d3a4d5c2ad2d9ffa8b1dec5cd78 Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Fri, 26 Feb 2021 18:19:38 +0100 Subject: [PATCH 381/384] [vvvvid] reduce season request payload size --- haruhi_dl/extractor/vvvvid.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/haruhi_dl/extractor/vvvvid.py b/haruhi_dl/extractor/vvvvid.py index d62404cf3..7c94c4ee2 100644 --- a/haruhi_dl/extractor/vvvvid.py +++ b/haruhi_dl/extractor/vvvvid.py @@ -75,12 +75,15 @@ class VVVVIDIE(InfoExtractor): 'https://www.vvvvid.it/user/login', None, headers=self.geo_verification_headers())['data']['conn_id'] - def _download_info(self, show_id, path, video_id, fatal=True): + def _download_info(self, show_id, path, video_id, fatal=True, query=None): + q = { + 'conn_id': self._conn_id, + } + if query: + q.update(query) response = self._download_json( 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), - video_id, headers=self.geo_verification_headers(), query={ - 'conn_id': self._conn_id, - }, fatal=fatal) + video_id, headers=self.geo_verification_headers(), query=q, fatal=fatal) if not (response or fatal): return if response.get('result') == 'error': @@ -98,7 +101,8 @@ class VVVVIDIE(InfoExtractor): show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() response = self._download_info( - show_id, 'season/%s' % season_id, video_id) + show_id, 'season/%s' % season_id, + video_id, query={'video_id': video_id}) vid = int(video_id) video_data = list(filter( From 57da386d5c50110faf2b5eaa9c451bcca0cdbdf2 Mon Sep 17 00:00:00 2001 From: Alexander Seiler Date: Fri, 26 Feb 2021 18:19:42 +0100 Subject: [PATCH 382/384] [srgssr] improve extraction - extract subtitle - fix extraction for new videos - update srf download domains closes #14717 closes #14725 closes #27231 closes #28238 --- haruhi_dl/extractor/rts.py | 15 ++- haruhi_dl/extractor/srgssr.py | 206 ++++++++++++++++++++++------------ 2 files changed, 143 insertions(+), 78 deletions(-) diff --git a/haruhi_dl/extractor/rts.py b/haruhi_dl/extractor/rts.py index 48f17b828..aed35f8a9 100644 --- a/haruhi_dl/extractor/rts.py +++ b/haruhi_dl/extractor/rts.py @@ -6,11 +6,12 @@ import re from .srgssr import SRGSSRIE from ..compat import compat_str from ..utils import ( + determine_ext, int_or_none, parse_duration, parse_iso8601, unescapeHTML, - determine_ext, + urljoin, ) @@ -21,7 +22,7 @@ class RTSIE(SRGSSRIE): _TESTS = [ { 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', - 'md5': 'ff7f8450a90cf58dacb64e29707b4a8e', + 'md5': '753b877968ad8afaeddccc374d4256a5', 'info_dict': { 'id': '3449373', 'display_id': 'les-enfants-terribles', @@ -35,6 +36,7 @@ class RTSIE(SRGSSRIE): 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', @@ -63,11 +65,12 @@ class RTSIE(SRGSSRIE): # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], 'skip': 'Blocked outside Switzerland', }, { 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', - 'md5': '1bae984fe7b1f78e94abc74e802ed99f', + 'md5': '9bb06503773c07ce83d3cbd793cebb91', 'info_dict': { 'id': '5745356', 'display_id': 'londres-cachee-par-un-epais-smog', @@ -81,6 +84,7 @@ class RTSIE(SRGSSRIE): 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', @@ -160,7 +164,7 @@ class RTSIE(SRGSSRIE): media_type = 'video' if 'video' in all_info else 'audio' # check for errors - self.get_media_data('rts', media_type, media_id) + self._get_media_data('rts', media_type, media_id) info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] @@ -194,6 +198,7 @@ class RTSIE(SRGSSRIE): 'tbr': extract_bitrate(format_url), }) + download_base = 'http://rtsww%s-d.rts.ch/' % ('-a' if media_type == 'audio' else '') for media in info.get('media', []): media_url = media.get('url') if not media_url or re.match(r'https?://', media_url): @@ -205,7 +210,7 @@ class RTSIE(SRGSSRIE): format_id += '-%dk' % rate formats.append({ 'format_id': format_id, - 'url': 'http://download-video.rts.ch/' + media_url, + 'url': urljoin(download_base, media_url), 'tbr': rate or extract_bitrate(media_url), }) diff --git a/haruhi_dl/extractor/srgssr.py b/haruhi_dl/extractor/srgssr.py index f63a1359a..ac018e740 100644 --- a/haruhi_dl/extractor/srgssr.py +++ b/haruhi_dl/extractor/srgssr.py @@ -4,16 +4,32 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, + float_or_none, + int_or_none, parse_iso8601, qualities, + try_get, ) class SRGSSRIE(InfoExtractor): - _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?Psrf|rts|rsi|rtr|swi):(?:[^:]+:)?(?Pvideo|audio):(?P[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'''(?x) + (?: + https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| + srgssr + ): + (?P + srf|rts|rsi|rtr|swi + ):(?:[^:]+:)? + (?P + video|audio + ): + (?P + [0-9a-f\-]{36}|\d+ + ) + ''' _GEO_BYPASS = False _GEO_COUNTRIES = ['CH'] @@ -25,25 +41,39 @@ class SRGSSRIE(InfoExtractor): 'LEGAL': 'The video cannot be transmitted for legal reasons.', 'STARTDATE': 'This video is not yet available. Please try again later.', } + _DEFAULT_LANGUAGE_CODES = { + 'srf': 'de', + 'rts': 'fr', + 'rsi': 'it', + 'rtr': 'rm', + 'swi': 'en', + } def _get_tokenized_src(self, url, video_id, format_id): - sp = compat_urllib_parse_urlparse(url).path.split('/') token = self._download_json( - 'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]), + 'http://tp.srgssr.ch/akahd/token?acl=*', video_id, 'Downloading %s token' % format_id, fatal=False) or {} - auth_params = token.get('token', {}).get('authparams') + auth_params = try_get(token, lambda x: x['token']['authparams']) if auth_params: - url += '?' + auth_params + url += ('?' if '?' not in url else '&') + auth_params return url - def get_media_data(self, bu, media_type, media_id): - media_data = self._download_json( - 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), - media_id)[media_type.capitalize()] + def _get_media_data(self, bu, media_type, media_id): + query = {'onlyChapters': True} if media_type == 'video' else {} + full_media_data = self._download_json( + 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' + % (bu, media_type, media_id), + media_id, query=query)['chapterList'] + try: + media_data = next( + x for x in full_media_data if x.get('id') == media_id) + except StopIteration: + raise ExtractorError('No media information found') - if media_data.get('block') and media_data['block'] in self._ERRORS: - message = self._ERRORS[media_data['block']] - if media_data['block'] == 'GEOBLOCK': + block_reason = media_data.get('blockReason') + if block_reason and block_reason in self._ERRORS: + message = self._ERRORS[block_reason] + if block_reason == 'GEOBLOCK': self.raise_geo_restricted( msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( @@ -53,53 +83,75 @@ class SRGSSRIE(InfoExtractor): def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + media_data = self._get_media_data(bu, media_type, media_id) + title = media_data['title'] - media_data = self.get_media_data(bu, media_type, media_id) - - metadata = media_data['AssetMetadatas']['AssetMetadata'][0] - title = metadata['title'] - description = metadata.get('description') - created_date = media_data.get('createdDate') or metadata.get('createdDate') - timestamp = parse_iso8601(created_date) - - thumbnails = [{ - 'id': image.get('id'), - 'url': image['url'], - } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])] - - preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) formats = [] - for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []): - protocol = source.get('@protocol') - for asset in source['url']: - asset_url = asset['text'] - quality = asset['@quality'] - format_id = '%s-%s' % (protocol, quality) - if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): - asset_url = self._get_tokenized_src(asset_url, media_id, format_id) - if protocol.startswith('HTTP-HDS'): - formats.extend(self._extract_f4m_formats( - asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', - media_id, f4m_id=format_id, fatal=False)) - elif protocol.startswith('HTTP-HLS'): - formats.extend(self._extract_m3u8_formats( - asset_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - else: - formats.append({ - 'format_id': format_id, - 'url': asset_url, - 'preference': preference(quality), - 'ext': 'flv' if protocol == 'RTMP' else None, - }) + q = qualities(['SD', 'HD']) + for source in (media_data.get('resourceList') or []): + format_url = source.get('url') + if not format_url: + continue + protocol = source.get('protocol') + quality = source.get('quality') + format_id = [] + for e in (protocol, source.get('encoding'), quality): + if e: + format_id.append(e) + format_id = '-'.join(format_id) + + if protocol in ('HDS', 'HLS'): + if source.get('tokenType') == 'AKAMAI': + format_url = self._get_tokenized_src( + format_url, media_id, format_id) + formats.extend(self._extract_akamai_formats( + format_url, media_id)) + elif protocol == 'HLS': + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif protocol in ('HTTP', 'HTTPS'): + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'quality': q(quality), + }) + + # This is needed because for audio medias the podcast url is usually + # always included, even if is only an audio segment and not the + # whole episode. + if int_or_none(media_data.get('position')) == 0: + for p in ('S', 'H'): + podcast_url = media_data.get('podcast%sdUrl' % p) + if not podcast_url: + continue + quality = p + 'D' + formats.append({ + 'format_id': 'PODCAST-' + quality, + 'url': podcast_url, + 'quality': q(quality), + }) self._sort_formats(formats) + subtitles = {} + if media_type == 'video': + for sub in (media_data.get('subtitleList') or []): + sub_url = sub.get('url') + if not sub_url: + continue + lang = sub.get('locale') or self._DEFAULT_LANGUAGE_CODES[bu] + subtitles.setdefault(lang, []).append({ + 'url': sub_url, + }) + return { 'id': media_id, 'title': title, - 'description': description, - 'timestamp': timestamp, - 'thumbnails': thumbnails, + 'description': media_data.get('description'), + 'timestamp': parse_iso8601(media_data.get('date')), + 'thumbnail': media_data.get('imageUrl'), + 'duration': float_or_none(media_data.get('duration'), 1000), + 'subtitles': subtitles, 'formats': formats, } @@ -119,26 +171,17 @@ class SRGSSRPlayIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', + 'md5': '6db2226ba97f62ad42ce09783680046c', 'info_dict': { 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'ext': 'mp4', 'upload_date': '20130701', 'title': 'Snowden beantragt Asyl in Russland', - 'timestamp': 1372713995, - } - }, { - # No Speichern (Save) button - 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', - 'md5': '0a274ce38fda48c53c01890651985bc6', - 'info_dict': { - 'id': '677f5829-e473-4823-ac83-a1087fe97faa', - 'ext': 'flv', - 'upload_date': '20130710', - 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', - 'description': 'md5:88604432b60d5a38787f152dec89cd56', - 'timestamp': 1373493600, + 'timestamp': 1372708215, + 'duration': 113.827, + 'thumbnail': r're:^https?://.*1383719781\.png$', }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', 'info_dict': { @@ -146,7 +189,8 @@ class SRGSSRPlayIE(InfoExtractor): 'ext': 'mp3', 'upload_date': '20151013', 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', - 'timestamp': 1444750398, + 'timestamp': 1444709160, + 'duration': 336.816, }, 'params': { # rtmp download @@ -159,19 +203,32 @@ class SRGSSRPlayIE(InfoExtractor): 'id': '6348260', 'display_id': '6348260', 'ext': 'mp4', - 'duration': 1796, + 'duration': 1796.76, 'title': 'Le 19h30', - 'description': '', - 'uploader': '19h30', 'upload_date': '20141201', 'timestamp': 1417458600, 'thumbnail': r're:^https?://.*\.image', - 'view_count': int, }, 'params': { # m3u8 download 'skip_download': True, } + }, { + 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', + 'info_dict': { + 'id': '42960270', + 'ext': 'mp4', + 'title': 'Why people were against tax reforms', + 'description': 'md5:7ac442c558e9630e947427469c4b824d', + 'duration': 94.0, + 'upload_date': '20170215', + 'timestamp': 1487173560, + 'thumbnail': r're:https?://www\.swissinfo\.ch/srgscalableimage/42961964', + 'subtitles': 'count:9', + }, + 'params': { + 'skip_download': True, + } }, { 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', 'only_matching': True, @@ -181,6 +238,10 @@ class SRGSSRPlayIE(InfoExtractor): }, { 'url': 'https://www.rts.ch/play/tv/19h30/video/le-19h30?urn=urn:rts:video:6348260', 'only_matching': True, + }, { + # audio segment, has podcastSdUrl of the full episode + 'url': 'https://www.srf.ch/play/radio/popupaudioplayer?id=50b20dc8-f05b-4972-bf03-e438ff2833eb', + 'only_matching': True, }] def _real_extract(self, url): @@ -188,5 +249,4 @@ class SRGSSRPlayIE(InfoExtractor): bu = mobj.group('bu') media_type = mobj.group('type') or mobj.group('type_2') media_id = mobj.group('id') - # other info can be extracted from url + '&layout=json' return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') From 293eada0f410505501172d391a979143ca3087b1 Mon Sep 17 00:00:00 2001 From: Laura Liberda Date: Fri, 26 Feb 2021 18:39:45 +0100 Subject: [PATCH 383/384] fix hdl tests --- test/test_HaruhiDL.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/test_HaruhiDL.py b/test/test_HaruhiDL.py index cb9954716..1165ceafe 100644 --- a/test/test_HaruhiDL.py +++ b/test/test_HaruhiDL.py @@ -933,14 +933,14 @@ class TestHaruhiDL(unittest.TestCase): # Test case for https://github.com/hdl-org/haruhi-dl/issues/27064 def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self): - class _YDL(YDL): + class _HDL(HDL): def __init__(self, *args, **kwargs): - super(_YDL, self).__init__(*args, **kwargs) + super(_HDL, self).__init__(*args, **kwargs) def trouble(self, s, tb=None): pass - ydl = _YDL({ + hdl = _HDL({ 'format': 'extra', 'ignoreerrors': True, }) @@ -984,15 +984,15 @@ class TestHaruhiDL(unittest.TestCase): def _real_extract(self, url): return self.playlist_result(self._entries()) - ydl.add_info_extractor(VideoIE(ydl)) - ydl.add_info_extractor(PlaylistIE(ydl)) - info = ydl.extract_info('playlist:') + hdl.add_info_extractor(VideoIE(hdl)) + hdl.add_info_extractor(PlaylistIE(hdl)) + info = hdl.extract_info('playlist:') entries = info['entries'] self.assertEqual(len(entries), 3) self.assertTrue(entries[0] is None) self.assertTrue(entries[1] is None) - self.assertEqual(len(ydl.downloaded_info_dicts), 1) - downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(len(hdl.downloaded_info_dicts), 1) + downloaded = hdl.downloaded_info_dicts[0] self.assertEqual(entries[2], downloaded) self.assertEqual(downloaded['url'], TEST_URL) self.assertEqual(downloaded['title'], 'Video Transparent 2') From 67692545da2ba59f95a4dbe838b9569bcd00ad7f Mon Sep 17 00:00:00 2001 From: Laura Liberda Date: Fri, 26 Feb 2021 18:47:47 +0100 Subject: [PATCH 384/384] fix crash in generic extractor --- haruhi_dl/extractor/arcpublishing.py | 2 +- haruhi_dl/extractor/medialaan.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/haruhi_dl/extractor/arcpublishing.py b/haruhi_dl/extractor/arcpublishing.py index ca6a6c4d8..6ac7df303 100644 --- a/haruhi_dl/extractor/arcpublishing.py +++ b/haruhi_dl/extractor/arcpublishing.py @@ -74,7 +74,7 @@ class ArcPublishingIE(InfoExtractor): ] @staticmethod - def _extract_urls(webpage): + def _extract_urls(webpage, **kw): entries = [] # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview for powa_el in re.findall(r'(]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): diff --git a/haruhi_dl/extractor/medialaan.py b/haruhi_dl/extractor/medialaan.py index 788acf7fb..469212ec6 100644 --- a/haruhi_dl/extractor/medialaan.py +++ b/haruhi_dl/extractor/medialaan.py @@ -72,7 +72,7 @@ class MedialaanIE(InfoExtractor): }] @staticmethod - def _extract_urls(webpage): + def _extract_urls(webpage, **kw): entries = [] for element in re.findall(r'(]+data-mychannels-type="video"[^>]*>)', webpage): mychannels_id = extract_attributes(element).get('data-mychannels-id')