diff --git a/devscripts/copykitku-patch-hook.js b/devscripts/copykitku-patch-hook.js index cd6e93a84..1db74257a 100644 --- a/devscripts/copykitku-patch-hook.js +++ b/devscripts/copykitku-patch-hook.js @@ -4,16 +4,18 @@ module.exports = function patchHook(patchContent) { [ + [/(?:youtube-|yt-?)dl\.org/g, 'haruhi.download'], [/youtube_dl/g, 'haruhi_dl'], [/youtube-dl/g, 'haruhi-dl'], [/youtubedl/g, 'haruhidl'], [/YoutubeDL/g, 'HaruhiDL'], [/ytdl/g, 'hdl'], - [/(?:youtube-|yt-?)dl\.org/g, 'haruhi.download'], [/yt-dl/g, 'h-dl'], + [/ydl/g, 'hdl'], // prevent from linking to non-existent repository [/github\.com\/ytdl-org\/haruhi-dl/g, 'github.com/ytdl-org/youtube-dl'], + [/github\.com\/rg3\/haruhi-dl/g, 'github.com/ytdl-org/youtube-dl'], // prevent changing the smuggle URLs (for compatibility with ytdl) [/__haruhidl_smuggle/g, '__youtubedl_smuggle'], ].forEach(([regex, replacement]) => patchContent = patchContent.replace(regex, replacement)); diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 0cfdf37ca..32f344201 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -77,7 +77,7 @@ def build_lazy_ie(ie, name): return s -# find the correct sorting and add the required base classes so that sublcasses +# find the correct sorting and add the required base classes so that subclasses # can be correctly created classes = _ALL_CLASSES[:-1] ordered_cls = [] diff --git a/haruhi_dl/HaruhiDL.py b/haruhi_dl/HaruhiDL.py index 0a1a5a5a9..6d0a59ff6 100755 --- a/haruhi_dl/HaruhiDL.py +++ b/haruhi_dl/HaruhiDL.py @@ -163,6 +163,7 @@ class HaruhiDL(object): simulate: Do not download the video files. format: Video format code. See options.py for more information. outtmpl: Template for output names. + outtmpl_na_placeholder: Placeholder for unavailable meta fields. restrictfilenames: Do not allow "&" and spaces in file names ignoreerrors: Do not stop on download errors. force_generic_extractor: Force downloader to use the generic extractor @@ -338,6 +339,8 @@ class HaruhiDL(object): _pps = [] _download_retcode = None _num_downloads = None + _playlist_level = 0 + _playlist_urls = set() _screen_file = None def __init__(self, params=None, auto_init=True): @@ -660,7 +663,7 @@ class HaruhiDL(object): template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) for k, v in template_dict.items() if v is not None and not isinstance(v, (list, tuple, dict))) - template_dict = collections.defaultdict(lambda: 'NA', template_dict) + template_dict = collections.defaultdict(lambda: self.params.get('outtmpl_na_placeholder', 'NA'), template_dict) outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) @@ -680,8 +683,8 @@ class HaruhiDL(object): # Missing numeric fields used together with integer presentation types # in format specification will break the argument substitution since - # string 'NA' is returned for missing fields. We will patch output - # template for missing fields to meet string presentation type. + # string NA placeholder is returned for missing fields. We will patch + # output template for missing fields to meet string presentation type. for numeric_field in self._NUMERIC_FIELDS: if numeric_field not in template_dict: # As of [1] format syntax is: @@ -797,21 +800,14 @@ class HaruhiDL(object): self.report_warning('The program functionality for this site has been marked as broken, ' 'and will probably not work.') + return self.__extract_info(url, ie, download, extra_info, process) + else: + self.report_error('no suitable InfoExtractor for URL %s' % url) + + def __handle_extraction_exceptions(func): + def wrapper(self, *args, **kwargs): try: - ie_result = ie.extract(url) - if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) - break - if isinstance(ie_result, list): - # Backwards compatibility: old IE result format - ie_result = { - '_type': 'compat_list', - 'entries': ie_result, - } - self.add_default_extra_info(ie_result, ie, url) - if process: - return self.process_ie_result(ie_result, download, extra_info) - else: - return ie_result + return func(self, *args, **kwargs) except GeoRestrictedError as e: msg = e.msg if e.countries: @@ -819,20 +815,33 @@ class HaruhiDL(object): map(ISO3166Utils.short2full, e.countries)) msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' self.report_error(msg) - break except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) - break except MaxDownloadsReached: raise except Exception as e: if self.params.get('ignoreerrors', False): self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) - break else: raise + return wrapper + + @__handle_extraction_exceptions + def __extract_info(self, url, ie, download, extra_info, process): + ie_result = ie.extract(url) + if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + return + if isinstance(ie_result, list): + # Backwards compatibility: old IE result format + ie_result = { + '_type': 'compat_list', + 'entries': ie_result, + } + self.add_default_extra_info(ie_result, ie, url) + if process: + return self.process_ie_result(ie_result, download, extra_info) else: - self.report_error('no suitable InfoExtractor for URL %s' % url) + return ie_result def add_default_extra_info(self, ie_result, ie, url): self.add_extra_info(ie_result, { @@ -904,116 +913,23 @@ class HaruhiDL(object): return self.process_ie_result( new_result, download=download, extra_info=extra_info) elif result_type in ('playlist', 'multi_video'): - # We process each entry in the playlist - playlist = ie_result.get('title') or ie_result.get('id') - self.to_screen('[download] Downloading playlist: %s' % playlist) - - playlist_results = [] - - playliststart = self.params.get('playliststart', 1) - 1 - playlistend = self.params.get('playlistend') - # For backwards compatibility, interpret -1 as whole list - if playlistend == -1: - playlistend = None - - playlistitems_str = self.params.get('playlist_items') - playlistitems = None - if playlistitems_str is not None: - def iter_playlistitems(format): - for string_segment in format.split(','): - if '-' in string_segment: - start, end = string_segment.split('-') - for item in range(int(start), int(end) + 1): - yield int(item) - else: - yield int(string_segment) - playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) - - ie_entries = ie_result['entries'] - - def make_playlistitems_entries(list_ie_entries): - num_entries = len(list_ie_entries) - return [ - list_ie_entries[i - 1] for i in playlistitems - if -num_entries <= i - 1 < num_entries] - - def report_download(num_entries): + # Protect from infinite recursion due to recursively nested playlists + # (see https://github.com/hdl-org/haruhi-dl/issues/27833) + webpage_url = ie_result['webpage_url'] + if webpage_url in self._playlist_urls: self.to_screen( - '[%s] playlist %s: Downloading %d videos' % - (ie_result['extractor'], playlist, num_entries)) + '[download] Skipping already downloaded playlist: %s' + % ie_result.get('title') or ie_result.get('id')) + return - if isinstance(ie_entries, list): - n_all_entries = len(ie_entries) - if playlistitems: - entries = make_playlistitems_entries(ie_entries) - else: - entries = ie_entries[playliststart:playlistend] - n_entries = len(entries) - self.to_screen( - '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % - (ie_result['extractor'], playlist, n_all_entries, n_entries)) - elif isinstance(ie_entries, PagedList): - if playlistitems: - entries = [] - for item in playlistitems: - entries.extend(ie_entries.getslice( - item - 1, item - )) - else: - entries = ie_entries.getslice( - playliststart, playlistend) - n_entries = len(entries) - report_download(n_entries) - else: # iterable - if playlistitems: - entries = make_playlistitems_entries(list(itertools.islice( - ie_entries, 0, max(playlistitems)))) - else: - entries = list(itertools.islice( - ie_entries, playliststart, playlistend)) - n_entries = len(entries) - report_download(n_entries) - - if self.params.get('playlistreverse', False): - entries = entries[::-1] - - if self.params.get('playlistrandom', False): - random.shuffle(entries) - - x_forwarded_for = ie_result.get('__x_forwarded_for_ip') - - for i, entry in enumerate(entries, 1): - self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) - # This __x_forwarded_for_ip thing is a bit ugly but requires - # minimal changes - if x_forwarded_for: - entry['__x_forwarded_for_ip'] = x_forwarded_for - extra = { - 'n_entries': n_entries, - 'playlist': playlist, - 'playlist_id': ie_result.get('id'), - 'playlist_title': ie_result.get('title'), - 'playlist_uploader': ie_result.get('uploader'), - 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, - 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'extractor_key': ie_result['extractor_key'], - } - - reason = self._match_entry(entry, incomplete=True) - if reason is not None: - self.to_screen('[download] ' + reason) - continue - - entry_result = self.process_ie_result(entry, - download=download, - extra_info=extra) - playlist_results.append(entry_result) - ie_result['entries'] = playlist_results - self.to_screen('[download] Finished downloading playlist: %s' % playlist) - return ie_result + self._playlist_level += 1 + self._playlist_urls.add(webpage_url) + try: + return self.__process_playlist(ie_result, download) + finally: + self._playlist_level -= 1 + if not self._playlist_level: + self._playlist_urls.clear() elif result_type == 'compat_list': self.report_warning( 'Extractor %s returned a compat_list result. ' @@ -1038,6 +954,123 @@ class HaruhiDL(object): else: raise Exception('Invalid result type: %s' % result_type) + def __process_playlist(self, ie_result, download): + # We process each entry in the playlist + playlist = ie_result.get('title') or ie_result.get('id') + + self.to_screen('[download] Downloading playlist: %s' % playlist) + + playlist_results = [] + + playliststart = self.params.get('playliststart', 1) - 1 + playlistend = self.params.get('playlistend') + # For backwards compatibility, interpret -1 as whole list + if playlistend == -1: + playlistend = None + + playlistitems_str = self.params.get('playlist_items') + playlistitems = None + if playlistitems_str is not None: + def iter_playlistitems(format): + for string_segment in format.split(','): + if '-' in string_segment: + start, end = string_segment.split('-') + for item in range(int(start), int(end) + 1): + yield int(item) + else: + yield int(string_segment) + playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) + + ie_entries = ie_result['entries'] + + def make_playlistitems_entries(list_ie_entries): + num_entries = len(list_ie_entries) + return [ + list_ie_entries[i - 1] for i in playlistitems + if -num_entries <= i - 1 < num_entries] + + def report_download(num_entries): + self.to_screen( + '[%s] playlist %s: Downloading %d videos' % + (ie_result['extractor'], playlist, num_entries)) + + if isinstance(ie_entries, list): + n_all_entries = len(ie_entries) + if playlistitems: + entries = make_playlistitems_entries(ie_entries) + else: + entries = ie_entries[playliststart:playlistend] + n_entries = len(entries) + self.to_screen( + '[%s] playlist %s: Collected %d video ids (downloading %d of them)' % + (ie_result['extractor'], playlist, n_all_entries, n_entries)) + elif isinstance(ie_entries, PagedList): + if playlistitems: + entries = [] + for item in playlistitems: + entries.extend(ie_entries.getslice( + item - 1, item + )) + else: + entries = ie_entries.getslice( + playliststart, playlistend) + n_entries = len(entries) + report_download(n_entries) + else: # iterable + if playlistitems: + entries = make_playlistitems_entries(list(itertools.islice( + ie_entries, 0, max(playlistitems)))) + else: + entries = list(itertools.islice( + ie_entries, playliststart, playlistend)) + n_entries = len(entries) + report_download(n_entries) + + if self.params.get('playlistreverse', False): + entries = entries[::-1] + + if self.params.get('playlistrandom', False): + random.shuffle(entries) + + x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + + for i, entry in enumerate(entries, 1): + self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) + # This __x_forwarded_for_ip thing is a bit ugly but requires + # minimal changes + if x_forwarded_for: + entry['__x_forwarded_for_ip'] = x_forwarded_for + extra = { + 'n_entries': n_entries, + 'playlist': playlist, + 'playlist_id': ie_result.get('id'), + 'playlist_title': ie_result.get('title'), + 'playlist_uploader': ie_result.get('uploader'), + 'playlist_uploader_id': ie_result.get('uploader_id'), + 'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart, + 'extractor': ie_result['extractor'], + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'extractor_key': ie_result['extractor_key'], + } + + reason = self._match_entry(entry, incomplete=True) + if reason is not None: + self.to_screen('[download] ' + reason) + continue + + entry_result = self.__process_iterable_entry(entry, download, extra) + # TODO: skip failed (empty) entries? + playlist_results.append(entry_result) + ie_result['entries'] = playlist_results + self.to_screen('[download] Finished downloading playlist: %s' % playlist) + return ie_result + + @__handle_extraction_exceptions + def __process_iterable_entry(self, entry, download, extra_info): + return self.process_ie_result( + entry, download=download, extra_info=extra_info) + def _build_format_filter(self, filter_spec): " Returns a function to filter the formats according to the filter_spec " @@ -1077,7 +1110,7 @@ class HaruhiDL(object): '*=': lambda attr, value: value in attr, } str_operator_rex = re.compile(r'''(?x) - \s*(?Pext|acodec|vcodec|container|protocol|format_id) + \s*(?Pext|acodec|vcodec|container|protocol|format_id|language) \s*(?P!\s*)?(?P%s)(?P\s*\?)? \s*(?P[a-zA-Z0-9._-]+) \s*$ @@ -1220,6 +1253,8 @@ class HaruhiDL(object): group = _parse_format_selection(tokens, inside_group=True) current_selector = FormatSelector(GROUP, group, []) elif string == '+': + if inside_merge: + raise syntax_error('Unexpected "+"', start) video_selector = current_selector audio_selector = _parse_format_selection(tokens, inside_merge=True) if not video_selector or not audio_selector: @@ -1604,7 +1639,7 @@ class HaruhiDL(object): if req_format is None: req_format = self._default_format_spec(info_dict, download=download) if self.params.get('verbose'): - self.to_stdout('[debug] Default format spec: %s' % req_format) + self._write_string('[debug] Default format spec: %s\n' % req_format) format_selector = self.build_format_selector(req_format) @@ -1771,6 +1806,8 @@ class HaruhiDL(object): os.makedirs(dn) return True except (OSError, IOError) as err: + if isinstance(err, OSError) and err.errno == errno.EEXIST: + return True self.report_error('unable to create directory ' + error_to_compat_str(err)) return False @@ -1865,7 +1902,7 @@ class HaruhiDL(object): for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): - self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) + self.to_screen('[debug] Invoking downloader on %r' % info.get('url')) return fd.download(name, info) if info_dict.get('requested_formats') is not None: @@ -2404,7 +2441,7 @@ class HaruhiDL(object): thumb_ext = determine_ext(t['url'], 'jpg') suffix = '_%s' % t['id'] if len(thumbnails) > 1 else '' thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else '' - t['filename'] = thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext + t['filename'] = thumb_filename = replace_extension(filename + suffix, thumb_ext, info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)): self.to_screen('[%s] %s: Thumbnail %sis already present' % diff --git a/haruhi_dl/__init__.py b/haruhi_dl/__init__.py index b90ca151d..f9c495323 100644 --- a/haruhi_dl/__init__.py +++ b/haruhi_dl/__init__.py @@ -340,6 +340,7 @@ def _real_main(argv=None): 'format': opts.format, 'listformats': opts.listformats, 'outtmpl': outtmpl, + 'outtmpl_na_placeholder': opts.outtmpl_na_placeholder, 'autonumber_size': opts.autonumber_size, 'autonumber_start': opts.autonumber_start, 'restrictfilenames': opts.restrictfilenames, diff --git a/haruhi_dl/downloader/fragment.py b/haruhi_dl/downloader/fragment.py index 65d8c881d..090941024 100644 --- a/haruhi_dl/downloader/fragment.py +++ b/haruhi_dl/downloader/fragment.py @@ -97,12 +97,15 @@ class FragmentFD(FileDownloader): def _download_fragment(self, ctx, frag_url, info_dict, headers=None): fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) - success = ctx['dl'].download(fragment_filename, { + fragment_info_dict = { 'url': frag_url, 'http_headers': headers or info_dict.get('http_headers'), - }) + } + success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False, None + if fragment_info_dict.get('filetime'): + ctx['fragment_filetime'] = fragment_info_dict.get('filetime') down, frag_sanitized = sanitize_open(fragment_filename, 'rb') ctx['fragment_filename_sanitized'] = frag_sanitized frag_content = down.read() @@ -258,6 +261,13 @@ class FragmentFD(FileDownloader): downloaded_bytes = ctx['complete_frags_downloaded_bytes'] else: self.try_rename(ctx['tmpfilename'], ctx['filename']) + if self.params.get('updatetime', True): + filetime = ctx.get('fragment_filetime') + if filetime: + try: + os.utime(ctx['filename'], (time.time(), filetime)) + except Exception: + pass downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) self._hook_progress({ diff --git a/haruhi_dl/downloader/hls.py b/haruhi_dl/downloader/hls.py index 0cd16db87..3aa58e1c0 100644 --- a/haruhi_dl/downloader/hls.py +++ b/haruhi_dl/downloader/hls.py @@ -42,11 +42,13 @@ class HlsFD(FragmentFD): # no segments will definitely be appended to the end of the playlist. # r'#EXT-X-PLAYLIST-TYPE:EVENT', # media segments may be appended to the end of # # event media playlists [4] + r'#EXT-X-MAP:', # media initialization [5] # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.4 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.2 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.2 # 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5 + # 5. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.2.5 ) check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES] is_aes128_enc = '#EXT-X-KEY:METHOD=AES-128' in manifest @@ -170,8 +172,12 @@ class HlsFD(FragmentFD): iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) decrypt_info['KEY'] = decrypt_info.get('KEY') or self.hdl.urlopen( self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() - frag_content = AES.new( - decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) + # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block + # size (see https://github.com/hdl-org/haruhi-dl/pull/27660). Tests only care that the correct data downloaded, + # not what it decrypts to. + if not test: + frag_content = AES.new( + decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) self._append_fragment(ctx, frag_content) # We only download the first fragment during the test if test: diff --git a/haruhi_dl/downloader/http.py b/haruhi_dl/downloader/http.py index c9c908b34..6d3d8f1e5 100644 --- a/haruhi_dl/downloader/http.py +++ b/haruhi_dl/downloader/http.py @@ -109,7 +109,9 @@ class HttpFD(FileDownloader): try: ctx.data = self.hdl.urlopen(request) except (compat_urllib_error.URLError, ) as err: - if isinstance(err.reason, socket.timeout): + # reason may not be available, e.g. for urllib2.HTTPError on python 2.6 + reason = getattr(err, 'reason', None) + if isinstance(reason, socket.timeout): raise RetryDownload(err) raise err # When trying to resume, Content-Range HTTP header of response has to be checked diff --git a/haruhi_dl/extractor/abcnews.py b/haruhi_dl/extractor/abcnews.py index 8b407bf9c..908c83377 100644 --- a/haruhi_dl/extractor/abcnews.py +++ b/haruhi_dl/extractor/abcnews.py @@ -1,14 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar import re -import time from .amp import AMPIE from .common import InfoExtractor -from .youtube import YoutubeIE -from ..compat import compat_urlparse +from ..utils import ( + parse_duration, + parse_iso8601, + try_get, +) class AbcNewsVideoIE(AMPIE): @@ -18,8 +19,8 @@ class AbcNewsVideoIE(AMPIE): (?: abcnews\.go\.com/ (?: - [^/]+/video/(?P[0-9a-z-]+)-| - video/embed\?.*?\bid= + (?:[^/]+/)*video/(?P[0-9a-z-]+)-| + video/(?:embed|itemfeed)\?.*?\bid= )| fivethirtyeight\.abcnews\.go\.com/video/embed/\d+/ ) @@ -36,6 +37,8 @@ class AbcNewsVideoIE(AMPIE): 'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.', 'duration': 180, 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1380454200, + 'upload_date': '20130929', }, 'params': { # m3u8 download @@ -47,6 +50,12 @@ class AbcNewsVideoIE(AMPIE): }, { 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', 'only_matching': True, + }, { + 'url': 'http://abcnews.go.com/video/itemfeed?id=46979033', + 'only_matching': True, + }, { + 'url': 'https://abcnews.go.com/GMA/News/video/history-christmas-story-67894761', + 'only_matching': True, }] def _real_extract(self, url): @@ -67,28 +76,23 @@ class AbcNewsIE(InfoExtractor): _VALID_URL = r'https?://abcnews\.go\.com/(?:[^/]+/)+(?P[0-9a-z-]+)/story\?id=(?P\d+)' _TESTS = [{ - 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', + # Youtube Embeds + 'url': 'https://abcnews.go.com/Entertainment/peter-billingsley-child-actor-christmas-story-hollywood-power/story?id=51286501', 'info_dict': { - 'id': '10505354', - 'ext': 'flv', - 'display_id': 'dramatic-video-rare-death-job-america', - 'title': 'Occupational Hazards', - 'description': 'Nightline investigates the dangers that lurk at various jobs.', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20100428', - 'timestamp': 1272412800, + 'id': '51286501', + 'title': "Peter Billingsley: From child actor in 'A Christmas Story' to Hollywood power player", + 'description': 'Billingsley went from a child actor to Hollywood power player.', }, - 'add_ie': ['AbcNewsVideo'], + 'playlist_count': 5, }, { 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', 'info_dict': { 'id': '38897857', 'ext': 'mp4', - 'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016', 'title': 'Justin Timberlake Drops Hints For Secret Single', 'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.', - 'upload_date': '20160515', - 'timestamp': 1463329500, + 'upload_date': '20160505', + 'timestamp': 1462442280, }, 'params': { # m3u8 download @@ -100,49 +104,55 @@ class AbcNewsIE(InfoExtractor): }, { 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', 'only_matching': True, + }, { + # inline.type == 'video' + 'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('display_id') - video_id = mobj.group('id') + story_id = self._match_id(url) + webpage = self._download_webpage(url, story_id) + story = self._parse_json(self._search_regex( + r"window\['__abcnews__'\]\s*=\s*({.+?});", + webpage, 'data'), story_id)['page']['content']['story']['everscroll'][0] + article_contents = story.get('articleContents') or {} - webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') - full_video_url = compat_urlparse.urljoin(url, video_url) + def entries(): + featured_video = story.get('featuredVideo') or {} + feed = try_get(featured_video, lambda x: x['video']['feed']) + if feed: + yield { + '_type': 'url', + 'id': featured_video.get('id'), + 'title': featured_video.get('name'), + 'url': feed, + 'thumbnail': featured_video.get('images'), + 'description': featured_video.get('description'), + 'timestamp': parse_iso8601(featured_video.get('uploadDate')), + 'duration': parse_duration(featured_video.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - youtube_url = YoutubeIE._extract_url(webpage) + for inline in (article_contents.get('inlines') or []): + inline_type = inline.get('type') + if inline_type == 'iframe': + iframe_url = try_get(inline, lambda x: x['attrs']['src']) + if iframe_url: + yield self.url_result(iframe_url) + elif inline_type == 'video': + video_id = inline.get('id') + if video_id: + yield { + '_type': 'url', + 'id': video_id, + 'url': 'http://abcnews.go.com/video/embed?id=' + video_id, + 'thumbnail': inline.get('imgSrc') or inline.get('imgDefault'), + 'description': inline.get('description'), + 'duration': parse_duration(inline.get('duration')), + 'ie_key': AbcNewsVideoIE.ie_key(), + } - timestamp = None - date_str = self._html_search_regex( - r']+class="timestamp">([^<]+)', - webpage, 'timestamp', fatal=False) - if date_str: - tz_offset = 0 - if date_str.endswith(' ET'): # Eastern Time - tz_offset = -5 - date_str = date_str[:-3] - date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p'] - for date_format in date_formats: - try: - timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format)) - except ValueError: - continue - if timestamp is not None: - timestamp -= tz_offset * 3600 - - entry = { - '_type': 'url_transparent', - 'ie_key': AbcNewsVideoIE.ie_key(), - 'url': full_video_url, - 'id': video_id, - 'display_id': display_id, - 'timestamp': timestamp, - } - - if youtube_url: - entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] - return self.playlist_result(entries) - - return entry + return self.playlist_result( + entries(), story_id, article_contents.get('headline'), + article_contents.get('subHead')) diff --git a/haruhi_dl/extractor/acast.py b/haruhi_dl/extractor/acast.py index b17c792d2..b9355a2c8 100644 --- a/haruhi_dl/extractor/acast.py +++ b/haruhi_dl/extractor/acast.py @@ -2,21 +2,48 @@ from __future__ import unicode_literals import re -import functools from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( clean_html, - float_or_none, + clean_podcast_url, int_or_none, - try_get, - unified_timestamp, - OnDemandPagedList, + parse_iso8601, ) -class ACastIE(InfoExtractor): +class ACastBaseIE(InfoExtractor): + def _extract_episode(self, episode, show_info): + title = episode['title'] + info = { + 'id': episode['id'], + 'display_id': episode.get('episodeUrl'), + 'url': clean_podcast_url(episode['url']), + 'title': title, + 'description': clean_html(episode.get('description') or episode.get('summary')), + 'thumbnail': episode.get('image'), + 'timestamp': parse_iso8601(episode.get('publishDate')), + 'duration': int_or_none(episode.get('duration')), + 'filesize': int_or_none(episode.get('contentLength')), + 'season_number': int_or_none(episode.get('season')), + 'episode': title, + 'episode_number': int_or_none(episode.get('episode')), + } + info.update(show_info) + return info + + def _extract_show_info(self, show): + return { + 'creator': show.get('author'), + 'series': show.get('title'), + } + + def _call_api(self, path, video_id, query=None): + return self._download_json( + 'https://feeder.acast.com/api/v1/shows/' + path, video_id, query=query) + + +class ACastIE(ACastBaseIE): IE_NAME = 'acast' _VALID_URL = r'''(?x) https?:// @@ -28,15 +55,15 @@ class ACastIE(InfoExtractor): ''' _TESTS = [{ 'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna', - 'md5': '16d936099ec5ca2d5869e3a813ee8dc4', + 'md5': 'f5598f3ad1e4776fed12ec1407153e4b', 'info_dict': { 'id': '2a92b283-1a75-4ad8-8396-499c641de0d9', 'ext': 'mp3', 'title': '2. Raggarmordet - Röster ur det förflutna', - 'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4', + 'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67', 'timestamp': 1477346700, 'upload_date': '20161024', - 'duration': 2766.602563, + 'duration': 2766, 'creator': 'Anton Berg & Martin Johnson', 'series': 'Spår', 'episode': '2. Raggarmordet - Röster ur det förflutna', @@ -45,7 +72,7 @@ class ACastIE(InfoExtractor): 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015', 'only_matching': True, }, { - 'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22', + 'url': 'https://play.acast.com/s/rattegangspodden/s04e09styckmordetihelenelund-del2-2', 'only_matching': True, }, { 'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9', @@ -54,40 +81,14 @@ class ACastIE(InfoExtractor): def _real_extract(self, url): channel, display_id = re.match(self._VALID_URL, url).groups() - s = self._download_json( - 'https://feeder.acast.com/api/v1/shows/%s/episodes/%s' % (channel, display_id), - display_id) - media_url = s['url'] - if re.search(r'[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}', display_id): - episode_url = s.get('episodeUrl') - if episode_url: - display_id = episode_url - else: - channel, display_id = re.match(self._VALID_URL, s['link']).groups() - cast_data = self._download_json( - 'https://play-api.acast.com/splash/%s/%s' % (channel, display_id), - display_id)['result'] - e = cast_data['episode'] - title = e.get('name') or s['title'] - return { - 'id': compat_str(e['id']), - 'display_id': display_id, - 'url': media_url, - 'title': title, - 'description': e.get('summary') or clean_html(e.get('description') or s.get('description')), - 'thumbnail': e.get('image'), - 'timestamp': unified_timestamp(e.get('publishingDate') or s.get('publishDate')), - 'duration': float_or_none(e.get('duration') or s.get('duration')), - 'filesize': int_or_none(e.get('contentLength')), - 'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str), - 'series': try_get(cast_data, lambda x: x['show']['name'], compat_str), - 'season_number': int_or_none(e.get('seasonNumber')), - 'episode': title, - 'episode_number': int_or_none(e.get('episodeNumber')), - } + episode = self._call_api( + '%s/episodes/%s' % (channel, display_id), + display_id, {'showInfo': 'true'}) + return self._extract_episode( + episode, self._extract_show_info(episode.get('show') or {})) -class ACastChannelIE(InfoExtractor): +class ACastChannelIE(ACastBaseIE): IE_NAME = 'acast:channel' _VALID_URL = r'''(?x) https?:// @@ -102,34 +103,24 @@ class ACastChannelIE(InfoExtractor): 'info_dict': { 'id': '4efc5294-5385-4847-98bd-519799ce5786', 'title': 'Today in Focus', - 'description': 'md5:9ba5564de5ce897faeb12963f4537a64', + 'description': 'md5:c09ce28c91002ce4ffce71d6504abaae', }, - 'playlist_mincount': 35, + 'playlist_mincount': 200, }, { 'url': 'http://play.acast.com/s/ft-banking-weekly', 'only_matching': True, }] - _API_BASE_URL = 'https://play.acast.com/api/' - _PAGE_SIZE = 10 @classmethod def suitable(cls, url): return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url) - def _fetch_page(self, channel_slug, page): - casts = self._download_json( - self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page), - channel_slug, note='Download page %d of channel data' % page) - for cast in casts: - yield self.url_result( - 'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']), - 'ACast', cast['id']) - def _real_extract(self, url): - channel_slug = self._match_id(url) - channel_data = self._download_json( - self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug) - entries = OnDemandPagedList(functools.partial( - self._fetch_page, channel_slug), self._PAGE_SIZE) - return self.playlist_result(entries, compat_str( - channel_data['id']), channel_data['name'], channel_data.get('description')) + show_slug = self._match_id(url) + show = self._call_api(show_slug, show_slug) + show_info = self._extract_show_info(show) + entries = [] + for episode in (show.get('episodes') or []): + entries.append(self._extract_episode(episode, show_info)) + return self.playlist_result( + entries, show.get('id'), show.get('title'), show.get('description')) diff --git a/haruhi_dl/extractor/adn.py b/haruhi_dl/extractor/adn.py index c95ad2173..a55ebbcbd 100644 --- a/haruhi_dl/extractor/adn.py +++ b/haruhi_dl/extractor/adn.py @@ -10,6 +10,7 @@ import random from .common import InfoExtractor from ..aes import aes_cbc_decrypt from ..compat import ( + compat_HTTPError, compat_b64decode, compat_ord, ) @@ -18,11 +19,14 @@ from ..utils import ( bytes_to_long, ExtractorError, float_or_none, + int_or_none, intlist_to_bytes, long_to_bytes, pkcs1pad, strip_or_none, - urljoin, + try_get, + unified_strdate, + urlencode_postdata, ) @@ -31,16 +35,30 @@ class ADNIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P\d+)' _TEST = { 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', - 'md5': 'e497370d847fd79d9d4c74be55575c7a', + 'md5': '0319c99885ff5547565cacb4f3f9348d', 'info_dict': { 'id': '7778', 'ext': 'mp4', - 'title': 'Blue Exorcist - Kyôto Saga - Épisode 1', + 'title': 'Blue Exorcist - Kyôto Saga - Episode 1', 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', + 'series': 'Blue Exorcist - Kyôto Saga', + 'duration': 1467, + 'release_date': '20170106', + 'comment_count': int, + 'average_rating': float, + 'season_number': 2, + 'episode': 'Début des hostilités', + 'episode_number': 1, } } + + _NETRC_MACHINE = 'animedigitalnetwork' _BASE_URL = 'http://animedigitalnetwork.fr' - _RSA_KEY = (0xc35ae1e4356b65a73b551493da94b8cb443491c0aa092a357a5aee57ffc14dda85326f42d716e539a34542a0d3f363adf16c5ec222d713d5997194030ee2e4f0d1fb328c01a81cf6868c090d50de8e169c6b13d1675b9eeed1cbc51e1fffca9b38af07f37abd790924cd3bee59d0257cfda4fe5f3f0534877e21ce5821447d1b, 65537) + _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' + _PLAYER_BASE_URL = _API_BASE_URL + 'player/' + _HEADERS = {} + _LOGIN_ERR_MESSAGE = 'Unable to log in' + _RSA_KEY = (0x9B42B08905199A5CCE2026274399CA560ECB209EE9878A708B1C0812E1BB8CB5D1FB7441861147C1A1F2F3A0476DD63A9CAC20D3E983613346850AA6CB38F16DC7D720FD7D86FC6E5B3D5BBC72E14CD0BF9E869F2CEA2CCAD648F1DCE38F1FF916CEFB2D339B64AA0264372344BC775E265E8A852F88144AB0BD9AA06C1A4ABB, 65537) _POS_ALIGN_MAP = { 'start': 1, 'end': 3, @@ -54,26 +72,24 @@ class ADNIE(InfoExtractor): def _ass_subtitles_timecode(seconds): return '%01d:%02d:%02d.%02d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 100) - def _get_subtitles(self, sub_path, video_id): - if not sub_path: + def _get_subtitles(self, sub_url, video_id): + if not sub_url: return None enc_subtitles = self._download_webpage( - urljoin(self._BASE_URL, sub_path), - video_id, 'Downloading subtitles location', fatal=False) or '{}' + sub_url, video_id, 'Downloading subtitles location', fatal=False) or '{}' subtitle_location = (self._parse_json(enc_subtitles, video_id, fatal=False) or {}).get('location') if subtitle_location: enc_subtitles = self._download_webpage( - urljoin(self._BASE_URL, subtitle_location), - video_id, 'Downloading subtitles data', fatal=False, - headers={'Origin': 'https://animedigitalnetwork.fr'}) + subtitle_location, video_id, 'Downloading subtitles data', + fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'}) if not enc_subtitles: return None # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(binascii.unhexlify(self._K + '4b8ef13ec1872730')), + bytes_to_intlist(binascii.unhexlify(self._K + 'ab9f52f5baae7c72')), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -117,61 +133,100 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }]) return subtitles + def _real_initialize(self): + username, password = self._get_login_info() + if not username: + return + try: + access_token = (self._download_json( + self._API_BASE_URL + 'authentication/login', None, + 'Logging in', self._LOGIN_ERR_MESSAGE, fatal=False, + data=urlencode_postdata({ + 'password': password, + 'rememberMe': False, + 'source': 'Web', + 'username': username, + })) or {}).get('accessToken') + if access_token: + self._HEADERS = {'authorization': 'Bearer ' + access_token} + except ExtractorError as e: + message = None + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json( + e.cause.read().decode(), None, fatal=False) or {} + message = resp.get('message') or resp.get('code') + self.report_warning(message or self._LOGIN_ERR_MESSAGE) + def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - player_config = self._parse_json(self._search_regex( - r'playerConfig\s*=\s*({.+});', webpage, - 'player config', default='{}'), video_id, fatal=False) - if not player_config: - config_url = urljoin(self._BASE_URL, self._search_regex( - r'(?:id="player"|class="[^"]*adn-player-container[^"]*")[^>]+data-url="([^"]+)"', - webpage, 'config url')) - player_config = self._download_json( - config_url, video_id, - 'Downloading player config JSON metadata')['player'] + video_base_url = self._PLAYER_BASE_URL + 'video/%s/' % video_id + player = self._download_json( + video_base_url + 'configuration', video_id, + 'Downloading player config JSON metadata', + headers=self._HEADERS)['player'] + options = player['options'] - video_info = {} - video_info_str = self._search_regex( - r'videoInfo\s*=\s*({.+});', webpage, - 'video info', fatal=False) - if video_info_str: - video_info = self._parse_json( - video_info_str, video_id, fatal=False) or {} + user = options['user'] + if not user.get('hasAccess'): + self.raise_login_required() - options = player_config.get('options') or {} - metas = options.get('metas') or {} - links = player_config.get('links') or {} - sub_path = player_config.get('subtitles') - error = None - if not links: - links_url = player_config.get('linksurl') or options['videoUrl'] - token = options['token'] - self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) - message = bytes_to_intlist(json.dumps({ - 'k': self._K, - 'e': 60, - 't': token, - })) + token = self._download_json( + user.get('refreshTokenUrl') or (self._PLAYER_BASE_URL + 'refresh/token'), + video_id, 'Downloading access token', headers={ + 'x-player-refresh-token': user['refreshToken'] + }, data=b'')['token'] + + links_url = try_get(options, lambda x: x['video']['url']) or (video_base_url + 'link') + self._K = ''.join([random.choice('0123456789abcdef') for _ in range(16)]) + message = bytes_to_intlist(json.dumps({ + 'k': self._K, + 't': token, + })) + + # Sometimes authentication fails for no good reason, retry with + # a different random padding + links_data = None + for _ in range(3): padded_message = intlist_to_bytes(pkcs1pad(message, 128)) n, e = self._RSA_KEY encrypted_message = long_to_bytes(pow(bytes_to_long(padded_message), e, n)) authorization = base64.b64encode(encrypted_message).decode() - links_data = self._download_json( - urljoin(self._BASE_URL, links_url), video_id, - 'Downloading links JSON metadata', headers={ - 'Authorization': 'Bearer ' + authorization, - }) - links = links_data.get('links') or {} - metas = metas or links_data.get('meta') or {} - sub_path = sub_path or links_data.get('subtitles') or \ - 'index.php?option=com_vodapi&task=subtitles.getJSON&format=json&id=' + video_id - sub_path += '&token=' + token - error = links_data.get('error') - title = metas.get('title') or video_info['title'] + + try: + links_data = self._download_json( + links_url, video_id, 'Downloading links JSON metadata', headers={ + 'X-Player-Token': authorization + }, query={ + 'freeWithAds': 'true', + 'adaptive': 'false', + 'withMetadata': 'true', + 'source': 'Web' + }) + break + except ExtractorError as e: + if not isinstance(e.cause, compat_HTTPError): + raise e + + if e.cause.code == 401: + # This usually goes away with a different random pkcs1pad, so retry + continue + + error = self._parse_json(e.cause.read(), video_id) + message = error.get('message') + if e.cause.code == 403 and error.get('code') == 'player-bad-geolocation-country': + self.raise_geo_restricted(msg=message) + raise ExtractorError(message) + else: + raise ExtractorError('Giving up retrying') + + links = links_data.get('links') or {} + metas = links_data.get('metadata') or {} + sub_url = (links.get('subtitles') or {}).get('all') + video_info = links_data.get('video') or {} + title = metas['title'] formats = [] - for format_id, qualities in links.items(): + for format_id, qualities in (links.get('streaming') or {}).items(): if not isinstance(qualities, dict): continue for quality, load_balancer_url in qualities.items(): @@ -189,19 +244,26 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' for f in m3u8_formats: f['language'] = 'fr' formats.extend(m3u8_formats) - if not error: - error = options.get('error') - if not formats and error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) self._sort_formats(formats) + video = (self._download_json( + self._API_BASE_URL + 'video/%s' % video_id, video_id, + 'Downloading additional video metadata', fatal=False) or {}).get('video') or {} + show = video.get('show') or {} + return { 'id': video_id, 'title': title, - 'description': strip_or_none(metas.get('summary') or video_info.get('resume')), - 'thumbnail': video_info.get('image'), + 'description': strip_or_none(metas.get('summary') or video.get('summary')), + 'thumbnail': video_info.get('image') or player.get('image'), 'formats': formats, - 'subtitles': self.extract_subtitles(sub_path, video_id), - 'episode': metas.get('subtitle') or video_info.get('videoTitle'), - 'series': video_info.get('playlistTitle'), + 'subtitles': self.extract_subtitles(sub_url, video_id), + 'episode': metas.get('subtitle') or video.get('name'), + 'episode_number': int_or_none(video.get('shortNumber')), + 'series': show.get('title'), + 'season_number': int_or_none(video.get('season')), + 'duration': int_or_none(video_info.get('duration') or video.get('duration')), + 'release_date': unified_strdate(video.get('releaseDate')), + 'average_rating': float_or_none(video.get('rating') or metas.get('rating')), + 'comment_count': int_or_none(video.get('commentsCount')), } diff --git a/haruhi_dl/extractor/aenetworks.py b/haruhi_dl/extractor/aenetworks.py index 611b948f5..e55c03fd7 100644 --- a/haruhi_dl/extractor/aenetworks.py +++ b/haruhi_dl/extractor/aenetworks.py @@ -5,20 +5,32 @@ import re from .theplatform import ThePlatformIE from ..utils import ( - extract_attributes, ExtractorError, + GeoRestrictedError, int_or_none, - smuggle_url, update_url_query, -) -from ..compat import ( - compat_urlparse, + urlencode_postdata, ) class AENetworksBaseIE(ThePlatformIE): + _BASE_URL_REGEX = r'''(?x)https?:// + (?:(?:www|play|watch)\.)? + (?P + (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| + fyi\.tv + )/''' _THEPLATFORM_KEY = 'crazyjava' _THEPLATFORM_SECRET = 's3cr3t' + _DOMAIN_MAP = { + 'history.com': ('HISTORY', 'history'), + 'aetv.com': ('AETV', 'aetv'), + 'mylifetime.com': ('LIFETIME', 'lifetime'), + 'lifetimemovieclub.com': ('LIFETIMEMOVIECLUB', 'lmc'), + 'fyi.tv': ('FYI', 'fyi'), + 'historyvault.com': (None, 'historyvault'), + 'biography.com': (None, 'biography'), + } def _extract_aen_smil(self, smil_url, video_id, auth=None): query = {'mbr': 'true'} @@ -31,7 +43,7 @@ class AENetworksBaseIE(ThePlatformIE): 'assetTypes': 'high_video_s3' }, { 'assetTypes': 'high_video_s3', - 'switch': 'hls_ingest_fastly' + 'switch': 'hls_high_fastly', }] formats = [] subtitles = {} @@ -44,6 +56,8 @@ class AENetworksBaseIE(ThePlatformIE): tp_formats, tp_subtitles = self._extract_theplatform_smil( m_url, video_id, 'Downloading %s SMIL data' % (q.get('switch') or q['assetTypes'])) except ExtractorError as e: + if isinstance(e, GeoRestrictedError): + raise last_e = e continue formats.extend(tp_formats) @@ -57,24 +71,45 @@ class AENetworksBaseIE(ThePlatformIE): 'subtitles': subtitles, } + def _extract_aetn_info(self, domain, filter_key, filter_value, url): + requestor_id, brand = self._DOMAIN_MAP[domain] + result = self._download_json( + 'https://feeds.video.aetnd.com/api/v2/%s/videos' % brand, + filter_value, query={'filter[%s]' % filter_key: filter_value})['results'][0] + title = result['title'] + video_id = result['id'] + media_url = result['publicUrl'] + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + auth = None + if theplatform_metadata.get('AETN$isBehindWall'): + resource = self._get_mvpd_resource( + requestor_id, theplatform_metadata['title'], + theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), + theplatform_metadata['ratings'][0]['rating']) + auth = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + info.update(self._extract_aen_smil(media_url, video_id, auth)) + info.update({ + 'title': title, + 'series': result.get('seriesName'), + 'season_number': int_or_none(result.get('tvSeasonNumber')), + 'episode_number': int_or_none(result.get('tvSeasonEpisodeNumber')), + }) + return info + class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)? - (?P - (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com| - fyi\.tv - )/ - (?: - shows/(?P[^/]+(?:/[^/]+){0,2})| - movies/(?P[^/]+)(?:/full-movie)?| - specials/(?P[^/]+)/(?:full-special|preview-)| - collections/[^/]+/(?P[^/]+) - ) - ''' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'''(?P + shows/[^/]+/season-\d+/episode-\d+| + (?: + (?:movie|special)s/[^/]+| + (?:shows/[^/]+/)?videos + )/[^/?#&]+ + )''' _TESTS = [{ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'info_dict': { @@ -91,22 +126,23 @@ class AENetworksIE(AENetworksBaseIE): 'skip_download': True, }, 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.history.com/shows/ancient-aliens/season-1', - 'info_dict': { - 'id': '71889446852', - }, - 'playlist_mincount': 5, - }, { - 'url': 'http://www.mylifetime.com/shows/atlanta-plastic', - 'info_dict': { - 'id': 'SERIES4317', - 'title': 'Atlanta Plastic', - }, - 'playlist_mincount': 2, + 'skip': 'This video is only available for users of participating TV providers.', }, { 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', - 'only_matching': True + 'info_dict': { + 'id': '600587331957', + 'ext': 'mp4', + 'title': 'Inlawful Entry', + 'description': 'md5:57c12115a2b384d883fe64ca50529e08', + 'timestamp': 1452634428, + 'upload_date': '20160112', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], }, { 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True @@ -117,78 +153,125 @@ class AENetworksIE(AENetworksBaseIE): 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', 'only_matching': True }, { - 'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us', + 'url': 'https://watch.lifetimemovieclub.com/movies/10-year-reunion/full-movie', 'only_matching': True }, { 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', 'only_matching': True - }, { - 'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward', - 'only_matching': True }, { 'url': 'https://www.aetv.com/specials/hunting-jonbenets-killer-the-untold-story/preview-hunting-jonbenets-killer-the-untold-story', 'only_matching': True + }, { + 'url': 'http://www.history.com/videos/history-of-valentines-day', + 'only_matching': True + }, { + 'url': 'https://play.aetv.com/shows/duck-dynasty/videos/best-of-duck-dynasty-getting-quack-in-shape', + 'only_matching': True }] - _DOMAIN_TO_REQUESTOR_ID = { - 'history.com': 'HISTORY', - 'aetv.com': 'AETV', - 'mylifetime.com': 'LIFETIME', - 'lifetimemovieclub.com': 'LIFETIMEMOVIECLUB', - 'fyi.tv': 'FYI', - } def _real_extract(self, url): - domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups() - display_id = show_path or movie_display_id or special_display_id or collection_display_id - webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers()) - if show_path: - url_parts = show_path.split('/') - url_parts_len = len(url_parts) - if url_parts_len == 1: - entries = [] - for season_url_path in re.findall(r'(?s)]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): - entries.append(self.url_result( - compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) - if entries: - return self.playlist_result( - entries, self._html_search_meta('aetn:SeriesId', webpage), - self._html_search_meta('aetn:SeriesTitle', webpage)) - else: - # single season - url_parts_len = 2 - if url_parts_len == 2: - entries = [] - for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): - episode_attributes = extract_attributes(episode_item) - episode_url = compat_urlparse.urljoin( - url, episode_attributes['data-canonical']) - entries.append(self.url_result( - episode_url, 'AENetworks', - episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id'))) - return self.playlist_result( - entries, self._html_search_meta('aetn:SeasonId', webpage)) + domain, canonical = re.match(self._VALID_URL, url).groups() + return self._extract_aetn_info(domain, 'canonical', '/' + canonical, url) - video_id = self._html_search_meta('aetn:VideoID', webpage) - media_url = self._search_regex( - [r"media_url\s*=\s*'(?P[^']+)'", - r'data-media-url=(?P(?:https?:)?//[^\s>]+)', - r'data-media-url=(["\'])(?P(?:(?!\1).)+?)\1'], - webpage, 'video url', group='url') - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'https?://link\.theplatform\.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) - info = self._parse_theplatform_metadata(theplatform_metadata) - auth = None - if theplatform_metadata.get('AETN$isBehindWall'): - requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] - resource = self._get_mvpd_resource( - requestor_id, theplatform_metadata['title'], - theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), - theplatform_metadata['ratings'][0]['rating']) - auth = self._extract_mvpd_auth( - url, video_id, requestor_id, resource) - info.update(self._search_json_ld(webpage, video_id, fatal=False)) - info.update(self._extract_aen_smil(media_url, video_id, auth)) - return info + +class AENetworksListBaseIE(AENetworksBaseIE): + def _call_api(self, resource, slug, brand, fields): + return self._download_json( + 'https://yoga.appsvcs.aetnd.com/graphql', + slug, query={'brand': brand}, data=urlencode_postdata({ + 'query': '''{ + %s(slug: "%s") { + %s + } +}''' % (resource, slug, fields), + }))['data'][resource] + + def _real_extract(self, url): + domain, slug = re.match(self._VALID_URL, url).groups() + _, brand = self._DOMAIN_MAP[domain] + playlist = self._call_api(self._RESOURCE, slug, brand, self._FIELDS) + base_url = 'http://watch.%s' % domain + + entries = [] + for item in (playlist.get(self._ITEMS_KEY) or []): + doc = self._get_doc(item) + canonical = doc.get('canonical') + if not canonical: + continue + entries.append(self.url_result( + base_url + canonical, AENetworksIE.ie_key(), doc.get('id'))) + + description = None + if self._PLAYLIST_DESCRIPTION_KEY: + description = playlist.get(self._PLAYLIST_DESCRIPTION_KEY) + + return self.playlist_result( + entries, playlist.get('id'), + playlist.get(self._PLAYLIST_TITLE_KEY), description) + + +class AENetworksCollectionIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:collection' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'(?:[^/]+/)*(?:list|collections)/(?P[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://watch.historyvault.com/list/america-the-story-of-us', + 'info_dict': { + 'id': '282', + 'title': 'America The Story of Us', + }, + 'playlist_mincount': 12, + }, { + 'url': 'https://watch.historyvault.com/shows/america-the-story-of-us-2/season-1/list/america-the-story-of-us', + 'only_matching': True + }, { + 'url': 'https://www.historyvault.com/collections/mysteryquest', + 'only_matching': True + }] + _RESOURCE = 'list' + _ITEMS_KEY = 'items' + _PLAYLIST_TITLE_KEY = 'display_title' + _PLAYLIST_DESCRIPTION_KEY = None + _FIELDS = '''id + display_title + items { + ... on ListVideoItem { + doc { + canonical + id + } + } + }''' + + def _get_doc(self, item): + return item.get('doc') or {} + + +class AENetworksShowIE(AENetworksListBaseIE): + IE_NAME = 'aenetworks:show' + _VALID_URL = AENetworksBaseIE._BASE_URL_REGEX + r'shows/(?P[^/?#&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.history.com/shows/ancient-aliens', + 'info_dict': { + 'id': 'SERIES1574', + 'title': 'Ancient Aliens', + 'description': 'md5:3f6d74daf2672ff3ae29ed732e37ea7f', + }, + 'playlist_mincount': 150, + }] + _RESOURCE = 'series' + _ITEMS_KEY = 'episodes' + _PLAYLIST_TITLE_KEY = 'title' + _PLAYLIST_DESCRIPTION_KEY = 'description' + _FIELDS = '''description + id + title + episodes { + canonical + id + }''' + + def _get_doc(self, item): + return item class HistoryTopicIE(AENetworksBaseIE): @@ -204,6 +287,7 @@ class HistoryTopicIE(AENetworksBaseIE): 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', 'timestamp': 1375819729, 'upload_date': '20130806', + 'uploader': 'AENE-NEW', }, 'params': { # m3u8 download @@ -212,36 +296,47 @@ class HistoryTopicIE(AENetworksBaseIE): 'add_ie': ['ThePlatform'], }] - def theplatform_url_result(self, theplatform_url, video_id, query): - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': smuggle_url( - update_url_query(theplatform_url, query), - { - 'sig': { - 'key': self._THEPLATFORM_KEY, - 'secret': self._THEPLATFORM_SECRET, - }, - 'force_smil_url': True - }), - 'ie_key': 'ThePlatform', - } + def _real_extract(self, url): + display_id = self._match_id(url) + return self.url_result( + 'http://www.history.com/videos/' + display_id, + AENetworksIE.ie_key()) + + +class HistoryPlayerIE(AENetworksBaseIE): + IE_NAME = 'history:player' + _VALID_URL = r'https?://(?:www\.)?(?P(?:history|biography)\.com)/player/(?P\d+)' + _TESTS = [] + + def _real_extract(self, url): + domain, video_id = re.match(self._VALID_URL, url).groups() + return self._extract_aetn_info(domain, 'id', video_id, url) + + +class BiographyIE(AENetworksBaseIE): + _VALID_URL = r'https?://(?:www\.)?biography\.com/video/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.biography.com/video/vincent-van-gogh-full-episode-2075049808', + 'info_dict': { + 'id': '30322987', + 'ext': 'mp4', + 'title': 'Vincent Van Gogh - Full Episode', + 'description': 'A full biography about the most influential 20th century painter, Vincent Van Gogh.', + 'timestamp': 1311970571, + 'upload_date': '20110729', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r']+src="[^"]+\btpid=(\d+)', webpage, 'tpid') - result = self._download_json( - 'https://feeds.video.aetnd.com/api/v2/history/videos', - video_id, query={'filter[id]': video_id})['results'][0] - title = result['title'] - info = self._extract_aen_smil(result['publicUrl'], video_id) - info.update({ - 'title': title, - 'description': result.get('description'), - 'duration': int_or_none(result.get('duration')), - 'timestamp': int_or_none(result.get('added'), 1000), - }) - return info + player_url = self._search_regex( + r']+src="(%s)' % HistoryPlayerIE._VALID_URL, + webpage, 'player URL') + return self.url_result(player_url, HistoryPlayerIE.ie_key()) diff --git a/haruhi_dl/extractor/aljazeera.py b/haruhi_dl/extractor/aljazeera.py index c68be3134..c4f915a3c 100644 --- a/haruhi_dl/extractor/aljazeera.py +++ b/haruhi_dl/extractor/aljazeera.py @@ -1,13 +1,16 @@ from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?Pprogram/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P[^/?&#]+)' _TESTS = [{ - 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', + 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', 'info_dict': { 'id': '3792260579001', 'ext': 'mp4', @@ -20,14 +23,34 @@ class AlJazeeraIE(InfoExtractor): 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', }, { - 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html', + 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', + 'only_matching': True, + }, { + 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' def _real_extract(self, url): - program_name = self._match_id(url) - webpage = self._download_webpage(url, program_name) - brightcove_id = self._search_regex( - r'RenderPagesVideo\(\'(.+?)\'', webpage, 'brightcove id') - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + post_type, name = re.match(self._VALID_URL, url).groups() + post_type = { + 'features': 'post', + 'program': 'episode', + 'videos': 'video', + }[post_type.split('/')[0]] + video = self._download_json( + 'https://www.aljazeera.com/graphql', name, query={ + 'operationName': 'SingleArticleQuery', + 'variables': json.dumps({ + 'name': name, + 'postType': post_type, + }), + }, headers={ + 'wp-site': 'aje', + })['data']['article']['video'] + video_id = video['id'] + account_id = video.get('accountId') or '665003303001' + player_id = video.get('playerId') or 'BkeSH5BDb' + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), + 'BrightcoveNew', video_id) diff --git a/haruhi_dl/extractor/amara.py b/haruhi_dl/extractor/amara.py new file mode 100644 index 000000000..61d469574 --- /dev/null +++ b/haruhi_dl/extractor/amara.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .youtube import YoutubeIE +from .vimeo import VimeoIE +from ..utils import ( + int_or_none, + parse_iso8601, + update_url_query, +) + + +class AmaraIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P\w+)' + _TESTS = [{ + # Youtube + 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', + 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', + 'info_dict': { + 'id': 'h6ZuVdvYnfE', + 'ext': 'mp4', + 'title': 'Why jury trials are becoming less common', + 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20160813', + 'uploader': 'PBS NewsHour', + 'uploader_id': 'PBSNewsHour', + 'timestamp': 1549639570, + } + }, { + # Vimeo + 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', + 'md5': '99392c75fa05d432a8f11df03612195e', + 'info_dict': { + 'id': '18622084', + 'ext': 'mov', + 'title': 'Vimeo at CES 2011!', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'timestamp': 1294763658, + 'upload_date': '20110111', + 'uploader': 'Sam Morrill', + 'uploader_id': 'sammorrill' + } + }, { + # Direct Link + 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', + 'md5': 'd3970f08512738ee60c5807311ff5d3f', + 'info_dict': { + 'id': 's8KL7I3jLmh6', + 'ext': 'mp4', + 'title': 'The danger of a single story', + 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', + 'thumbnail': r're:^https?://.*\.jpg$', + 'subtitles': dict, + 'upload_date': '20091007', + 'timestamp': 1254942511, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._download_json( + 'https://amara.org/api/videos/%s/' % video_id, + video_id, query={'format': 'json'}) + title = meta['title'] + video_url = meta['all_urls'][0] + + subtitles = {} + for language in (meta.get('languages') or []): + subtitles_uri = language.get('subtitles_uri') + if not (subtitles_uri and language.get('published')): + continue + subtitle = subtitles.setdefault(language.get('code') or 'en', []) + for f in ('json', 'srt', 'vtt'): + subtitle.append({ + 'ext': f, + 'url': update_url_query(subtitles_uri, {'format': f}), + }) + + info = { + 'url': video_url, + 'id': video_id, + 'subtitles': subtitles, + 'title': title, + 'description': meta.get('description'), + 'thumbnail': meta.get('thumbnail'), + 'duration': int_or_none(meta.get('duration')), + 'timestamp': parse_iso8601(meta.get('created')), + } + + for ie in (YoutubeIE, VimeoIE): + if ie.suitable(video_url): + info.update({ + '_type': 'url_transparent', + 'ie_key': ie.ie_key(), + }) + break + + return info diff --git a/haruhi_dl/extractor/amcnetworks.py b/haruhi_dl/extractor/amcnetworks.py index 6fb3d6c53..b8027bbca 100644 --- a/haruhi_dl/extractor/amcnetworks.py +++ b/haruhi_dl/extractor/amcnetworks.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .theplatform import ThePlatformIE from ..utils import ( int_or_none, @@ -11,25 +13,22 @@ from ..utils import ( class AMCNetworksIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?Pamc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', - 'md5': '', + 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', 'info_dict': { - 'id': 's3MX01Nl4vPH', + 'id': '4Lq1dzOnZGt0', 'ext': 'mp4', - 'title': 'Maron - Season 4 - Step 1', - 'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.', - 'age_limit': 17, - 'upload_date': '20160505', - 'timestamp': 1462468831, + 'title': "The Graham Norton Show - Season 28 - Tina Fey's Adorable Airline-Themed Family Dinner", + 'description': "It turns out child stewardesses are very generous with the wine! All-new episodes of 'The Graham Norton Show' premiere Fridays at 11/10c on BBC America.", + 'upload_date': '20201120', + 'timestamp': 1605904350, 'uploader': 'AMCN', }, 'params': { # m3u8 download 'skip_download': True, }, - 'skip': 'Requires TV provider accounts', }, { 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', 'only_matching': True, @@ -55,32 +54,34 @@ class AMCNetworksIE(ThePlatformIE): 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', 'only_matching': True, }] + _REQUESTOR_ID_MAP = { + 'amc': 'AMC', + 'bbcamerica': 'BBCA', + 'ifc': 'IFC', + 'sundancetv': 'SUNDANCE', + 'wetv': 'WETV', + } def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + site, display_id = re.match(self._VALID_URL, url).groups() + requestor_id = self._REQUESTOR_ID_MAP[site] + properties = self._download_json( + 'https://content-delivery-gw.svc.ds.amcn.com/api/v2/content/amcn/%s/url/%s' % (requestor_id.lower(), display_id), + display_id)['data']['properties'] query = { 'mbr': 'true', 'manifest': 'm3u', } - media_url = self._search_regex( - r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', - webpage, 'media url') - theplatform_metadata = self._download_theplatform_metadata(self._search_regex( - r'link\.theplatform\.com/s/([^?]+)', - media_url, 'theplatform_path'), display_id) + tp_path = 'M_UwQC/media/' + properties['videoPid'] + media_url = 'https://link.theplatform.com/s/' + tp_path + theplatform_metadata = self._download_theplatform_metadata(tp_path, display_id) info = self._parse_theplatform_metadata(theplatform_metadata) video_id = theplatform_metadata['pid'] title = theplatform_metadata['title'] rating = try_get( theplatform_metadata, lambda x: x['ratings'][0]['rating']) - auth_required = self._search_regex( - r'window\.authRequired\s*=\s*(true|false);', - webpage, 'auth required') - if auth_required == 'true': - requestor_id = self._search_regex( - r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', - webpage, 'requestor id') + video_category = properties.get('videoCategory') + if video_category and video_category.endswith('-Auth'): resource = self._get_mvpd_resource( requestor_id, title, video_id, rating) query['auth'] = self._extract_mvpd_auth( diff --git a/haruhi_dl/extractor/americastestkitchen.py b/haruhi_dl/extractor/americastestkitchen.py index 9c9d77ae1..be960c0f9 100644 --- a/haruhi_dl/extractor/americastestkitchen.py +++ b/haruhi_dl/extractor/americastestkitchen.py @@ -1,82 +1,159 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor from ..utils import ( clean_html, int_or_none, - js_to_json, try_get, unified_strdate, + unified_timestamp, ) class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?Pepisode|videos)/(?P\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', 'info_dict': { 'id': '5b400b9ee338f922cb06450c', - 'title': 'Weeknight Japanese Suppers', + 'title': 'Japanese Suppers', 'ext': 'mp4', - 'description': 'md5:3d0c1a44bb3b27607ce82652db25b4a8', + 'description': 'md5:64e606bfee910627efc4b5f050de92b3', 'thumbnail': r're:^https?://', - 'timestamp': 1523664000, - 'upload_date': '20180414', - 'release_date': '20180414', + 'timestamp': 1523318400, + 'upload_date': '20180410', + 'release_date': '20180410', 'series': "America's Test Kitchen", 'season_number': 18, - 'episode': 'Weeknight Japanese Suppers', + 'episode': 'Japanese Suppers', 'episode_number': 15, }, 'params': { 'skip_download': True, }, + }, { + # Metadata parsing behaves differently for newer episodes (705) as opposed to older episodes (582 above) + 'url': 'https://www.americastestkitchen.com/episode/705-simple-chicken-dinner', + 'md5': '06451608c57651e985a498e69cec17e5', + 'info_dict': { + 'id': '5fbe8c61bda2010001c6763b', + 'title': 'Simple Chicken Dinner', + 'ext': 'mp4', + 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', + 'thumbnail': r're:^https?://', + 'timestamp': 1610755200, + 'upload_date': '20210116', + 'release_date': '20210116', + 'series': "America's Test Kitchen", + 'season_number': 21, + 'episode': 'Simple Chicken Dinner', + 'episode_number': 3, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, + }, { + 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'only_matching': True, + }, { + 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + resource_type, video_id = re.match(self._VALID_URL, url).groups() + is_episode = resource_type == 'episode' + if is_episode: + resource_type = 'episodes' - webpage = self._download_webpage(url, video_id) - - video_data = self._parse_json( - self._search_regex( - r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*', - webpage, 'initial context'), - video_id, js_to_json) - - ep_data = try_get( - video_data, - (lambda x: x['episodeDetail']['content']['data'], - lambda x: x['videoDetail']['content']['data']), dict) - ep_meta = ep_data.get('full_video', {}) - - zype_id = ep_data.get('zype_id') or ep_meta['zype_id'] - - title = ep_data.get('title') or ep_meta.get('title') - description = clean_html(ep_meta.get('episode_description') or ep_data.get( - 'description') or ep_meta.get('description')) - thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url']) - release_date = unified_strdate(ep_data.get('aired_at')) - - season_number = int_or_none(ep_meta.get('season_number')) - episode = ep_meta.get('title') - episode_number = int_or_none(ep_meta.get('episode_number')) + resource = self._download_json( + 'https://www.americastestkitchen.com/api/v6/%s/%s' % (resource_type, video_id), video_id) + video = resource['video'] if is_episode else resource + episode = resource if is_episode else resource.get('episode') or {} return { '_type': 'url_transparent', - 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id, + 'url': 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % video['zypeId'], 'ie_key': 'Zype', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'release_date': release_date, - 'series': "America's Test Kitchen", - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, + 'description': clean_html(video.get('description')), + 'timestamp': unified_timestamp(video.get('publishDate')), + 'release_date': unified_strdate(video.get('publishDate')), + 'episode_number': int_or_none(episode.get('number')), + 'season_number': int_or_none(episode.get('season')), + 'series': try_get(episode, lambda x: x['show']['title']), + 'episode': episode.get('title'), } + + +class AmericasTestKitchenSeasonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?(?Pamericastestkitchen|cookscountry)\.com/episodes/browse/season_(?P\d+)' + _TESTS = [{ + # ATK Season + 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', + 'info_dict': { + 'id': 'season_1', + 'title': 'Season 1', + }, + 'playlist_count': 13, + }, { + # Cooks Country Season + 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'info_dict': { + 'id': 'season_12', + 'title': 'Season 12', + }, + 'playlist_count': 13, + }] + + def _real_extract(self, url): + show_name, season_number = re.match(self._VALID_URL, url).groups() + season_number = int(season_number) + + slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + + season = 'Season %d' % season_number + + season_search = self._download_json( + 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, + season, headers={ + 'Origin': 'https://www.%s.com' % show_name, + 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', + 'X-Algolia-Application-Id': 'Y1FNZXUI30', + }, query={ + 'facetFilters': json.dumps([ + 'search_season_list:' + season, + 'search_document_klass:episode', + 'search_show_slug:' + slug, + ]), + 'attributesToRetrieve': 'description,search_%s_episode_number,search_document_date,search_url,title' % slug, + 'attributesToHighlight': '', + 'hitsPerPage': 1000, + }) + + def entries(): + for episode in (season_search.get('hits') or []): + search_url = episode.get('search_url') + if not search_url: + continue + yield { + '_type': 'url', + 'url': 'https://www.%s.com%s' % (show_name, search_url), + 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'timestamp': unified_timestamp(episode.get('search_document_date')), + 'season_number': season_number, + 'episode_number': int_or_none(episode.get('search_%s_episode_number' % slug)), + 'ie_key': AmericasTestKitchenIE.ie_key(), + } + + return self.playlist_result( + entries(), 'season_%d' % season_number, season) diff --git a/haruhi_dl/extractor/amp.py b/haruhi_dl/extractor/amp.py index 7ff098cfa..24c684cad 100644 --- a/haruhi_dl/extractor/amp.py +++ b/haruhi_dl/extractor/amp.py @@ -8,6 +8,7 @@ from ..utils import ( int_or_none, mimetype2ext, parse_iso8601, + unified_timestamp, url_or_none, ) @@ -88,7 +89,7 @@ class AMPIE(InfoExtractor): self._sort_formats(formats) - timestamp = parse_iso8601(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) + timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) return { 'id': video_id, diff --git a/haruhi_dl/extractor/animeondemand.py b/haruhi_dl/extractor/animeondemand.py index 00ce684d1..54e097d2f 100644 --- a/haruhi_dl/extractor/animeondemand.py +++ b/haruhi_dl/extractor/animeondemand.py @@ -116,8 +116,6 @@ class AnimeOnDemandIE(InfoExtractor): r'(?s)]+itemprop="description"[^>]*>(.+?)', webpage, 'anime description', default=None) - entries = [] - def extract_info(html, video_id, num=None): title, description = [None] * 2 formats = [] @@ -233,7 +231,7 @@ class AnimeOnDemandIE(InfoExtractor): self._sort_formats(info['formats']) f = common_info.copy() f.update(info) - entries.append(f) + yield f # Extract teaser/trailer only when full episode is not available if not info['formats']: @@ -247,7 +245,7 @@ class AnimeOnDemandIE(InfoExtractor): 'title': m.group('title'), 'url': urljoin(url, m.group('href')), }) - entries.append(f) + yield f def extract_episodes(html): for num, episode_html in enumerate(re.findall( @@ -275,7 +273,8 @@ class AnimeOnDemandIE(InfoExtractor): 'episode_number': episode_number, } - extract_entries(episode_html, video_id, common_info) + for e in extract_entries(episode_html, video_id, common_info): + yield e def extract_film(html, video_id): common_info = { @@ -283,11 +282,18 @@ class AnimeOnDemandIE(InfoExtractor): 'title': anime_title, 'description': anime_description, } - extract_entries(html, video_id, common_info) + for e in extract_entries(html, video_id, common_info): + yield e - extract_episodes(webpage) + def entries(): + has_episodes = False + for e in extract_episodes(webpage): + has_episodes = True + yield e - if not entries: - extract_film(webpage, anime_id) + if not has_episodes: + for e in extract_film(webpage, anime_id): + yield e - return self.playlist_result(entries, anime_id, anime_title, anime_description) + return self.playlist_result( + entries(), anime_id, anime_title, anime_description) diff --git a/haruhi_dl/extractor/anvato.py b/haruhi_dl/extractor/anvato.py index 84e841035..b7398563b 100644 --- a/haruhi_dl/extractor/anvato.py +++ b/haruhi_dl/extractor/anvato.py @@ -116,7 +116,76 @@ class AnvatoIE(InfoExtractor): 'anvato_scripps_app_ios_prod_409c41960c60b308db43c3cc1da79cab9f1c3d93': 'WPxj5GraLTkYCyj3M7RozLqIycjrXOEcDGFMIJPn', 'EZqvRyKBJLrgpClDPDF8I7Xpdp40Vx73': '4OxGd2dEakylntVKjKF0UK9PDPYB6A9W', 'M2v78QkpleXm9hPp9jUXI63x5vA6BogR': 'ka6K32k7ZALmpINkjJUGUo0OE42Md1BQ', - 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ' + 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', + 'X8POa4zPPaKVZHqmWjuEzfP31b1QM9VN': 'Dn5vOY9ooDw7VSl9qztjZI5o0g08mA0z', + 'M2v78QkBMpNJlSPp9diX5F2PBmBy6Bog': 'ka6K32kyo7nDZfNkjQCGWf1lpApXMd1B', + 'bvJ0dQpav07l0hG5JgfVLF2dv1vARwpP': 'BzoQW24GrJZoJfmNodiJKSPeB9B8NOxj', + 'lxQMLg2XZKuEZaWgsqubBxV9INZ6bryY': 'Vm2Mx6noKds9jB71h6urazwlTG3m9x8l', + '04EnjvXeoSmkbJ9ckPs7oY0mcxv7PlyN': 'aXERQP9LMfQVlEDsgGs6eEA1SWznAQ8P', + 'mQbO2ge6BFRWVPYCYpU06YvNt80XLvAX': 'E2BV1NGmasN5v7eujECVPJgwflnLPm2A', + 'g43oeBzJrCml7o6fa5fRL1ErCdeD8z4K': 'RX34mZ6zVH4Nr6whbxIGLv9WSbxEKo8V', + 'VQrDJoP7mtdBzkxhXbSPwGB1coeElk4x': 'j2VejQx0VFKQepAF7dI0mJLKtOVJE18z', + 'WxA5NzLRjCrmq0NUgaU5pdMDuZO7RJ4w': 'lyY5ADLKaIOLEgAsGQCveEMAcqnx3rY9', + 'M4lpMXB71ie0PjMCjdFzVXq0SeRVqz49': 'n2zVkOqaLIv3GbLfBjcwW51LcveWOZ2e', + 'dyDZGEqN8u8nkJZcJns0oxYmtP7KbGAn': 'VXOEqQW9BtEVLajfZQSLEqxgS5B7qn2D', + 'E7QNjrVY5u5mGvgu67IoDgV1CjEND8QR': 'rz8AaDmdKIkLmPNhB5ILPJnjS5PnlL8d', + 'a4zrqjoKlfzg0dwHEWtP31VqcLBpjm4g': 'LY9J16gwETdGWa3hjBu5o0RzuoQDjqXQ', + 'dQP5BZroMsMVLO1hbmT5r2Enu86GjxA6': '7XR3oOdbPF6x3PRFLDCq9RkgsRjAo48V', + 'M4lKNBO1NFe0PjMCj1tzVXq0SeRVqzA9': 'n2zoRqGLRUv3GbLfBmTwW51LcveWOZYe', + 'nAZ7MZdpGCGg1pqFEbsoJOz2C60mv143': 'dYJgdqA9aT4yojETqGi7yNgoFADxqmXP', + '3y1MERYgOuE9NzbFgwhV6Wv2F0YKvbyz': '081xpZDQgC4VadLTavhWQxrku56DAgXV', + 'bmQvmEXr5HWklBMCZOcpE2Z3HBYwqGyl': 'zxXPbVNyMiMAZldhr9FkOmA0fl4aKr2v', + 'wA7oDNYldfr6050Hwxi52lPZiVlB86Ap': 'ZYK16aA7ni0d3l3c34uwpxD7CbReMm8Q', + 'g43MbKMWmFml7o7sJoSRkXxZiXRvJ3QK': 'RX3oBJonvs4Nr6rUWBCGn3matRGqJPXV', + 'mA9VdlqpLS0raGaSDvtoqNrBTzb8XY4q': '0XN4OjBD3fnW7r7IbmtJB4AyfOmlrE2r', + 'mAajOwgkGt17oGoFmEuklMP9H0GnW54d': 'lXbBLPGyzikNGeGujAuAJGjZiwLRxyXR', + 'vy8vjJ9kbUwrRqRu59Cj5dWZfzYErlAb': 'K8l7gpwaGcBpnAnCLNCmPZRdin3eaQX0', + 'xQMWBpR8oHEZaWaSMGUb0avOHjLVYn4Y': 'm2MrN4vEaf9jB7BFy5Srb40jTrN67AYl', + 'xyKEmVO3miRr6D6UVkt7oB8jtD6aJEAv': 'g2ddDebqDfqdgKgswyUKwGjbTWwzq923', + '7Qk0wa2D9FjKapacoJF27aLvUDKkLGA0': 'b2kgBEkephJaMkMTL7s1PLe4Ua6WyP2P', + '3QLg6nqmNTJ5VvVTo7f508LPidz1xwyY': 'g2L1GgpraipmAOAUqmIbBnPxHOmw4MYa', + '3y1B7zZjXTE9NZNSzZSVNPZaTNLjo6Qz': '081b5G6wzH4VagaURmcWbN5mT4JGEe2V', + 'lAqnwvkw6SG6D8DSqmUg6DRLUp0w3G4x': 'O2pbP0xPDFNJjpjIEvcdryOJtpkVM4X5', + 'awA7xd1N0Hr6050Hw2c52lPZiVlB864p': 'GZYKpn4aoT0d3l3c3PiwpxD7CbReMmXQ', + 'jQVqPLl9YHL1WGWtR1HDgWBGT63qRNyV': '6X03ne6vrU4oWyWUN7tQVoajikxJR3Ye', + 'GQRMR8mL7uZK797t7xH3eNzPIP5dOny1': 'm2vqPWGd4U31zWzSyasDRAoMT1PKRp8o', + 'zydq9RdmRhXLkNkfNoTJlMzaF0lWekQB': '3X7LnvE7vH5nkEkSqLiey793Un7dLB8e', + 'VQrDzwkB2IdBzjzu9MHPbEYkSB50gR4x': 'j2VebLzoKUKQeEesmVh0gM1eIp9jKz8z', + 'mAa2wMamBs17oGoFmktklMP9H0GnW54d': 'lXbgP74xZTkNGeGujVUAJGjZiwLRxy8R', + '7yjB6ZLG6sW8R6RF2xcan1KGfJ5dNoyd': 'wXQkPorvPHZ45N5t4Jf6qwg5Tp4xvw29', + 'a4zPpNeWGuzg0m0iX3tPeanGSkRKWXQg': 'LY9oa3QAyHdGW9Wu3Ri5JGeEik7l1N8Q', + 'k2rneA2M38k25cXDwwSknTJlxPxQLZ6M': '61lyA2aEVDzklfdwmmh31saPxQx2VRjp', + 'bK9Zk4OvPnvxduLgxvi8VUeojnjA02eV': 'o5jANYjbeMb4nfBaQvcLAt1jzLzYx6ze', + '5VD6EydM3R9orHmNMGInGCJwbxbQvGRw': 'w3zjmX7g4vnxzCxElvUEOiewkokXprkZ', + '70X35QbVYVYNPUmP9YfbzI06YqYQk2R1': 'vG4Aj2BMjMjoztB7zeFOnCVPJpJ8lMOa', + '26qYwQVG9p1Bks2GgBckjfDJOXOAMgG1': 'r4ev9X0mv5zqJc0yk5IBDcQOwZw8mnwQ', + 'rvVKpA56MBXWlSxMw3cobT5pdkd4Dm7q': '1J7ZkY53pZ645c93owcLZuveE7E8B3rL', + 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo': 'qN1zdy1zlYL23IWZGWtDvfV6WeWQWkJo', + 'jdKqRGF16dKsBviMDae7IGDl7oTjEbVV': 'Q09l7vhlNxPFErIOK6BVCe7KnwUW5DVV', + '3QLkogW1OUJ5VvPsrDH56DY2u7lgZWyY': 'g2LRE1V9espmAOPhE4ubj4ZdUA57yDXa', + 'wyJvWbXGBSdbkEzhv0CW8meou82aqRy8': 'M2wolPvyBIpQGkbT4juedD4ruzQGdK2y', + '7QkdZrzEkFjKap6IYDU2PB0oCNZORmA0': 'b2kN1l96qhJaMkPs9dt1lpjBfwqZoA8P', + 'pvA05113MHG1w3JTYxc6DVlRCjErVz4O': 'gQXeAbblBUnDJ7vujbHvbRd1cxlz3AXO', + 'mA9blJDZwT0raG1cvkuoeVjLC7ZWd54q': '0XN9jRPwMHnW7rvumgfJZOD9CJgVkWYr', + '5QwRN5qKJTvGKlDTmnf7xwNZcjRmvEy9': 'R2GP6LWBJU1QlnytwGt0B9pytWwAdDYy', + 'eyn5rPPbkfw2KYxH32fG1q58CbLJzM40': 'p2gyqooZnS56JWeiDgfmOy1VugOQEBXn', + '3BABn3b5RfPJGDwilbHe7l82uBoR05Am': '7OYZG7KMVhbPdKJS3xcWEN3AuDlLNmXj', + 'xA5zNGXD3HrmqMlF6OS5pdMDuZO7RJ4w': 'yY5DAm6r1IOLE3BCVMFveEMAcqnx3r29', + 'g43PgW3JZfml7o6fDEURL1ErCdeD8zyK': 'RX3aQn1zrS4Nr6whDgCGLv9WSbxEKo2V', + 'lAqp8WbGgiG6D8LTKJcg3O72CDdre1Qx': 'O2pnm6473HNJjpKuVosd3vVeh975yrX5', + 'wyJbYEDxKSdbkJ6S6RhW8meou82aqRy8': 'M2wPm7EgRSpQGlAh70CedD4ruzQGdKYy', + 'M4lgW28nLCe0PVdtaXszVXq0SeRVqzA9': 'n2zmJvg4jHv3G0ETNgiwW51LcveWOZ8e', + '5Qw3OVvp9FvGKlDTmOC7xwNZcjRmvEQ9': 'R2GzDdml9F1Qlnytw9s0B9pytWwAdD8y', + 'vy8a98X7zCwrRqbHrLUjYzwDiK2b70Qb': 'K8lVwzyjZiBpnAaSGeUmnAgxuGOBxmY0', + 'g4eGjJLLoiqRD3Pf9oT5O03LuNbLRDQp': '6XqD59zzpfN4EwQuaGt67qNpSyRBlnYy', + 'g43OPp9boIml7o6fDOIRL1ErCdeD8z4K': 'RX33alNB4s4Nr6whDPUGLv9WSbxEKoXV', + 'xA2ng9OkBcGKzDbTkKsJlx7dUK8R3dA5': 'z2aPnJvzBfObkwGC3vFaPxeBhxoMqZ8K', + 'xyKEgBajZuRr6DEC0Kt7XpD1cnNW9gAv': 'g2ddlEBvRsqdgKaI4jUK9PrgfMexGZ23', + 'BAogww51jIMa2JnH1BcYpXM5F658RNAL': 'rYWDmm0KptlkGv4FGJFMdZmjs9RDE6XR', + 'BAokpg62VtMa2JnH1mHYpXM5F658RNAL': 'rYWryDnlNslkGv4FG4HMdZmjs9RDE62R', + 'a4z1Px5e2hzg0m0iMMCPeanGSkRKWXAg': 'LY9eorNQGUdGW9WuKKf5JGeEik7l1NYQ', + 'kAx69R58kF9nY5YcdecJdl2pFXP53WyX': 'gXyRxELpbfPvLeLSaRil0mp6UEzbZJ8L', + 'BAoY13nwViMa2J2uo2cY6BlETgmdwryL': 'rYWwKzJmNFlkGvGtNoUM9bzwIJVzB1YR', } _MCP_TO_ACCESS_KEY_TABLE = { @@ -189,19 +258,17 @@ class AnvatoIE(InfoExtractor): video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') anvrid = md5_text(time.time() * 1000 * random.random())[:30] - payload = { - 'api': { - 'anvrid': anvrid, - 'anvstk': md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, - self._ANVACK_TABLE.get(access_key, self._API_KEY))), - 'anvts': server_time, - }, + api = { + 'anvrid': anvrid, + 'anvts': server_time, } + api['anvstk'] = md5_text('%s|%s|%d|%s' % ( + access_key, anvrid, server_time, + self._ANVACK_TABLE.get(access_key, self._API_KEY))) return self._download_json( video_data_url, video_id, transform_source=strip_jsonp, - data=json.dumps(payload).encode('utf-8')) + data=json.dumps({'api': api}).encode('utf-8')) def _get_anvato_videos(self, access_key, video_id): video_data = self._get_video_json(access_key, video_id) @@ -259,7 +326,7 @@ class AnvatoIE(InfoExtractor): 'description': video_data.get('def_description'), 'tags': video_data.get('def_tags', '').split(','), 'categories': video_data.get('categories'), - 'thumbnail': video_data.get('thumbnail'), + 'thumbnail': video_data.get('src_image_url') or video_data.get('thumbnail'), 'timestamp': int_or_none(video_data.get( 'ts_published') or video_data.get('ts_added')), 'uploader': video_data.get('mcp_id'), diff --git a/haruhi_dl/extractor/aol.py b/haruhi_dl/extractor/aol.py index e87994a6a..f6ecb8438 100644 --- a/haruhi_dl/extractor/aol.py +++ b/haruhi_dl/extractor/aol.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .yahoo import YahooIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, @@ -15,9 +15,9 @@ from ..utils import ( ) -class AolIE(InfoExtractor): +class AolIE(YahooIE): IE_NAME = 'aol.com' - _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P[0-9a-f]+)' + _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' _TESTS = [{ # video with 5min ID @@ -76,10 +76,16 @@ class AolIE(InfoExtractor): }, { 'url': 'https://www.aol.jp/video/playlist/5a28e936a1334d000137da0c/5a28f3151e642219fde19831/', 'only_matching': True, + }, { + # Yahoo video + 'url': 'https://www.aol.com/video/play/991e6700-ac02-11ea-99ff-357400036f61/24bbc846-3e30-3c46-915e-fe8ccd7fcc46/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + if '-' in video_id: + return self._extract_yahoo_video(video_id, 'us') response = self._download_json( 'https://feedapi.b2c.on.aol.com/v1.0/app/videos/aolon/%s/details' % video_id, diff --git a/haruhi_dl/extractor/apa.py b/haruhi_dl/extractor/apa.py index aa07e07b9..af1398bf8 100644 --- a/haruhi_dl/extractor/apa.py +++ b/haruhi_dl/extractor/apa.py @@ -6,25 +6,21 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, - js_to_json, + int_or_none, url_or_none, ) class APAIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.apa\.at/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _VALID_URL = r'(?Phttps?://[^/]+\.apa\.at)/embed/(?P[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TESTS = [{ 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', 'info_dict': { - 'id': 'jjv85FdZ', + 'id': '293f6d17-692a-44e3-9fd5-7b178f3a1029', 'ext': 'mp4', - 'title': '"Blau ist mysteriös": Die Blue Man Group im Interview', - 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'title': '293f6d17-692a-44e3-9fd5-7b178f3a1029', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 254, - 'timestamp': 1519211149, - 'upload_date': '20180221', }, }, { 'url': 'https://uvp-apapublisher.sf.apa.at/embed/2f94e9e6-d945-4db2-9548-f9a41ebf7b78', @@ -46,9 +42,11 @@ class APAIE(InfoExtractor): webpage)] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id, base_url = mobj.group('id', 'base_url') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + '%s/player/%s' % (base_url, video_id), video_id) jwplatform_id = self._search_regex( r'media[iI]d\s*:\s*["\'](?P[a-zA-Z0-9]{8})', webpage, @@ -59,16 +57,18 @@ class APAIE(InfoExtractor): 'jwplatform:' + jwplatform_id, ie='JWPlatform', video_id=video_id) - sources = self._parse_json( - self._search_regex( - r'sources\s*=\s*(\[.+?\])\s*;', webpage, 'sources'), - video_id, transform_source=js_to_json) + def extract(field, name=None): + return self._search_regex( + r'\b%s["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' % field, + webpage, name or field, default=None, group='value') + + title = extract('title') or video_id + description = extract('description') + thumbnail = extract('poster', 'thumbnail') formats = [] - for source in sources: - if not isinstance(source, dict): - continue - source_url = url_or_none(source.get('file')) + for format_id in ('hls', 'progressive'): + source_url = url_or_none(extract(format_id)) if not source_url: continue ext = determine_ext(source_url) @@ -77,18 +77,19 @@ class APAIE(InfoExtractor): source_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) else: + height = int_or_none(self._search_regex( + r'(\d+)\.mp4', source_url, 'height', default=None)) formats.append({ 'url': source_url, + 'format_id': format_id, + 'height': height, }) self._sort_formats(formats) - thumbnail = self._search_regex( - r'image\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'thumbnail', fatal=False, group='url') - return { 'id': video_id, - 'title': video_id, + 'title': title, + 'description': description, 'thumbnail': thumbnail, 'formats': formats, } diff --git a/haruhi_dl/extractor/aparat.py b/haruhi_dl/extractor/aparat.py index 883dcee7a..a9527e785 100644 --- a/haruhi_dl/extractor/aparat.py +++ b/haruhi_dl/extractor/aparat.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + get_element_by_id, int_or_none, merge_dicts, mimetype2ext, @@ -39,23 +40,15 @@ class AparatIE(InfoExtractor): webpage = self._download_webpage(url, video_id, fatal=False) if not webpage: - # Note: There is an easier-to-parse configuration at - # http://www.aparat.com/video/video/config/videohash/%video_id - # but the URL in there does not work webpage = self._download_webpage( 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, video_id) - options = self._parse_json( - self._search_regex( - r'options\s*=\s*JSON\.parse\(\s*(["\'])(?P(?:(?!\1).)+)\1\s*\)', - webpage, 'options', group='value'), - video_id) - - player = options['plugins']['sabaPlayerPlugin'] + options = self._parse_json(self._search_regex( + r'options\s*=\s*({.+?})\s*;', webpage, 'options'), video_id) formats = [] - for sources in player['multiSRC']: + for sources in (options.get('multiSRC') or []): for item in sources: if not isinstance(item, dict): continue @@ -85,11 +78,12 @@ class AparatIE(InfoExtractor): info = self._search_json_ld(webpage, video_id, default={}) if not info.get('title'): - info['title'] = player['title'] + info['title'] = get_element_by_id('videoTitle', webpage) or \ + self._html_search_meta(['og:title', 'twitter:title', 'DC.Title', 'title'], webpage, fatal=True) return merge_dicts(info, { 'id': video_id, 'thumbnail': url_or_none(options.get('poster')), - 'duration': int_or_none(player.get('duration')), + 'duration': int_or_none(options.get('duration')), 'formats': formats, }) diff --git a/haruhi_dl/extractor/applepodcasts.py b/haruhi_dl/extractor/applepodcasts.py new file mode 100644 index 000000000..95758fece --- /dev/null +++ b/haruhi_dl/extractor/applepodcasts.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + try_get, +) + + +class ApplePodcastsIE(InfoExtractor): + _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P\d+)' + _TESTS = [{ + 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'info_dict': { + 'id': '1000482637777', + 'ext': 'mp3', + 'title': '207 - Whitney Webb Returns', + 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'upload_date': '20200705', + 'timestamp': 1593921600, + 'duration': 6425, + 'series': 'The Tim Dillon Show', + } + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + webpage = self._download_webpage(url, episode_id) + ember_data = self._parse_json(self._search_regex( + r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) + episode = ember_data['data']['attributes'] + description = episode.get('description') or {} + + series = None + for inc in (ember_data.get('included') or []): + if inc.get('type') == 'media/podcast': + series = try_get(inc, lambda x: x['attributes']['name']) + + return { + 'id': episode_id, + 'title': episode['name'], + 'url': clean_podcast_url(episode['assetUrl']), + 'description': description.get('standard') or description.get('short'), + 'timestamp': parse_iso8601(episode.get('releaseDateTime')), + 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), + 'series': series, + } diff --git a/haruhi_dl/extractor/archiveorg.py b/haruhi_dl/extractor/archiveorg.py index c79c58e82..e42ed5e79 100644 --- a/haruhi_dl/extractor/archiveorg.py +++ b/haruhi_dl/extractor/archiveorg.py @@ -2,15 +2,17 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - unified_strdate, clean_html, + extract_attributes, + unified_strdate, + unified_timestamp, ) class ArchiveOrgIE(InfoExtractor): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P[^/?#]+)(?:[?].*)?$' + _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'md5': '8af1d4cf447933ed3c7f4871162602db', @@ -19,8 +21,11 @@ class ArchiveOrgIE(InfoExtractor): 'ext': 'ogg', 'title': '1968 Demo - FJCC Conference Presentation Reel #1', 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', - 'upload_date': '19681210', - 'uploader': 'SRI International' + 'creator': 'SRI International', + 'release_date': '19681210', + 'uploader': 'SRI International', + 'timestamp': 1268695290, + 'upload_date': '20100315', } }, { 'url': 'https://archive.org/details/Cops1922', @@ -29,22 +34,43 @@ class ArchiveOrgIE(InfoExtractor): 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6', + 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', + 'timestamp': 1387699629, + 'upload_date': '20131222', } }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'only_matching': True, + }, { + 'url': 'https://archive.org/details/MSNBCW_20131125_040000_To_Catch_a_Predator/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( 'http://archive.org/embed/' + video_id, video_id) - jwplayer_playlist = self._parse_json(self._search_regex( - r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", - webpage, 'jwplayer playlist'), video_id) - info = self._parse_jwplayer_data( - {'playlist': jwplayer_playlist}, video_id, base_url=url) + + playlist = None + play8 = self._search_regex( + r'(<[^>]+\bclass=["\']js-play8-playlist[^>]+>)', webpage, + 'playlist', default=None) + if play8: + attrs = extract_attributes(play8) + playlist = attrs.get('value') + if not playlist: + # Old jwplayer fallback + playlist = self._search_regex( + r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", + webpage, 'jwplayer playlist', default='[]') + jwplayer_playlist = self._parse_json(playlist, video_id, fatal=False) + if jwplayer_playlist: + info = self._parse_jwplayer_data( + {'playlist': jwplayer_playlist}, video_id, base_url=url) + else: + # HTML5 media fallback + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + info['id'] = video_id def get_optional(metadata, field): return metadata.get(field, [None])[0] @@ -58,8 +84,12 @@ class ArchiveOrgIE(InfoExtractor): 'description': clean_html(get_optional(metadata, 'description')), }) if info.get('_type') != 'playlist': + creator = get_optional(metadata, 'creator') info.update({ - 'uploader': get_optional(metadata, 'creator'), - 'upload_date': unified_strdate(get_optional(metadata, 'date')), + 'creator': creator, + 'release_date': unified_strdate(get_optional(metadata, 'date')), + 'uploader': get_optional(metadata, 'publisher') or creator, + 'timestamp': unified_timestamp(get_optional(metadata, 'publicdate')), + 'language': get_optional(metadata, 'language'), }) return info diff --git a/haruhi_dl/extractor/arcpublishing.py b/haruhi_dl/extractor/arcpublishing.py new file mode 100644 index 000000000..6ac7df303 --- /dev/null +++ b/haruhi_dl/extractor/arcpublishing.py @@ -0,0 +1,174 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + parse_iso8601, + try_get, +) + + +class ArcPublishingIE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + _VALID_URL = r'arcpublishing:(?P[a-z]+):(?P%s)' % _UUID_REGEX + _TESTS = [{ + # https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/ + 'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'only_matching': True, + }, { + # https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/ + 'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1', + 'only_matching': True, + }, { + # https://www.actionnewsjax.com/video/live-stream/ + 'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a', + 'only_matching': True, + }, { + # https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/ + 'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3', + 'only_matching': True, + }, { + # https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/ + 'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe', + 'only_matching': True, + }, { + # https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/ + 'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e', + 'only_matching': True, + }, { + # https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/ + 'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143', + 'only_matching': True, + }, { + # https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/ + 'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055', + 'only_matching': True, + }, { + # https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/ + 'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d', + 'only_matching': True, + }, { + # https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/ + 'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7', + 'only_matching': True, + }, { + # https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/ + 'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b', + 'only_matching': True, + }, { + # https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html + 'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685', + 'only_matching': True, + }] + _POWA_DEFAULTS = [ + (['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'), + ([ + 'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo', + 'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom', + 'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek', + ], 'video-api-cdn.%s.arcpublishing.com/api'), + ] + + @staticmethod + def _extract_urls(webpage, **kw): + entries = [] + # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview + for powa_el in re.findall(r'(]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): + powa = extract_attributes(powa_el) or {} + org = powa.get('data-org') + uuid = powa.get('data-uuid') + if org and uuid: + entries.append('arcpublishing:%s:%s' % (org, uuid)) + return entries + + def _real_extract(self, url): + org, uuid = re.match(self._VALID_URL, url).groups() + for orgs, tmpl in self._POWA_DEFAULTS: + if org in orgs: + base_api_tmpl = tmpl + break + else: + base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api' + if org == 'wapo': + org = 'washpost' + video = self._download_json( + 'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org), + uuid, query={'uuid': uuid})[0] + title = video['headlines']['basic'] + is_live = video.get('status') == 'live' + + urls = [] + formats = [] + for s in video.get('streams', []): + s_url = s.get('url') + if not s_url or s_url in urls: + continue + urls.append(s_url) + stream_type = s.get('stream_type') + if stream_type == 'smil': + smil_formats = self._extract_smil_formats( + s_url, uuid, fatal=False) + for f in smil_formats: + if f['url'].endswith('/cfx/st'): + f['app'] = 'cfx/st' + if not f['play_path'].startswith('mp4:'): + f['play_path'] = 'mp4:' + f['play_path'] + if isinstance(f['tbr'], float): + f['vbr'] = f['tbr'] * 1000 + del f['tbr'] + f['format_id'] = 'rtmp-%d' % f['vbr'] + formats.extend(smil_formats) + elif stream_type in ('ts', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native', + m3u8_id='hls', fatal=False) + if all([f.get('acodec') == 'none' for f in m3u8_formats]): + continue + for f in m3u8_formats: + if f.get('acodec') == 'none': + f['preference'] = -40 + elif f.get('vcodec') == 'none': + f['preference'] = -50 + height = f.get('height') + if not height: + continue + vbr = self._search_regex( + r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None) + if vbr: + f['vbr'] = int(vbr) + formats.extend(m3u8_formats) + else: + vbr = int_or_none(s.get('bitrate')) + formats.append({ + 'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type, + 'vbr': vbr, + 'width': int_or_none(s.get('width')), + 'height': int_or_none(s.get('height')), + 'filesize': int_or_none(s.get('filesize')), + 'url': s_url, + 'preference': -1, + }) + self._sort_formats( + formats, ('preference', 'width', 'height', 'vbr', 'filesize', 'tbr', 'ext', 'format_id')) + + subtitles = {} + for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []): + subtitle_url = subtitle.get('url') + if subtitle_url: + subtitles.setdefault('en', []).append({'url': subtitle_url}) + + return { + 'id': uuid, + 'title': self._live_title(title) if is_live else title, + 'thumbnail': try_get(video, lambda x: x['promo_image']['url']), + 'description': try_get(video, lambda x: x['subheadlines']['basic']), + 'formats': formats, + 'duration': int_or_none(video.get('duration'), 100), + 'timestamp': parse_iso8601(video.get('created_date')), + 'subtitles': subtitles, + 'is_live': is_live, + } diff --git a/haruhi_dl/extractor/ard.py b/haruhi_dl/extractor/ard.py index 5b7b2dd6d..143fc51e9 100644 --- a/haruhi_dl/extractor/ard.py +++ b/haruhi_dl/extractor/ard.py @@ -187,13 +187,13 @@ class ARDMediathekIE(ARDMediathekBaseIE): if doc.tag == 'rss': return GenericIE()._extract_rss(url, video_id, doc) - title = self._html_search_regex( + title = self._og_search_title(webpage, default=None) or self._html_search_regex( [r'(.*?)', r'', r'

(.*?)

', r']*>(.*?)'], webpage, 'title') - description = self._html_search_meta( + description = self._og_search_description(webpage, default=None) or self._html_search_meta( 'dcterms.abstract', webpage, 'description', default=None) if description is None: description = self._html_search_meta( @@ -249,18 +249,18 @@ class ARDMediathekIE(ARDMediathekBaseIE): class ARDIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?P[0-9]+))\.html' + _VALID_URL = r'(?Phttps?://(?:www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P[^/?#]+)-(?:video-?)?(?P[0-9]+))\.html' _TESTS = [{ - # available till 14.02.2019 - 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', - 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', + # available till 7.01.2022 + 'url': 'https://www.daserste.de/information/talk/maischberger/videos/maischberger-die-woche-video100.html', + 'md5': '867d8aa39eeaf6d76407c5ad1bb0d4c1', 'info_dict': { - 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', - 'id': '102', + 'display_id': 'maischberger-die-woche', + 'id': '100', 'ext': 'mp4', - 'duration': 4435.0, - 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', - 'upload_date': '20180214', + 'duration': 3687.0, + 'title': 'maischberger. die woche vom 7. Januar 2021', + 'upload_date': '20210107', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -284,20 +284,42 @@ class ARDIE(InfoExtractor): formats = [] for a in video_node.findall('.//asset'): + file_name = xpath_text(a, './fileName', default=None) + if not file_name: + continue + format_type = a.attrib.get('type') + format_url = url_or_none(file_name) + if format_url: + ext = determine_ext(file_name) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_type or 'hls', fatal=False)) + continue + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), + display_id, f4m_id=format_type or 'hds', fatal=False)) + continue f = { - 'format_id': a.attrib['type'], - 'width': int_or_none(a.find('./frameWidth').text), - 'height': int_or_none(a.find('./frameHeight').text), - 'vbr': int_or_none(a.find('./bitrateVideo').text), - 'abr': int_or_none(a.find('./bitrateAudio').text), - 'vcodec': a.find('./codecVideo').text, - 'tbr': int_or_none(a.find('./totalBitrate').text), + 'format_id': format_type, + 'width': int_or_none(xpath_text(a, './frameWidth')), + 'height': int_or_none(xpath_text(a, './frameHeight')), + 'vbr': int_or_none(xpath_text(a, './bitrateVideo')), + 'abr': int_or_none(xpath_text(a, './bitrateAudio')), + 'vcodec': xpath_text(a, './codecVideo'), + 'tbr': int_or_none(xpath_text(a, './totalBitrate')), } - if a.find('./serverPrefix').text: - f['url'] = a.find('./serverPrefix').text - f['playpath'] = a.find('./fileName').text + server_prefix = xpath_text(a, './serverPrefix', default=None) + if server_prefix: + f.update({ + 'url': server_prefix, + 'playpath': file_name, + }) else: - f['url'] = a.find('./fileName').text + if not format_url: + continue + f['url'] = format_url formats.append(f) self._sort_formats(formats) @@ -315,17 +337,17 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P[^/]+)/(?:player|live|video)/(?P(?:[^/]+/)*)(?P[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', - 'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', + 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', + 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', 'info_dict': { 'display_id': 'die-robuste-roswita', - 'id': '70153354', + 'id': '78566716', 'title': 'Die robuste Roswita', - 'description': r're:^Der Mord.*trüber ist als die Ilm.', + 'description': r're:^Der Mord.*totgeglaubte Ehefrau Roswita', 'duration': 5316, - 'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', - 'timestamp': 1577047500, - 'upload_date': '20191222', + 'thumbnail': 'https://img.ardmediathek.de/standard/00/78/56/67/84/575672121/16x9/960?mandant=ard', + 'timestamp': 1596658200, + 'upload_date': '20200805', 'ext': 'mp4', }, }, { diff --git a/haruhi_dl/extractor/arte.py b/haruhi_dl/extractor/arte.py index 2bd3bfe8a..03abdbfaf 100644 --- a/haruhi_dl/extractor/arte.py +++ b/haruhi_dl/extractor/arte.py @@ -4,23 +4,57 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, int_or_none, qualities, try_get, unified_strdate, + url_or_none, ) -# There are different sources of video in arte.tv, the extraction process -# is different for each one. The videos usually expire in 7 days, so we can't -# add tests. - class ArteTVBaseIE(InfoExtractor): - def _extract_from_json_url(self, json_url, video_id, lang, title=None): - info = self._download_json(json_url, video_id) + _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' + _API_BASE = 'https://api.arte.tv/api/player/v1' + + +class ArteTVIE(ArteTVBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?arte\.tv/(?P%(langs)s)/videos| + api\.arte\.tv/api/player/v\d+/config/(?P%(langs)s) + ) + /(?P\d{6}-\d{3}-[AF]) + ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} + _TESTS = [{ + 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'info_dict': { + 'id': '088501-000-A', + 'ext': 'mp4', + 'title': 'Mexico: Stealing Petrol to Survive', + 'upload_date': '20190628', + }, + }, { + 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', + 'only_matching': True, + }, { + 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + lang = mobj.group('lang') or mobj.group('lang_2') + + info = self._download_json( + '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id) player_info = info['videoJsonPlayer'] vsr = try_get(player_info, lambda x: x['VSR'], dict) @@ -37,18 +71,11 @@ class ArteTVBaseIE(InfoExtractor): if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] - title = (player_info.get('VTI') or title or player_info['VID']).strip() + title = (player_info.get('VTI') or player_info['VID']).strip() subtitle = player_info.get('VSU', '').strip() if subtitle: title += ' - %s' % subtitle - info_dict = { - 'id': player_info['VID'], - 'title': title, - 'description': player_info.get('VDE'), - 'upload_date': unified_strdate(upload_date_str), - 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), - } qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) LANGS = { @@ -65,6 +92,10 @@ class ArteTVBaseIE(InfoExtractor): formats = [] for format_id, format_dict in vsr.items(): f = dict(format_dict) + format_url = url_or_none(f.get('url')) + streamer = f.get('streamer') + if not format_url and not streamer: + continue versionCode = f.get('versionCode') l = re.escape(langcode) @@ -107,6 +138,16 @@ class ArteTVBaseIE(InfoExtractor): else: lang_pref = -1 + media_type = f.get('mediaType') + if media_type == 'hls': + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False) + for m3u8_format in m3u8_formats: + m3u8_format['language_preference'] = lang_pref + formats.extend(m3u8_formats) + continue + format = { 'format_id': format_id, 'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -118,7 +159,7 @@ class ArteTVBaseIE(InfoExtractor): 'quality': qfunc(f.get('quality')), } - if f.get('mediaType') == 'rtmp': + if media_type == 'rtmp': format['url'] = f['streamer'] format['play_path'] = 'mp4:' + f['url'] format['ext'] = 'flv' @@ -127,56 +168,50 @@ class ArteTVBaseIE(InfoExtractor): formats.append(format) - self._check_formats(formats, video_id) self._sort_formats(formats) - info_dict['formats'] = formats - return info_dict + return { + 'id': player_info.get('VID') or video_id, + 'title': title, + 'description': player_info.get('VDE'), + 'upload_date': unified_strdate(upload_date_str), + 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + 'formats': formats, + } -class ArteTVPlus7IE(ArteTVBaseIE): - IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?Pfr|de|en|es|it|pl)/videos/(?P\d{6}-\d{3}-[AF])' - +class ArteTVEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' _TESTS = [{ - 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', 'info_dict': { - 'id': '088501-000-A', + 'id': '100605-013-A', 'ext': 'mp4', - 'title': 'Mexico: Stealing Petrol to Survive', - 'upload_date': '20190628', + 'title': 'United we Stream November Lockdown Edition #13', + 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', + 'upload_date': '20201116', }, + }, { + 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', + 'only_matching': True, }] - def _real_extract(self, url): - lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url( - 'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), - video_id, lang) - - -class ArteTVEmbedIE(ArteTVPlus7IE): - IE_NAME = 'arte.tv:embed' - _VALID_URL = r'''(?x) - https://www\.arte\.tv - /player/v3/index\.php\?json_url= - (?P - https?://api\.arte\.tv/api/player/v1/config/ - (?P[^/]+)/(?P\d{6}-\d{3}-[AF]) - ) - ''' - - _TESTS = [] + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r'<(?:iframe|script)[^>]+src=(["\'])(?P(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', + webpage)] def _real_extract(self, url): - json_url, lang, video_id = re.match(self._VALID_URL, url).groups() - return self._extract_from_json_url(json_url, video_id, lang) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + json_url = qs['json_url'][0] + video_id = ArteTVIE._match_id(json_url) + return self.url_result( + json_url, ie=ArteTVIE.ie_key(), video_id=video_id) class ArteTVPlaylistIE(ArteTVBaseIE): - IE_NAME = 'arte.tv:playlist' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?Pfr|de|en|es|it|pl)/videos/(?PRC-\d{6})' - + _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P%s)/videos/(?PRC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', 'info_dict': { @@ -185,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE): 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', }, 'playlist_mincount': 6, + }, { + 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', + 'only_matching': True, }] def _real_extract(self, url): lang, playlist_id = re.match(self._VALID_URL, url).groups() collection = self._download_json( - 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' - % (lang, playlist_id), playlist_id) + '%s/collectionData/%s/%s?source=videos' + % (self._API_BASE, lang, playlist_id), playlist_id) + entries = [] + for video in collection['videos']: + if not isinstance(video, dict): + continue + video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) + if not video_url: + continue + video_id = video.get('programId') + entries.append({ + '_type': 'url_transparent', + 'url': video_url, + 'id': video_id, + 'title': video.get('title'), + 'alt_title': video.get('subtitle'), + 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), + 'duration': int_or_none(video.get('durationSeconds')), + 'view_count': int_or_none(video.get('views')), + 'ie_key': ArteTVIE.ie_key(), + }) title = collection.get('title') description = collection.get('shortDescription') or collection.get('teaserText') - entries = [ - self._extract_from_json_url( - video['jsonUrl'], video.get('programId') or playlist_id, lang) - for video in collection['videos'] if video.get('jsonUrl')] return self.playlist_result(entries, playlist_id, title, description) diff --git a/haruhi_dl/extractor/asiancrush.py b/haruhi_dl/extractor/asiancrush.py index 0348e680c..66ce7c686 100644 --- a/haruhi_dl/extractor/asiancrush.py +++ b/haruhi_dl/extractor/asiancrush.py @@ -1,27 +1,91 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import re from .common import InfoExtractor from .kaltura import KalturaIE -from ..utils import extract_attributes +from ..utils import ( + extract_attributes, + int_or_none, + OnDemandPagedList, + parse_age_limit, + strip_or_none, + try_get, +) -class AsianCrushIE(InfoExtractor): - _VALID_URL_BASE = r'https?://(?:www\.)?(?P(?:(?:asiancrush|yuyutv|midnightpulp)\.com|cocoro\.tv))' - _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P\d+)v\b' % _VALID_URL_BASE +class AsianCrushBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?(?P(?:(?:asiancrush|yuyutv|midnightpulp)\.com|(?:cocoro|retrocrush)\.tv))' + _KALTURA_KEYS = [ + 'video_url', 'progressive_url', 'download_url', 'thumbnail_url', + 'widescreen_thumbnail_url', 'screencap_widescreen', + ] + _API_SUFFIX = {'retrocrush.tv': '-ott'} + + def _call_api(self, host, endpoint, video_id, query, resource): + return self._download_json( + 'https://api%s.%s/%s' % (self._API_SUFFIX.get(host, ''), host, endpoint), video_id, + 'Downloading %s JSON metadata' % resource, query=query, + headers=self.geo_verification_headers())['objects'] + + def _download_object_data(self, host, object_id, resource): + return self._call_api( + host, 'search', object_id, {'id': object_id}, resource)[0] + + def _get_object_description(self, obj): + return strip_or_none(obj.get('long_description') or obj.get('short_description')) + + def _parse_video_data(self, video): + title = video['name'] + + entry_id, partner_id = [None] * 2 + for k in self._KALTURA_KEYS: + k_url = video.get(k) + if k_url: + mobj = re.search(r'/p/(\d+)/.+?/entryId/([^/]+)/', k_url) + if mobj: + partner_id, entry_id = mobj.groups() + break + + meta_categories = try_get(video, lambda x: x['meta']['categories'], list) or [] + categories = list(filter(None, [c.get('name') for c in meta_categories])) + + show_info = video.get('show_info') or {} + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, entry_id), + 'ie_key': KalturaIE.ie_key(), + 'id': entry_id, + 'title': title, + 'description': self._get_object_description(video), + 'age_limit': parse_age_limit(video.get('mpaa_rating') or video.get('tv_rating')), + 'categories': categories, + 'series': show_info.get('show_name'), + 'season_number': int_or_none(show_info.get('season_num')), + 'season_id': show_info.get('season_id'), + 'episode_number': int_or_none(show_info.get('episode_num')), + } + + +class AsianCrushIE(AsianCrushBaseIE): + _VALID_URL = r'%s/video/(?:[^/]+/)?0+(?P\d+)v\b' % AsianCrushBaseIE._VALID_URL_BASE _TESTS = [{ - 'url': 'https://www.asiancrush.com/video/012869v/women-who-flirt/', + 'url': 'https://www.asiancrush.com/video/004289v/women-who-flirt', 'md5': 'c3b740e48d0ba002a42c0b72857beae6', 'info_dict': { 'id': '1_y4tmjm5r', 'ext': 'mp4', 'title': 'Women Who Flirt', - 'description': 'md5:7e986615808bcfb11756eb503a751487', + 'description': 'md5:b65c7e0ae03a85585476a62a186f924c', 'timestamp': 1496936429, 'upload_date': '20170608', 'uploader_id': 'craig@crifkin.com', + 'age_limit': 13, + 'categories': 'count:5', + 'duration': 5812, }, }, { 'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/', @@ -41,67 +105,35 @@ class AsianCrushIE(InfoExtractor): }, { 'url': 'https://www.cocoro.tv/video/the-wonderful-wizard-of-oz/008878v-the-wonderful-wizard-of-oz-ep01/', 'only_matching': True, + }, { + 'url': 'https://www.retrocrush.tv/video/true-tears/012328v-i...gave-away-my-tears', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - video_id = mobj.group('id') + host, video_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, video_id) - - entry_id, partner_id, title = [None] * 3 - - vars = self._parse_json( - self._search_regex( + if host == 'cocoro.tv': + webpage = self._download_webpage(url, video_id) + embed_vars = self._parse_json(self._search_regex( r'iEmbedVars\s*=\s*({.+?})', webpage, 'embed vars', - default='{}'), video_id, fatal=False) - if vars: - entry_id = vars.get('entry_id') - partner_id = vars.get('partner_id') - title = vars.get('vid_label') + default='{}'), video_id, fatal=False) or {} + video_id = embed_vars.get('entry_id') or video_id - if not entry_id: - entry_id = self._search_regex( - r'\bentry_id["\']\s*:\s*["\'](\d+)', webpage, 'entry id') - - player = self._download_webpage( - 'https://api.%s/embeddedVideoPlayer' % host, video_id, - query={'id': entry_id}) - - kaltura_id = self._search_regex( - r'entry_id["\']\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', player, - 'kaltura id', group='id') - - if not partner_id: - partner_id = self._search_regex( - r'/p(?:artner_id)?/(\d+)', player, 'partner id', - default='513551') - - description = self._html_search_regex( - r'(?s)]+\bclass=["\']description["\'][^>]*>(.+?)', - webpage, 'description', fatal=False) - - return { - '_type': 'url_transparent', - 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), - 'ie_key': KalturaIE.ie_key(), - 'id': video_id, - 'title': title, - 'description': description, - } + video = self._download_object_data(host, video_id, 'video') + return self._parse_video_data(video) -class AsianCrushPlaylistIE(InfoExtractor): - _VALID_URL = r'%s/series/0+(?P\d+)s\b' % AsianCrushIE._VALID_URL_BASE +class AsianCrushPlaylistIE(AsianCrushBaseIE): + _VALID_URL = r'%s/series/0+(?P\d+)s\b' % AsianCrushBaseIE._VALID_URL_BASE _TESTS = [{ - 'url': 'https://www.asiancrush.com/series/012481s/scholar-walks-night/', + 'url': 'https://www.asiancrush.com/series/006447s/fruity-samurai', 'info_dict': { - 'id': '12481', - 'title': 'Scholar Who Walks the Night', - 'description': 'md5:7addd7c5132a09fd4741152d96cce886', + 'id': '6447', + 'title': 'Fruity Samurai', + 'description': 'md5:7535174487e4a202d3872a7fc8f2f154', }, - 'playlist_count': 20, + 'playlist_count': 13, }, { 'url': 'https://www.yuyutv.com/series/013920s/peep-show/', 'only_matching': True, @@ -111,35 +143,58 @@ class AsianCrushPlaylistIE(InfoExtractor): }, { 'url': 'https://www.cocoro.tv/series/008549s/the-wonderful-wizard-of-oz/', 'only_matching': True, + }, { + 'url': 'https://www.retrocrush.tv/series/012355s/true-tears', + 'only_matching': True, }] + _PAGE_SIZE = 1000000000 + + def _fetch_page(self, domain, parent_id, page): + videos = self._call_api( + domain, 'getreferencedobjects', parent_id, { + 'max': self._PAGE_SIZE, + 'object_type': 'video', + 'parent_id': parent_id, + 'start': page * self._PAGE_SIZE, + }, 'page %d' % (page + 1)) + for video in videos: + yield self._parse_video_data(video) def _real_extract(self, url): - playlist_id = self._match_id(url) + host, playlist_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, playlist_id) + if host == 'cocoro.tv': + webpage = self._download_webpage(url, playlist_id) - entries = [] + entries = [] - for mobj in re.finditer( - r']+href=(["\'])(?P%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, - webpage): - attrs = extract_attributes(mobj.group(0)) - if attrs.get('class') == 'clearfix': - entries.append(self.url_result( - mobj.group('url'), ie=AsianCrushIE.ie_key())) + for mobj in re.finditer( + r']+href=(["\'])(?P%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, + webpage): + attrs = extract_attributes(mobj.group(0)) + if attrs.get('class') == 'clearfix': + entries.append(self.url_result( + mobj.group('url'), ie=AsianCrushIE.ie_key())) - title = self._html_search_regex( - r'(?s)]\bid=["\']movieTitle[^>]+>(.+?)', webpage, - 'title', default=None) or self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', - default=None) or self._search_regex( - r'([^<]+)', webpage, 'title', fatal=False) - if title: - title = re.sub(r'\s*\|\s*.+?$', '', title) + title = self._html_search_regex( + r'(?s)]\bid=["\']movieTitle[^>]+>(.+?)', webpage, + 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or self._search_regex( + r'([^<]+)', webpage, 'title', fatal=False) + if title: + title = re.sub(r'\s*\|\s*.+?$', '', title) - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'twitter:description', webpage, 'description', fatal=False) + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage, 'description', fatal=False) + else: + show = self._download_object_data(host, playlist_id, 'show') + title = show.get('name') + description = self._get_object_description(show) + entries = OnDemandPagedList( + functools.partial(self._fetch_page, host, playlist_id), + self._PAGE_SIZE) return self.playlist_result(entries, playlist_id, title, description) diff --git a/haruhi_dl/extractor/awaan.py b/haruhi_dl/extractor/awaan.py index a2603bbff..3a7700cd4 100644 --- a/haruhi_dl/extractor/awaan.py +++ b/haruhi_dl/extractor/awaan.py @@ -48,6 +48,7 @@ class AWAANBaseIE(InfoExtractor): 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, + 'uploader_id': video_data.get('user_id'), } @@ -107,6 +108,7 @@ class AWAANLiveIE(AWAANBaseIE): 'title': 're:Dubai Al Oula [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'upload_date': '20150107', 'timestamp': 1420588800, + 'uploader_id': '71', }, 'params': { # m3u8 download diff --git a/haruhi_dl/extractor/azmedien.py b/haruhi_dl/extractor/azmedien.py index b1e20def5..930266990 100644 --- a/haruhi_dl/extractor/azmedien.py +++ b/haruhi_dl/extractor/azmedien.py @@ -47,7 +47,7 @@ class AZMedienIE(InfoExtractor): 'url': 'https://www.telebaern.tv/telebaern-news/montag-1-oktober-2018-ganze-sendung-133531189#video=0_7xjo9lf1', 'only_matching': True }] - _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/cb9f2f81ed22e9b47f4ca64ea3cc5a5d13e88d1d' + _API_TEMPL = 'https://www.%s/api/pub/gql/%s/NewsArticleTeaser/a4016f65fe62b81dc6664dd9f4910e4ab40383be' _PARTNER_ID = '1719221' def _real_extract(self, url): diff --git a/haruhi_dl/extractor/bandcamp.py b/haruhi_dl/extractor/bandcamp.py index 9ac93645e..4c6b55035 100644 --- a/haruhi_dl/extractor/bandcamp.py +++ b/haruhi_dl/extractor/bandcamp.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import random @@ -5,10 +6,7 @@ import re import time from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_str from ..utils import ( ExtractorError, float_or_none, @@ -17,30 +15,32 @@ from ..utils import ( parse_filesize, str_or_none, try_get, - unescapeHTML, update_url_query, unified_strdate, unified_timestamp, url_or_none, + urljoin, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P[^/?#&]+)' + _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://haruhi-dl.bandcamp.com/track/haruhi-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "haruhi-dl \"'/\\\u00e4\u21ad - haruhi-dl test song \"'/\\\u00e4\u21ad", + 'title': "haruhi-dl \"'/\\ä↭ - haruhi-dl \"'/\\ä↭ - haruhi-dl test song \"'/\\ä↭", 'duration': 9.8485, + 'uploader': 'haruhi-dl "\'/\\ä↭', + 'upload_date': '20121129', + 'timestamp': 1354224127, }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '853e35bf34aa1d6fe2615ae612564b36', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', @@ -79,11 +79,16 @@ class BandcampIE(InfoExtractor): }, }] + def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): + return self._parse_json(self._html_search_regex( + r'data-%s=(["\'])({.+?})\1' % attr, webpage, + attr + ' data', group=2), video_id, fatal=fatal) + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') + title = self._match_id(url) webpage = self._download_webpage(url, title) - thumbnail = self._html_search_meta('og:image', webpage, default=None) + tralbum = self._extract_data_attr(webpage, title) + thumbnail = self._og_search_thumbnail(webpage) track_id = None track = None @@ -91,10 +96,7 @@ class BandcampIE(InfoExtractor): duration = None formats = [] - track_info = self._parse_json( - self._search_regex( - r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', - webpage, 'track info', default='{}'), title) + track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -111,37 +113,25 @@ class BandcampIE(InfoExtractor): 'abr': int_or_none(abr_str), }) track = track_info.get('title') - track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) + track_id = str_or_none( + track_info.get('track_id') or track_info.get('id')) track_number = int_or_none(track_info.get('track_num')) duration = float_or_none(track_info.get('duration')) - def extract(key): - return self._search_regex( - r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key, - webpage, key, default=None, group='value') - - artist = extract('artist') - album = extract('album_title') + embed = self._extract_data_attr(webpage, title, 'embed', False) + current = tralbum.get('current') or {} + artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') timestamp = unified_timestamp( - extract('publish_date') or extract('album_publish_date')) - release_date = unified_strdate(extract('album_release_date')) + current.get('publish_date') or tralbum.get('album_publish_date')) - download_link = self._search_regex( - r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'download link', default=None, group='url') + download_link = tralbum.get('freeDownloadPage') if download_link: - track_id = self._search_regex( - r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$', - webpage, 'track id') + track_id = compat_str(tralbum['id']) download_webpage = self._download_webpage( download_link, track_id, 'Downloading free downloads page') - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, - 'blob', group='blob'), - track_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(download_webpage, track_id, 'blob') info = try_get( blob, (lambda x: x['digital_items'][0], @@ -207,20 +197,20 @@ class BandcampIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': artist, 'timestamp': timestamp, - 'release_date': release_date, + 'release_date': unified_strdate(tralbum.get('album_release_date')), 'duration': duration, 'track': track, 'track_number': track_number, 'track_id': track_id, 'artist': artist, - 'album': album, + 'album': embed.get('album_title'), 'formats': formats, } -class BandcampAlbumIE(InfoExtractor): +class BandcampAlbumIE(BandcampIE): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -230,7 +220,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '1353101989', 'ext': 'mp3', - 'title': 'Intro', + 'title': 'Blazo - Intro', + 'timestamp': 1311756226, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, { @@ -238,7 +231,10 @@ class BandcampAlbumIE(InfoExtractor): 'info_dict': { 'id': '38097443', 'ext': 'mp3', - 'title': 'Kero One - Keep It Alive (Blazo remix)', + 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)', + 'timestamp': 1311757238, + 'upload_date': '20110727', + 'uploader': 'Blazo', } }, ], @@ -274,6 +270,7 @@ class BandcampAlbumIE(InfoExtractor): 'title': '"Entropy" EP', 'uploader_id': 'jstrecords', 'id': 'entropy-ep', + 'description': 'md5:0ff22959c943622972596062f2f366a5', }, 'playlist_mincount': 3, }, { @@ -283,6 +280,7 @@ class BandcampAlbumIE(InfoExtractor): 'id': 'we-are-the-plague', 'title': 'WE ARE THE PLAGUE', 'uploader_id': 'insulters', + 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f', }, 'playlist_count': 2, }] @@ -294,41 +292,34 @@ class BandcampAlbumIE(InfoExtractor): else super(BandcampAlbumIE, cls).suitable(url)) def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader_id = mobj.group('subdomain') - album_id = mobj.group('album_id') + uploader_id, album_id = re.match(self._VALID_URL, url).groups() playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) - track_elements = re.findall( - r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) - if not track_elements: + tralbum = self._extract_data_attr(webpage, playlist_id) + track_info = tralbum.get('trackinfo') + if not track_info: raise ExtractorError('The page doesn\'t contain any tracks') # Only tracks with duration info have songs entries = [ self.url_result( - compat_urlparse.urljoin(url, t_path), - ie=BandcampIE.ie_key(), - video_title=self._search_regex( - r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', - elem_content, 'track title', fatal=False)) - for elem_content, t_path in track_elements - if self._html_search_meta('duration', elem_content, default=None)] + urljoin(url, t['title_link']), BandcampIE.ie_key(), + str_or_none(t.get('track_id') or t.get('id')), t.get('title')) + for t in track_info + if t.get('duration')] + + current = tralbum.get('current') or {} - title = self._html_search_regex( - r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', - webpage, 'title', fatal=False) - if title: - title = title.replace(r'\"', '"') return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, - 'title': title, + 'title': current.get('title'), + 'description': current.get('about'), 'entries': entries, } -class BandcampWeeklyIE(InfoExtractor): +class BandcampWeeklyIE(BandcampIE): IE_NAME = 'Bandcamp:weekly' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' _TESTS = [{ @@ -343,29 +334,23 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': '20170404', 'series': 'Bandcamp Weekly', 'episode': 'Magic Moments', - 'episode_number': 208, 'episode_id': '224', - } + }, + 'params': { + 'format': 'opus-lo', + }, }, { 'url': 'https://bandcamp.com/?blah/blah@&show=228', 'only_matching': True }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + show_id = self._match_id(url) + webpage = self._download_webpage(url, show_id) - blob = self._parse_json( - self._search_regex( - r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, - 'blob', group='blob'), - video_id, transform_source=unescapeHTML) + blob = self._extract_data_attr(webpage, show_id, 'blob') - show = blob['bcw_show'] - - # This is desired because any invalid show id redirects to `bandcamp.com` - # which happens to expose the latest Bandcamp Weekly episode. - show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) + show = blob['bcw_data'][show_id] formats = [] for format_id, format_url in show['audio_stream'].items(): @@ -390,20 +375,8 @@ class BandcampWeeklyIE(InfoExtractor): if subtitle: title += ' - %s' % subtitle - episode_number = None - seq = blob.get('bcw_seq') - - if seq and isinstance(seq, list): - try: - episode_number = next( - int_or_none(e.get('episode_number')) - for e in seq - if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) - except StopIteration: - pass - return { - 'id': video_id, + 'id': show_id, 'title': title, 'description': show.get('desc') or show.get('short_desc'), 'duration': float_or_none(show.get('audio_duration')), @@ -411,7 +384,6 @@ class BandcampWeeklyIE(InfoExtractor): 'release_date': unified_strdate(show.get('published_date')), 'series': 'Bandcamp Weekly', 'episode': show.get('subtitle'), - 'episode_number': episode_number, - 'episode_id': compat_str(video_id), + 'episode_id': show_id, 'formats': formats } diff --git a/haruhi_dl/extractor/bbc.py b/haruhi_dl/extractor/bbc.py index 1ff7834bf..b73521043 100644 --- a/haruhi_dl/extractor/bbc.py +++ b/haruhi_dl/extractor/bbc.py @@ -49,22 +49,17 @@ class BBCCoUkIE(InfoExtractor): _LOGIN_URL = 'https://account.bbc.com/signin' _NETRC_MACHINE = 'bbc' - _MEDIASELECTOR_URLS = [ + _MEDIA_SELECTOR_URL_TEMPL = 'https://open.live.bbc.co.uk/mediaselector/6/select/version/2.0/mediaset/%s/vpid/%s' + _MEDIA_SETS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails # with geolocation in some cases when it's even not geo restricted at all (e.g. # http://www.bbc.co.uk/programmes/b06bp7lf). Also may fail with selectionunavailable. - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s', + 'iptv-all', + 'pc', ] - _MEDIASELECTION_NS = 'http://bbc.co.uk/2008/mp/mediaselection' _EMP_PLAYLIST_NS = 'http://bbc.co.uk/2008/emp/playlist' - _NAMESPACES = ( - _MEDIASELECTION_NS, - _EMP_PLAYLIST_NS, - ) - _TESTS = [ { 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', @@ -261,8 +256,6 @@ class BBCCoUkIE(InfoExtractor): 'only_matching': True, }] - _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' - def _login(self): username, password = self._get_login_info() if username is None: @@ -307,22 +300,14 @@ class BBCCoUkIE(InfoExtractor): def _extract_items(self, playlist): return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) - def _findall_ns(self, element, xpath): - elements = [] - for ns in self._NAMESPACES: - elements.extend(element.findall(xpath % ns)) - return elements - def _extract_medias(self, media_selection): - error = media_selection.find('./{%s}error' % self._MEDIASELECTION_NS) - if error is None: - media_selection.find('./{%s}error' % self._EMP_PLAYLIST_NS) - if error is not None: - raise BBCCoUkIE.MediaSelectionError(error.get('id')) - return self._findall_ns(media_selection, './{%s}media') + error = media_selection.get('result') + if error: + raise BBCCoUkIE.MediaSelectionError(error) + return media_selection.get('media') or [] def _extract_connections(self, media): - return self._findall_ns(media, './{%s}connection') + return media.get('connection') or [] def _get_subtitles(self, media, programme_id): subtitles = {} @@ -334,13 +319,13 @@ class BBCCoUkIE(InfoExtractor): cc_url, programme_id, 'Downloading captions', fatal=False) if not isinstance(captions, compat_etree_Element): continue - lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') - subtitles[lang] = [ + subtitles['en'] = [ { 'url': connection.get('href'), 'ext': 'ttml', }, ] + break return subtitles def _raise_extractor_error(self, media_selection_error): @@ -350,10 +335,10 @@ class BBCCoUkIE(InfoExtractor): def _download_media_selector(self, programme_id): last_exception = None - for mediaselector_url in self._MEDIASELECTOR_URLS: + for media_set in self._MEDIA_SETS: try: return self._download_media_selector_url( - mediaselector_url % programme_id, programme_id) + self._MEDIA_SELECTOR_URL_TEMPL % (media_set, programme_id), programme_id) except BBCCoUkIE.MediaSelectionError as e: if e.id in ('notukerror', 'geolocation', 'selectionunavailable'): last_exception = e @@ -362,8 +347,8 @@ class BBCCoUkIE(InfoExtractor): self._raise_extractor_error(last_exception) def _download_media_selector_url(self, url, programme_id=None): - media_selection = self._download_xml( - url, programme_id, 'Downloading media selection XML', + media_selection = self._download_json( + url, programme_id, 'Downloading media selection JSON', expected_status=(403, 404)) return self._process_media_selector(media_selection, programme_id) @@ -377,7 +362,6 @@ class BBCCoUkIE(InfoExtractor): if kind in ('video', 'audio'): bitrate = int_or_none(media.get('bitrate')) encoding = media.get('encoding') - service = media.get('service') width = int_or_none(media.get('width')) height = int_or_none(media.get('height')) file_size = int_or_none(media.get('media_file_size')) @@ -392,8 +376,6 @@ class BBCCoUkIE(InfoExtractor): supplier = connection.get('supplier') transfer_format = connection.get('transferFormat') format_id = supplier or conn_kind or protocol - if service: - format_id = '%s_%s' % (service, format_id) # ASX playlist if supplier == 'asx': for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): @@ -408,20 +390,11 @@ class BBCCoUkIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) - if re.search(self._USP_RE, href): - usp_formats = self._extract_m3u8_formats( - re.sub(self._USP_RE, r'/\1.ism/\1.m3u8', href), - programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False) - for f in usp_formats: - if f.get('height') and f['height'] > 720: - continue - formats.append(f) elif transfer_format == 'hds': formats.extend(self._extract_f4m_formats( href, programme_id, f4m_id=format_id, fatal=False)) else: - if not service and not supplier and bitrate: + if not supplier and bitrate: format_id += '-%d' % bitrate fmt = { 'format_id': format_id, @@ -554,7 +527,7 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') error = self._search_regex( - r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<', + r'<div\b[^>]+\bclass=["\'](?:smp|playout)__message delta["\'][^>]*>\s*([^<]+?)\s*<', webpage, 'error', default=None) if error: raise ExtractorError(error, expected=True) @@ -607,16 +580,9 @@ class BBCIE(BBCCoUkIE): IE_DESC = 'BBC' _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)' - _MEDIASELECTOR_URLS = [ - # Provides HQ HLS streams but fails with geolocation in some cases when it's - # even not geo restricted at all - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/iptv-all/vpid/%s', - # Provides more formats, namely direct mp4 links, but fails on some videos with - # notukerror for non UK (?) users (e.g. - # http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret) - 'http://open.live.bbc.co.uk/mediaselector/4/mtis/stream/%s', - # Provides fewer formats, but works everywhere for everybody (hopefully) - 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/journalism-pc/vpid/%s', + _MEDIA_SETS = [ + 'mobile-tablet-main', + 'pc', ] _TESTS = [{ @@ -981,7 +947,7 @@ class BBCIE(BBCCoUkIE): group_id = self._search_regex( r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, webpage, 'group id', default=None) - if playlist_id: + if group_id: return self.url_result( 'https://www.bbc.co.uk/programmes/%s' % group_id, ie=BBCCoUkIE.ie_key()) @@ -1092,10 +1058,26 @@ class BBCIE(BBCCoUkIE): self._search_regex( r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, 'bbcthree config', default='{}'), - playlist_id, transform_source=js_to_json, fatal=False) - if bbc3_config: + playlist_id, transform_source=js_to_json, fatal=False) or {} + payload = bbc3_config.get('payload') or {} + if payload: + clip = payload.get('currentClip') or {} + clip_vpid = clip.get('vpid') + clip_title = clip.get('title') + if clip_vpid and clip_title: + formats, subtitles = self._download_media_selector(clip_vpid) + self._sort_formats(formats) + return { + 'id': clip_vpid, + 'title': clip_title, + 'thumbnail': dict_get(clip, ('poster', 'imageUrl')), + 'description': clip.get('description'), + 'duration': parse_duration(clip.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + } bbc3_playlist = try_get( - bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], + payload, lambda x: x['content']['bbcMedia']['playlist'], dict) if bbc3_playlist: playlist_title = bbc3_playlist.get('title') or playlist_title @@ -1118,6 +1100,39 @@ class BBCIE(BBCCoUkIE): return self.playlist_result( entries, playlist_id, playlist_title, playlist_description) + initial_data = self._parse_json(self._search_regex( + r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, + 'preload state', default='{}'), playlist_id, fatal=False) + if initial_data: + def parse_media(media): + if not media: + return + for item in (try_get(media, lambda x: x['media']['items'], list) or []): + item_id = item.get('id') + item_title = item.get('title') + if not (item_id and item_title): + continue + formats, subtitles = self._download_media_selector(item_id) + self._sort_formats(formats) + entries.append({ + 'id': item_id, + 'title': item_title, + 'thumbnail': item.get('holdingImageUrl'), + 'formats': formats, + 'subtitles': subtitles, + }) + for resp in (initial_data.get('data') or {}).values(): + name = resp.get('name') + if name == 'media-experience': + parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) + elif name == 'article': + for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): + if block.get('type') != 'media': + continue + parse_media(block.get('model')) + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description) + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), diff --git a/haruhi_dl/extractor/beampro.py b/haruhi_dl/extractor/beampro.py deleted file mode 100644 index 86abdae00..000000000 --- a/haruhi_dl/extractor/beampro.py +++ /dev/null @@ -1,194 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - clean_html, - compat_str, - float_or_none, - int_or_none, - parse_iso8601, - try_get, - urljoin, -) - - -class BeamProBaseIE(InfoExtractor): - _API_BASE = 'https://mixer.com/api/v1' - _RATINGS = {'family': 0, 'teen': 13, '18+': 18} - - def _extract_channel_info(self, chan): - user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id']) - return { - 'uploader': chan.get('token') or try_get( - chan, lambda x: x['user']['username'], compat_str), - 'uploader_id': compat_str(user_id) if user_id else None, - 'age_limit': self._RATINGS.get(chan.get('audience')), - } - - -class BeamProLiveIE(BeamProBaseIE): - IE_NAME = 'Mixer:live' - _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'http://mixer.com/niterhayven', - 'info_dict': { - 'id': '261562', - 'ext': 'mp4', - 'title': 'Introducing The Witcher 3 // The Grind Starts Now!', - 'description': 'md5:0b161ac080f15fe05d18a07adb44a74d', - 'thumbnail': r're:https://.*\.jpg$', - 'timestamp': 1483477281, - 'upload_date': '20170103', - 'uploader': 'niterhayven', - 'uploader_id': '373396', - 'age_limit': 18, - 'is_live': True, - 'view_count': int, - }, - 'skip': 'niterhayven is offline', - 'params': { - 'skip_download': True, - }, - } - - _MANIFEST_URL_TEMPLATE = '%s/channels/%%s/manifest.%%s' % BeamProBaseIE._API_BASE - - @classmethod - def suitable(cls, url): - return False if BeamProVodIE.suitable(url) else super(BeamProLiveIE, cls).suitable(url) - - def _real_extract(self, url): - channel_name = self._match_id(url) - - chan = self._download_json( - '%s/channels/%s' % (self._API_BASE, channel_name), channel_name) - - if chan.get('online') is False: - raise ExtractorError( - '{0} is offline'.format(channel_name), expected=True) - - channel_id = chan['id'] - - def manifest_url(kind): - return self._MANIFEST_URL_TEMPLATE % (channel_id, kind) - - formats = self._extract_m3u8_formats( - manifest_url('m3u8'), channel_name, ext='mp4', m3u8_id='hls', - fatal=False) - formats.extend(self._extract_smil_formats( - manifest_url('smil'), channel_name, fatal=False)) - self._sort_formats(formats) - - info = { - 'id': compat_str(chan.get('id') or channel_name), - 'title': self._live_title(chan.get('name') or channel_name), - 'description': clean_html(chan.get('description')), - 'thumbnail': try_get( - chan, lambda x: x['thumbnail']['url'], compat_str), - 'timestamp': parse_iso8601(chan.get('updatedAt')), - 'is_live': True, - 'view_count': int_or_none(chan.get('viewersTotal')), - 'formats': formats, - } - info.update(self._extract_channel_info(chan)) - - return info - - -class BeamProVodIE(BeamProBaseIE): - IE_NAME = 'Mixer:vod' - _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P<id>[^?#&]+)' - _TESTS = [{ - 'url': 'https://mixer.com/willow8714?vod=2259830', - 'md5': 'b2431e6e8347dc92ebafb565d368b76b', - 'info_dict': { - 'id': '2259830', - 'ext': 'mp4', - 'title': 'willow8714\'s Channel', - 'duration': 6828.15, - 'thumbnail': r're:https://.*source\.png$', - 'timestamp': 1494046474, - 'upload_date': '20170506', - 'uploader': 'willow8714', - 'uploader_id': '6085379', - 'age_limit': 13, - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://mixer.com/streamer?vod=IxFno1rqC0S_XJ1a2yGgNw', - 'only_matching': True, - }, { - 'url': 'https://mixer.com/streamer?vod=Rh3LY0VAqkGpEQUe2pN-ig', - 'only_matching': True, - }] - - @staticmethod - def _extract_format(vod, vod_type): - if not vod.get('baseUrl'): - return [] - - if vod_type == 'hls': - filename, protocol = 'manifest.m3u8', 'm3u8_native' - elif vod_type == 'raw': - filename, protocol = 'source.mp4', 'https' - else: - assert False - - data = vod.get('data') if isinstance(vod.get('data'), dict) else {} - - format_id = [vod_type] - if isinstance(data.get('Height'), compat_str): - format_id.append('%sp' % data['Height']) - - return [{ - 'url': urljoin(vod['baseUrl'], filename), - 'format_id': '-'.join(format_id), - 'ext': 'mp4', - 'protocol': protocol, - 'width': int_or_none(data.get('Width')), - 'height': int_or_none(data.get('Height')), - 'fps': int_or_none(data.get('Fps')), - 'tbr': int_or_none(data.get('Bitrate'), 1000), - }] - - def _real_extract(self, url): - vod_id = self._match_id(url) - - vod_info = self._download_json( - '%s/recordings/%s' % (self._API_BASE, vod_id), vod_id) - - state = vod_info.get('state') - if state != 'AVAILABLE': - raise ExtractorError( - 'VOD %s is not available (state: %s)' % (vod_id, state), - expected=True) - - formats = [] - thumbnail_url = None - - for vod in vod_info['vods']: - vod_type = vod.get('format') - if vod_type in ('hls', 'raw'): - formats.extend(self._extract_format(vod, vod_type)) - elif vod_type == 'thumbnail': - thumbnail_url = urljoin(vod.get('baseUrl'), 'source.png') - - self._sort_formats(formats) - - info = { - 'id': vod_id, - 'title': vod_info.get('name') or vod_id, - 'duration': float_or_none(vod_info.get('duration')), - 'thumbnail': thumbnail_url, - 'timestamp': parse_iso8601(vod_info.get('createdAt')), - 'view_count': int_or_none(vod_info.get('viewsTotal')), - 'formats': formats, - } - info.update(self._extract_channel_info(vod_info.get('channel') or {})) - - return info diff --git a/haruhi_dl/extractor/bfmtv.py b/haruhi_dl/extractor/bfmtv.py new file mode 100644 index 000000000..501f69d80 --- /dev/null +++ b/haruhi_dl/extractor/bfmtv.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import extract_attributes + + +class BFMTVBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?bfmtv\.com/' + _VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P<id>\d{12})\.html' + _VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block"[^>]*>)' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + + def _brightcove_url_result(self, video_id, video_block): + account_id = video_block.get('accountid') or '876450612001' + player_id = video_block.get('playerid') or 'I2qBTln4u' + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), + 'BrightcoveNew', video_id) + + +class BFMTVIE(BFMTVBaseIE): + IE_NAME = 'bfmtv' + _VALID_URL = BFMTVBaseIE._VALID_URL_TMPL % 'V' + _TESTS = [{ + 'url': 'https://www.bfmtv.com/politique/emmanuel-macron-l-islam-est-une-religion-qui-vit-une-crise-aujourd-hui-partout-dans-le-monde_VN-202010020146.html', + 'info_dict': { + 'id': '6196747868001', + 'ext': 'mp4', + 'title': 'Emmanuel Macron: "L\'Islam est une religion qui vit une crise aujourd’hui, partout dans le monde"', + 'description': 'Le Président s\'exprime sur la question du séparatisme depuis les Mureaux, dans les Yvelines.', + 'uploader_id': '876450610001', + 'upload_date': '20201002', + 'timestamp': 1601629620, + }, + }] + + def _real_extract(self, url): + bfmtv_id = self._match_id(url) + webpage = self._download_webpage(url, bfmtv_id) + video_block = extract_attributes(self._search_regex( + self._VIDEO_BLOCK_REGEX, webpage, 'video block')) + return self._brightcove_url_result(video_block['videoid'], video_block) + + +class BFMTVLiveIE(BFMTVIE): + IE_NAME = 'bfmtv:live' + _VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P<id>(?:[^/]+/)?en-direct)' + _TESTS = [{ + 'url': 'https://www.bfmtv.com/en-direct/', + 'info_dict': { + 'id': '5615950982001', + 'ext': 'mp4', + 'title': r're:^le direct BFMTV WEB \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'uploader_id': '876450610001', + 'upload_date': '20171018', + 'timestamp': 1508329950, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.bfmtv.com/economie/en-direct/', + 'only_matching': True, + }] + + +class BFMTVArticleIE(BFMTVBaseIE): + IE_NAME = 'bfmtv:article' + _VALID_URL = BFMTVBaseIE._VALID_URL_TMPL % 'A' + _TESTS = [{ + 'url': 'https://www.bfmtv.com/sante/covid-19-un-responsable-de-l-institut-pasteur-se-demande-quand-la-france-va-se-reconfiner_AV-202101060198.html', + 'info_dict': { + 'id': '202101060198', + 'title': 'Covid-19: un responsable de l\'Institut Pasteur se demande "quand la France va se reconfiner"', + 'description': 'md5:947974089c303d3ac6196670ae262843', + }, + 'playlist_count': 2, + }, { + 'url': 'https://www.bfmtv.com/international/pour-bolsonaro-le-bresil-est-en-faillite-mais-il-ne-peut-rien-faire_AD-202101060232.html', + 'only_matching': True, + }, { + 'url': 'https://www.bfmtv.com/sante/covid-19-oui-le-vaccin-de-pfizer-distribue-en-france-a-bien-ete-teste-sur-des-personnes-agees_AN-202101060275.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + bfmtv_id = self._match_id(url) + webpage = self._download_webpage(url, bfmtv_id) + + entries = [] + for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage): + video_block = extract_attributes(video_block_el) + video_id = video_block.get('videoid') + if not video_id: + continue + entries.append(self._brightcove_url_result(video_id, video_block)) + + return self.playlist_result( + entries, bfmtv_id, self._og_search_title(webpage, fatal=False), + self._html_search_meta(['og:description', 'description'], webpage)) diff --git a/haruhi_dl/extractor/bibeltv.py b/haruhi_dl/extractor/bibeltv.py new file mode 100644 index 000000000..56c2bfee8 --- /dev/null +++ b/haruhi_dl/extractor/bibeltv.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BibelTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?:crn/)?(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.bibeltv.de/mediathek/videos/329703-sprachkurs-in-malaiisch', + 'md5': '252f908192d611de038b8504b08bf97f', + 'info_dict': { + 'id': 'ref:329703', + 'ext': 'mp4', + 'title': 'Sprachkurs in Malaiisch', + 'description': 'md5:3e9f197d29ee164714e67351cf737dfe', + 'timestamp': 1608316701, + 'uploader_id': '5840105145001', + 'upload_date': '20201218', + } + }, { + 'url': 'https://www.bibeltv.de/mediathek/videos/crn/326374', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5840105145001/default_default/index.html?videoId=ref:%s' + + def _real_extract(self, url): + crn_id = self._match_id(url) + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % crn_id, 'BrightcoveNew') diff --git a/haruhi_dl/extractor/bleacherreport.py b/haruhi_dl/extractor/bleacherreport.py index dc60224d0..d1bf8e829 100644 --- a/haruhi_dl/extractor/bleacherreport.py +++ b/haruhi_dl/extractor/bleacherreport.py @@ -90,13 +90,19 @@ class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36}|\d{5})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1&library=video-cms', - 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', + 'md5': '670b2d73f48549da032861130488c681', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', + 'upload_date': '20150723', + 'timestamp': 1437679032, + }, + 'expected_warnings': [ + 'Unable to download f4m manifest' + ] }] def _real_extract(self, url): diff --git a/haruhi_dl/extractor/bongacams.py b/haruhi_dl/extractor/bongacams.py new file mode 100644 index 000000000..180542fbc --- /dev/null +++ b/haruhi_dl/extractor/bongacams.py @@ -0,0 +1,60 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + try_get, + urlencode_postdata, +) + + +class BongaCamsIE(InfoExtractor): + _VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.com)/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://de.bongacams.com/azumi-8', + 'only_matching': True, + }, { + 'url': 'https://cn.bongacams.com/azumi-8', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + channel_id = mobj.group('id') + + amf = self._download_json( + 'https://%s/tools/amf.php' % host, channel_id, + data=urlencode_postdata(( + ('method', 'getRoomData'), + ('args[]', channel_id), + ('args[]', 'false'), + )), headers={'X-Requested-With': 'XMLHttpRequest'}) + + server_url = amf['localData']['videoServerUrl'] + + uploader_id = try_get( + amf, lambda x: x['performerData']['username'], compat_str) or channel_id + uploader = try_get( + amf, lambda x: x['performerData']['displayName'], compat_str) + like_count = int_or_none(try_get( + amf, lambda x: x['performerData']['loversCount'])) + + formats = self._extract_m3u8_formats( + '%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id), + channel_id, 'mp4', m3u8_id='hls', live=True) + self._sort_formats(formats) + + return { + 'id': channel_id, + 'title': self._live_title(uploader or uploader_id), + 'uploader': uploader, + 'uploader_id': uploader_id, + 'like_count': like_count, + 'age_limit': 18, + 'is_live': True, + 'formats': formats, + } diff --git a/haruhi_dl/extractor/box.py b/haruhi_dl/extractor/box.py new file mode 100644 index 000000000..aae82d1af --- /dev/null +++ b/haruhi_dl/extractor/box.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + parse_iso8601, + # try_get, + update_url_query, +) + + +class BoxIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)' + _TEST = { + 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', + 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', + 'info_dict': { + 'id': '510727257538', + 'ext': 'mp4', + 'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4', + 'uploader': 'MLS Video', + 'timestamp': 1566320259, + 'upload_date': '20190820', + 'uploader_id': '235196876', + } + } + + def _real_extract(self, url): + shared_name, file_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, file_id) + request_token = self._parse_json(self._search_regex( + r'Box\.config\s*=\s*({.+?});', webpage, + 'Box config'), file_id)['requestToken'] + access_token = self._download_json( + 'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, + 'Downloading token JSON metadata', + data=json.dumps({'fileIDs': [file_id]}).encode(), headers={ + 'Content-Type': 'application/json', + 'X-Request-Token': request_token, + 'X-Box-EndUser-API': 'sharedName=' + shared_name, + })[file_id]['read'] + shared_link = 'https://app.box.com/s/' + shared_name + f = self._download_json( + 'https://api.box.com/2.0/files/' + file_id, file_id, + 'Downloading file JSON metadata', headers={ + 'Authorization': 'Bearer ' + access_token, + 'BoxApi': 'shared_link=' + shared_link, + 'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats + }, query={ + 'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size' + }) + title = f['name'] + + query = { + 'access_token': access_token, + 'shared_link': shared_link + } + + formats = [] + + # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): + # entry_url_template = try_get( + # entry, lambda x: x['content']['url_template']) + # if not entry_url_template: + # continue + # representation = entry.get('representation') + # if representation == 'dash': + # TODO: append query to every fragment URL + # formats.extend(self._extract_mpd_formats( + # entry_url_template.replace('{+asset_path}', 'manifest.mpd'), + # file_id, query=query)) + + authenticated_download_url = f.get('authenticated_download_url') + if authenticated_download_url and f.get('is_download_available'): + formats.append({ + 'ext': f.get('extension') or determine_ext(title), + 'filesize': f.get('size'), + 'format_id': 'download', + 'url': update_url_query(authenticated_download_url, query), + }) + + self._sort_formats(formats) + + creator = f.get('created_by') or {} + + return { + 'id': file_id, + 'title': title, + 'formats': formats, + 'description': f.get('description') or None, + 'uploader': creator.get('name'), + 'timestamp': parse_iso8601(f.get('created_at')), + 'uploader_id': creator.get('id'), + } diff --git a/haruhi_dl/extractor/bravotv.py b/haruhi_dl/extractor/bravotv.py index b9715df00..bae2aedce 100644 --- a/haruhi_dl/extractor/bravotv.py +++ b/haruhi_dl/extractor/bravotv.py @@ -12,7 +12,7 @@ from ..utils import ( class BravoTVIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?bravotv\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<req_id>bravotv|oxygen)\.com/(?:[^/]+/)+(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.bravotv.com/top-chef/season-16/episode-15/videos/the-top-chef-season-16-winner-is', 'md5': 'e34684cfea2a96cd2ee1ef3a60909de9', @@ -28,10 +28,13 @@ class BravoTVIE(AdobePassIE): }, { 'url': 'http://www.bravotv.com/below-deck/season-3/ep-14-reunion-part-1', 'only_matching': True, + }, { + 'url': 'https://www.oxygen.com/in-ice-cold-blood/season-2/episode-16/videos/handling-the-horwitz-house-after-the-murder-season-2', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) + site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) settings = self._parse_json(self._search_regex( r'<script[^>]+data-drupal-selector="drupal-settings-json"[^>]*>({.+?})</script>', webpage, 'drupal settings'), @@ -53,11 +56,14 @@ class BravoTVIE(AdobePassIE): tp_path = release_pid = tve['release_pid'] if tve.get('entitlement') == 'auth': adobe_pass = settings.get('tve_adobe_auth', {}) + if site == 'bravotv': + site = 'bravo' resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'bravo'), + adobe_pass.get('adobePassResourceId') or site, tve['title'], release_pid, tve.get('rating')) query['auth'] = self._extract_mvpd_auth( - url, release_pid, adobe_pass.get('adobePassRequestorId', 'bravo'), resource) + url, release_pid, + adobe_pass.get('adobePassRequestorId') or site, resource) else: shared_playlist = settings['ls_playlist'] account_pid = shared_playlist['account_pid'] diff --git a/haruhi_dl/extractor/brightcove.py b/haruhi_dl/extractor/brightcove.py index ee2867ecc..0a2ec3879 100644 --- a/haruhi_dl/extractor/brightcove.py +++ b/haruhi_dl/extractor/brightcove.py @@ -28,6 +28,7 @@ from ..utils import ( parse_iso8601, smuggle_url, str_or_none, + try_get, unescapeHTML, unsmuggle_url, UnsupportedError, @@ -470,13 +471,18 @@ class BrightcoveNewIE(AdobePassIE): def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() + num_drm_sources = 0 formats = [] - for source in json_data.get('sources', []): + sources = json_data.get('sources') or [] + for source in sources: container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if ext == 'ism' or container == 'WVM' or source.get('key_systems'): + if container == 'WVM' or source.get('key_systems'): + num_drm_sources += 1 + continue + elif ext == 'ism': continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -533,20 +539,15 @@ class BrightcoveNewIE(AdobePassIE): 'format_id': build_format_id('rtmp'), }) formats.append(f) - if not formats: - # for sonyliv.com DRM protected videos - s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') - if s3_source_url: - formats.append({ - 'url': s3_source_url, - 'format_id': 'source', - }) - errors = json_data.get('errors') - if not formats and errors: - error = errors[0] - raise ExtractorError( - error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if not formats: + errors = json_data.get('errors') + if errors: + error = errors[0] + raise ExtractorError( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + if sources and num_drm_sources == len(sources): + raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) @@ -600,24 +601,27 @@ class BrightcoveNewIE(AdobePassIE): store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) def extract_policy_key(): - webpage = self._download_webpage( - 'http://players.brightcove.net/%s/%s_%s/index.min.js' - % (account_id, player_id, embed), video_id) - - policy_key = None - - catalog = self._search_regex( - r'catalog\(({.+?})\);', webpage, 'catalog', default=None) - if catalog: - catalog = self._parse_json( - js_to_json(catalog), video_id, fatal=False) - if catalog: - policy_key = catalog.get('policyKey') - + base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed) + config = self._download_json( + base_url + 'config.json', video_id, fatal=False) or {} + policy_key = try_get( + config, lambda x: x['video_cloud']['policy_key']) if not policy_key: - policy_key = self._search_regex( - r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', - webpage, 'policy key', group='pk') + webpage = self._download_webpage( + base_url + 'index.min.js', video_id) + + catalog = self._search_regex( + r'catalog\(({.+?})\);', webpage, 'catalog', default=None) + if catalog: + catalog = self._parse_json( + js_to_json(catalog), video_id, fatal=False) + if catalog: + policy_key = catalog.get('policyKey') + + if not policy_key: + policy_key = self._search_regex( + r'policyKey\s*:\s*(["\'])(?P<pk>.+?)\1', + webpage, 'policy key', group='pk') store_pk(policy_key) return policy_key diff --git a/haruhi_dl/extractor/canvas.py b/haruhi_dl/extractor/canvas.py index 8667a0d04..eefbab241 100644 --- a/haruhi_dl/extractor/canvas.py +++ b/haruhi_dl/extractor/canvas.py @@ -8,18 +8,20 @@ from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( ExtractorError, - strip_or_none, + clean_html, + extract_attributes, float_or_none, + get_element_by_class, int_or_none, merge_dicts, - parse_iso8601, str_or_none, + strip_or_none, url_or_none, ) class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'md5': '68993eda72ef62386a15ea2cf3c93107', @@ -37,6 +39,7 @@ class CanvasIE(InfoExtractor): 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'only_matching': True, }] + _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', 'HLS_AES': 'm3u8', @@ -47,29 +50,34 @@ class CanvasIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) site_id, video_id = mobj.group('site_id'), mobj.group('id') - # Old API endpoint, serves more formats but may fail for some videos - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) + data = None + if site_id != 'vrtvideo': + # Old API endpoint, serves more formats but may fail for some videos + data = self._download_json( + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), video_id, 'Downloading asset JSON', + 'Unable to download asset JSON', fatal=False) # New API endpoint if not data: + headers = self.geo_verification_headers() + headers.update({'Content-Type': 'application/json'}) token = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', - headers={'Content-Type': 'application/json'})['vrtPlayerToken'] + 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', fatal=False, query={ + video_id, 'Downloading video JSON', query={ 'vrtPlayerToken': token, 'client': '%s@PROD' % site_id, }, expected_status=400) - message = data.get('message') - if message and not data.get('title'): - if data.get('code') == 'AUTHENTICATION_REQUIRED': - self.raise_login_required(message) - raise ExtractorError(message, expected=True) + if not data.get('title'): + code = data.get('code') + if code == 'AUTHENTICATION_REQUIRED': + self.raise_login_required() + elif code == 'INVALID_LOCATION': + self.raise_geo_restricted(countries=['BE']) + raise ExtractorError(data.get('message') or code, expected=True) title = data['title'] description = data.get('description') @@ -205,20 +213,24 @@ class CanvasEenIE(InfoExtractor): class VrtNUIE(GigyaBaseIE): IE_DESC = 'VrtNU.be' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)' _TESTS = [{ # Available via old API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', + 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', 'info_dict': { - 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', + 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', 'ext': 'mp4', - 'title': 'De zwarte weduwe', - 'description': 'md5:db1227b0f318c849ba5eab1fef895ee4', + 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', + 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', 'duration': 1457.04, 'thumbnail': r're:^https?://.*\.jpg$', - 'season': 'Season 1', - 'season_number': 1, + 'series': 'Postbus X', + 'season': 'Seizoen 1989', + 'season_number': 1989, + 'episode': 'De zwarte weduwe', 'episode_number': 1, + 'timestamp': 1595822400, + 'upload_date': '20200727', }, 'skip': 'This video is only available for registered users', 'params': { @@ -300,69 +312,73 @@ class VrtNUIE(GigyaBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage, urlh = self._download_webpage_handle(url, display_id) + webpage = self._download_webpage(url, display_id) + + attrs = extract_attributes(self._search_regex( + r'(<nui-media[^>]+>)', webpage, 'media element')) + video_id = attrs['videoid'] + publication_id = attrs.get('publicationid') + if publication_id: + video_id = publication_id + '$' + video_id + + page = (self._parse_json(self._search_regex( + r'digitalData\s*=\s*({.+?});', webpage, 'digial data', + default='{}'), video_id, fatal=False) or {}).get('page') or {} info = self._search_json_ld(webpage, display_id, default={}) - - # title is optional here since it may be extracted by extractor - # that is delegated from here - title = strip_or_none(self._html_search_regex( - r'(?ms)<h1 class="content__heading">(.+?)</h1>', - webpage, 'title', default=None)) - - description = self._html_search_regex( - r'(?ms)<div class="content__description">(.+?)</div>', - webpage, 'description', default=None) - - season = self._html_search_regex( - [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s* - <span>seizoen\ (.+?)</span>\s* - </div>''', - r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'], - webpage, 'season', default=None) - - season_number = int_or_none(season) - - episode_number = int_or_none(self._html_search_regex( - r'''(?xms)<div\ class="content__episode">\s* - <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span> - </div>''', - webpage, 'episode_number', default=None)) - - release_date = parse_iso8601(self._html_search_regex( - r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"', - webpage, 'release_date', default=None)) - - # If there's a ? or a # in the URL, remove them and everything after - clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/') - securevideo_url = clean_url + '.mssecurevideo.json' - - try: - video = self._download_json(securevideo_url, display_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - self.raise_login_required() - raise - - # We are dealing with a '../<show>.relevant' URL - redirect_url = video.get('url') - if redirect_url: - return self.url_result(self._proto_relative_url(redirect_url, 'https:')) - - # There is only one entry, but with an unknown key, so just get - # the first one - video_id = list(video.values())[0].get('videoid') - return merge_dicts(info, { '_type': 'url_transparent', 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, 'ie_key': CanvasIE.ie_key(), 'id': video_id, 'display_id': display_id, + 'season_number': int_or_none(page.get('episode_season')), + }) + + +class DagelijkseKostIE(InfoExtractor): + IE_DESC = 'dagelijksekost.een.be' + _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof', + 'md5': '30bfffc323009a3e5f689bef6efa2365', + 'info_dict': { + 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa', + 'display_id': 'hachis-parmentier-met-witloof', + 'ext': 'mp4', + 'title': 'Hachis parmentier met witloof', + 'description': 'md5:9960478392d87f63567b5b117688cdc5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 283.02, + }, + 'expected_warnings': ['is not a supported codec'], + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + title = strip_or_none(get_element_by_class( + 'dish-metadata__title', webpage + ) or self._html_search_meta( + 'twitter:title', webpage)) + + description = clean_html(get_element_by_class( + 'dish-description', webpage) + ) or self._html_search_meta( + ('description', 'twitter:description', 'og:description'), + webpage) + + video_id = self._html_search_regex( + r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id', + group='id') + + return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id, + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, 'title': title, 'description': description, - 'season': season, - 'season_number': season_number, - 'episode_number': episode_number, - 'release_date': release_date, - }) + } diff --git a/haruhi_dl/extractor/cbslocal.py b/haruhi_dl/extractor/cbslocal.py index 90852a9ef..3b7e1a8b9 100644 --- a/haruhi_dl/extractor/cbslocal.py +++ b/haruhi_dl/extractor/cbslocal.py @@ -11,7 +11,47 @@ from ..utils import ( class CBSLocalIE(AnvatoIE): - _VALID_URL = r'https?://[a-z]+\.cbslocal\.com/(?:\d+/\d+/\d+|video)/(?P<id>[0-9a-z-]+)' + _VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/' + _VALID_URL = _VALID_URL_BASE + r'video/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', + 'info_dict': { + 'id': '3580809', + 'ext': 'mp4', + 'title': 'A Very Blue Anniversary', + 'description': 'CBS2’s Cindy Hsu has more.', + 'thumbnail': 're:^https?://.*', + 'timestamp': int, + 'upload_date': r're:^\d{8}$', + 'uploader': 'CBS', + 'subtitles': { + 'en': 'mincount:5', + }, + 'categories': [ + 'Stations\\Spoken Word\\WCBSTV', + 'Syndication\\AOL', + 'Syndication\\MSN', + 'Syndication\\NDN', + 'Syndication\\Yahoo', + 'Content\\News', + 'Content\\News\\Local News', + ], + 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + mcp_id = self._match_id(url) + return self.url_result( + 'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id) + + +class CBSLocalArticleIE(AnvatoIE): + _VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P<id>[0-9a-z-]+)' _TESTS = [{ # Anvato backend @@ -52,31 +92,6 @@ class CBSLocalIE(AnvatoIE): # m3u8 download 'skip_download': True, }, - }, { - 'url': 'http://newyork.cbslocal.com/video/3580809-a-very-blue-anniversary/', - 'info_dict': { - 'id': '3580809', - 'ext': 'mp4', - 'title': 'A Very Blue Anniversary', - 'description': 'CBS2’s Cindy Hsu has more.', - 'thumbnail': 're:^https?://.*', - 'timestamp': int, - 'upload_date': r're:^\d{8}$', - 'uploader': 'CBS', - 'subtitles': { - 'en': 'mincount:5', - }, - 'categories': [ - 'Stations\\Spoken Word\\WCBSTV', - 'Syndication\\AOL', - 'Syndication\\MSN', - 'Syndication\\NDN', - 'Syndication\\Yahoo', - 'Content\\News', - 'Content\\News\\Local News', - ], - 'tags': ['CBS 2 News Weekends', 'Cindy Hsu', 'Blue Man Group'], - }, }] def _real_extract(self, url): diff --git a/haruhi_dl/extractor/ccma.py b/haruhi_dl/extractor/ccma.py index 544647f92..e6ae49352 100644 --- a/haruhi_dl/extractor/ccma.py +++ b/haruhi_dl/extractor/ccma.py @@ -1,15 +1,18 @@ # coding: utf-8 from __future__ import unicode_literals +import calendar +import datetime import re from .common import InfoExtractor from ..utils import ( clean_html, + extract_timezone, int_or_none, parse_duration, - parse_iso8601, parse_resolution, + try_get, url_or_none, ) @@ -24,8 +27,9 @@ class CCMAIE(InfoExtractor): 'ext': 'mp4', 'title': 'L\'espot de La Marató de TV3', 'description': 'md5:f12987f320e2f6e988e9908e4fe97765', - 'timestamp': 1470918540, - 'upload_date': '20160811', + 'timestamp': 1478608140, + 'upload_date': '20161108', + 'age_limit': 0, } }, { 'url': 'http://www.ccma.cat/catradio/alacarta/programa/el-consell-de-savis-analitza-el-derbi/audio/943685/', @@ -35,8 +39,24 @@ class CCMAIE(InfoExtractor): 'ext': 'mp3', 'title': 'El Consell de Savis analitza el derbi', 'description': 'md5:e2a3648145f3241cb9c6b4b624033e53', - 'upload_date': '20171205', - 'timestamp': 1512507300, + 'upload_date': '20170512', + 'timestamp': 1494622500, + 'vcodec': 'none', + 'categories': ['Esports'], + } + }, { + 'url': 'http://www.ccma.cat/tv3/alacarta/crims/crims-josep-tallada-lespereu-me-capitol-1/video/6031387/', + 'md5': 'b43c3d3486f430f3032b5b160d80cbc3', + 'info_dict': { + 'id': '6031387', + 'ext': 'mp4', + 'title': 'Crims - Josep Talleda, l\'"Espereu-me" (capítol 1)', + 'description': 'md5:7cbdafb640da9d0d2c0f62bad1e74e60', + 'timestamp': 1582577700, + 'upload_date': '20200224', + 'subtitles': 'mincount:4', + 'age_limit': 16, + 'series': 'Crims', } }] @@ -72,17 +92,28 @@ class CCMAIE(InfoExtractor): informacio = media['informacio'] title = informacio['titol'] - durada = informacio.get('durada', {}) + durada = informacio.get('durada') or {} duration = int_or_none(durada.get('milisegons'), 1000) or parse_duration(durada.get('text')) - timestamp = parse_iso8601(informacio.get('data_emissio', {}).get('utc')) + tematica = try_get(informacio, lambda x: x['tematica']['text']) + + timestamp = None + data_utc = try_get(informacio, lambda x: x['data_emissio']['utc']) + try: + timezone, data_utc = extract_timezone(data_utc) + timestamp = calendar.timegm((datetime.datetime.strptime( + data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple()) + except TypeError: + pass subtitles = {} - subtitols = media.get('subtitols', {}) - if subtitols: - sub_url = subtitols.get('url') + subtitols = media.get('subtitols') or [] + if isinstance(subtitols, dict): + subtitols = [subtitols] + for st in subtitols: + sub_url = st.get('url') if sub_url: subtitles.setdefault( - subtitols.get('iso') or subtitols.get('text') or 'ca', []).append({ + st.get('iso') or st.get('text') or 'ca', []).append({ 'url': sub_url, }) @@ -97,6 +128,16 @@ class CCMAIE(InfoExtractor): 'height': int_or_none(imatges.get('alcada')), }] + age_limit = None + codi_etic = try_get(informacio, lambda x: x['codi_etic']['id']) + if codi_etic: + codi_etic_s = codi_etic.split('_') + if len(codi_etic_s) == 2: + if codi_etic_s[1] == 'TP': + age_limit = 0 + else: + age_limit = int_or_none(codi_etic_s[1]) + return { 'id': media_id, 'title': title, @@ -106,4 +147,9 @@ class CCMAIE(InfoExtractor): 'thumbnails': thumbnails, 'subtitles': subtitles, 'formats': formats, + 'age_limit': age_limit, + 'alt_title': informacio.get('titol_complet'), + 'episode_number': int_or_none(informacio.get('capitol')), + 'categories': [tematica] if tematica else None, + 'series': informacio.get('programa'), } diff --git a/haruhi_dl/extractor/cnbc.py b/haruhi_dl/extractor/cnbc.py index 6889b0f40..7b9f4536a 100644 --- a/haruhi_dl/extractor/cnbc.py +++ b/haruhi_dl/extractor/cnbc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import smuggle_url @@ -38,7 +39,7 @@ class CNBCIE(InfoExtractor): class CNBCVideoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)' + _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)' _TEST = { 'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', 'info_dict': { @@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor): } def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, - 'video id') + path, display_id = re.match(self._VALID_URL, url).groups() + video_id = self._download_json( + 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ + 'query': '''{ + page(path: "%s") { + vcpsId + } +}''' % path, + })['data']['page']['vcpsId'] return self.url_result( - 'http://video.cnbc.com/gallery/?video=%s' % video_id, + 'http://video.cnbc.com/gallery/?video=%d' % video_id, CNBCIE.ie_key()) diff --git a/haruhi_dl/extractor/cnn.py b/haruhi_dl/extractor/cnn.py index 774b71055..2d950fa05 100644 --- a/haruhi_dl/extractor/cnn.py +++ b/haruhi_dl/extractor/cnn.py @@ -96,7 +96,10 @@ class CNNIE(TurnerBaseIE): config['data_src'] % path, page_title, { 'default': { 'media_src': config['media_src'], - } + }, + 'f4m': { + 'host': 'cnn-vh.akamaihd.net', + }, }) diff --git a/haruhi_dl/extractor/comedycentral.py b/haruhi_dl/extractor/comedycentral.py index d08b909a6..1bfa912be 100644 --- a/haruhi_dl/extractor/comedycentral.py +++ b/haruhi_dl/extractor/comedycentral.py @@ -1,142 +1,51 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor -from .common import InfoExtractor class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (video-clips|episodes|cc-studios|video-collections|shows(?=/[^/]+/(?!full-episodes))) - /(?P<title>.*)''' + _VALID_URL = r'https?://(?:www\.)?cc\.com/(?:episodes|video(?:-clips)?)/(?P<id>[0-9a-z]{6})' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TESTS = [{ - 'url': 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', - 'md5': 'c4f48e9eda1b16dd10add0744344b6d8', + 'url': 'http://www.cc.com/video-clips/5ke9v2/the-daily-show-with-trevor-noah-doc-rivers-and-steve-ballmer---the-nba-player-strike', + 'md5': 'b8acb347177c680ff18a292aa2166f80', 'info_dict': { - 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', + 'id': '89ccc86e-1b02-4f83-b0c9-1d9592ecd025', 'ext': 'mp4', - 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother', - 'description': 'After a certain point, breastfeeding becomes c**kblocking.', - 'timestamp': 1376798400, - 'upload_date': '20130818', + 'title': 'The Daily Show with Trevor Noah|August 28, 2020|25|25149|Doc Rivers and Steve Ballmer - The NBA Player Strike', + 'description': 'md5:5334307c433892b85f4f5e5ac9ef7498', + 'timestamp': 1598670000, + 'upload_date': '20200829', }, }, { - 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', + 'url': 'http://www.cc.com/episodes/pnzzci/drawn-together--american-idol--parody-clip-show-season-3-ep-314', 'only_matching': True, - }] - - -class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?cc\.com/ - (?:full-episodes|shows(?=/[^/]+/full-episodes)) - /(?P<id>[^?]+)''' - _FEED_URL = 'http://comedycentral.com/feeds/mrss/' - - _TESTS = [{ - 'url': 'http://www.cc.com/full-episodes/pv391a/the-daily-show-with-trevor-noah-november-28--2016---ryan-speedo-green-season-22-ep-22028', - 'info_dict': { - 'description': 'Donald Trump is accused of exploiting his president-elect status for personal gain, Cuban leader Fidel Castro dies, and Ryan Speedo Green discusses "Sing for Your Life."', - 'title': 'November 28, 2016 - Ryan Speedo Green', - }, - 'playlist_count': 4, }, { - 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'only_matching': True, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1') - videos_info = self._get_videos_info(mgid) - return videos_info - - -class ToshIE(MTVServicesInfoExtractor): - IE_DESC = 'Tosh.0' - _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)' - _FEED_URL = 'http://tosh.cc.com/feeds/mrss' - - _TESTS = [{ - 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', - 'info_dict': { - 'description': 'Tosh asked fans to share their summer plans.', - 'title': 'Twitter Users Share Summer Plans', - }, - 'playlist': [{ - 'md5': 'f269e88114c1805bb6d7653fecea9e06', - 'info_dict': { - 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30', - 'ext': 'mp4', - 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', - 'description': 'Tosh asked fans to share their summer plans.', - 'thumbnail': r're:^https?://.*\.jpg', - # It's really reported to be published on year 2077 - 'upload_date': '20770610', - 'timestamp': 3390510600, - 'subtitles': { - 'en': 'mincount:3', - }, - }, - }] - }, { - 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp', + 'url': 'https://www.cc.com/video/k3sdvm/the-daily-show-with-jon-stewart-exclusive-the-fourth-estate', 'only_matching': True, }] class ComedyCentralTVIE(MTVServicesInfoExtractor): - _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/folgen/(?P<id>[0-9a-z]{6})' _TESTS = [{ - 'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4', + 'url': 'https://www.comedycentral.tv/folgen/pxdpec/josh-investigates-klimawandel-staffel-1-ep-1', 'info_dict': { - 'id': 'local_playlist-f99b626bdfe13568579a', - 'ext': 'flv', - 'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1', + 'id': '15907dc3-ec3c-11e8-a442-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'Josh Investigates', + 'description': 'Steht uns das Ende der Welt bevor?', }, - 'params': { - # rtmp download - 'skip_download': True, - }, - }, { - 'url': 'http://www.comedycentral.tv/shows/1074-workaholics', - 'only_matching': True, - }, { - 'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus', - 'only_matching': True, }] + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' + _GEO_COUNTRIES = ['DE'] - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - mrss_url = self._search_regex( - r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'mrss url', group='url') - - return self._get_videos_info_from_url(mrss_url, video_id) - - -class ComedyCentralShortnameIE(InfoExtractor): - _VALID_URL = r'^:(?P<id>tds|thedailyshow|theopposition)$' - _TESTS = [{ - 'url': ':tds', - 'only_matching': True, - }, { - 'url': ':thedailyshow', - 'only_matching': True, - }, { - 'url': ':theopposition', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - shortcut_map = { - 'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', - 'theopposition': 'http://www.cc.com/shows/the-opposition-with-jordan-klepper/full-episodes', + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'web.cc.tv', + 'ep': 'b9032c3a', + 'imageEp': 'web.cc.tv', + 'mgid': uri, } - return self.url_result(shortcut_map[video_id]) diff --git a/haruhi_dl/extractor/common.py b/haruhi_dl/extractor/common.py index 699aced61..6b8ebbdef 100644 --- a/haruhi_dl/extractor/common.py +++ b/haruhi_dl/extractor/common.py @@ -336,8 +336,8 @@ class InfoExtractor(object): object, each element of which is a valid dictionary by this specification. Additionally, playlists can have "id", "title", "description", "uploader", - "uploader_id", "uploader_url" attributes with the same semantics as videos - (see above). + "uploader_id", "uploader_url", "duration" attributes with the same semantics + as videos (see above). _type "multi_video" indicates that there are multiple videos that @@ -1239,8 +1239,16 @@ class InfoExtractor(object): 'ViewAction': 'view', } + def extract_interaction_type(e): + interaction_type = e.get('interactionType') + if isinstance(interaction_type, dict): + interaction_type = interaction_type.get('@type') + return str_or_none(interaction_type) + def extract_interaction_statistic(e): interaction_statistic = e.get('interactionStatistic') + if isinstance(interaction_statistic, dict): + interaction_statistic = [interaction_statistic] if not isinstance(interaction_statistic, list): return for is_e in interaction_statistic: @@ -1248,8 +1256,8 @@ class InfoExtractor(object): continue if is_e.get('@type') != 'InteractionCounter': continue - interaction_type = is_e.get('interactionType') - if not isinstance(interaction_type, compat_str): + interaction_type = extract_interaction_type(is_e) + if not interaction_type: continue # For interaction count some sites provide string instead of # an integer (as per spec) with non digit characters (e.g. ",") @@ -1474,9 +1482,10 @@ class InfoExtractor(object): try: self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True - except ExtractorError: + except ExtractorError as e: self.to_screen( - '%s: %s URL is invalid, skipping' % (video_id, item)) + '%s: %s URL is invalid, skipping: %s' + % (video_id, item, error_to_compat_str(e.cause))) return False def http_scheme(self): @@ -2612,7 +2621,15 @@ class InfoExtractor(object): return entries def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): + signed = 'hdnea=' in manifest_url + if not signed: + # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html + manifest_url = re.sub( + r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?', + '', manifest_url).strip('?') + formats = [] + hdcore_sign = 'hdcore=3.7.0' f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') hds_host = hosts.get('hds') @@ -2625,13 +2642,38 @@ class InfoExtractor(object): for entry in f4m_formats: entry.update({'extra_param_to_segment_url': hdcore_sign}) formats.extend(f4m_formats) + m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') hls_host = hosts.get('hls') if hls_host: m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + + http_host = hosts.get('http') + if http_host and m3u8_formats and not signed: + REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+' + qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') + qualities_length = len(qualities) + if len(m3u8_formats) in (qualities_length, qualities_length + 1): + i = 0 + for f in m3u8_formats: + if f['vcodec'] != 'none': + for protocol in ('http', 'https'): + http_f = f.copy() + del http_f['manifest_url'] + http_url = re.sub( + REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url']) + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), + 'url': http_url, + 'protocol': protocol, + }) + formats.append(http_f) + i += 1 + return formats def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): diff --git a/haruhi_dl/extractor/condenast.py b/haruhi_dl/extractor/condenast.py index ed278fefc..d5e77af32 100644 --- a/haruhi_dl/extractor/condenast.py +++ b/haruhi_dl/extractor/condenast.py @@ -16,6 +16,8 @@ from ..utils import ( mimetype2ext, orderedSet, parse_iso8601, + strip_or_none, + try_get, ) @@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor): 'uploader': 'gq', 'upload_date': '20170321', 'timestamp': 1490126427, + 'description': 'How much grimmer would things be if these people were competent?', }, }, { # JS embed @@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor): 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', 'uploader': 'arstechnica', 'upload_date': '20150916', - 'timestamp': 1442434955, + 'timestamp': 1442434920, } }, { 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', @@ -196,6 +199,13 @@ class CondeNastIE(InfoExtractor): }) self._sort_formats(formats) + subtitles = {} + for t, caption in video_info.get('captions', {}).items(): + caption_url = caption.get('src') + if not (t in ('vtt', 'srt', 'tml') and caption_url): + continue + subtitles.setdefault('en', []).append({'url': caption_url}) + return { 'id': video_id, 'formats': formats, @@ -208,6 +218,7 @@ class CondeNastIE(InfoExtractor): 'season': video_info.get('season_title'), 'timestamp': parse_iso8601(video_info.get('premiere_date')), 'categories': video_info.get('categories'), + 'subtitles': subtitles, } def _real_extract(self, url): @@ -225,8 +236,16 @@ class CondeNastIE(InfoExtractor): if url_type == 'series': return self._extract_series(url, webpage) else: - params = self._extract_video_params(webpage, display_id) - info = self._search_json_ld( - webpage, display_id, fatal=False) + video = try_get(self._parse_json(self._search_regex( + r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage, + 'preload state', '{}'), display_id), + lambda x: x['transformed']['video']) + if video: + params = {'videoId': video['id']} + info = {'description': strip_or_none(video.get('description'))} + else: + params = self._extract_video_params(webpage, display_id) + info = self._search_json_ld( + webpage, display_id, fatal=False) info.update(self._extract_video(params)) return info diff --git a/haruhi_dl/extractor/cspan.py b/haruhi_dl/extractor/cspan.py index 67d6df4b0..2e01aff48 100644 --- a/haruhi_dl/extractor/cspan.py +++ b/haruhi_dl/extractor/cspan.py @@ -8,9 +8,14 @@ from ..utils import ( ExtractorError, extract_attributes, find_xpath_attr, + get_element_by_attribute, get_element_by_class, int_or_none, + js_to_json, + merge_dicts, + parse_iso8601, smuggle_url, + str_to_int, unescapeHTML, ) from .senateisvp import SenateISVPIE @@ -98,6 +103,48 @@ class CSpanIE(InfoExtractor): bc_attr['data-bcid']) return self.url_result(smuggle_url(bc_url, {'source_url': url})) + def add_referer(formats): + for f in formats: + f.setdefault('http_headers', {})['Referer'] = url + + # As of 01.12.2020 this path looks to cover all cases making the rest + # of the code unnecessary + jwsetup = self._parse_json( + self._search_regex( + r'(?s)jwsetup\s*=\s*({.+?})\s*;', webpage, 'jwsetup', + default='{}'), + video_id, transform_source=js_to_json, fatal=False) + if jwsetup: + info = self._parse_jwplayer_data( + jwsetup, video_id, require_title=False, m3u8_id='hls', + base_url=url) + add_referer(info['formats']) + for subtitles in info['subtitles'].values(): + for subtitle in subtitles: + ext = determine_ext(subtitle['url']) + if ext == 'php': + ext = 'vtt' + subtitle['ext'] = ext + ld_info = self._search_json_ld(webpage, video_id, default={}) + title = get_element_by_class('video-page-title', webpage) or \ + self._og_search_title(webpage) + description = get_element_by_attribute('itemprop', 'description', webpage) or \ + self._html_search_meta(['og:description', 'description'], webpage) + return merge_dicts(info, ld_info, { + 'title': title, + 'thumbnail': get_element_by_attribute('itemprop', 'thumbnailUrl', webpage), + 'description': description, + 'timestamp': parse_iso8601(get_element_by_attribute('itemprop', 'uploadDate', webpage)), + 'location': get_element_by_attribute('itemprop', 'contentLocation', webpage), + 'duration': int_or_none(self._search_regex( + r'jwsetup\.seclength\s*=\s*(\d+);', + webpage, 'duration', fatal=False)), + 'view_count': str_to_int(self._search_regex( + r"<span[^>]+class='views'[^>]*>([\d,]+)\s+Views</span>", + webpage, 'views', fatal=False)), + }) + + # Obsolete # We first look for clipid, because clipprog always appears before patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] results = list(filter(None, (re.search(p, webpage) for p in patterns))) @@ -165,6 +212,7 @@ class CSpanIE(InfoExtractor): formats = self._extract_m3u8_formats( path, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] + add_referer(formats) self._sort_formats(formats) entries.append({ 'id': '%s_%d' % (video_id, partnum + 1), diff --git a/haruhi_dl/extractor/ctv.py b/haruhi_dl/extractor/ctv.py new file mode 100644 index 000000000..756bcc2be --- /dev/null +++ b/haruhi_dl/extractor/ctv.py @@ -0,0 +1,52 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ctv\.ca/(?P<id>(?:show|movie)s/[^/]+/[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.ctv.ca/shows/your-morning/wednesday-december-23-2020-s5e88', + 'info_dict': { + 'id': '2102249', + 'ext': 'flv', + 'title': 'Wednesday, December 23, 2020', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Your Morning delivers original perspectives and unique insights into the headlines of the day.', + 'timestamp': 1608732000, + 'upload_date': '20201223', + 'series': 'Your Morning', + 'season': '2020-2021', + 'season_number': 5, + 'episode_number': 88, + 'tags': ['Your Morning'], + 'categories': ['Talk Show'], + 'duration': 7467.126, + }, + }, { + 'url': 'https://www.ctv.ca/movies/adam-sandlers-eight-crazy-nights/adam-sandlers-eight-crazy-nights', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + content = self._download_json( + 'https://www.ctv.ca/space-graphql/graphql', display_id, query={ + 'query': '''{ + resolvedPath(path: "/%s") { + lastSegment { + content { + ... on AxisContent { + axisId + videoPlayerDestCode + } + } + } + } +}''' % display_id, + })['data']['resolvedPath']['lastSegment']['content'] + video_id = content['axisId'] + return self.url_result( + '9c9media:%s:%s' % (content['videoPlayerDestCode'], video_id), + 'NineCNineMedia', video_id) diff --git a/haruhi_dl/extractor/discoverynetworks.py b/haruhi_dl/extractor/discoverynetworks.py index 607a54948..c512b95d0 100644 --- a/haruhi_dl/extractor/discoverynetworks.py +++ b/haruhi_dl/extractor/discoverynetworks.py @@ -7,7 +7,7 @@ from .dplay import DPlayIE class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)' _TESTS = [{ 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', @@ -29,6 +29,9 @@ class DiscoveryNetworksDeIE(DPlayIE): }, { 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', 'only_matching': True, + }, { + 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/haruhi_dl/extractor/dplay.py b/haruhi_dl/extractor/dplay.py index a7b9db568..bbb199094 100644 --- a/haruhi_dl/extractor/dplay.py +++ b/haruhi_dl/extractor/dplay.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -10,16 +11,23 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + strip_or_none, unified_timestamp, ) class DPlayIE(InfoExtractor): + _PATH_REGEX = r'/(?P<id>[^/]+/[^/?#]+)' _VALID_URL = r'''(?x)https?:// (?P<domain> - (?:www\.)?(?P<host>dplay\.(?P<country>dk|fi|jp|se|no))| + (?:www\.)?(?P<host>d + (?: + play\.(?P<country>dk|fi|jp|se|no)| + iscoveryplus\.(?P<plus_country>dk|es|fi|it|se|no) + ) + )| (?P<subdomain_country>es|it)\.dplay\.com - )/[^/]+/(?P<id>[^/]+/[^/?#]+)''' + )/[^/]+''' + _PATH_REGEX _TESTS = [{ # non geo restricted, via secure api, unsigned download hls URL @@ -126,58 +134,99 @@ class DPlayIE(InfoExtractor): }, { 'url': 'https://www.dplay.jp/video/gold-rush/24086', 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.no/videoer/i-kongens-klr/sesong-1-episode-7', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.it/videos/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.es/videos/la-fiebre-del-oro/temporada-8-episodio-1', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.fi/videot/shifting-gears-with-aaron-kaufman/episode-16', + 'only_matching': True, }] + def _process_errors(self, e, geo_countries): + info = self._parse_json(e.cause.read().decode('utf-8'), None) + error = info['errors'][0] + error_code = error.get('code') + if error_code == 'access.denied.geoblocked': + self.raise_geo_restricted(countries=geo_countries) + elif error_code in ('access.denied.missingpackage', 'invalid.token'): + raise ExtractorError( + 'This video is only available for registered users. You may want to use --cookies.', expected=True) + raise ExtractorError(info['errors'][0]['detail'], expected=True) + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['Authorization'] = 'Bearer ' + self._download_json( + disco_base + 'token', display_id, 'Downloading token', + query={ + 'realm': realm, + })['data']['attributes']['token'] + + def _download_video_playback_info(self, disco_base, video_id, headers): + streaming = self._download_json( + disco_base + 'playback/videoPlaybackInfo/' + video_id, + video_id, headers=headers)['data']['attributes']['streaming'] + streaming_list = [] + for format_id, format_dict in streaming.items(): + streaming_list.append({ + 'type': format_id, + 'url': format_dict.get('url'), + }) + return streaming_list + def _get_disco_api_info(self, url, display_id, disco_host, realm, country): geo_countries = [country.upper()] self._initialize_geo_bypass({ 'countries': geo_countries, }) disco_base = 'https://%s/' % disco_host - token = self._download_json( - disco_base + 'token', display_id, 'Downloading token', - query={ - 'realm': realm, - })['data']['attributes']['token'] headers = { 'Referer': url, - 'Authorization': 'Bearer ' + token, } - video = self._download_json( - disco_base + 'content/videos/' + display_id, display_id, - headers=headers, query={ - 'fields[channel]': 'name', - 'fields[image]': 'height,src,width', - 'fields[show]': 'name', - 'fields[tag]': 'name', - 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', - 'include': 'images,primaryChannel,show,tags' - }) + self._update_disco_api_headers(headers, disco_base, display_id, realm) + try: + video = self._download_json( + disco_base + 'content/videos/' + display_id, display_id, + headers=headers, query={ + 'fields[channel]': 'name', + 'fields[image]': 'height,src,width', + 'fields[show]': 'name', + 'fields[tag]': 'name', + 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration', + 'include': 'images,primaryChannel,show,tags' + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + self._process_errors(e, geo_countries) + raise video_id = video['data']['id'] info = video['data']['attributes'] title = info['name'].strip() formats = [] try: - streaming = self._download_json( - disco_base + 'playback/videoPlaybackInfo/' + video_id, - display_id, headers=headers)['data']['attributes']['streaming'] + streaming = self._download_video_playback_info( + disco_base, video_id, headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - info = self._parse_json(e.cause.read().decode('utf-8'), display_id) - error = info['errors'][0] - error_code = error.get('code') - if error_code == 'access.denied.geoblocked': - self.raise_geo_restricted(countries=geo_countries) - elif error_code == 'access.denied.missingpackage': - self.raise_login_required() - raise ExtractorError(info['errors'][0]['detail'], expected=True) + self._process_errors(e, geo_countries) raise - for format_id, format_dict in streaming.items(): + for format_dict in streaming: if not isinstance(format_dict, dict): continue format_url = format_dict.get('url') if not format_url: continue + format_id = format_dict.get('type') ext = determine_ext(format_url) if format_id == 'dash' or ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -225,7 +274,7 @@ class DPlayIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'title': title, - 'description': info.get('description'), + 'description': strip_or_none(info.get('description')), 'duration': float_or_none(info.get('videoDuration'), 1000), 'timestamp': unified_timestamp(info.get('publishStart')), 'series': series, @@ -241,7 +290,80 @@ class DPlayIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') domain = mobj.group('domain').lstrip('www.') - country = mobj.group('country') or mobj.group('subdomain_country') - host = 'disco-api.' + domain if domain.startswith('dplay.') else 'eu2-prod.disco-api.com' + country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country') + host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( url, display_id, host, 'dplay' + country, country) + + +class DiscoveryPlusIE(DPlayIE): + _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family', + 'info_dict': { + 'id': '1140794', + 'display_id': 'property-brothers-forever-home/food-and-family', + 'ext': 'mp4', + 'title': 'Food and Family', + 'description': 'The brothers help a Richmond family expand their single-level home.', + 'duration': 2583.113, + 'timestamp': 1609304400, + 'upload_date': '20201230', + 'creator': 'HGTV', + 'series': 'Property Brothers: Forever Home', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'Available for Premium users', + }] + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers['x-disco-client'] = 'WEB:UNKNOWN:dplus_us:15.0.0' + + def _download_video_playback_info(self, disco_base, video_id, headers): + return self._download_json( + disco_base + 'playback/v3/videoPlaybackInfo', + video_id, headers=headers, data=json.dumps({ + 'deviceInfo': { + 'adBlocker': False, + }, + 'videoId': video_id, + 'wisteriaProperties': { + 'platform': 'desktop', + 'product': 'dplus_us', + }, + }).encode('utf-8'))['data']['attributes']['streaming'] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us') + + +class HGTVDeIE(DPlayIE): + _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/', + 'info_dict': { + 'id': '151205', + 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette', + 'ext': 'mp4', + 'title': 'Wer braucht schon eine Toilette', + 'description': 'md5:05b40a27e7aed2c9172de34d459134e2', + 'duration': 1177.024, + 'timestamp': 1595705400, + 'upload_date': '20200725', + 'creator': 'HGTV', + 'series': 'Tiny House - klein, aber oho', + 'season_number': 3, + 'episode_number': 3, + }, + 'params': { + 'format': 'bestvideo', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + return self._get_disco_api_info( + url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de') diff --git a/haruhi_dl/extractor/drtv.py b/haruhi_dl/extractor/drtv.py index 390e79f8c..c0036adb6 100644 --- a/haruhi_dl/extractor/drtv.py +++ b/haruhi_dl/extractor/drtv.py @@ -29,7 +29,7 @@ class DRTVIE(InfoExtractor): https?:// (?: (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*| - (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode)/ + (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ ) (?P<id>[\da-z_-]+) ''' @@ -111,6 +111,9 @@ class DRTVIE(InfoExtractor): }, { 'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769', 'only_matching': True, + }, { + 'url': 'https://www.dr.dk/drtv/program/jagten_220924', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/haruhi_dl/extractor/egghead.py b/haruhi_dl/extractor/egghead.py index df11dc206..aff9b88c0 100644 --- a/haruhi_dl/extractor/egghead.py +++ b/haruhi_dl/extractor/egghead.py @@ -12,7 +12,14 @@ from ..utils import ( ) -class EggheadCourseIE(InfoExtractor): +class EggheadBaseIE(InfoExtractor): + def _call_api(self, path, video_id, resource, fatal=True): + return self._download_json( + 'https://app.egghead.io/api/v1/' + path, + video_id, 'Downloading %s JSON' % resource, fatal=fatal) + + +class EggheadCourseIE(EggheadBaseIE): IE_DESC = 'egghead.io course' IE_NAME = 'egghead:course' _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)' @@ -28,10 +35,9 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - - lessons = self._download_json( - 'https://egghead.io/api/v1/series/%s/lessons' % playlist_id, - playlist_id, 'Downloading course lessons JSON') + series_path = 'series/' + playlist_id + lessons = self._call_api( + series_path + '/lessons', playlist_id, 'course lessons') entries = [] for lesson in lessons: @@ -44,9 +50,8 @@ class EggheadCourseIE(InfoExtractor): entries.append(self.url_result( lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) - course = self._download_json( - 'https://egghead.io/api/v1/series/%s' % playlist_id, - playlist_id, 'Downloading course JSON', fatal=False) or {} + course = self._call_api( + series_path, playlist_id, 'course', False) or {} playlist_id = course.get('id') if playlist_id: @@ -57,7 +62,7 @@ class EggheadCourseIE(InfoExtractor): course.get('description')) -class EggheadLessonIE(InfoExtractor): +class EggheadLessonIE(EggheadBaseIE): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)' @@ -74,7 +79,7 @@ class EggheadLessonIE(InfoExtractor): 'upload_date': '20161209', 'duration': 304, 'view_count': 0, - 'tags': ['javascript', 'free'], + 'tags': 'count:2', }, 'params': { 'skip_download': True, @@ -88,8 +93,8 @@ class EggheadLessonIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - lesson = self._download_json( - 'https://egghead.io/api/v1/lessons/%s' % display_id, display_id) + lesson = self._call_api( + 'lessons/' + display_id, display_id, 'lesson') lesson_id = compat_str(lesson['id']) title = lesson['title'] diff --git a/haruhi_dl/extractor/eporner.py b/haruhi_dl/extractor/eporner.py index fe42821c7..bfecd3a41 100644 --- a/haruhi_dl/extractor/eporner.py +++ b/haruhi_dl/extractor/eporner.py @@ -16,7 +16,7 @@ from ..utils import ( class EpornerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?eporner\.com/(?:hd-porn|embed)/(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' + _VALID_URL = r'https?://(?:www\.)?eporner\.com/(?:(?:hd-porn|embed)/|video-)(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' _TESTS = [{ 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', 'md5': '39d486f046212d8e1b911c52ab4691f8', @@ -43,7 +43,10 @@ class EpornerIE(InfoExtractor): 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', 'only_matching': True, }, { - 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', + 'url': 'http://www.eporner.com/embed/3YRUtzMcWn0', + 'only_matching': True, + }, { + 'url': 'https://www.eporner.com/video-FJsA19J3Y3H/one-of-the-greats/', 'only_matching': True, }] @@ -57,7 +60,7 @@ class EpornerIE(InfoExtractor): video_id = self._match_id(urlh.geturl()) hash = self._search_regex( - r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash') + r'hash\s*[:=]\s*["\']([\da-f]{32})', webpage, 'hash') title = self._og_search_title(webpage, default=None) or self._html_search_regex( r'<title>(.+?) - EPORNER', webpage, 'title') @@ -115,8 +118,8 @@ class EpornerIE(InfoExtractor): duration = parse_duration(self._html_search_meta( 'duration', webpage, default=None)) view_count = str_to_int(self._search_regex( - r'id="cinemaviews">\s*([0-9,]+)\s*<small>views', - webpage, 'view count', fatal=False)) + r'id=["\']cinemaviews1["\'][^>]*>\s*([0-9,]+)', + webpage, 'view count', default=None)) return merge_dicts(json_ld, { 'id': video_id, diff --git a/haruhi_dl/extractor/everyonesmixtape.py b/haruhi_dl/extractor/everyonesmixtape.py deleted file mode 100644 index 84a9b750e..000000000 --- a/haruhi_dl/extractor/everyonesmixtape.py +++ /dev/null @@ -1,77 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - sanitized_Request, -) - - -class EveryonesMixtapeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?everyonesmixtape\.com/#/mix/(?P<id>[0-9a-zA-Z]+)(?:/(?P<songnr>[0-9]))?$' - - _TESTS = [{ - 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi/5', - 'info_dict': { - 'id': '5bfseWNmlds', - 'ext': 'mp4', - 'title': "Passion Pit - \"Sleepyhead\" (Official Music Video)", - 'uploader': 'FKR.TV', - 'uploader_id': 'frenchkissrecords', - 'description': "Music video for \"Sleepyhead\" from Passion Pit's debut EP Chunk Of Change.\nBuy on iTunes: https://itunes.apple.com/us/album/chunk-of-change-ep/id300087641\n\nDirected by The Wilderness.\n\nhttp://www.passionpitmusic.com\nhttp://www.frenchkissrecords.com", - 'upload_date': '20081015' - }, - 'params': { - 'skip_download': True, # This is simply YouTube - } - }, { - 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi', - 'info_dict': { - 'id': 'm7m0jJAbMQi', - 'title': 'Driving', - }, - 'playlist_count': 24 - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') - - pllist_url = 'http://everyonesmixtape.com/mixtape.php?a=getMixes&u=-1&linked=%s&explore=' % playlist_id - pllist_req = sanitized_Request(pllist_url) - pllist_req.add_header('X-Requested-With', 'XMLHttpRequest') - - playlist_list = self._download_json( - pllist_req, playlist_id, note='Downloading playlist metadata') - try: - playlist_no = next(playlist['id'] - for playlist in playlist_list - if playlist['code'] == playlist_id) - except StopIteration: - raise ExtractorError('Playlist id not found') - - pl_url = 'http://everyonesmixtape.com/mixtape.php?a=getMix&id=%s&userId=null&code=' % playlist_no - pl_req = sanitized_Request(pl_url) - pl_req.add_header('X-Requested-With', 'XMLHttpRequest') - playlist = self._download_json( - pl_req, playlist_id, note='Downloading playlist info') - - entries = [{ - '_type': 'url', - 'url': t['url'], - 'title': t['title'], - } for t in playlist['tracks']] - - if mobj.group('songnr'): - songnr = int(mobj.group('songnr')) - 1 - return entries[songnr] - - playlist_title = playlist['mixData']['name'] - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': playlist_title, - 'entries': entries, - } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 412d02955..db6166a93 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -30,7 +30,11 @@ from .adobetv import ( from .adultswim import AdultSwimIE from .aenetworks import ( AENetworksIE, + AENetworksCollectionIE, + AENetworksShowIE, HistoryTopicIE, + HistoryPlayerIE, + BiographyIE, ) from .afreecatv import AfreecaTVIE from .agora import ( @@ -43,8 +47,12 @@ from .airmozilla import AirMozillaIE from .albicla import AlbiclaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .amara import AmaraIE from .amcnetworks import AMCNetworksIE -from .americastestkitchen import AmericasTestKitchenIE +from .americastestkitchen import ( + AmericasTestKitchenIE, + AmericasTestKitchenSeasonIE, +) from .animeondemand import AnimeOnDemandIE from .anvato import AnvatoIE from .aol import AolIE @@ -60,7 +68,9 @@ from .appletrailers import ( AppleTrailersIE, AppleTrailersSectionIE, ) +from .applepodcasts import ApplePodcastsIE from .archiveorg import ArchiveOrgIE +from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE from .ard import ( ARDBetaMediathekIE, @@ -68,7 +78,7 @@ from .ard import ( ARDMediathekIE, ) from .arte import ( - ArteTVPlus7IE, + ArteTVIE, ArteTVEmbedIE, ArteTVPlaylistIE, ) @@ -98,16 +108,18 @@ from .bbc import ( BBCCoUkPlaylistIE, BBCIE, ) -from .beampro import ( - BeamProLiveIE, - BeamProVodIE, -) from .beeg import BeegIE from .behindkink import BehindKinkIE from .bellmedia import BellMediaIE from .beatport import BeatportIE from .bet import BetIE from .bfi import BFIPlayerIE +from .bfmtv import ( + BFMTVIE, + BFMTVLiveIE, + BFMTVArticleIE, +) +from .bibeltv import BibelTVIE from .bigflix import BigflixIE from .bild import BildIE from .bilibili import ( @@ -130,7 +142,9 @@ from .bleacherreport import ( from .blinkx import BlinkxIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE +from .bongacams import BongaCamsIE from .bostonglobe import BostonGlobeIE +from .box import BoxIE from .bpb import BpbIE from .br import ( BRIE, @@ -159,6 +173,7 @@ from .canvas import ( CanvasIE, CanvasEenIE, VrtNUIE, + DagelijkseKostIE, ) from .carambatv import ( CarambaTVIE, @@ -173,7 +188,10 @@ from .cbc import ( CBCOlympicsIE, ) from .cbs import CBSIE -from .cbslocal import CBSLocalIE +from .cbslocal import ( + CBSLocalIE, + CBSLocalArticleIE, +) from .cbsinteractive import CBSInteractiveIE from .cbsnews import ( CBSNewsEmbedIE, @@ -228,11 +246,8 @@ from .cnn import ( ) from .coub import CoubIE from .comedycentral import ( - ComedyCentralFullEpisodesIE, ComedyCentralIE, - ComedyCentralShortnameIE, ComedyCentralTVIE, - ToshIE, ) from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonprotocols import ( @@ -251,6 +266,7 @@ from .crunchyroll import ( ) from .cspan import CSpanIE from .ctsnews import CtsNewsIE +from .ctv import CTVIE from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE from .curiositystream import ( @@ -282,7 +298,11 @@ from .douyutv import ( DouyuShowIE, DouyuTVIE, ) -from .dplay import DPlayIE +from .dplay import ( + DPlayIE, + DiscoveryPlusIE, + HGTVDeIE, +) from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE @@ -344,7 +364,6 @@ from .eurozet import ( EurozetPlayerPodcastIE, EurozetPlayerMusicStreamIE, ) -from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE from .expressen import ExpressenIE from .extremetube import ExtremeTubeIE @@ -408,6 +427,7 @@ from .frontendmasters import ( FrontendMastersLessonIE, FrontendMastersCourseIE ) +from .fujitv import FujiTVFODPlus7IE from .funimation import FunimationIE from .funk import FunkIE from .funkwhale import ( @@ -419,7 +439,6 @@ from .funkwhale import ( FunkwhaleRadioSHIE, ) from .fusion import FusionIE -from .fxnetworks import FXNetworksIE from .gaia import GaiaIE from .gameinformer import GameInformerIE from .gamespot import GameSpotIE @@ -427,6 +446,7 @@ from .gamestar import GameStarIE from .gaskrank import GaskrankIE from .gazeta import GazetaIE from .gdcvault import GDCVaultIE +from .gedidigital import GediDigitalIE from .generic import GenericIE from .gfycat import GfycatIE from .giantbomb import GiantBombIE @@ -440,7 +460,10 @@ from .go import GoIE from .godtube import GodTubeIE from .golem import GolemIE from .googledrive import GoogleDriveIE -from .googleplus import GooglePlusIE +from .googlepodcasts import ( + GooglePodcastsIE, + GooglePodcastsFeedIE, +) from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE @@ -483,8 +506,12 @@ from .hungama import ( from .hypem import HypemIE from .ign import ( IGNIE, - OneUPIE, - PCMagIE, + IGNVideoIE, + IGNArticleIE, +) +from .iheart import ( + IHeartRadioIE, + IHeartRadioPodcastIE, ) from .imdb import ( ImdbIE, @@ -531,13 +558,15 @@ from .joj import JojIE from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE -from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE from .ketnet import KetnetIE -from .khanacademy import KhanAcademyIE +from .khanacademy import ( + KhanAcademyIE, + KhanAcademyUnitIE, +) from .kickstarter import KickStarterIE from .kinja import KinjaEmbedIE from .kinopoisk import KinoPoiskIE @@ -638,6 +667,7 @@ from .mastodon import MastodonSHIE from .massengeschmacktv import MassengeschmackTVIE from .matchtv import MatchTVIE from .mdr import MDRIE +from .medaltv import MedalTVIE from .mediaset import MediasetIE from .mediasite import ( MediasiteIE, @@ -658,6 +688,11 @@ from .microsoftvirtualacademy import ( MicrosoftVirtualAcademyIE, MicrosoftVirtualAcademyCourseIE, ) +from .minds import ( + MindsIE, + MindsChannelIE, + MindsGroupIE, +) from .ministrygrid import MinistryGridIE from .minoto import MinotoIE from .miomio import MioMioIE @@ -708,9 +743,15 @@ from .nationalgeographic import ( NationalGeographicTVIE, ) from .naver import NaverIE -from .nba import NBAIE +from .nba import ( + NBAWatchEmbedIE, + NBAWatchIE, + NBAWatchCollectionIE, + NBAEmbedIE, + NBAIE, + NBAChannelIE, +) from .nbc import ( - CSNNEIE, NBCIE, NBCNewsIE, NBCOlympicsIE, @@ -753,8 +794,14 @@ from .nexx import ( NexxIE, NexxEmbedIE, ) -from .nfl import NFLIE -from .nhk import NhkVodIE +from .nfl import ( + NFLIE, + NFLArticleIE, +) +from .nhk import ( + NhkVodIE, + NhkVodProgramIE, +) from .nhl import NHLIE from .nick import ( NickIE, @@ -804,6 +851,7 @@ from .nrk import ( NRKSkoleIE, NRKTVIE, NRKTVDirekteIE, + NRKRadioPodkastIE, NRKTVEpisodeIE, NRKTVEpisodesIE, NRKTVSeasonIE, @@ -816,6 +864,7 @@ from .ntvru import NTVRuIE from .nytimes import ( NYTimesIE, NYTimesArticleIE, + NYTimesCookingIE, ) from .nuvid import NuvidIE from .nzz import NZZIE @@ -879,6 +928,10 @@ from .picarto import ( ) from .piksel import PikselIE from .pinkbike import PinkbikeIE +from .pinterest import ( + PinterestIE, + PinterestCollectionIE, +) from .pladform import PladformIE from .platzi import ( PlatziIE, @@ -1005,6 +1058,7 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe from .rtvnh import RTVNHIE from .rtvs import RTVSIE from .ruhd import RUHDIE +from .rumble import RumbleEmbedIE from .rutube import ( RutubeIE, RutubeChannelIE, @@ -1021,6 +1075,7 @@ from .safari import ( SafariApiIE, SafariCourseIE, ) +from .samplefocus import SampleFocusIE from .sapo import SapoIE from .savefrom import SaveFromIE from .sbs import SBSIE @@ -1053,8 +1108,23 @@ from .shared import ( VivoIE, ) from .showroomlive import ShowRoomLiveIE +from .simplecast import ( + SimplecastIE, + SimplecastEpisodeIE, + SimplecastPodcastIE, +) from .sina import SinaIE from .sixplay import SixPlayIE +from .skyit import ( + SkyItPlayerIE, + SkyItVideoIE, + SkyItVideoLiveIE, + SkyItIE, + SkyItAcademyIE, + SkyItArteIE, + CieloTVItIE, + TV8ItIE, +) from .skylinewebcams import SkylineWebcamsIE from .skynewsarabia import ( SkyNewsArabiaIE, @@ -1063,16 +1133,11 @@ from .skynewsarabia import ( from .sky import ( SkyNewsIE, SkySportsIE, + SkySportsNewsIE, ) from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE from .slutload import SlutloadIE -from .smotri import ( - SmotriIE, - SmotriCommunityIE, - SmotriUserIE, - SmotriBroadcastIE, -) from .snotr import SnotrIE from .sohu import SohuIE from .sonyliv import SonyLIVIE @@ -1101,16 +1166,28 @@ from .spankbang import ( SpankBangPlaylistIE, ) from .spankwire import SpankwireIE -from .spiegel import SpiegelIE, SpiegelArticleIE -from .spiegeltv import SpiegeltvIE +from .spiegel import SpiegelIE from .spike import ( BellatorIE, ParamountNetworkIE, ) -from .stitcher import StitcherIE +from .stitcher import ( + StitcherIE, + StitcherShowIE, +) from .sport5 import Sport5IE from .sportbox import SportBoxIE from .sportdeutschland import SportDeutschlandIE +from .spotify import ( + SpotifyIE, + SpotifyShowIE, +) +from .spreaker import ( + SpreakerIE, + SpreakerPageIE, + SpreakerShowIE, + SpreakerShowPageIE, +) from .springboardplatform import SpringboardPlatformIE from .sprout import SproutIE from .srgssr import ( @@ -1120,6 +1197,11 @@ from .srgssr import ( from .srmediathek import SRMediathekIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE +from .storyfire import ( + StoryFireIE, + StoryFireUserIE, + StoryFireSeriesIE, +) from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE @@ -1145,7 +1227,6 @@ from .tagesschau import ( TagesschauIE, ) from .tass import TassIE -from .tastytrade import TastyTradeIE from .tbs import TBSIE from .tdslifeway import TDSLifewayIE from .teachable import ( @@ -1172,6 +1253,7 @@ from .telequebec import ( TeleQuebecSquatIE, TeleQuebecEmissionIE, TeleQuebecLiveIE, + TeleQuebecVideoIE, ) from .teletask import TeleTaskIE from .telewebion import TelewebionIE @@ -1208,7 +1290,10 @@ from .tnaflix import ( EMPFlixIE, MovieFapIE, ) -from .toggle import ToggleIE +from .toggle import ( + ToggleIE, + MeWatchIE, +) from .tonline import TOnlineIE from .toongoggles import ToonGogglesIE from .toutv import TouTvIE @@ -1219,6 +1304,10 @@ from .transistorfm import ( TransistorFMShareIE, ) from .trilulilu import TriluliluIE +from .trovo import ( + TrovoIE, + TrovoVodIE, +) from .trunews import TruNewsIE from .trutv import TruTVIE from .tubafm import ( @@ -1241,6 +1330,7 @@ from .tv2 import ( TV2IE, TV2ArticleIE, KatsomoIE, + MTVUutisetArticleIE, ) from .tv2dk import ( TV2DKIE, @@ -1249,7 +1339,14 @@ from .tv2dk import ( from .tv2hu import TV2HuIE from .tv4 import TV4IE from .tv5mondeplus import TV5MondePlusIE -from .tva import TVAIE +from .tv5unis import ( + TV5UnisVideoIE, + TV5UnisIE, +) +from .tva import ( + TVAIE, + QubIE, +) from .tvanouvelles import ( TVANouvellesIE, TVANouvellesArticleIE, @@ -1258,6 +1355,7 @@ from .tvc import ( TVCIE, TVCArticleIE, ) +from .tver import TVerIE from .tvigle import TvigleIE from .tvland import TVLandIE from .tvn24 import TVN24IE @@ -1376,7 +1474,6 @@ from .vidme import ( VidmeUserIE, VidmeUserLikesIE, ) -from .vidzi import VidziIE from .vier import VierIE, VierVideosIE from .viewlift import ( ViewLiftIE, @@ -1436,10 +1533,14 @@ from .vrv import ( VRVSeriesIE, ) from .vshare import VShareIE +from .vtm import VTMIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE -from .vvvvid import VVVVIDIE +from .vvvvid import ( + VVVVIDIE, + VVVVIDShowIE, +) from .vyborymos import VyboryMosIE from .vzaar import VzaarIE from .wakanim import WakanimIE @@ -1551,7 +1652,6 @@ from .youtube import ( YoutubeTruncatedURLIE, ) from .zapiks import ZapiksIE -from .zaq1 import Zaq1IE from .zattoo import ( BBVTVIE, EinsUndEinsTVIE, @@ -1572,5 +1672,6 @@ from .zattoo import ( ZattooLiveIE, ) from .zdf import ZDFIE, ZDFChannelIE +from .zhihu import ZhihuIE from .zingmp3 import ZingMp3IE from .zype import ZypeIE diff --git a/haruhi_dl/extractor/facebook.py b/haruhi_dl/extractor/facebook.py index 72781bd80..5dc931b86 100644 --- a/haruhi_dl/extractor/facebook.py +++ b/haruhi_dl/extractor/facebook.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re import socket @@ -8,6 +9,7 @@ from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, compat_http_client, + compat_str, compat_urllib_error, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, @@ -16,14 +18,17 @@ from ..utils import ( clean_html, error_to_compat_str, ExtractorError, + float_or_none, get_element_by_id, int_or_none, js_to_json, limit_length, parse_count, + qualities, sanitized_Request, try_get, urlencode_postdata, + urljoin, ) @@ -39,11 +44,13 @@ class FacebookIE(InfoExtractor): photo\.php| video\.php| video/embed| - story\.php + story\.php| + watch(?:/live)?/? )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| [^/]+/posts/| - groups/[^/]+/permalink/ + groups/[^/]+/permalink/| + watchparty/ )| facebook: ) @@ -54,8 +61,6 @@ class FacebookIE(InfoExtractor): _NETRC_MACHINE = 'facebook' IE_NAME = 'facebook' - _CHROME_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36' - _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s' _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' @@ -72,6 +77,7 @@ class FacebookIE(InfoExtractor): }, 'skip': 'Requires logging in', }, { + # data.video 'url': 'https://www.facebook.com/video.php?v=274175099429670', 'info_dict': { 'id': '274175099429670', @@ -133,6 +139,7 @@ class FacebookIE(InfoExtractor): }, }, { # have 1080P, but only up to 720p in swf params + # data.video.story.attachments[].media 'url': 'https://www.facebook.com/cnn/videos/10155529876156509/', 'md5': '9571fae53d4165bbbadb17a94651dcdc', 'info_dict': { @@ -147,6 +154,7 @@ class FacebookIE(InfoExtractor): }, }, { # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/', 'info_dict': { 'id': '1417995061575415', @@ -174,6 +182,7 @@ class FacebookIE(InfoExtractor): 'skip_download': True, }, }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/', 'info_dict': { 'id': '1396382447100162', @@ -193,18 +202,23 @@ class FacebookIE(InfoExtractor): 'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf', 'only_matching': True, }, { + # data.mediaset.currMedia.edges 'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater', 'only_matching': True, }, { + # data.video.story.attachments[].media 'url': 'facebook:544765982287235', 'only_matching': True, }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 'only_matching': True, }, { + # data.video.creation_story.attachments[].media 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', 'only_matching': True, }, { + # data.video 'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670', 'only_matching': True, }, { @@ -212,6 +226,7 @@ class FacebookIE(InfoExtractor): 'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/', 'only_matching': True, }, { + # data.video 'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/', 'info_dict': { 'id': '359649331226507', @@ -222,7 +237,64 @@ class FacebookIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media + 'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/', + 'info_dict': { + 'id': '106560053808006', + }, + 'playlist_count': 2, + }, { + # data.video.story.attachments[].media + 'url': 'https://www.facebook.com/watch/?v=647537299265662', + 'only_matching': True, + }, { + # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media + 'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271', + 'info_dict': { + 'id': '10157667649866271', + }, + 'playlist_count': 3, + }, { + # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media + 'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330', + 'info_dict': { + 'id': '117576630041613', + 'ext': 'mp4', + # TODO: title can be extracted from video page + 'title': 'Facebook video #117576630041613', + 'uploader_id': '189393014416438', + 'upload_date': '20201123', + 'timestamp': 1606162592, + }, + 'skip': 'Requires logging in', + }, { + # node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media + 'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/', + 'info_dict': { + 'id': '211567722618337', + 'ext': 'mp4', + 'title': 'Facebook video #211567722618337', + 'uploader_id': '127875227654254', + 'upload_date': '20161122', + 'timestamp': 1479793574, + }, + }, { + # data.video.creation_story.attachments[].media + 'url': 'https://www.facebook.com/watch/live/?v=1823658634322275', + 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/watchparty/211641140192478', + 'info_dict': { + 'id': '211641140192478', + }, + 'playlist_count': 1, + 'skip': 'Requires logging in', }] + _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)' + _api_config = { + 'graphURI': '/api/graphql/' + } @staticmethod def _extract_urls(webpage, **kwargs): @@ -305,23 +377,24 @@ class FacebookIE(InfoExtractor): def _real_initialize(self): self._login() - def _extract_from_url(self, url, video_id, fatal_if_no_video=True): - req = sanitized_Request(url) - req.add_header('User-Agent', self._CHROME_USER_AGENT) - webpage = self._download_webpage(req, video_id) + def _extract_from_url(self, url, video_id): + webpage = self._download_webpage( + url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) video_data = None def extract_video_data(instances): + video_data = [] for item in instances: - if item[1][0] == 'VideoConfig': + if try_get(item, lambda x: x[1][0]) == 'VideoConfig': video_item = item[2][0] if video_item.get('video_id'): - return video_item['videoData'] + video_data.append(video_item['videoData']) + return video_data server_js_data = self._parse_json(self._search_regex( - r'handleServerJS\(({.+})(?:\);|,")', webpage, - 'server js data', default='{}'), video_id, fatal=False) + [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'], + webpage, 'server js data', default='{}'), video_id, fatal=False) if server_js_data: video_data = extract_video_data(server_js_data.get('instances', [])) @@ -331,17 +404,118 @@ class FacebookIE(InfoExtractor): return extract_video_data(try_get( js_data, lambda x: x['jsmods']['instances'], list) or []) + def extract_dash_manifest(video, formats): + dash_manifest = video.get('dash_manifest') + if dash_manifest: + formats.extend(self._parse_mpd_formats( + compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + + def process_formats(formats): + # Downloads with browser's User-Agent are rate limited. Working around + # with non-browser User-Agent. + for f in formats: + f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' + + self._sort_formats(formats) + + def extract_relay_data(_filter): + return self._parse_json(self._search_regex( + r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, + webpage, 'replay data', default='{}'), video_id, fatal=False) or {} + + def extract_relay_prefetched_data(_filter): + replay_data = extract_relay_data(_filter) + for require in (replay_data.get('require') or []): + if require[0] == 'RelayPrefetchedStreamCache': + return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + if not video_data: - server_js_data = self._parse_json( - self._search_regex( - r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)', - webpage, 'js data', default='{}'), - video_id, transform_source=js_to_json, fatal=False) + server_js_data = self._parse_json(self._search_regex([ + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX, + r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"%s".*?})\);' % self._SUPPORTED_PAGLETS_REGEX + ], webpage, 'js data', default='{}'), video_id, js_to_json, False) video_data = extract_from_jsmods_instances(server_js_data) if not video_data: - if not fatal_if_no_video: - return webpage, False + data = extract_relay_prefetched_data( + r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') + if data: + entries = [] + + def parse_graphql_video(video): + formats = [] + q = qualities(['sd', 'hd']) + for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]: + playable_url = video.get('playable_url' + suffix) + if not playable_url: + continue + formats.append({ + 'format_id': format_id, + 'quality': q(format_id), + 'url': playable_url, + }) + extract_dash_manifest(video, formats) + process_formats(formats) + v_id = video.get('videoId') or video.get('id') or video_id + info = { + 'id': v_id, + 'formats': formats, + 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), + 'uploader_id': try_get(video, lambda x: x['owner']['id']), + 'timestamp': int_or_none(video.get('publish_time')), + 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), + } + description = try_get(video, lambda x: x['savable_description']['text']) + title = video.get('name') + if title: + info.update({ + 'title': title, + 'description': description, + }) + else: + info['title'] = description or 'Facebook video #%s' % v_id + entries.append(info) + + def parse_attachment(attachment, key='media'): + media = attachment.get(key) or {} + if media.get('__typename') == 'Video': + return parse_graphql_video(media) + + nodes = data.get('nodes') or [] + node = data.get('node') or {} + if not nodes and node: + nodes.append(node) + for node in nodes: + story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {} + attachments = try_get(story, [ + lambda x: x['attached_story']['attachments'], + lambda x: x['attachments'] + ], list) or [] + for attachment in attachments: + attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict) + ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or [] + for n in ns: + parse_attachment(n) + parse_attachment(attachment) + + edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or [] + for edge in edges: + parse_attachment(edge, key='node') + + video = data.get('video') or {} + if video: + attachments = try_get(video, [ + lambda x: x['story']['attachments'], + lambda x: x['creation_story']['attachments'] + ], list) or [] + for attachment in attachments: + parse_attachment(attachment) + if not entries: + parse_graphql_video(video) + + return self.playlist_result(entries, video_id) + + if not video_data: m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage) if m_msg is not None: raise ExtractorError( @@ -350,6 +524,43 @@ class FacebookIE(InfoExtractor): elif '>You must log in to continue' in webpage: self.raise_login_required() + if not video_data and '/watchparty/' in url: + post_data = { + 'doc_id': 3731964053542869, + 'variables': json.dumps({ + 'livingRoomID': video_id, + }), + } + + prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{') + if prefetched_data: + lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict) + if lsd: + post_data[lsd['name']] = lsd['value'] + + relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,') + for define in (relay_data.get('define') or []): + if define[0] == 'RelayAPIConfigDefaults': + self._api_config = define[2] + + living_room = self._download_json( + urljoin(url, self._api_config['graphURI']), video_id, + data=urlencode_postdata(post_data))['data']['living_room'] + + entries = [] + for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []): + video = try_get(edge, lambda x: x['node']['video']) or {} + v_id = video.get('id') + if not v_id: + continue + v_id = compat_str(v_id) + entries.append(self.url_result( + self._VIDEO_PAGE_TEMPLATE % v_id, + self.ie_key(), v_id, video.get('name'))) + + return self.playlist_result(entries, video_id) + + if not video_data: # Video info not in first request, do a secondary request using # tahoe player specific URL tahoe_data = self._download_webpage( @@ -379,8 +590,19 @@ class FacebookIE(InfoExtractor): if not video_data: raise ExtractorError('Cannot parse data') - subtitles = {} + if len(video_data) > 1: + entries = [] + for v in video_data: + video_url = v[0].get('video_url') + if not video_url: + continue + entries.append(self.url_result(urljoin( + url, video_url), self.ie_key(), v[0].get('video_id'))) + return self.playlist_result(entries, video_id) + video_data = video_data[0] + formats = [] + subtitles = {} for f in video_data: format_id = f['stream_type'] if f and isinstance(f, dict): @@ -399,22 +621,14 @@ class FacebookIE(InfoExtractor): 'url': src, 'preference': preference, }) - dash_manifest = f[0].get('dash_manifest') - if dash_manifest: - formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + extract_dash_manifest(f[0], formats) subtitles_src = f[0].get('subtitles_src') if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src}) if not formats: raise ExtractorError('Cannot find video formats') - # Downloads with browser's User-Agent are rate limited. Working around - # with non-browser User-Agent. - for f in formats: - f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - - self._sort_formats(formats) + process_formats(formats) video_title = self._html_search_regex( r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, @@ -454,35 +668,13 @@ class FacebookIE(InfoExtractor): 'subtitles': subtitles, } - return webpage, info_dict + return info_dict def _real_extract(self, url): video_id = self._match_id(url) real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url - webpage, info_dict = self._extract_from_url(real_url, video_id, fatal_if_no_video=False) - - if info_dict: - return info_dict - - if '/posts/' in url: - video_id_json = self._search_regex( - r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])', webpage, 'video ids', group='ids', - default='') - if video_id_json: - entries = [ - self.url_result('facebook:%s' % vid, FacebookIE.ie_key()) - for vid in self._parse_json(video_id_json, video_id)] - return self.playlist_result(entries, video_id) - - # Single Video? - video_id = self._search_regex(r'video_id:\s*"([0-9]+)"', webpage, 'single video id') - return self.url_result('facebook:%s' % video_id, FacebookIE.ie_key()) - else: - _, info_dict = self._extract_from_url( - self._VIDEO_PAGE_TEMPLATE % video_id, - video_id, fatal_if_no_video=True) - return info_dict + return self._extract_from_url(real_url, video_id) class FacebookPluginsVideoIE(InfoExtractor): diff --git a/haruhi_dl/extractor/franceculture.py b/haruhi_dl/extractor/franceculture.py index 306b45fc9..14f4cb489 100644 --- a/haruhi_dl/extractor/franceculture.py +++ b/haruhi_dl/extractor/franceculture.py @@ -11,7 +11,7 @@ from ..utils import ( class FranceCultureIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', 'info_dict': { 'id': 'rendez-vous-au-pays-des-geeks', @@ -20,10 +20,14 @@ class FranceCultureIE(InfoExtractor): 'title': 'Rendez-vous au pays des geeks', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140301', - 'timestamp': 1393642916, + 'timestamp': 1393700400, 'vcodec': 'none', } - } + }, { + # no thumbnail + 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) @@ -36,19 +40,19 @@ class FranceCultureIE(InfoExtractor): </h1>| <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> ).*? - (<button[^>]+data-asset-source="[^"]+"[^>]+>) + (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>) ''', webpage, 'video data')) - video_url = video_data['data-asset-source'] - title = video_data.get('data-asset-title') or self._og_search_title(webpage) + video_url = video_data.get('data-url') or video_data['data-asset-source'] + title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage) description = self._html_search_regex( r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>', webpage, 'description', default=None) thumbnail = self._search_regex( r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"', - webpage, 'thumbnail', fatal=False) + webpage, 'thumbnail', default=None) uploader = self._html_search_regex( r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None) @@ -64,6 +68,6 @@ class FranceCultureIE(InfoExtractor): 'ext': ext, 'vcodec': 'none' if ext == 'mp3' else None, 'uploader': uploader, - 'timestamp': int_or_none(video_data.get('data-asset-created-date')), + 'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')), 'duration': int_or_none(video_data.get('data-duration')), } diff --git a/haruhi_dl/extractor/franceinter.py b/haruhi_dl/extractor/franceinter.py index 05806895c..ae822a50e 100644 --- a/haruhi_dl/extractor/franceinter.py +++ b/haruhi_dl/extractor/franceinter.py @@ -16,6 +16,7 @@ class FranceInterIE(InfoExtractor): 'ext': 'mp3', 'title': 'Affaire Cahuzac : le contentieux du compte en Suisse', 'description': 'md5:401969c5d318c061f86bda1fa359292b', + 'thumbnail': r're:^https?://.*\.jpg', 'upload_date': '20160907', }, } @@ -31,6 +32,7 @@ class FranceInterIE(InfoExtractor): title = self._og_search_title(webpage) description = self._og_search_description(webpage) + thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage) upload_date_str = self._search_regex( r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', @@ -48,6 +50,7 @@ class FranceInterIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, + 'thumbnail': thumbnail, 'upload_date': upload_date, 'formats': [{ 'url': video_url, diff --git a/haruhi_dl/extractor/francetv.py b/haruhi_dl/extractor/francetv.py index 8598576e5..3cb17751e 100644 --- a/haruhi_dl/extractor/francetv.py +++ b/haruhi_dl/extractor/francetv.py @@ -17,6 +17,7 @@ from ..utils import ( parse_duration, try_get, url_or_none, + urljoin, ) from .dailymotion import DailymotionIE @@ -128,18 +129,38 @@ class FranceTVIE(InfoExtractor): is_live = None - formats = [] - for video in info['videos']: - if video['statut'] != 'ONLINE': + videos = [] + + for video in (info.get('videos') or []): + if video.get('statut') != 'ONLINE': continue - video_url = video['url'] + if not video.get('url'): + continue + videos.append(video) + + if not videos: + for device_type in ['desktop', 'mobile']: + fallback_info = self._download_json( + 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, + video_id, 'Downloading fallback %s video JSON' % device_type, query={ + 'device_type': device_type, + 'browser': 'chrome', + }, fatal=False) + + if fallback_info and fallback_info.get('video'): + videos.append(fallback_info['video']) + + formats = [] + for video in videos: + video_url = video.get('url') if not video_url: continue if is_live is None: is_live = (try_get( - video, lambda x: x['plages_ouverture'][0]['direct'], - bool) is True) or '/live.francetv.fr/' in video_url - format_id = video['format'] + video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True + or video.get('is_live') is True + or '/live.francetv.fr/' in video_url) + format_id = video.get('format') ext = determine_ext(video_url) if ext == 'f4m': if georestricted: @@ -154,6 +175,9 @@ class FranceTVIE(InfoExtractor): sign(video_url, format_id), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False)) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, @@ -166,6 +190,7 @@ class FranceTVIE(InfoExtractor): 'url': video_url, 'format_id': format_id, }) + self._sort_formats(formats) title = info['titre'] @@ -185,10 +210,10 @@ class FranceTVIE(InfoExtractor): return { 'id': video_id, 'title': self._live_title(title) if is_live else title, - 'description': clean_html(info['synopsis']), - 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), - 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), - 'timestamp': int_or_none(info['diffusion']['timestamp']), + 'description': clean_html(info.get('synopsis')), + 'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')), + 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), + 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])), 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, diff --git a/haruhi_dl/extractor/fujitv.py b/haruhi_dl/extractor/fujitv.py new file mode 100644 index 000000000..39685e075 --- /dev/null +++ b/haruhi_dl/extractor/fujitv.py @@ -0,0 +1,35 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class FujiTVFODPlus7IE(InfoExtractor): + _VALID_URL = r'https?://i\.fod\.fujitv\.co\.jp/plus7/web/[0-9a-z]{4}/(?P<id>[0-9a-z]+)' + _BASE_URL = 'http://i.fod.fujitv.co.jp/' + _BITRATE_MAP = { + 300: (320, 180), + 800: (640, 360), + 1200: (1280, 720), + 2000: (1280, 720), + } + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = self._extract_m3u8_formats( + self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id) + for f in formats: + wh = self._BITRATE_MAP.get(f.get('tbr')) + if wh: + f.update({ + 'width': wh[0], + 'height': wh[1], + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_id, + 'formats': formats, + 'thumbnail': self._BASE_URL + 'pc/image/wbtn/wbtn_%s.jpg' % video_id, + } diff --git a/haruhi_dl/extractor/fxnetworks.py b/haruhi_dl/extractor/fxnetworks.py deleted file mode 100644 index 00e67426b..000000000 --- a/haruhi_dl/extractor/fxnetworks.py +++ /dev/null @@ -1,77 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .adobepass import AdobePassIE -from ..utils import ( - extract_attributes, - int_or_none, - parse_age_limit, - smuggle_url, - update_url_query, -) - - -class FXNetworksIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.fxnetworks.com/video/1032565827847', - 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703', - 'info_dict': { - 'id': 'dRzwHC_MMqIv', - 'ext': 'mp4', - 'title': 'First Look: Better Things - Season 2', - 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.', - 'age_limit': 14, - 'uploader': 'NEWA-FNG-FX', - 'upload_date': '20170825', - 'timestamp': 1503686274, - 'episode_number': 0, - 'season_number': 2, - 'series': 'Better Things', - }, - 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.simpsonsworld.com/video/716094019682', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - if 'The content you are trying to access is not available in your region.' in webpage: - self.raise_geo_restricted() - video_data = extract_attributes(self._search_regex( - r'(<a.+?rel="https?://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) - player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) - release_url = video_data['rel'] - title = video_data['data-title'] - rating = video_data.get('data-rating') - query = { - 'mbr': 'true', - } - if player_type == 'movies': - query.update({ - 'manifest': 'm3u', - }) - else: - query.update({ - 'switch': 'http', - }) - if video_data.get('data-req-auth') == '1': - resource = self._get_mvpd_resource( - video_data['data-channel'], title, - video_data.get('data-guid'), rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource) - - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), - 'series': video_data.get('data-show-title'), - 'episode_number': int_or_none(video_data.get('data-episode')), - 'season_number': int_or_none(video_data.get('data-season')), - 'thumbnail': video_data.get('data-large-thumb'), - 'age_limit': parse_age_limit(rating), - 'ie_key': 'ThePlatform', - } diff --git a/haruhi_dl/extractor/gamespot.py b/haruhi_dl/extractor/gamespot.py index 4236a5ed8..7a1beae3c 100644 --- a/haruhi_dl/extractor/gamespot.py +++ b/haruhi_dl/extractor/gamespot.py @@ -1,16 +1,7 @@ from __future__ import unicode_literals -import re - from .once import OnceIE -from ..compat import ( - compat_urllib_parse_unquote, -) -from ..utils import ( - unescapeHTML, - url_basename, - dict_get, -) +from ..compat import compat_urllib_parse_unquote class GameSpotIE(OnceIE): @@ -24,17 +15,16 @@ class GameSpotIE(OnceIE): 'title': 'Arma 3 - Community Guide: SITREP I', 'description': 'Check out this video where some of the basics of Arma 3 is explained.', }, + 'skip': 'manifest URL give HTTP Error 404: Not Found', }, { 'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/', + 'md5': '173ea87ad762cf5d3bf6163dceb255a6', 'info_dict': { 'id': 'gs-2300-6424837', 'ext': 'mp4', 'title': 'Now Playing - The Witcher 3: Wild Hunt', 'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.', }, - 'params': { - 'skip_download': True, # m3u8 downloads - }, }, { 'url': 'https://www.gamespot.com/videos/embed/6439218/', 'only_matching': True, @@ -49,90 +39,40 @@ class GameSpotIE(OnceIE): def _real_extract(self, url): page_id = self._match_id(url) webpage = self._download_webpage(url, page_id) - data_video_json = self._search_regex( - r'data-video=["\'](.*?)["\']', webpage, 'data video') - data_video = self._parse_json(unescapeHTML(data_video_json), page_id) + data_video = self._parse_json(self._html_search_regex( + r'data-video=(["\'])({.*?})\1', webpage, + 'video data', group=2), page_id) + title = compat_urllib_parse_unquote(data_video['title']) streams = data_video['videoStreams'] - - manifest_url = None formats = [] - f4m_url = streams.get('f4m_stream') - if f4m_url: - manifest_url = f4m_url - formats.extend(self._extract_f4m_formats( - f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False)) - m3u8_url = dict_get(streams, ('m3u8_stream', 'adaptive_stream')) + + m3u8_url = streams.get('adaptive_stream') if m3u8_url: - manifest_url = m3u8_url m3u8_formats = self._extract_m3u8_formats( m3u8_url, page_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - formats.extend(m3u8_formats) - progressive_url = dict_get( - streams, ('progressive_hd', 'progressive_high', 'progressive_low', 'other_lr')) - if progressive_url and manifest_url: - qualities_basename = self._search_regex( - r'/([^/]+)\.csmil/', - manifest_url, 'qualities basename', default=None) - if qualities_basename: - QUALITIES_RE = r'((,\d+)+,?)' - qualities = self._search_regex( - QUALITIES_RE, qualities_basename, - 'qualities', default=None) - if qualities: - qualities = list(map(lambda q: int(q), qualities.strip(',').split(','))) - qualities.sort() - http_template = re.sub(QUALITIES_RE, r'%d', qualities_basename) - http_url_basename = url_basename(progressive_url) - if m3u8_formats: - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none', m3u8_formats)) - if len(qualities) == len(m3u8_formats): - for q, m3u8_format in zip(qualities, m3u8_formats): - f = m3u8_format.copy() - f.update({ - 'url': progressive_url.replace( - http_url_basename, http_template % q), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - for q in qualities: - formats.append({ - 'url': progressive_url.replace( - http_url_basename, http_template % q), - 'ext': 'mp4', - 'format_id': 'http-%d' % q, - 'tbr': q, - }) + for f in m3u8_formats: + formats.append(f) + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': f['url'].replace('.m3u8', '.mp4'), + }) + formats.append(http_f) - onceux_json = self._search_regex( - r'data-onceux-options=["\'](.*?)["\']', webpage, 'data video', default=None) - if onceux_json: - onceux_url = self._parse_json(unescapeHTML(onceux_json), page_id).get('metadataUri') - if onceux_url: - formats.extend(self._extract_once_formats(re.sub( - r'https?://[^/]+', 'http://once.unicornmedia.com', onceux_url), - http_formats_preference=-1)) + mpd_url = streams.get('adaptive_dash') + if mpd_url: + formats.extend(self._extract_mpd_formats( + mpd_url, page_id, mpd_id='dash', fatal=False)) - if not formats: - for quality in ['sd', 'hd']: - # It's actually a link to a flv file - flv_url = streams.get('f4m_{0}'.format(quality)) - if flv_url is not None: - formats.append({ - 'url': flv_url, - 'ext': 'flv', - 'format_id': quality, - }) self._sort_formats(formats) return { - 'id': data_video['guid'], + 'id': data_video.get('guid') or page_id, 'display_id': page_id, - 'title': compat_urllib_parse_unquote(data_video['title']), + 'title': title, 'formats': formats, 'description': self._html_search_meta('description', webpage), 'thumbnail': self._og_search_thumbnail(webpage), diff --git a/haruhi_dl/extractor/gedidigital.py b/haruhi_dl/extractor/gedidigital.py new file mode 100644 index 000000000..6c4153b40 --- /dev/null +++ b/haruhi_dl/extractor/gedidigital.py @@ -0,0 +1,161 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, +) + + +class GediDigitalIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://video\. + (?: + (?: + (?:espresso\.)?repubblica + |lastampa + |ilsecoloxix + )| + (?: + iltirreno + |messaggeroveneto + |ilpiccolo + |gazzettadimantova + |mattinopadova + |laprovinciapavese + |tribunatreviso + |nuovavenezia + |gazzettadimodena + |lanuovaferrara + |corrierealpi + |lasentinella + )\.gelocal + )\.it(?:/[^/]+){2,3}?/(?P<id>\d+)(?:[/?&#]|$)''' + _TESTS = [{ + 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', + 'md5': '84658d7fb9e55a6e57ecc77b73137494', + 'info_dict': { + 'id': '121559', + 'ext': 'mp4', + 'title': 'Il paradosso delle Regionali: ecco perché la Lega vince ma sembra aver perso', + 'description': 'md5:de7f4d6eaaaf36c153b599b10f8ce7ca', + 'thumbnail': r're:^https://www\.repstatic\.it/video/photo/.+?-thumb-full-.+?\.jpg$', + 'duration': 125, + }, + }, { + 'url': 'https://video.espresso.repubblica.it/embed/tutti-i-video/01-ted-villa/14772/14870&width=640&height=360', + 'only_matching': True, + }, { + 'url': 'https://video.repubblica.it/motori/record-della-pista-a-spa-francorchamps-la-pagani-huayra-roadster-bc-stupisce/367415/367963', + 'only_matching': True, + }, { + 'url': 'https://video.ilsecoloxix.it/sport/cassani-e-i-brividi-azzurri-ai-mondiali-di-imola-qui-mi-sono-innamorato-del-ciclismo-da-ragazzino-incredibile-tornarci-da-ct/66184/66267', + 'only_matching': True, + }, { + 'url': 'https://video.iltirreno.gelocal.it/sport/dentro-la-notizia-ferrari-cosa-succede-a-maranello/141059/142723', + 'only_matching': True, + }, { + 'url': 'https://video.messaggeroveneto.gelocal.it/locale/maria-giovanna-elmi-covid-vaccino/138155/139268', + 'only_matching': True, + }, { + 'url': 'https://video.ilpiccolo.gelocal.it/dossier/big-john/dinosauro-big-john-al-via-le-visite-guidate-a-trieste/135226/135751', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimantova.gelocal.it/locale/dal-ponte-visconteo-di-valeggio-l-and-8217sos-dei-ristoratori-aprire-anche-a-cena/137310/137818', + 'only_matching': True, + }, { + 'url': 'https://video.mattinopadova.gelocal.it/dossier/coronavirus-in-veneto/covid-a-vo-un-anno-dopo-un-cuore-tricolore-per-non-dimenticare/138402/138964', + 'only_matching': True, + }, { + 'url': 'https://video.laprovinciapavese.gelocal.it/locale/mede-zona-rossa-via-alle-vaccinazioni-per-gli-over-80/137545/138120', + 'only_matching': True, + }, { + 'url': 'https://video.tribunatreviso.gelocal.it/dossier/coronavirus-in-veneto/ecco-le-prima-vaccinazioni-di-massa-nella-marca/134485/135024', + 'only_matching': True, + }, { + 'url': 'https://video.nuovavenezia.gelocal.it/locale/camion-troppo-alto-per-il-ponte-ferroviario-perde-il-carico/135734/136266', + 'only_matching': True, + }, { + 'url': 'https://video.gazzettadimodena.gelocal.it/locale/modena-scoperta-la-proteina-che-predice-il-livello-di-gravita-del-covid/139109/139796', + 'only_matching': True, + }, { + 'url': 'https://video.lanuovaferrara.gelocal.it/locale/due-bombole-di-gpl-aperte-e-abbandonate-i-vigili-bruciano-il-gas/134391/134957', + 'only_matching': True, + }, { + 'url': 'https://video.corrierealpi.gelocal.it/dossier/cortina-2021-i-mondiali-di-sci-alpino/mondiali-di-sci-il-timelapse-sulla-splendida-olympia/133760/134331', + 'only_matching': True, + }, { + 'url': 'https://video.lasentinella.gelocal.it/locale/vestigne-centra-un-auto-e-si-ribalta/138931/139466', + 'only_matching': True, + }, { + 'url': 'https://video.espresso.repubblica.it/tutti-i-video/01-ted-villa/14772', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + title = self._html_search_meta( + ['twitter:title', 'og:title'], webpage, fatal=True) + player_data = re.findall( + r"PlayerFactory\.setParam\('(?P<type>format|param)',\s*'(?P<name>[^']+)',\s*'(?P<val>[^']+)'\);", + webpage) + + formats = [] + duration = thumb = None + for t, n, v in player_data: + if t == 'format': + if n in ('video-hds-vod-ec', 'video-hls-vod-ec', 'video-viralize', 'video-youtube-pfp'): + continue + elif n.endswith('-vod-ak'): + formats.extend(self._extract_akamai_formats( + v, video_id, {'http': 'media.gedidigital.it'})) + else: + ext = determine_ext(v) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v, video_id, 'mp4', 'm3u8_native', m3u8_id=n, fatal=False)) + continue + f = { + 'format_id': n, + 'url': v, + } + if ext == 'mp3': + abr = int_or_none(self._search_regex( + r'-mp3-audio-(\d+)', v, 'abr', default=None)) + f.update({ + 'abr': abr, + 'tbr': abr, + 'vcodec': 'none' + }) + else: + mobj = re.match(r'^video-rrtv-(\d+)(?:-(\d+))?$', n) + if mobj: + f.update({ + 'height': int(mobj.group(1)), + 'vbr': int_or_none(mobj.group(2)), + }) + if not f.get('vbr'): + f['vbr'] = int_or_none(self._search_regex( + r'-video-rrtv-(\d+)', v, 'abr', default=None)) + formats.append(f) + elif t == 'param': + if n in ['image_full', 'image']: + thumb = v + elif n == 'videoDuration': + duration = int_or_none(v) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_meta( + ['twitter:description', 'og:description', 'description'], webpage), + 'thumbnail': thumb or self._og_search_thumbnail(webpage), + 'formats': formats, + 'duration': duration, + } diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index c81247dd0..c2c930ac5 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -20,19 +20,24 @@ from ..utils import ( ExtractorError, float_or_none, HEADRequest, + int_or_none, is_html, js_to_json, KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, orderedSet, + parse_duration, sanitized_Request, smuggle_url, unescapeHTML, - unified_strdate, + unified_timestamp, unsmuggle_url, UnsupportedError, + url_or_none, + xpath_attr, xpath_text, + xpath_with_ns, ) from .commonprotocols import RtmpIE from .brightcove import ( @@ -48,7 +53,6 @@ from .ooyala import OoyalaIE from .rutv import RUTVIE from .tvc import TVCIE from .sportbox import SportBoxIE -from .smotri import SmotriIE from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE @@ -63,7 +67,10 @@ from .tube8 import Tube8IE from .mofosex import MofosexEmbedIE from .spankwire import SpankwireIE from .youporn import YouPornIE -from .vimeo import VimeoIE +from .vimeo import ( + VimeoIE, + VHXEmbedIE, +) from .dailymotion import DailymotionIE from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE @@ -91,6 +98,7 @@ from .piksel import PikselIE from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE +from .arte import ArteTVEmbedIE from .videopress import VideoPressIE from .rutube import RutubeIE from .limelight import LimelightBaseIE @@ -125,6 +133,9 @@ from .rtlnl import RtlNlIE from .xnews import XLinkIE from .libsyn import LibsynIE from .pulsembed import PulsEmbedIE +from .arcpublishing import ArcPublishingIE +from .medialaan import MedialaanIE +from .simplecast import SimplecastIE class GenericIE(InfoExtractor): @@ -203,11 +214,48 @@ class GenericIE(InfoExtractor): { 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'info_dict': { - 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - 'ext': 'm4v', - 'upload_date': '20150228', - 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', - } + 'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'title': 'MSNBC Rachel Maddow (video)', + 'description': 're:.*her unique approach to storytelling.*', + }, + 'playlist': [{ + 'info_dict': { + 'ext': 'mov', + 'id': 'pdv_maddow_netcast_mov-12-04-2020-224335', + 'title': 're:MSNBC Rachel Maddow', + 'description': 're:.*her unique approach to storytelling.*', + 'timestamp': int, + 'upload_date': compat_str, + 'duration': float, + }, + }], + }, + # RSS feed with item with description and thumbnails + { + 'url': 'https://anchor.fm/s/dd00e14/podcast/rss', + 'info_dict': { + 'id': 'https://anchor.fm/s/dd00e14/podcast/rss', + 'title': 're:.*100% Hydrogen.*', + 'description': 're:.*In this episode.*', + }, + 'playlist': [{ + 'info_dict': { + 'ext': 'm4a', + 'id': 'c1c879525ce2cb640b344507e682c36d', + 'title': 're:Hydrogen!', + 'description': 're:.*In this episode we are going.*', + 'timestamp': 1567977776, + 'upload_date': '20190908', + 'duration': 459, + 'thumbnail': r're:^https?://.*\.jpg$', + 'episode_number': 1, + 'season_number': 1, + 'age_limit': 0, + }, + }], + 'params': { + 'skip_download': True, + }, }, # RSS feed with enclosures and unsupported link URLs { @@ -2111,23 +2159,23 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - { - # Zype embed - 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', - 'info_dict': { - 'id': '5b400b834b32992a310622b9', - 'ext': 'mp4', - 'title': 'Smoky Barbecue Favorites', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', - 'upload_date': '20170909', - 'timestamp': 1504915200, - }, - 'add_ie': [ZypeIE.ie_key()], - 'params': { - 'skip_download': True, - }, - }, + # { + # # Zype embed + # 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', + # 'info_dict': { + # 'id': '5b400b834b32992a310622b9', + # 'ext': 'mp4', + # 'title': 'Smoky Barbecue Favorites', + # 'thumbnail': r're:^https?://.*\.jpe?g', + # 'description': 'md5:5ff01e76316bd8d46508af26dc86023b', + # 'upload_date': '20170909', + # 'timestamp': 1504915200, + # }, + # 'add_ie': [ZypeIE.ie_key()], + # 'params': { + # 'skip_download': True, + # }, + # }, { # videojs embed 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904', @@ -2205,6 +2253,54 @@ class GenericIE(InfoExtractor): # 'force_generic_extractor': True, # }, # } + { + # VHX Embed + 'url': 'https://demo.vhx.tv/category-c/videos/file-example-mp4-480-1-5mg-copy', + 'info_dict': { + 'id': '858208', + 'ext': 'mp4', + 'title': 'Untitled', + 'uploader_id': 'user80538407', + 'uploader': 'OTT Videos', + }, + }, + { + # ArcPublishing PoWa video player + 'url': 'https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/', + 'md5': 'b03b2fac8680e1e5a7cc81a5c27e71b3', + 'info_dict': { + 'id': '8c99cb6e-b29c-4bc9-9173-7bf9979225ab', + 'ext': 'mp4', + 'title': 'Senate candidates wave to voters on Anchorage streets', + 'description': 'md5:91f51a6511f090617353dc720318b20e', + 'timestamp': 1604378735, + 'upload_date': '20201103', + 'duration': 1581, + }, + }, + { + # MyChannels SDK embed + # https://www.24kitchen.nl/populair/deskundige-dit-waarom-sommigen-gevoelig-zijn-voor-voedselallergieen + 'url': 'https://www.demorgen.be/nieuws/burgemeester-rotterdam-richt-zich-in-videoboodschap-tot-relschoppers-voelt-het-goed~b0bcfd741/', + 'md5': '90c0699c37006ef18e198c032d81739c', + 'info_dict': { + 'id': '194165', + 'ext': 'mp4', + 'title': 'Burgemeester Aboutaleb spreekt relschoppers toe', + 'timestamp': 1611740340, + 'upload_date': '20210127', + 'duration': 159, + }, + }, + { + # Simplecast player embed + 'url': 'https://www.bio.org/podcast', + 'info_dict': { + 'id': 'podcast', + 'title': 'I AM BIO Podcast | BIO', + }, + 'playlist_mincount': 52, + }, ] def report_following_redirect(self, new_url): @@ -2216,6 +2312,10 @@ class GenericIE(InfoExtractor): playlist_desc_el = doc.find('./channel/description') playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text + NS_MAP = { + 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', + } + entries = [] for it in doc.findall('./channel/item'): next_url = None @@ -2231,10 +2331,33 @@ class GenericIE(InfoExtractor): if not next_url: continue + def itunes(key): + return xpath_text( + it, xpath_with_ns('./itunes:%s' % key, NS_MAP), + default=None) + + duration = itunes('duration') + explicit = (itunes('explicit') or '').lower() + if explicit in ('true', 'yes'): + age_limit = 18 + elif explicit in ('false', 'no'): + age_limit = 0 + else: + age_limit = None + entries.append({ '_type': 'url_transparent', 'url': next_url, 'title': it.find('title').text, + 'description': xpath_text(it, 'description', default=None), + 'timestamp': unified_timestamp( + xpath_text(it, 'pubDate', default=None)), + 'duration': int_or_none(duration) or parse_duration(duration), + 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), + 'episode': itunes('title'), + 'episode_number': int_or_none(itunes('episode')), + 'season_number': int_or_none(itunes('season')), + 'age_limit': age_limit, }) return { @@ -2354,7 +2477,7 @@ class GenericIE(InfoExtractor): info_dict = { 'id': video_id, 'title': self._generic_title(url), - 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) + 'timestamp': unified_timestamp(head_response.headers.get('Last-Modified')) } # Check for direct link to a video @@ -2417,6 +2540,9 @@ class GenericIE(InfoExtractor): webpage = self._webpage_read_content( full_response, url, video_id, prefix=first_bytes) + if '<title>DPG Media Privacy Gate' in webpage: + webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? @@ -2592,6 +2718,10 @@ class GenericIE(InfoExtractor): SVTIE, XLinkIE, LibsynIE, + VHXEmbedIE, + ArcPublishingIE, + MedialaanIE, + SimplecastIE, ): try: ie_key = embie.ie_key() @@ -2751,11 +2881,9 @@ class GenericIE(InfoExtractor): return self.url_result(ustream_url, UstreamIE.ie_key()) # Look for embedded arte.tv player - mobj = re.search( - r'<(?:script|iframe) [^>]*?src="(?Phttp://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'ArteTVEmbed') + arte_urls = ArteTVEmbedIE._extract_urls(webpage) + if arte_urls: + return self.playlist_from_matches(arte_urls, video_id, video_title) # Look for embedded francetv player mobj = re.search( @@ -2764,11 +2892,6 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) - # Look for embedded smotri.com player - smotri_url = SmotriIE._extract_url(webpage) - if smotri_url: - return self.url_result(smotri_url, 'Smotri') - # Look for embedded Myvi.ru player myvi_url = MyviIE._extract_url(webpage) if myvi_url: diff --git a/haruhi_dl/extractor/go.py b/haruhi_dl/extractor/go.py index 03cfba91f..0d731e90a 100644 --- a/haruhi_dl/extractor/go.py +++ b/haruhi_dl/extractor/go.py @@ -38,13 +38,17 @@ class GoIE(AdobePassIE): 'disneynow': { 'brand': '011', 'resource_id': 'Disney', - } + }, + 'fxnow.fxnetworks': { + 'brand': '025', + 'requestor_id': 'dtci', + }, } _VALID_URL = r'''(?x) https?:// (?: (?:(?P%s)\.)?go| - (?Pabc|freeform|disneynow) + (?Pabc|freeform|disneynow|fxnow\.fxnetworks) )\.com/ (?: (?:[^/]+/)*(?P[Vv][Dd][Kk][Aa]\w+)| @@ -99,6 +103,19 @@ class GoIE(AdobePassIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'https://fxnow.fxnetworks.com/shows/better-things/video/vdka12782841', + 'info_dict': { + 'id': 'VDKA12782841', + 'ext': 'mp4', + 'title': 'First Look: Better Things - Season 2', + 'description': 'md5:fa73584a95761c605d9d54904e35b407', + }, + 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, diff --git a/haruhi_dl/extractor/googleplus.py b/haruhi_dl/extractor/googleplus.py deleted file mode 100644 index 6b927bb44..000000000 --- a/haruhi_dl/extractor/googleplus.py +++ /dev/null @@ -1,73 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import codecs - -from .common import InfoExtractor -from ..utils import unified_strdate - - -class GooglePlusIE(InfoExtractor): - IE_DESC = 'Google Plus' - _VALID_URL = r'https?://plus\.google\.com/(?:[^/]+/)*?posts/(?P\w+)' - IE_NAME = 'plus.google' - _TEST = { - 'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH', - 'info_dict': { - 'id': 'ZButuJc6CtH', - 'ext': 'flv', - 'title': '嘆きの天使 降臨', - 'upload_date': '20120613', - 'uploader': '井上ヨシマサ', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - # Step 1, Retrieve post webpage to extract further information - webpage = self._download_webpage(url, video_id, 'Downloading entry webpage') - - title = self._og_search_description(webpage).splitlines()[0] - upload_date = unified_strdate(self._html_search_regex( - r'''(?x) - ([0-9]{4}-[0-9]{2}-[0-9]{2})''', - webpage, 'upload date', fatal=False, flags=re.VERBOSE)) - uploader = self._html_search_regex( - r'rel="author".*?>(.*?)', webpage, 'uploader', fatal=False) - - # Step 2, Simulate clicking the image box to launch video - DOMAIN = 'https://plus.google.com/' - video_page = self._search_regex( - r'[^/]+)/episode/(?P[^/?&#]+)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh', + 'md5': 'fa56b2ee8bd0703e27e42d4b104c4766', + 'info_dict': { + 'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a', + 'ext': 'mp3', + 'title': 'WWDTM New Year 2021', + 'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.', + 'upload_date': '20210102', + 'timestamp': 1609606800, + 'duration': 2901, + 'series': "Wait Wait... Don't Tell Me!", + } + } + + def _real_extract(self, url): + b64_feed_url, b64_guid = re.match(self._VALID_URL, url).groups() + episode = self._batch_execute( + 'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1] + return self._extract_episode(episode) + + +class GooglePodcastsFeedIE(GooglePodcastsBaseIE): + IE_NAME = 'google:podcasts:feed' + _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P[^/?&#]+)/?(?:[?#&]|$)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA', + 'info_dict': { + 'title': "Wait Wait... Don't Tell Me!", + 'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.", + }, + 'playlist_mincount': 20, + } + + def _real_extract(self, url): + b64_feed_url = self._match_id(url) + data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url]) + + entries = [] + for episode in (try_get(data, lambda x: x[1][0]) or []): + entries.append(self._extract_episode(episode)) + + feed = try_get(data, lambda x: x[3]) or [] + return self.playlist_result( + entries, playlist_title=try_get(feed, lambda x: x[0]), + playlist_description=try_get(feed, lambda x: x[2])) diff --git a/haruhi_dl/extractor/hotstar.py b/haruhi_dl/extractor/hotstar.py index f97eefa3d..1620822b6 100644 --- a/haruhi_dl/extractor/hotstar.py +++ b/haruhi_dl/extractor/hotstar.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import hashlib import hmac +import json import re import time import uuid @@ -25,43 +26,50 @@ from ..utils import ( class HotStarBaseIE(InfoExtractor): _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee' - def _call_api_impl(self, path, video_id, query): + def _call_api_impl(self, path, video_id, headers, query, data=None): st = int(time.time()) exp = st + 6000 auth = 'st=%d~exp=%d~acl=/*' % (st, exp) auth += '~hmac=' + hmac.new(self._AKAMAI_ENCRYPTION_KEY, auth.encode(), hashlib.sha256).hexdigest() - response = self._download_json( - 'https://api.hotstar.com/' + path, video_id, headers={ - 'hotstarauth': auth, - 'x-country-code': 'IN', - 'x-platform-code': 'JIO', - }, query=query) + h = {'hotstarauth': auth} + h.update(headers) + return self._download_json( + 'https://api.hotstar.com/' + path, + video_id, headers=h, query=query, data=data) + + def _call_api(self, path, video_id, query_name='contentId'): + response = self._call_api_impl(path, video_id, { + 'x-country-code': 'IN', + 'x-platform-code': 'JIO', + }, { + query_name: video_id, + 'tas': 10000, + }) if response['statusCode'] != 'OK': raise ExtractorError( response['body']['message'], expected=True) return response['body']['results'] - def _call_api(self, path, video_id, query_name='contentId'): - return self._call_api_impl(path, video_id, { - query_name: video_id, - 'tas': 10000, - }) - - def _call_api_v2(self, path, video_id): - return self._call_api_impl( - '%s/in/contents/%s' % (path, video_id), video_id, { - 'desiredConfig': 'encryption:plain;ladder:phone,tv;package:hls,dash', - 'client': 'mweb', - 'clientVersion': '6.18.0', - 'deviceId': compat_str(uuid.uuid4()), - 'osName': 'Windows', - 'osVersion': '10', - }) + def _call_api_v2(self, path, video_id, headers, query=None, data=None): + h = {'X-Request-Id': compat_str(uuid.uuid4())} + h.update(headers) + try: + return self._call_api_impl( + path, video_id, h, query, data) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + if e.cause.code == 402: + self.raise_login_required() + message = self._parse_json(e.cause.read().decode(), video_id)['message'] + if message in ('Content not available in region', 'Country is not supported'): + raise self.raise_geo_restricted(message) + raise ExtractorError(message) + raise e class HotStarIE(HotStarBaseIE): IE_NAME = 'hotstar' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P\d{10})' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+[/-])?(?P\d{10})' _TESTS = [{ # contentData 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', @@ -92,8 +100,13 @@ class HotStarIE(HotStarBaseIE): # only available via api v2 'url': 'https://www.hotstar.com/tv/ek-bhram-sarvagun-sampanna/s-2116/janhvi-targets-suman/1000234847', 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/tv/start-music/1260005217/cooks-vs-comalis/1100039717', + 'only_matching': True, }] _GEO_BYPASS = False + _DEVICE_ID = None + _USER_TOKEN = None def _real_extract(self, url): video_id = self._match_id(url) @@ -121,7 +134,30 @@ class HotStarIE(HotStarBaseIE): headers = {'Referer': url} formats = [] geo_restricted = False - playback_sets = self._call_api_v2('h/v2/play', video_id)['playBackSets'] + + if not self._USER_TOKEN: + self._DEVICE_ID = compat_str(uuid.uuid4()) + self._USER_TOKEN = self._call_api_v2('um/v3/users', video_id, { + 'X-HS-Platform': 'PCTV', + 'Content-Type': 'application/json', + }, data=json.dumps({ + 'device_ids': [{ + 'id': self._DEVICE_ID, + 'type': 'device_id', + }], + }).encode())['user_identity'] + + playback_sets = self._call_api_v2( + 'play/v2/playback/content/' + video_id, video_id, { + 'X-HS-Platform': 'web', + 'X-HS-AppVersion': '6.99.1', + 'X-HS-UserToken': self._USER_TOKEN, + }, query={ + 'device-id': self._DEVICE_ID, + 'desired-config': 'encryption:plain', + 'os-name': 'Windows', + 'os-version': '10', + })['data']['playBackSets'] for playback_set in playback_sets: if not isinstance(playback_set, dict): continue @@ -163,19 +199,22 @@ class HotStarIE(HotStarBaseIE): for f in formats: f.setdefault('http_headers', {}).update(headers) + image = try_get(video_data, lambda x: x['image']['h'], compat_str) + return { 'id': video_id, 'title': title, + 'thumbnail': 'https://img1.hotstarext.com/image/upload/' + image if image else None, 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')), 'formats': formats, 'channel': video_data.get('channelName'), - 'channel_id': video_data.get('channelId'), + 'channel_id': str_or_none(video_data.get('channelId')), 'series': video_data.get('showName'), 'season': video_data.get('seasonName'), 'season_number': int_or_none(video_data.get('seasonNo')), - 'season_id': video_data.get('seasonId'), + 'season_id': str_or_none(video_data.get('seasonId')), 'episode': title, 'episode_number': int_or_none(video_data.get('episodeNo')), } @@ -183,7 +222,7 @@ class HotStarIE(HotStarBaseIE): class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com/tv/[^/]+/s-\w+/list/[^/]+/t-(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:[a-z]{2}/)?tv/[^/]+/s-\w+/list/[^/]+/t-(?P\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', 'info_dict': { @@ -193,6 +232,9 @@ class HotStarPlaylistIE(HotStarBaseIE): }, { 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/us/tv/masterchef-india/s-830/list/episodes/t-1_2_830', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/haruhi_dl/extractor/ign.py b/haruhi_dl/extractor/ign.py index a96ea8010..0d9f50ed2 100644 --- a/haruhi_dl/extractor/ign.py +++ b/haruhi_dl/extractor/ign.py @@ -3,230 +3,255 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( + HEADRequest, + determine_ext, int_or_none, parse_iso8601, + strip_or_none, + try_get, ) -class IGNIE(InfoExtractor): +class IGNBaseIE(InfoExtractor): + def _call_api(self, slug): + return self._download_json( + 'http://apis.ign.com/{0}/v3/{0}s/slug/{1}'.format(self._PAGE_TYPE, slug), slug) + + +class IGNIE(IGNBaseIE): """ Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. Some videos of it.ign.com are also supported """ - _VALID_URL = r'https?://.+?\.ign\.com/(?:[^/]+/)?(?Pvideos|show_videos|articles|feature|(?:[^/]+/\d+/video))(/.+)?/(?P.+)' + _VALID_URL = r'https?://(?:.+?\.ign|www\.pcmag)\.com/videos/(?:\d{4}/\d{2}/\d{2}/)?(?P[^/?&#]+)' IE_NAME = 'ign.com' + _PAGE_TYPE = 'video' - _API_URL_TEMPLATE = 'http://apis.ign.com/video/v3/videos/%s' - _EMBED_RE = r']+?["\']((?:https?:)?//.+?\.ign\.com.+?/embed.+?)["\']' - - _TESTS = [ - { - 'url': 'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', - 'md5': 'febda82c4bafecd2d44b6e1a18a595f8', - 'info_dict': { - 'id': '8f862beef863986b2785559b9e1aa599', - 'ext': 'mp4', - 'title': 'The Last of Us Review', - 'description': 'md5:c8946d4260a4d43a00d5ae8ed998870c', - 'timestamp': 1370440800, - 'upload_date': '20130605', - 'uploader_id': 'cberidon@ign.com', - } - }, - { - 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', - 'info_dict': { - 'id': '100-little-things-in-gta-5-that-will-blow-your-mind', - }, - 'playlist': [ - { - 'info_dict': { - 'id': '5ebbd138523268b93c9141af17bec937', - 'ext': 'mp4', - 'title': 'GTA 5 Video Review', - 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', - 'timestamp': 1379339880, - 'upload_date': '20130916', - 'uploader_id': 'danieljkrupa@gmail.com', - }, - }, - { - 'info_dict': { - 'id': '638672ee848ae4ff108df2a296418ee2', - 'ext': 'mp4', - 'title': '26 Twisted Moments from GTA 5 in Slow Motion', - 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', - 'timestamp': 1386878820, - 'upload_date': '20131212', - 'uploader_id': 'togilvie@ign.com', - }, - }, - ], - 'params': { - 'skip_download': True, - }, - }, - { - 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', - 'md5': '618fedb9c901fd086f6f093564ef8558', - 'info_dict': { - 'id': '078fdd005f6d3c02f63d795faa1b984f', - 'ext': 'mp4', - 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', - 'description': 'Brian and Jared explore Michel Ancel\'s captivating new preview.', - 'timestamp': 1408047180, - 'upload_date': '20140814', - 'uploader_id': 'jamesduggan1990@gmail.com', - }, - }, - { - 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', - 'only_matching': True, - }, - { - 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', - 'only_matching': True, - }, - { - # videoId pattern - 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', - 'only_matching': True, - }, - ] - - def _find_video_id(self, webpage): - res_id = [ - r'"video_id"\s*:\s*"(.*?)"', - r'class="hero-poster[^"]*?"[^>]*id="(.+?)"', - r'data-video-id="(.+?)"', - r']*value="[^"]*?url=(https?://www\.ign\.com/videos/.*?)["&]', - webpage) - if multiple_urls: - entries = [self.url_result(u, ie='IGN') for u in multiple_urls] - return { - '_type': 'playlist', - 'id': name_or_id, - 'entries': entries, - } - - video_id = self._find_video_id(webpage) - if not video_id: - return self.url_result(self._search_regex( - self._EMBED_RE, webpage, 'embed url')) - return self._get_video_info(video_id) - - def _get_video_info(self, video_id): - api_data = self._download_json( - self._API_URL_TEMPLATE % video_id, video_id) + display_id = self._match_id(url) + video = self._call_api(display_id) + video_id = video['videoId'] + metadata = video['metadata'] + title = metadata.get('longTitle') or metadata.get('title') or metadata['name'] formats = [] - m3u8_url = api_data['refs'].get('m3uUrl') + refs = video.get('refs') or {} + + m3u8_url = refs.get('m3uUrl') if m3u8_url: formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - f4m_url = api_data['refs'].get('f4mUrl') + + f4m_url = refs.get('f4mUrl') if f4m_url: formats.extend(self._extract_f4m_formats( f4m_url, video_id, f4m_id='hds', fatal=False)) - for asset in api_data['assets']: + + for asset in (video.get('assets') or []): + asset_url = asset.get('url') + if not asset_url: + continue formats.append({ - 'url': asset['url'], - 'tbr': asset.get('actual_bitrate_kbps'), - 'fps': asset.get('frame_rate'), + 'url': asset_url, + 'tbr': int_or_none(asset.get('bitrate'), 1000), + 'fps': int_or_none(asset.get('frame_rate')), 'height': int_or_none(asset.get('height')), 'width': int_or_none(asset.get('width')), }) + + mezzanine_url = try_get(video, lambda x: x['system']['mezzanineUrl']) + if mezzanine_url: + formats.append({ + 'ext': determine_ext(mezzanine_url, 'mp4'), + 'format_id': 'mezzanine', + 'preference': 1, + 'url': mezzanine_url, + }) + self._sort_formats(formats) - thumbnails = [{ - 'url': thumbnail['url'] - } for thumbnail in api_data.get('thumbnails', [])] + thumbnails = [] + for thumbnail in (video.get('thumbnails') or []): + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + }) - metadata = api_data['metadata'] + tags = [] + for tag in (video.get('tags') or []): + display_name = tag.get('displayName') + if not display_name: + continue + tags.append(display_name) return { - 'id': api_data.get('videoId') or video_id, - 'title': metadata.get('longTitle') or metadata.get('name') or metadata.get['title'], - 'description': metadata.get('description'), + 'id': video_id, + 'title': title, + 'description': strip_or_none(metadata.get('description')), 'timestamp': parse_iso8601(metadata.get('publishDate')), 'duration': int_or_none(metadata.get('duration')), - 'display_id': metadata.get('slug') or video_id, - 'uploader_id': metadata.get('creator'), + 'display_id': display_id, 'thumbnails': thumbnails, 'formats': formats, + 'tags': tags, } -class OneUPIE(IGNIE): - _VALID_URL = r'https?://gamevideos\.1up\.com/(?Pvideo)/id/(?P.+)\.html' - IE_NAME = '1up.com' - +class IGNVideoIE(InfoExtractor): + _VALID_URL = r'https?://.+?\.ign\.com/(?:[a-z]{2}/)?[^/]+/(?P\d+)/(?:video|trailer)/' _TESTS = [{ - 'url': 'http://gamevideos.1up.com/video/id/34976.html', - 'md5': 'c9cc69e07acb675c31a16719f909e347', + 'url': 'http://me.ign.com/en/videos/112203/video/how-hitman-aims-to-be-different-than-every-other-s', + 'md5': 'dd9aca7ed2657c4e118d8b261e5e9de1', 'info_dict': { - 'id': '34976', + 'id': 'e9be7ea899a9bbfc0674accc22a36cc8', 'ext': 'mp4', - 'title': 'Sniper Elite V2 - Trailer', - 'description': 'md5:bf0516c5ee32a3217aa703e9b1bc7826', - 'timestamp': 1313099220, - 'upload_date': '20110811', - 'uploader_id': 'IGN', + 'title': 'How Hitman Aims to Be Different Than Every Other Stealth Game - NYCC 2015', + 'description': 'Taking out assassination targets in Hitman has never been more stylish.', + 'timestamp': 1444665600, + 'upload_date': '20151012', } + }, { + 'url': 'http://me.ign.com/ar/angry-birds-2/106533/video/lrd-ldyy-lwl-lfylm-angry-birds', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://me.ign.com/ar/ratchet-clank-rift-apart/144327/trailer/embed', + 'only_matching': True, + }, { + # Twitter embed + 'url': 'http://adria.ign.com/sherlock-season-4/9687/trailer/embed', + 'only_matching': True, + }, { + # Vimeo embed + 'url': 'https://kr.ign.com/bic-2018/3307/trailer/embed', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - result = super(OneUPIE, self)._real_extract(url) - result['id'] = mobj.group('name_or_id') - return result + video_id = self._match_id(url) + req = HEADRequest(url.rsplit('/', 1)[0] + '/embed') + url = self._request_webpage(req, video_id).geturl() + ign_url = compat_parse_qs( + compat_urllib_parse_urlparse(url).query).get('url', [None])[0] + if ign_url: + return self.url_result(ign_url, IGNIE.ie_key()) + return self.url_result(url) -class PCMagIE(IGNIE): - _VALID_URL = r'https?://(?:www\.)?pcmag\.com/(?Pvideos|article2)(/.+)?/(?P.+)' - IE_NAME = 'pcmag' - - _EMBED_RE = r'iframe\.setAttribute\("src",\s*__util.objToUrlString\("http://widgets\.ign\.com/video/embed/content\.html?[^"]*url=([^"]+)["&]' - +class IGNArticleIE(IGNBaseIE): + _VALID_URL = r'https?://.+?\.ign\.com/(?:articles(?:/\d{4}/\d{2}/\d{2})?|(?:[a-z]{2}/)?feature/\d+)/(?P[^/?&#]+)' + _PAGE_TYPE = 'article' _TESTS = [{ - 'url': 'http://www.pcmag.com/videos/2015/01/06/010615-whats-new-now-is-gogo-snooping-on-your-data', - 'md5': '212d6154fd0361a2781075f1febbe9ad', + 'url': 'http://me.ign.com/en/feature/15775/100-little-things-in-gta-5-that-will-blow-your-mind', 'info_dict': { - 'id': 'ee10d774b508c9b8ec07e763b9125b91', - 'ext': 'mp4', - 'title': '010615_What\'s New Now: Is GoGo Snooping on Your Data?', - 'description': 'md5:a7071ae64d2f68cc821c729d4ded6bb3', - 'timestamp': 1420571160, - 'upload_date': '20150106', - 'uploader_id': 'cozzipix@gmail.com', - } + 'id': '524497489e4e8ff5848ece34', + 'title': '100 Little Things in GTA 5 That Will Blow Your Mind', + }, + 'playlist': [ + { + 'info_dict': { + 'id': '5ebbd138523268b93c9141af17bec937', + 'ext': 'mp4', + 'title': 'GTA 5 Video Review', + 'description': 'Rockstar drops the mic on this generation of games. Watch our review of the masterly Grand Theft Auto V.', + 'timestamp': 1379339880, + 'upload_date': '20130916', + }, + }, + { + 'info_dict': { + 'id': '638672ee848ae4ff108df2a296418ee2', + 'ext': 'mp4', + 'title': '26 Twisted Moments from GTA 5 in Slow Motion', + 'description': 'The twisted beauty of GTA 5 in stunning slow motion.', + 'timestamp': 1386878820, + 'upload_date': '20131212', + }, + }, + ], + 'params': { + 'playlist_items': '2-3', + 'skip_download': True, + }, }, { - 'url': 'http://www.pcmag.com/article2/0,2817,2470156,00.asp', - 'md5': '94130c1ca07ba0adb6088350681f16c1', + 'url': 'http://www.ign.com/articles/2014/08/15/rewind-theater-wild-trailer-gamescom-2014?watch', 'info_dict': { - 'id': '042e560ba94823d43afcb12ddf7142ca', - 'ext': 'mp4', - 'title': 'HTC\'s Weird New Re Camera - What\'s New Now', - 'description': 'md5:53433c45df96d2ea5d0fda18be2ca908', - 'timestamp': 1412953920, - 'upload_date': '20141010', - 'uploader_id': 'chris_snyder@pcmag.com', - } + 'id': '53ee806780a81ec46e0790f8', + 'title': 'Rewind Theater - Wild Trailer Gamescom 2014', + }, + 'playlist_count': 2, + }, { + # videoId pattern + 'url': 'http://www.ign.com/articles/2017/06/08/new-ducktales-short-donalds-birthday-doesnt-go-as-planned', + 'only_matching': True, + }, { + # Youtube embed + 'url': 'https://www.ign.com/articles/2021-mvp-named-in-puppy-bowl-xvii', + 'only_matching': True, + }, { + # IMDB embed + 'url': 'https://www.ign.com/articles/2014/08/07/sons-of-anarchy-final-season-trailer', + 'only_matching': True, + }, { + # Facebook embed + 'url': 'https://www.ign.com/articles/2017/09/20/marvels-the-punisher-watch-the-new-trailer-for-the-netflix-series', + 'only_matching': True, + }, { + # Brightcove embed + 'url': 'https://www.ign.com/articles/2016/01/16/supergirl-goes-flying-with-martian-manhunter-in-new-clip', + 'only_matching': True, }] + + def _real_extract(self, url): + display_id = self._match_id(url) + article = self._call_api(display_id) + + def entries(): + media_url = try_get(article, lambda x: x['mediaRelations'][0]['media']['metadata']['url']) + if media_url: + yield self.url_result(media_url, IGNIE.ie_key()) + for content in (article.get('content') or []): + for video_url in re.findall(r'(?:\[(?:ignvideo\s+url|youtube\s+clip_id)|]+src)="([^"]+)"', content): + yield self.url_result(video_url) + + return self.playlist_result( + entries(), article.get('articleId'), + strip_or_none(try_get(article, lambda x: x['metadata']['headline']))) diff --git a/haruhi_dl/extractor/iheart.py b/haruhi_dl/extractor/iheart.py new file mode 100644 index 000000000..b54c05eeb --- /dev/null +++ b/haruhi_dl/extractor/iheart.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + clean_podcast_url, + int_or_none, + str_or_none, +) + + +class IHeartRadioBaseIE(InfoExtractor): + def _call_api(self, path, video_id, fatal=True, query=None): + return self._download_json( + 'https://api.iheart.com/api/v3/podcast/' + path, + video_id, fatal=fatal, query=query) + + def _extract_episode(self, episode): + return { + 'thumbnail': episode.get('imageUrl'), + 'description': clean_html(episode.get('description')), + 'timestamp': int_or_none(episode.get('startDate'), 1000), + 'duration': int_or_none(episode.get('duration')), + } + + +class IHeartRadioIE(IHeartRadioBaseIE): + IENAME = 'iheartradio' + _VALID_URL = r'(?:https?://(?:www\.)?iheart\.com/podcast/[^/]+/episode/(?P[^/?&#]+)-|iheartradio:)(?P\d+)' + _TEST = { + 'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true', + 'md5': 'c8609c92c8688dcb69d8541042b8abca', + 'info_dict': { + 'id': '70346499', + 'ext': 'mp3', + 'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus', + 'description': 'md5:96cc7297b3a5a9ebae28643801c96fae', + 'timestamp': 1597741200, + 'upload_date': '20200818', + } + } + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api( + 'episodes/' + episode_id, episode_id)['episode'] + info = self._extract_episode(episode) + info.update({ + 'id': episode_id, + 'title': episode['title'], + 'url': clean_podcast_url(episode['mediaUrl']), + }) + return info + + +class IHeartRadioPodcastIE(IHeartRadioBaseIE): + IE_NAME = 'iheartradio:podcast' + _VALID_URL = r'https?://(?:www\.)?iheart(?:podcastnetwork)?\.com/podcast/[^/?&#]+-(?P\d+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://www.iheart.com/podcast/1119-it-could-happen-here-30717896/', + 'info_dict': { + 'id': '30717896', + 'title': 'It Could Happen Here', + 'description': 'md5:5842117412a967eb0b01f8088eb663e2', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://www.iheartpodcastnetwork.com/podcast/105-stuff-you-should-know-26940277', + 'only_matching': True, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + path = 'podcasts/' + podcast_id + episodes = self._call_api( + path + '/episodes', podcast_id, query={'limit': 1000000000})['data'] + + entries = [] + for episode in episodes: + episode_id = str_or_none(episode.get('id')) + if not episode_id: + continue + info = self._extract_episode(episode) + info.update({ + '_type': 'url', + 'id': episode_id, + 'title': episode.get('title'), + 'url': 'iheartradio:' + episode_id, + 'ie_key': IHeartRadioIE.ie_key(), + }) + entries.append(info) + + podcast = self._call_api(path, podcast_id, False) or {} + + return self.playlist_result( + entries, podcast_id, podcast.get('title'), podcast.get('description')) diff --git a/haruhi_dl/extractor/ina.py b/haruhi_dl/extractor/ina.py index 12695af27..b3b2683cb 100644 --- a/haruhi_dl/extractor/ina.py +++ b/haruhi_dl/extractor/ina.py @@ -12,7 +12,7 @@ from ..utils import ( class InaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ina\.fr/(?:video|audio)/(?P[A-Z0-9_]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:video|audio)/(?P[A-Z0-9_]+)' _TESTS = [{ 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', 'md5': 'a667021bf2b41f8dc6049479d9bb38a3', @@ -31,6 +31,9 @@ class InaIE(InfoExtractor): }, { 'url': 'https://www.ina.fr/video/P16173408-video.html', 'only_matching': True, + }, { + 'url': 'http://m.ina.fr/video/I12055569', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/haruhi_dl/extractor/infoq.py b/haruhi_dl/extractor/infoq.py index 18249cf9b..0a70a1fb4 100644 --- a/haruhi_dl/extractor/infoq.py +++ b/haruhi_dl/extractor/infoq.py @@ -54,7 +54,7 @@ class InfoQIE(BokeCCBaseIE): def _extract_rtmp_video(self, webpage): # The server URL is hardcoded - video_url = 'rtmpe://video.infoq.com/cfx/st/' + video_url = 'rtmpe://videof.infoq.com/cfx/st/' # Extract video URL encoded_id = self._search_regex( @@ -86,17 +86,18 @@ class InfoQIE(BokeCCBaseIE): return [{ 'format_id': 'http_video', 'url': http_video_url, + 'http_headers': {'Referer': 'https://www.infoq.com/'}, }] def _extract_http_audio(self, webpage, video_id): - fields = self._hidden_inputs(webpage) + fields = self._form_hidden_inputs('mp3Form', webpage) http_audio_url = fields.get('filename') if not http_audio_url: return [] # base URL is found in the Location header in the response returned by # GET https://www.infoq.com/mp3download.action?filename=... when logged in. - http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) + http_audio_url = compat_urlparse.urljoin('http://ress.infoq.com/downloads/mp3downloads/', http_audio_url) http_audio_url = update_url_query(http_audio_url, self._extract_cf_auth(webpage)) # audio file seem to be missing some times even if there is a download link diff --git a/haruhi_dl/extractor/instagram.py b/haruhi_dl/extractor/instagram.py index b061850a1..27ea97f56 100644 --- a/haruhi_dl/extractor/instagram.py +++ b/haruhi_dl/extractor/instagram.py @@ -22,7 +22,7 @@ from ..utils import ( class InstagramIE(InfoExtractor): - _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv)/(?P[^/?#&]+))' + _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P[^/?#&]+))' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -35,7 +35,7 @@ class InstagramIE(InfoExtractor): 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': 'naomipq', - 'uploader': 'Naomi Leonor Phan-Quang', + 'uploader': 'B E A U T Y F O R A S H E S', 'like_count': int, 'comment_count': int, 'comments': list, @@ -95,6 +95,9 @@ class InstagramIE(InfoExtractor): }, { 'url': 'https://www.instagram.com/tv/aye83DjauH/', 'only_matching': True, + }, { + 'url': 'https://www.instagram.com/reel/CDUMkliABpa/', + 'only_matching': True, }] @staticmethod @@ -122,9 +125,9 @@ class InstagramIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - (video_url, description, thumbnail, timestamp, uploader, + (media, video_url, description, thumbnail, timestamp, uploader, uploader_id, like_count, comment_count, comments, height, - width) = [None] * 11 + width) = [None] * 12 shared_data = self._parse_json( self._search_regex( @@ -137,59 +140,77 @@ class InstagramIE(InfoExtractor): (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], lambda x: x['entry_data']['PostPage'][0]['media']), dict) - if media: - video_url = media.get('video_url') - height = int_or_none(media.get('dimensions', {}).get('height')) - width = int_or_none(media.get('dimensions', {}).get('width')) - description = try_get( - media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) or media.get('caption') - thumbnail = media.get('display_src') - timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) - uploader = media.get('owner', {}).get('full_name') - uploader_id = media.get('owner', {}).get('username') + # _sharedData.entry_data.PostPage is empty when authenticated (see + # https://github.com/hdl-org/haruhi-dl/pull/22880) + if not media: + additional_data = self._parse_json( + self._search_regex( + r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', + webpage, 'additional data', default='{}'), + video_id, fatal=False) + if additional_data: + media = try_get( + additional_data, lambda x: x['graphql']['shortcode_media'], + dict) + if media: + video_url = media.get('video_url') + height = int_or_none(media.get('dimensions', {}).get('height')) + width = int_or_none(media.get('dimensions', {}).get('width')) + description = try_get( + media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) or media.get('caption') + thumbnail = media.get('display_src') or media.get('display_url') + timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) + uploader = media.get('owner', {}).get('full_name') + uploader_id = media.get('owner', {}).get('username') - def get_count(key, kind): - return int_or_none(try_get( + def get_count(keys, kind): + if not isinstance(keys, (list, tuple)): + keys = [keys] + for key in keys: + count = int_or_none(try_get( media, (lambda x: x['edge_media_%s' % key]['count'], lambda x: x['%ss' % kind]['count']))) - like_count = get_count('preview_like', 'like') - comment_count = get_count('to_comment', 'comment') + if count is not None: + return count + like_count = get_count('preview_like', 'like') + comment_count = get_count( + ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment') - comments = [{ - 'author': comment.get('user', {}).get('username'), - 'author_id': comment.get('user', {}).get('id'), - 'id': comment.get('id'), - 'text': comment.get('text'), - 'timestamp': int_or_none(comment.get('created_at')), - } for comment in media.get( - 'comments', {}).get('nodes', []) if comment.get('text')] - if not video_url: - edges = try_get( - media, lambda x: x['edge_sidecar_to_children']['edges'], - list) or [] - if edges: - entries = [] - for edge_num, edge in enumerate(edges, start=1): - node = try_get(edge, lambda x: x['node'], dict) - if not node: - continue - node_video_url = url_or_none(node.get('video_url')) - if not node_video_url: - continue - entries.append({ - 'id': node.get('shortcode') or node['id'], - 'title': 'Video %d' % edge_num, - 'url': node_video_url, - 'thumbnail': node.get('display_url'), - 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), - 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), - 'view_count': int_or_none(node.get('video_view_count')), - }) - return self.playlist_result( - entries, video_id, - 'Post by %s' % uploader_id if uploader_id else None, - description) + comments = [{ + 'author': comment.get('user', {}).get('username'), + 'author_id': comment.get('user', {}).get('id'), + 'id': comment.get('id'), + 'text': comment.get('text'), + 'timestamp': int_or_none(comment.get('created_at')), + } for comment in media.get( + 'comments', {}).get('nodes', []) if comment.get('text')] + if not video_url: + edges = try_get( + media, lambda x: x['edge_sidecar_to_children']['edges'], + list) or [] + if edges: + entries = [] + for edge_num, edge in enumerate(edges, start=1): + node = try_get(edge, lambda x: x['node'], dict) + if not node: + continue + node_video_url = url_or_none(node.get('video_url')) + if not node_video_url: + continue + entries.append({ + 'id': node.get('shortcode') or node['id'], + 'title': 'Video %d' % edge_num, + 'url': node_video_url, + 'thumbnail': node.get('display_url'), + 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), + 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), + 'view_count': int_or_none(node.get('video_view_count')), + }) + return self.playlist_result( + entries, video_id, + 'Post by %s' % uploader_id if uploader_id else None, + description) if not video_url: video_url = self._og_search_video_url(webpage, secure=False) diff --git a/haruhi_dl/extractor/itv.py b/haruhi_dl/extractor/itv.py index ad2f4eca5..e86c40b42 100644 --- a/haruhi_dl/extractor/itv.py +++ b/haruhi_dl/extractor/itv.py @@ -1,29 +1,21 @@ # coding: utf-8 from __future__ import unicode_literals -import uuid -import xml.etree.ElementTree as etree import json import re from .common import InfoExtractor from .brightcove import BrightcoveNewIE -from ..compat import ( - compat_str, - compat_etree_register_namespace, -) from ..utils import ( + clean_html, determine_ext, - ExtractorError, extract_attributes, - int_or_none, + get_element_by_class, + JSON_LD_RE, merge_dicts, parse_duration, smuggle_url, url_or_none, - xpath_with_ns, - xpath_element, - xpath_text, ) @@ -31,14 +23,18 @@ class ITVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P[0-9a-zA-Z]+)' _GEO_COUNTRIES = ['GB'] _TESTS = [{ - 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053', + 'url': 'https://www.itv.com/hub/liar/2a4547a0012', 'info_dict': { - 'id': '2a2936a0053', - 'ext': 'flv', - 'title': 'Home Movie', + 'id': '2a4547a0012', + 'ext': 'mp4', + 'title': 'Liar - Series 2 - Episode 6', + 'description': 'md5:d0f91536569dec79ea184f0a44cca089', + 'series': 'Liar', + 'season_number': 2, + 'episode_number': 6, }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, }, { @@ -61,220 +57,97 @@ class ITVIE(InfoExtractor): params = extract_attributes(self._search_regex( r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params')) - ns_map = { - 'soapenv': 'http://schemas.xmlsoap.org/soap/envelope/', - 'tem': 'http://tempuri.org/', - 'itv': 'http://schemas.datacontract.org/2004/07/Itv.BB.Mercury.Common.Types', - 'com': 'http://schemas.itv.com/2009/05/Common', - } - for ns, full_ns in ns_map.items(): - compat_etree_register_namespace(ns, full_ns) - - def _add_ns(name): - return xpath_with_ns(name, ns_map) - - def _add_sub_element(element, name): - return etree.SubElement(element, _add_ns(name)) - - production_id = ( - params.get('data-video-autoplay-id') - or '%s#001' % ( - params.get('data-video-episode-id') - or video_id.replace('a', '/'))) - - req_env = etree.Element(_add_ns('soapenv:Envelope')) - _add_sub_element(req_env, 'soapenv:Header') - body = _add_sub_element(req_env, 'soapenv:Body') - get_playlist = _add_sub_element(body, ('tem:GetPlaylist')) - request = _add_sub_element(get_playlist, 'tem:request') - _add_sub_element(request, 'itv:ProductionId').text = production_id - _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper() - vodcrid = _add_sub_element(request, 'itv:Vodcrid') - _add_sub_element(vodcrid, 'com:Id') - _add_sub_element(request, 'itv:Partition') - user_info = _add_sub_element(get_playlist, 'tem:userInfo') - _add_sub_element(user_info, 'itv:Broadcaster').text = 'Itv' - _add_sub_element(user_info, 'itv:DM') - _add_sub_element(user_info, 'itv:RevenueScienceValue') - _add_sub_element(user_info, 'itv:SessionId') - _add_sub_element(user_info, 'itv:SsoToken') - _add_sub_element(user_info, 'itv:UserToken') - site_info = _add_sub_element(get_playlist, 'tem:siteInfo') - _add_sub_element(site_info, 'itv:AdvertisingRestriction').text = 'None' - _add_sub_element(site_info, 'itv:AdvertisingSite').text = 'ITV' - _add_sub_element(site_info, 'itv:AdvertisingType').text = 'Any' - _add_sub_element(site_info, 'itv:Area').text = 'ITVPLAYER.VIDEO' - _add_sub_element(site_info, 'itv:Category') - _add_sub_element(site_info, 'itv:Platform').text = 'DotCom' - _add_sub_element(site_info, 'itv:Site').text = 'ItvCom' - device_info = _add_sub_element(get_playlist, 'tem:deviceInfo') - _add_sub_element(device_info, 'itv:ScreenSize').text = 'Big' - player_info = _add_sub_element(get_playlist, 'tem:playerInfo') - _add_sub_element(player_info, 'itv:Version').text = '2' - + ios_playlist_url = params.get('data-video-playlist') or params['data-video-id'] + hmac = params['data-video-hmac'] headers = self.geo_verification_headers() headers.update({ - 'Content-Type': 'text/xml; charset=utf-8', - 'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist', + 'Accept': 'application/vnd.itv.vod.playlist.v2+json', + 'Content-Type': 'application/json', + 'hmac': hmac.upper(), }) + ios_playlist = self._download_json( + ios_playlist_url, video_id, data=json.dumps({ + 'user': { + 'itvUserId': '', + 'entitlements': [], + 'token': '' + }, + 'device': { + 'manufacturer': 'Safari', + 'model': '5', + 'os': { + 'name': 'Windows NT', + 'version': '6.1', + 'type': 'desktop' + } + }, + 'client': { + 'version': '4.1', + 'id': 'browser' + }, + 'variantAvailability': { + 'featureset': { + 'min': ['hls', 'aes', 'outband-webvtt'], + 'max': ['hls', 'aes', 'outband-webvtt'] + }, + 'platformTag': 'dotcom' + } + }).encode(), headers=headers) + video_data = ios_playlist['Playlist']['Video'] + ios_base_url = video_data.get('Base') - info = self._search_json_ld(webpage, video_id, default={}) formats = [] - subtitles = {} - - def extract_subtitle(sub_url): - ext = determine_ext(sub_url, 'ttml') - subtitles.setdefault('en', []).append({ - 'url': sub_url, - 'ext': 'ttml' if ext == 'xml' else ext, - }) - - resp_env = self._download_xml( - params['data-playlist-url'], video_id, - headers=headers, data=etree.tostring(req_env), fatal=False) - if resp_env: - playlist = xpath_element(resp_env, './/Playlist') - if playlist is None: - fault_code = xpath_text(resp_env, './/faultcode') - fault_string = xpath_text(resp_env, './/faultstring') - if fault_code == 'InvalidGeoRegion': - self.raise_geo_restricted( - msg=fault_string, countries=self._GEO_COUNTRIES) - elif fault_code not in ( - 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'): - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, fault_string), expected=True) - info.update({ - 'title': self._og_search_title(webpage), - 'episode_title': params.get('data-video-episode'), - 'series': params.get('data-video-title'), - }) + for media_file in (video_data.get('MediaFiles') or []): + href = media_file.get('Href') + if not href: + continue + if ios_base_url: + href = ios_base_url + href + ext = determine_ext(href) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + href, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) else: - title = xpath_text(playlist, 'EpisodeTitle', default=None) - info.update({ - 'title': title, - 'episode_title': title, - 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), - 'series': xpath_text(playlist, 'ProgrammeTitle'), - 'duration': parse_duration(xpath_text(playlist, 'Duration')), + formats.append({ + 'url': href, }) - video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True) - media_files = xpath_element(video_element, 'MediaFiles', fatal=True) - rtmp_url = media_files.attrib['base'] - - for media_file in media_files.findall('MediaFile'): - play_path = xpath_text(media_file, 'URL') - if not play_path: - continue - tbr = int_or_none(media_file.get('bitrate'), 1000) - f = { - 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'play_path': play_path, - # Providing this swfVfy allows to avoid truncated downloads - 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', - 'page_url': url, - 'tbr': tbr, - 'ext': 'flv', - } - app = self._search_regex( - 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) - if app: - f.update({ - 'url': rtmp_url.split('?', 1)[0], - 'app': app, - }) - else: - f['url'] = rtmp_url - formats.append(f) - - for caption_url in video_element.findall('ClosedCaptioningURIs/URL'): - if caption_url.text: - extract_subtitle(caption_url.text) - - ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id') - hmac = params.get('data-video-hmac') - if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url): - headers = self.geo_verification_headers() - headers.update({ - 'Accept': 'application/vnd.itv.vod.playlist.v2+json', - 'Content-Type': 'application/json', - 'hmac': hmac.upper(), - }) - ios_playlist = self._download_json( - ios_playlist_url, video_id, data=json.dumps({ - 'user': { - 'itvUserId': '', - 'entitlements': [], - 'token': '' - }, - 'device': { - 'manufacturer': 'Safari', - 'model': '5', - 'os': { - 'name': 'Windows NT', - 'version': '6.1', - 'type': 'desktop' - } - }, - 'client': { - 'version': '4.1', - 'id': 'browser' - }, - 'variantAvailability': { - 'featureset': { - 'min': ['hls', 'aes', 'outband-webvtt'], - 'max': ['hls', 'aes', 'outband-webvtt'] - }, - 'platformTag': 'dotcom' - } - }).encode(), headers=headers, fatal=False) - if ios_playlist: - video_data = ios_playlist.get('Playlist', {}).get('Video', {}) - ios_base_url = video_data.get('Base') - for media_file in video_data.get('MediaFiles', []): - href = media_file.get('Href') - if not href: - continue - if ios_base_url: - href = ios_base_url + href - ext = determine_ext(href) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - href, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'url': href, - }) - subs = video_data.get('Subtitles') - if isinstance(subs, list): - for sub in subs: - if not isinstance(sub, dict): - continue - href = url_or_none(sub.get('Href')) - if href: - extract_subtitle(href) - if not info.get('duration'): - info['duration'] = parse_duration(video_data.get('Duration')) - self._sort_formats(formats) - info.update({ + subtitles = {} + subs = video_data.get('Subtitles') or [] + for sub in subs: + if not isinstance(sub, dict): + continue + href = url_or_none(sub.get('Href')) + if not href: + continue + subtitles.setdefault('en', []).append({ + 'url': href, + 'ext': determine_ext(href, 'vtt'), + }) + + info = self._search_json_ld(webpage, video_id, default={}) + if not info: + json_ld = self._parse_json(self._search_regex( + JSON_LD_RE, webpage, 'JSON-LD', '{}', + group='json_ld'), video_id, fatal=False) + if json_ld and json_ld.get('@type') == 'BreadcrumbList': + for ile in (json_ld.get('itemListElement:') or []): + item = ile.get('item:') or {} + if item.get('@type') == 'TVEpisode': + item['@context'] = 'http://schema.org' + info = self._json_ld(item, video_id, fatal=False) or {} + break + + return merge_dicts({ 'id': video_id, + 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage), 'formats': formats, 'subtitles': subtitles, - }) - - webpage_info = self._search_json_ld(webpage, video_id, default={}) - if not webpage_info.get('title'): - webpage_info['title'] = self._html_search_regex( - r'(?s)]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<', - webpage, 'title', default=None) or self._og_search_title( - webpage, default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', - default=None) or webpage_info['episode'] - - return merge_dicts(info, webpage_info) + 'duration': parse_duration(video_data.get('Duration')), + 'description': clean_html(get_element_by_class('episode-info__synopsis', webpage)), + }, info) class ITVBTCCIE(InfoExtractor): diff --git a/haruhi_dl/extractor/kakao.py b/haruhi_dl/extractor/kakao.py index 32935bb28..31ce7a85c 100644 --- a/haruhi_dl/extractor/kakao.py +++ b/haruhi_dl/extractor/kakao.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, int_or_none, + str_or_none, strip_or_none, + try_get, unified_timestamp, update_url_query, ) @@ -23,7 +26,7 @@ class KakaoIE(InfoExtractor): 'id': '301965083', 'ext': 'mp4', 'title': '乃木坂46 バナナマン 「3期生紹介コーナーが始動!顔高低差GPも!」 『乃木坂工事中』', - 'uploader_id': 2671005, + 'uploader_id': '2671005', 'uploader': '그랑그랑이', 'timestamp': 1488160199, 'upload_date': '20170227', @@ -36,11 +39,15 @@ class KakaoIE(InfoExtractor): 'ext': 'mp4', 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회', 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)', - 'uploader_id': 2653210, + 'uploader_id': '2653210', 'uploader': '쇼! 음악중심', 'timestamp': 1485684628, 'upload_date': '20170129', } + }, { + # geo restricted + 'url': 'https://tv.kakao.com/channel/3643855/cliplink/412069491', + 'only_matching': True, }] def _real_extract(self, url): @@ -68,8 +75,7 @@ class KakaoIE(InfoExtractor): 'fields': ','.join([ '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title', 'description', 'channelId', 'createTime', 'duration', 'playCount', - 'likeCount', 'commentCount', 'tagList', 'channel', 'name', - 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault', + 'likeCount', 'commentCount', 'tagList', 'channel', 'name', 'thumbnailUrl', 'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label']) } @@ -82,24 +88,28 @@ class KakaoIE(InfoExtractor): title = clip.get('title') or clip_link.get('displayTitle') - query['tid'] = impress.get('tid', '') + query.update({ + 'fields': '-*,code,message,url', + 'tid': impress.get('tid') or '', + }) formats = [] - for fmt in clip.get('videoOutputList', []): + for fmt in (clip.get('videoOutputList') or []): try: profile_name = fmt['profile'] if profile_name == 'AUDIO': continue - query.update({ - 'profile': profile_name, - 'fields': '-*,url', - }) - fmt_url_json = self._download_json( - api_base + 'raw/videolocation', display_id, - 'Downloading video URL for profile %s' % profile_name, - query=query, headers=player_header, fatal=False) - - if fmt_url_json is None: + query['profile'] = profile_name + try: + fmt_url_json = self._download_json( + api_base + 'raw/videolocation', display_id, + 'Downloading video URL for profile %s' % profile_name, + query=query, headers=player_header) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + resp = self._parse_json(e.cause.read().decode(), video_id) + if resp.get('code') == 'GeoBlocked': + self.raise_geo_restricted() continue fmt_url = fmt_url_json['url'] @@ -116,27 +126,13 @@ class KakaoIE(InfoExtractor): pass self._sort_formats(formats) - thumbs = [] - for thumb in clip.get('clipChapterThumbnailList', []): - thumbs.append({ - 'url': thumb.get('thumbnailUrl'), - 'id': compat_str(thumb.get('timeInSec')), - 'preference': -1 if thumb.get('isDefault') else 0 - }) - top_thumbnail = clip.get('thumbnailUrl') - if top_thumbnail: - thumbs.append({ - 'url': top_thumbnail, - 'preference': 10, - }) - return { 'id': display_id, 'title': title, 'description': strip_or_none(clip.get('description')), - 'uploader': clip_link.get('channel', {}).get('name'), - 'uploader_id': clip_link.get('channelId'), - 'thumbnails': thumbs, + 'uploader': try_get(clip_link, lambda x: x['channel']['name']), + 'uploader_id': str_or_none(clip_link.get('channelId')), + 'thumbnail': clip.get('thumbnailUrl'), 'timestamp': unified_timestamp(clip_link.get('createTime')), 'duration': int_or_none(clip.get('duration')), 'view_count': int_or_none(clip.get('playCount')), diff --git a/haruhi_dl/extractor/kanalplay.py b/haruhi_dl/extractor/kanalplay.py deleted file mode 100644 index 6c3498c67..000000000 --- a/haruhi_dl/extractor/kanalplay.py +++ /dev/null @@ -1,97 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - srt_subtitles_timecode, -) - - -class KanalPlayIE(InfoExtractor): - IE_DESC = 'Kanal 5/9/11 Play' - _VALID_URL = r'https?://(?:www\.)?kanal(?P5|9|11)play\.se/(?:#!/)?(?:play/)?program/\d+/video/(?P\d+)' - _TESTS = [{ - 'url': 'http://www.kanal5play.se/#!/play/program/3060212363/video/3270012277', - 'info_dict': { - 'id': '3270012277', - 'ext': 'flv', - 'title': 'Saknar både dusch och avlopp', - 'description': 'md5:6023a95832a06059832ae93bc3c7efb7', - 'duration': 2636.36, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.kanal9play.se/#!/play/program/335032/video/246042', - 'only_matching': True, - }, { - 'url': 'http://www.kanal11play.se/#!/play/program/232835958/video/367135199', - 'only_matching': True, - }] - - def _fix_subtitles(self, subs): - return '\r\n\r\n'.join( - '%s\r\n%s --> %s\r\n%s' - % ( - num, - srt_subtitles_timecode(item['startMillis'] / 1000.0), - srt_subtitles_timecode(item['endMillis'] / 1000.0), - item['text'], - ) for num, item in enumerate(subs, 1)) - - def _get_subtitles(self, channel_id, video_id): - subs = self._download_json( - 'http://www.kanal%splay.se/api/subtitles/%s' % (channel_id, video_id), - video_id, 'Downloading subtitles JSON', fatal=False) - return {'sv': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {} - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - channel_id = mobj.group('channel_id') - - video = self._download_json( - 'http://www.kanal%splay.se/api/getVideo?format=FLASH&videoId=%s' % (channel_id, video_id), - video_id) - - reasons_for_no_streams = video.get('reasonsForNoStreams') - if reasons_for_no_streams: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, '\n'.join(reasons_for_no_streams)), - expected=True) - - title = video['title'] - description = video.get('description') - duration = float_or_none(video.get('length'), 1000) - thumbnail = video.get('posterUrl') - - stream_base_url = video['streamBaseUrl'] - - formats = [{ - 'url': stream_base_url, - 'play_path': stream['source'], - 'ext': 'flv', - 'tbr': float_or_none(stream.get('bitrate'), 1000), - 'rtmp_real_time': True, - } for stream in video['streams']] - self._sort_formats(formats) - - subtitles = {} - if video.get('hasSubtitle'): - subtitles = self.extract_subtitles(channel_id, video_id) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/haruhi_dl/extractor/ketnet.py b/haruhi_dl/extractor/ketnet.py index 93a98e1e0..e0599d02f 100644 --- a/haruhi_dl/extractor/ketnet.py +++ b/haruhi_dl/extractor/ketnet.py @@ -2,92 +2,71 @@ from __future__ import unicode_literals from .canvas import CanvasIE from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + int_or_none, + parse_iso8601, +) class KetnetIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?:[^/]+/)*(?P[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ - 'url': 'https://www.ketnet.be/kijken/zomerse-filmpjes', - 'md5': '6bdeb65998930251bbd1c510750edba9', + 'url': 'https://www.ketnet.be/kijken/n/nachtwacht/3/nachtwacht-s3a1-de-greystook', + 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', 'info_dict': { - 'id': 'zomerse-filmpjes', + 'id': 'pbs-pub-aef8b526-115e-4006-aa24-e59ff6c6ef6f$vid-ddb815bf-c8e7-467b-8879-6bad7a32cebd', 'ext': 'mp4', - 'title': 'Gluur mee op de filmset en op Pennenzakkenrock', - 'description': 'Gluur mee met Ghost Rockers op de filmset', + 'title': 'Nachtwacht - Reeks 3: Aflevering 1', + 'description': 'De Nachtwacht krijgt te maken met een parasiet', 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - # mzid in playerConfig instead of sources - 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-greystook', - 'md5': '90139b746a0a9bd7bb631283f6e2a64e', - 'info_dict': { - 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'ext': 'flv', - 'title': 'Nachtwacht: De Greystook', - 'description': 'md5:1db3f5dc4c7109c821261e7512975be7', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.03, + 'duration': 1468.02, + 'timestamp': 1609225200, + 'upload_date': '20201229', + 'series': 'Nachtwacht', + 'season': 'Reeks 3', + 'episode': 'De Greystook', + 'episode_number': 1, }, 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], }, { - 'url': 'https://www.ketnet.be/kijken/karrewiet/uitzending-8-september-2016', - 'only_matching': True, - }, { - 'url': 'https://www.ketnet.be/achter-de-schermen/sien-repeteert-voor-stars-for-life', - 'only_matching': True, - }, { - # mzsource, geo restricted to Belgium - 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-bermadoe', + 'url': 'https://www.ketnet.be/themas/karrewiet/jaaroverzicht-20200/karrewiet-het-jaar-van-black-mamba', 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + video = self._download_json( + 'https://senior-bff.ketnet.be/graphql', display_id, query={ + 'query': '''{ + video(id: "content/ketnet/nl/%s.model.json") { + description + episodeNr + imageUrl + mediaReference + programTitle + publicationDate + seasonTitle + subtitleVideodetail + titleVideodetail + } +}''' % display_id, + })['data']['video'] - config = self._parse_json( - self._search_regex( - r'(?s)playerConfig\s*=\s*({.+?})\s*;', webpage, - 'player config'), - video_id) - - mzid = config.get('mzid') - if mzid: - return self.url_result( - 'https://mediazone.vrt.be/api/v1/ketnet/assets/%s' % mzid, - CanvasIE.ie_key(), video_id=mzid) - - title = config['title'] - - formats = [] - for source_key in ('', 'mz'): - source = config.get('%ssource' % source_key) - if not isinstance(source, dict): - continue - for format_id, format_url in source.items(): - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id, - fatal=False)) - elif format_id == 'hds': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_id, fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) + mz_id = compat_urllib_parse_unquote(video['mediaReference']) return { - 'id': video_id, - 'title': title, - 'description': config.get('description'), - 'thumbnail': config.get('image'), - 'series': config.get('program'), - 'episode': config.get('episode'), - 'formats': formats, + '_type': 'url_transparent', + 'id': mz_id, + 'title': video['titleVideodetail'], + 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/' + mz_id, + 'thumbnail': video.get('imageUrl'), + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('publicationDate')), + 'series': video.get('programTitle'), + 'season': video.get('seasonTitle'), + 'episode': video.get('subtitleVideodetail'), + 'episode_number': int_or_none(video.get('episodeNr')), + 'ie_key': CanvasIE.ie_key(), } diff --git a/haruhi_dl/extractor/khanacademy.py b/haruhi_dl/extractor/khanacademy.py index 61739efa7..87e520378 100644 --- a/haruhi_dl/extractor/khanacademy.py +++ b/haruhi_dl/extractor/khanacademy.py @@ -1,82 +1,107 @@ from __future__ import unicode_literals -import re +import json from .common import InfoExtractor from ..utils import ( - unified_strdate, + int_or_none, + parse_iso8601, + try_get, ) -class KhanAcademyIE(InfoExtractor): - _VALID_URL = r'^https?://(?:(?:www|api)\.)?khanacademy\.org/(?P[^/]+)/(?:[^/]+/){,2}(?P[^?#/]+)(?:$|[?#])' - IE_NAME = 'KhanAcademy' +class KhanAcademyBaseIE(InfoExtractor): + _VALID_URL_TEMPL = r'https?://(?:www\.)?khanacademy\.org/(?P(?:[^/]+/){%s}%s[^?#/&]+)' - _TESTS = [{ - 'url': 'http://www.khanacademy.org/video/one-time-pad', - 'md5': '7b391cce85e758fb94f763ddc1bbb979', + def _parse_video(self, video): + return { + '_type': 'url_transparent', + 'url': video['youtubeId'], + 'id': video.get('slug'), + 'title': video.get('title'), + 'thumbnail': video.get('imageUrl') or video.get('thumbnailUrl'), + 'duration': int_or_none(video.get('duration')), + 'description': video.get('description'), + 'ie_key': 'Youtube', + } + + def _real_extract(self, url): + display_id = self._match_id(url) + component_props = self._parse_json(self._download_json( + 'https://www.khanacademy.org/api/internal/graphql', + display_id, query={ + 'hash': 1604303425, + 'variables': json.dumps({ + 'path': display_id, + 'queryParams': '', + }), + })['data']['contentJson'], display_id)['componentProps'] + return self._parse_component_props(component_props) + + +class KhanAcademyIE(KhanAcademyBaseIE): + IE_NAME = 'khanacademy' + _VALID_URL = KhanAcademyBaseIE._VALID_URL_TEMPL % ('4', 'v/') + _TEST = { + 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography/crypt/v/one-time-pad', + 'md5': '9c84b7b06f9ebb80d22a5c8dedefb9a0', 'info_dict': { - 'id': 'one-time-pad', - 'ext': 'webm', + 'id': 'FlIG3TvQCBQ', + 'ext': 'mp4', 'title': 'The one-time pad', 'description': 'The perfect cipher', 'duration': 176, 'uploader': 'Brit Cruise', 'uploader_id': 'khanacademy', 'upload_date': '20120411', + 'timestamp': 1334170113, + 'license': 'cc-by-nc-sa', }, 'add_ie': ['Youtube'], - }, { - 'url': 'https://www.khanacademy.org/math/applied-math/cryptography', + } + + def _parse_component_props(self, component_props): + video = component_props['tutorialPageData']['contentModel'] + info = self._parse_video(video) + author_names = video.get('authorNames') + info.update({ + 'uploader': ', '.join(author_names) if author_names else None, + 'timestamp': parse_iso8601(video.get('dateAdded')), + 'license': video.get('kaUserLicense'), + }) + return info + + +class KhanAcademyUnitIE(KhanAcademyBaseIE): + IE_NAME = 'khanacademy:unit' + _VALID_URL = (KhanAcademyBaseIE._VALID_URL_TEMPL % ('2', '')) + '/?(?:[?#&]|$)' + _TEST = { + 'url': 'https://www.khanacademy.org/computing/computer-science/cryptography', 'info_dict': { 'id': 'cryptography', - 'title': 'Journey into cryptography', + 'title': 'Cryptography', 'description': 'How have humans protected their secret messages through history? What has changed today?', }, - 'playlist_mincount': 3, - }] + 'playlist_mincount': 31, + } - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('id') + def _parse_component_props(self, component_props): + curation = component_props['curation'] - if m.group('key') == 'video': - data = self._download_json( - 'http://api.khanacademy.org/api/v1/videos/' + video_id, - video_id, 'Downloading video info') - - upload_date = unified_strdate(data['date_added']) - uploader = ', '.join(data['author_names']) - return { - '_type': 'url_transparent', - 'url': data['url'], - 'id': video_id, - 'title': data['title'], - 'thumbnail': data['image_url'], - 'duration': data['duration'], - 'description': data['description'], - 'uploader': uploader, - 'upload_date': upload_date, + entries = [] + tutorials = try_get(curation, lambda x: x['tabs'][0]['modules'][0]['tutorials'], list) or [] + for tutorial_number, tutorial in enumerate(tutorials, 1): + chapter_info = { + 'chapter': tutorial.get('title'), + 'chapter_number': tutorial_number, + 'chapter_id': tutorial.get('id'), } - else: - # topic - data = self._download_json( - 'http://api.khanacademy.org/api/v1/topic/' + video_id, - video_id, 'Downloading topic info') + for content_item in (tutorial.get('contentItems') or []): + if content_item.get('kind') == 'Video': + info = self._parse_video(content_item) + info.update(chapter_info) + entries.append(info) - entries = [ - { - '_type': 'url', - 'url': c['url'], - 'id': c['id'], - 'title': c['title'], - } - for c in data['children'] if c['kind'] in ('Video', 'Topic')] - - return { - '_type': 'playlist', - 'id': video_id, - 'title': data['title'], - 'description': data['description'], - 'entries': entries, - } + return self.playlist_result( + entries, curation.get('unit'), curation.get('title'), + curation.get('description')) diff --git a/haruhi_dl/extractor/linuxacademy.py b/haruhi_dl/extractor/linuxacademy.py index 23ca965d9..7ec4a6557 100644 --- a/haruhi_dl/extractor/linuxacademy.py +++ b/haruhi_dl/extractor/linuxacademy.py @@ -8,11 +8,15 @@ from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_HTTPError, + compat_str, ) from ..utils import ( + clean_html, ExtractorError, - orderedSet, - unescapeHTML, + js_to_json, + parse_duration, + try_get, + unified_timestamp, urlencode_postdata, urljoin, ) @@ -28,11 +32,15 @@ class LinuxAcademyIE(InfoExtractor): ) ''' _TESTS = [{ - 'url': 'https://linuxacademy.com/cp/courses/lesson/course/1498/lesson/2/module/154', + 'url': 'https://linuxacademy.com/cp/courses/lesson/course/7971/lesson/2/module/675', 'info_dict': { - 'id': '1498-2', + 'id': '7971-2', 'ext': 'mp4', - 'title': "Introduction to the Practitioner's Brief", + 'title': 'What Is Data Science', + 'description': 'md5:c574a3c20607144fb36cb65bdde76c99', + 'timestamp': 1607387907, + 'upload_date': '20201208', + 'duration': 304, }, 'params': { 'skip_download': True, @@ -46,7 +54,8 @@ class LinuxAcademyIE(InfoExtractor): 'info_dict': { 'id': '154', 'title': 'AWS Certified Cloud Practitioner', - 'description': 'md5:039db7e60e4aac9cf43630e0a75fa834', + 'description': 'md5:a68a299ca9bb98d41cca5abc4d4ce22c', + 'duration': 28835, }, 'playlist_count': 41, 'skip': 'Requires Linux Academy account credentials', @@ -74,6 +83,7 @@ class LinuxAcademyIE(InfoExtractor): self._AUTHORIZE_URL, None, 'Downloading authorize page', query={ 'client_id': self._CLIENT_ID, 'response_type': 'token id_token', + 'response_mode': 'web_message', 'redirect_uri': self._ORIGIN_URL, 'scope': 'openid email user_impersonation profile', 'audience': self._ORIGIN_URL, @@ -129,7 +139,13 @@ class LinuxAcademyIE(InfoExtractor): access_token = self._search_regex( r'access_token=([^=&]+)', urlh.geturl(), - 'access token') + 'access token', default=None) + if not access_token: + access_token = self._parse_json( + self._search_regex( + r'authorizationResponse\s*=\s*({.+?})\s*;', callback_page, + 'authorization response'), None, + transform_source=js_to_json)['response']['access_token'] self._download_webpage( 'https://linuxacademy.com/cp/login/tokenValidateLogin/token/%s' @@ -144,30 +160,84 @@ class LinuxAcademyIE(InfoExtractor): # course path if course_id: - entries = [ - self.url_result( - urljoin(url, lesson_url), ie=LinuxAcademyIE.ie_key()) - for lesson_url in orderedSet(re.findall( - r']+\bhref=["\'](/cp/courses/lesson/course/\d+/lesson/\d+/module/\d+)', - webpage))] - title = unescapeHTML(self._html_search_regex( - (r'class=["\']course-title["\'][^>]*>(?P[^<]+)', - r'var\s+title\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), - webpage, 'title', default=None, group='value')) - description = unescapeHTML(self._html_search_regex( - r'var\s+description\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'description', default=None, group='value')) - return self.playlist_result(entries, course_id, title, description) + module = self._parse_json( + self._search_regex( + r'window\.module\s*=\s*({.+?})\s*;', webpage, 'module'), + item_id) + entries = [] + chapter_number = None + chapter = None + chapter_id = None + for item in module['items']: + if not isinstance(item, dict): + continue + + def type_field(key): + return (try_get(item, lambda x: x['type'][key], compat_str) or '').lower() + type_fields = (type_field('name'), type_field('slug')) + # Move to next module section + if 'section' in type_fields: + chapter = item.get('course_name') + chapter_id = item.get('course_module') + chapter_number = 1 if not chapter_number else chapter_number + 1 + continue + # Skip non-lessons + if 'lesson' not in type_fields: + continue + lesson_url = urljoin(url, item.get('url')) + if not lesson_url: + continue + title = item.get('title') or item.get('lesson_name') + description = item.get('md_desc') or clean_html(item.get('description')) or clean_html(item.get('text')) + entries.append({ + '_type': 'url_transparent', + 'url': lesson_url, + 'ie_key': LinuxAcademyIE.ie_key(), + 'title': title, + 'description': description, + 'timestamp': unified_timestamp(item.get('date')) or unified_timestamp(item.get('created_on')), + 'duration': parse_duration(item.get('duration')), + 'chapter': chapter, + 'chapter_id': chapter_id, + 'chapter_number': chapter_number, + }) + return { + '_type': 'playlist', + 'entries': entries, + 'id': course_id, + 'title': module.get('title'), + 'description': module.get('md_desc') or clean_html(module.get('desc')), + 'duration': parse_duration(module.get('duration')), + } # single video path - info = self._extract_jwplayer_data( - webpage, item_id, require_title=False, m3u8_id='hls',) - title = self._search_regex( - (r'>Lecture\s*:\s*(?P[^<]+)', - r'lessonName\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), webpage, - 'title', group='value') - info.update({ + m3u8_url = self._parse_json( + self._search_regex( + r'player\.playlist\s*=\s*(\[.+?\])\s*;', webpage, 'playlist'), + item_id)[0]['file'] + formats = self._extract_m3u8_formats( + m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + info = { 'id': item_id, - 'title': title, - }) + 'formats': formats, + } + lesson = self._parse_json( + self._search_regex( + (r'window\.lesson\s*=\s*({.+?})\s*;', + r'player\.lesson\s*=\s*({.+?})\s*;'), + webpage, 'lesson', default='{}'), item_id, fatal=False) + if lesson: + info.update({ + 'title': lesson.get('lesson_name'), + 'description': lesson.get('md_desc') or clean_html(lesson.get('desc')), + 'timestamp': unified_timestamp(lesson.get('date')) or unified_timestamp(lesson.get('created_on')), + 'duration': parse_duration(lesson.get('duration')), + }) + if not info.get('title'): + info['title'] = self._search_regex( + (r'>Lecture\s*:\s*(?P[^<]+)', + r'lessonName\s*=\s*(["\'])(?P(?:(?!\1).)+)\1'), webpage, + 'title', group='value') return info diff --git a/haruhi_dl/extractor/lrt.py b/haruhi_dl/extractor/lrt.py index f5c997ef4..89d549858 100644 --- a/haruhi_dl/extractor/lrt.py +++ b/haruhi_dl/extractor/lrt.py @@ -5,28 +5,26 @@ import re from .common import InfoExtractor from ..utils import ( - determine_ext, - int_or_none, - parse_duration, - remove_end, + clean_html, + merge_dicts, ) class LRTIE(InfoExtractor): IE_NAME = 'lrt.lt' - _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P/mediateka/irasas/(?P[0-9]+))' _TESTS = [{ # m3u8 download - 'url': 'http://www.lrt.lt/mediateka/irasas/54391/', - 'md5': 'fe44cf7e4ab3198055f2c598fc175cb0', + 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene', + 'md5': '85cb2bb530f31d91a9c65b479516ade4', 'info_dict': { - 'id': '54391', + 'id': '2000127261', 'ext': 'mp4', - 'title': 'Septynios Kauno dienos', - 'description': 'md5:24d84534c7dc76581e59f5689462411a', - 'duration': 1783, - 'view_count': int, - 'like_count': int, + 'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė', + 'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa', + 'duration': 3035, + 'timestamp': 1604079000, + 'upload_date': '20201030', }, }, { # direct mp3 download @@ -43,52 +41,35 @@ class LRTIE(InfoExtractor): }, }] + def _extract_js_var(self, webpage, var_name, default): + return self._search_regex( + r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name, + webpage, var_name.replace('_', ' '), default, group=2) + def _real_extract(self, url): - video_id = self._match_id(url) + path, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' - LRT') + media_url = self._extract_js_var(webpage, 'main_url', path) + media = self._download_json(self._extract_js_var( + webpage, 'media_info_url', + 'https://www.lrt.lt/servisai/stream_url/vod/media_info/'), + video_id, query={'url': media_url}) + jw_data = self._parse_jwplayer_data( + media['playlist_item'], video_id, base_url=url) - formats = [] - for _, file_url in re.findall( - r'file\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage): - ext = determine_ext(file_url) - if ext not in ('m3u8', 'mp3'): + json_ld_data = self._search_json_ld(webpage, video_id) + + tags = [] + for tag in (media.get('tags') or []): + tag_name = tag.get('name') + if not tag_name: continue - # mp3 served as m3u8 produces stuttered media file - if ext == 'm3u8' and '.mp3' in file_url: - continue - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - file_url, video_id, 'mp4', entry_protocol='m3u8_native', - fatal=False)) - elif ext == 'mp3': - formats.append({ - 'url': file_url, - 'vcodec': 'none', - }) - self._sort_formats(formats) + tags.append(tag_name) - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage) - duration = parse_duration(self._search_regex( - r'var\s+record_len\s*=\s*(["\'])(?P[0-9]+:[0-9]+:[0-9]+)\1', - webpage, 'duration', default=None, group='duration')) - - view_count = int_or_none(self._html_search_regex( - r']+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P.+?)', - webpage, 'view count', fatal=False, group='count')) - like_count = int_or_none(self._search_regex( - r']+id=(["\'])flikesCount.*?\1>(?P\d+)<', - webpage, 'like count', fatal=False, group='count')) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, + clean_info = { + 'description': clean_html(media.get('content')), + 'tags': tags, } + + return merge_dicts(clean_info, jw_data, json_ld_data) diff --git a/haruhi_dl/extractor/malltv.py b/haruhi_dl/extractor/malltv.py index 6f4fd927f..fadfd9338 100644 --- a/haruhi_dl/extractor/malltv.py +++ b/haruhi_dl/extractor/malltv.py @@ -1,10 +1,16 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import merge_dicts +from ..utils import ( + clean_html, + dict_get, + float_or_none, + int_or_none, + merge_dicts, + parse_duration, + try_get, +) class MallTVIE(InfoExtractor): @@ -17,7 +23,7 @@ class MallTVIE(InfoExtractor): 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', 'ext': 'mp4', 'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', - 'description': 'md5:25fc0ec42a72ba602b602c683fa29deb', + 'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35', 'duration': 216, 'timestamp': 1538870400, 'upload_date': '20181007', @@ -37,20 +43,46 @@ class MallTVIE(InfoExtractor): webpage = self._download_webpage( url, display_id, headers=self.geo_verification_headers()) - SOURCE_RE = r'(]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P[\da-z]+)/index)\b' + video = self._parse_json(self._search_regex( + r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);', + webpage, 'video object'), display_id) + video_source = video['VideoSource'] video_id = self._search_regex( - SOURCE_RE, webpage, 'video id', group='id') + r'/([\da-z]+)/index\b', video_source, 'video id') - media = self._parse_html5_media_entries( - url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id, - m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] + formats = self._extract_m3u8_formats( + video_source + '.m3u8', video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + subtitles = {} + for s in (video.get('Subtitles') or {}): + s_url = s.get('Url') + if not s_url: + continue + subtitles.setdefault(s.get('Language') or 'cz', []).append({ + 'url': s_url, + }) + + entity_counts = video.get('EntityCounts') or {} + + def get_count(k): + v = entity_counts.get(k + 's') or {} + return int_or_none(dict_get(v, ('Count', 'StrCount'))) info = self._search_json_ld(webpage, video_id, default={}) - return merge_dicts(media, info, { + return merge_dicts({ 'id': video_id, 'display_id': display_id, - 'title': self._og_search_title(webpage, default=None) or display_id, - 'description': self._og_search_description(webpage, default=None), - 'thumbnail': self._og_search_thumbnail(webpage, default=None), - }) + 'title': video.get('Title'), + 'description': clean_html(video.get('Description')), + 'thumbnail': video.get('ThumbnailUrl'), + 'formats': formats, + 'subtitles': subtitles, + 'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')), + 'view_count': get_count('View'), + 'like_count': get_count('Like'), + 'dislike_count': get_count('Dislike'), + 'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])), + 'comment_count': get_count('Comment'), + }, info) diff --git a/haruhi_dl/extractor/mdr.py b/haruhi_dl/extractor/mdr.py index 322e5b45a..dc6aa9819 100644 --- a/haruhi_dl/extractor/mdr.py +++ b/haruhi_dl/extractor/mdr.py @@ -2,12 +2,16 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( determine_ext, int_or_none, parse_duration, parse_iso8601, + url_or_none, xpath_text, ) @@ -16,6 +20,8 @@ class MDRIE(InfoExtractor): IE_DESC = 'MDR.DE and KiKA' _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P\d+)(?:_.+?)?\.html' + _GEO_COUNTRIES = ['DE'] + _TESTS = [{ # MDR regularly deletes its videos 'url': 'http://www.mdr.de/fakt/video189002.html', @@ -66,6 +72,22 @@ class MDRIE(InfoExtractor): 'duration': 3239, 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, + }, { + # empty bitrateVideo and bitrateAudio + 'url': 'https://www.kika.de/filme/sendung128372_zc-572e3f45_zs-1d9fb70e.html', + 'info_dict': { + 'id': '128372', + 'ext': 'mp4', + 'title': 'Der kleine Wichtel kehrt zurück', + 'description': 'md5:f77fafdff90f7aa1e9dca14f662c052a', + 'duration': 4876, + 'timestamp': 1607823300, + 'upload_date': '20201213', + 'uploader': 'ZDF', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', 'only_matching': True, @@ -91,10 +113,13 @@ class MDRIE(InfoExtractor): title = xpath_text(doc, ['./title', './broadcast/broadcastName'], 'title', fatal=True) + type_ = xpath_text(doc, './type', default=None) + formats = [] processed_urls = [] for asset in doc.findall('./assets/asset'): for source in ( + 'download', 'progressiveDownload', 'dynamicHttpStreamingRedirector', 'adaptiveHttpStreamingRedirector'): @@ -102,63 +127,49 @@ class MDRIE(InfoExtractor): if url_el is None: continue - video_url = url_el.text - if video_url in processed_urls: + video_url = url_or_none(url_el.text) + if not video_url or video_url in processed_urls: continue processed_urls.append(video_url) - vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) - abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) - - ext = determine_ext(url_el.text) + ext = determine_ext(video_url) if ext == 'm3u8': - url_formats = self._extract_m3u8_formats( + formats.extend(self._extract_m3u8_formats( video_url, video_id, 'mp4', entry_protocol='m3u8_native', - preference=0, m3u8_id='HLS', fatal=False) + preference=0, m3u8_id='HLS', fatal=False)) elif ext == 'f4m': - url_formats = self._extract_f4m_formats( + formats.extend(self._extract_f4m_formats( video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id, - preference=0, f4m_id='HDS', fatal=False) + preference=0, f4m_id='HDS', fatal=False)) else: media_type = xpath_text(asset, './mediaType', 'media type', default='MP4') vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000) abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) + format_id = [media_type] + if vbr or abr: + format_id.append(compat_str(vbr or abr)) + f = { 'url': video_url, - 'format_id': '%s-%d' % (media_type, vbr or abr), + 'format_id': '-'.join(format_id), 'filesize': filesize, 'abr': abr, - 'preference': 1, + 'vbr': vbr, } if vbr: - width = int_or_none(xpath_text(asset, './frameWidth', 'width')) - height = int_or_none(xpath_text(asset, './frameHeight', 'height')) f.update({ - 'vbr': vbr, - 'width': width, - 'height': height, + 'width': int_or_none(xpath_text(asset, './frameWidth', 'width')), + 'height': int_or_none(xpath_text(asset, './frameHeight', 'height')), }) - url_formats = [f] + if type_ == 'audio': + f['vcodec'] = 'none' - if not url_formats: - continue - - if not vbr: - for f in url_formats: - abr = f.get('tbr') or abr - if 'tbr' in f: - del f['tbr'] - f.update({ - 'abr': abr, - 'vcodec': 'none', - }) - - formats.extend(url_formats) + formats.append(f) self._sort_formats(formats) diff --git a/haruhi_dl/extractor/medaltv.py b/haruhi_dl/extractor/medaltv.py new file mode 100644 index 000000000..1603b55f6 --- /dev/null +++ b/haruhi_dl/extractor/medaltv.py @@ -0,0 +1,131 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + str_or_none, + try_get, +) + + +class MedalTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr', + 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', + 'info_dict': { + 'id': '34934644', + 'ext': 'mp4', + 'title': 'Quad Cold', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'MowgliSB', + 'timestamp': 1603165266, + 'upload_date': '20201020', + 'uploader_id': 10619174, + } + }, { + 'url': 'https://medal.tv/clips/36787208', + 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', + 'info_dict': { + 'id': '36787208', + 'ext': 'mp4', + 'title': 'u tk me i tk u bigger', + 'description': 'Medal,https://medal.tv/desktop/', + 'uploader': 'Mimicc', + 'timestamp': 1605580939, + 'upload_date': '20201117', + 'uploader_id': 5156321, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + hydration_data = self._parse_json(self._search_regex( + r']*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*', + webpage, 'hydration data', default='{}'), video_id) + + clip = try_get( + hydration_data, lambda x: x['clips'][video_id], dict) or {} + if not clip: + raise ExtractorError( + 'Could not find video information.', video_id=video_id) + + title = clip['contentTitle'] + + source_width = int_or_none(clip.get('sourceWidth')) + source_height = int_or_none(clip.get('sourceHeight')) + + aspect_ratio = source_width / source_height if source_width and source_height else 16 / 9 + + def add_item(container, item_url, height, id_key='format_id', item_id=None): + item_id = item_id or '%dp' % height + if item_id not in item_url: + return + width = int(round(aspect_ratio * height)) + container.append({ + 'url': item_url, + id_key: item_id, + 'width': width, + 'height': height + }) + + formats = [] + thumbnails = [] + for k, v in clip.items(): + if not (v and isinstance(v, compat_str)): + continue + mobj = re.match(r'(contentUrl|thumbnail)(?:(\d+)p)?$', k) + if not mobj: + continue + prefix = mobj.group(1) + height = int_or_none(mobj.group(2)) + if prefix == 'contentUrl': + add_item( + formats, v, height or source_height, + item_id=None if height else 'source') + elif prefix == 'thumbnail': + add_item(thumbnails, v, height, 'id') + + error = clip.get('error') + if not formats and error: + if error == 404: + raise ExtractorError( + 'That clip does not exist.', + expected=True, video_id=video_id) + else: + raise ExtractorError( + 'An unknown error occurred ({0}).'.format(error), + video_id=video_id) + + self._sort_formats(formats) + + # Necessary because the id of the author is not known in advance. + # Won't raise an issue if no profile can be found as this is optional. + author = try_get( + hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} + author_id = str_or_none(author.get('id')) + author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': clip.get('contentDescription'), + 'uploader': author.get('displayName'), + 'timestamp': float_or_none(clip.get('created'), 1000), + 'uploader_id': author_id, + 'uploader_url': author_url, + 'duration': int_or_none(clip.get('videoLengthSeconds')), + 'view_count': int_or_none(clip.get('views')), + 'like_count': int_or_none(clip.get('likes')), + 'comment_count': int_or_none(clip.get('comments')), + } diff --git a/haruhi_dl/extractor/medialaan.py b/haruhi_dl/extractor/medialaan.py index 50d5db802..469212ec6 100644 --- a/haruhi_dl/extractor/medialaan.py +++ b/haruhi_dl/extractor/medialaan.py @@ -2,268 +2,113 @@ from __future__ import unicode_literals import re -from .gigya import GigyaBaseIE - -from ..compat import compat_str +from .common import InfoExtractor from ..utils import ( + extract_attributes, int_or_none, - parse_duration, - try_get, - unified_timestamp, + mimetype2ext, + parse_iso8601, ) -class MedialaanIE(GigyaBaseIE): +class MedialaanIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// - (?:www\.|nieuws\.)? (?: - (?Pvtm|q2|vtmkzoom)\.be/ - (?: - video(?:/[^/]+/id/|/?\?.*?\baid=)| - (?:[^/]+/)* - ) + (?:embed\.)?mychannels.video/embed/| + embed\.mychannels\.video/(?:s(?:dk|cript)/)?production/| + (?:www\.)?(?: + (?: + 7sur7| + demorgen| + hln| + joe| + qmusic + )\.be| + (?: + [abe]d| + bndestem| + destentor| + gelderlander| + pzc| + tubantia| + volkskrant + )\.nl + )/video/(?:[^/]+/)*[^/?&#]+~p ) - (?P[^/?#&]+) + (?P\d+) ''' - _NETRC_MACHINE = 'medialaan' - _APIKEY = '3_HZ0FtkMW_gOyKlqQzW5_0FHRC7Nd5XpXJZcDdXY4pk5eES2ZWmejRW5egwVm4ug-' - _SITE_TO_APP_ID = { - 'vtm': 'vtm_watch', - 'q2': 'q2', - 'vtmkzoom': 'vtmkzoom', - } _TESTS = [{ - # vod - 'url': 'http://vtm.be/video/volledige-afleveringen/id/vtm_20170219_VM0678361_vtmwatch', + 'url': 'https://www.bndestem.nl/video/de-terugkeer-van-ally-de-aap-en-wie-vertrekt-er-nog-bij-nac~p193993', 'info_dict': { - 'id': 'vtm_20170219_VM0678361_vtmwatch', + 'id': '193993', 'ext': 'mp4', - 'title': 'Allemaal Chris afl. 6', - 'description': 'md5:4be86427521e7b07e0adb0c9c554ddb2', - 'timestamp': 1487533280, - 'upload_date': '20170219', - 'duration': 2562, - 'series': 'Allemaal Chris', - 'season': 'Allemaal Chris', - 'season_number': 1, - 'season_id': '256936078124527', - 'episode': 'Allemaal Chris afl. 6', - 'episode_number': 6, - 'episode_id': '256936078591527', + 'title': 'De terugkeer van Ally de Aap en wie vertrekt er nog bij NAC?', + 'timestamp': 1611663540, + 'upload_date': '20210126', + 'duration': 238, }, 'params': { 'skip_download': True, }, - 'skip': 'Requires account credentials', }, { - # clip - 'url': 'http://vtm.be/video?aid=168332', - 'info_dict': { - 'id': '168332', - 'ext': 'mp4', - 'title': '"Veronique liegt!"', - 'description': 'md5:1385e2b743923afe54ba4adc38476155', - 'timestamp': 1489002029, - 'upload_date': '20170308', - 'duration': 96, - }, - }, { - # vod - 'url': 'http://vtm.be/video/volledige-afleveringen/id/257107153551000', + 'url': 'https://www.gelderlander.nl/video/kanalen/degelderlander~c320/series/snel-nieuws~s984/noodbevel-in-doetinchem-politie-stuurt-mensen-centrum-uit~p194093', 'only_matching': True, }, { - # vod - 'url': 'http://vtm.be/video?aid=163157', + 'url': 'https://embed.mychannels.video/sdk/production/193993?options=TFTFF_default', 'only_matching': True, }, { - # vod - 'url': 'http://www.q2.be/video/volledige-afleveringen/id/2be_20170301_VM0684442_q2', + 'url': 'https://embed.mychannels.video/script/production/193993', 'only_matching': True, }, { - # clip - 'url': 'http://vtmkzoom.be/k3-dansstudio/een-nieuw-seizoen-van-k3-dansstudio', + 'url': 'https://embed.mychannels.video/production/193993', 'only_matching': True, }, { - # http/s redirect - 'url': 'https://vtmkzoom.be/video?aid=45724', - 'info_dict': { - 'id': '257136373657000', - 'ext': 'mp4', - 'title': 'K3 Dansstudio Ushuaia afl.6', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires account credentials', + 'url': 'https://mychannels.video/embed/193993', + 'only_matching': True, }, { - # nieuws.vtm.be - 'url': 'https://nieuws.vtm.be/stadion/stadion/genk-nog-moeilijk-programma', + 'url': 'https://embed.mychannels.video/embed/193993', 'only_matching': True, }] - def _real_initialize(self): - self._logged_in = False - - def _login(self): - username, password = self._get_login_info() - if username is None: - self.raise_login_required() - - auth_data = { - 'APIKey': self._APIKEY, - 'sdk': 'js_6.1', - 'format': 'json', - 'loginID': username, - 'password': password, - } - - auth_info = self._gigya_login(auth_data) - - self._uid = auth_info['UID'] - self._uid_signature = auth_info['UIDSignature'] - self._signature_timestamp = auth_info['signatureTimestamp'] - - self._logged_in = True + @staticmethod + def _extract_urls(webpage, **kw): + entries = [] + for element in re.findall(r'(]+data-mychannels-type="video"[^>]*>)', webpage): + mychannels_id = extract_attributes(element).get('data-mychannels-id') + if mychannels_id: + entries.append('https://mychannels.video/embed/' + mychannels_id) + return entries def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, site_id = mobj.group('id', 'site_id') + production_id = self._match_id(url) + production = self._download_json( + 'https://embed.mychannels.video/sdk/production/' + production_id, + production_id, query={'options': 'UUUU_default'})['productions'][0] + title = production['title'] - webpage = self._download_webpage(url, video_id) - - config = self._parse_json( - self._search_regex( - r'videoJSConfig\s*=\s*JSON\.parse\(\'({.+?})\'\);', - webpage, 'config', default='{}'), video_id, - transform_source=lambda s: s.replace( - '\\\\', '\\').replace(r'\"', '"').replace(r"\'", "'")) - - vod_id = config.get('vodId') or self._search_regex( - (r'\\"vodId\\"\s*:\s*\\"(.+?)\\"', - r'"vodId"\s*:\s*"(.+?)"', - r'<[^>]+id=["\']vod-(\d+)'), - webpage, 'video_id', default=None) - - # clip, no authentication required - if not vod_id: - player = self._parse_json( - self._search_regex( - r'vmmaplayer\(({.+?})\);', webpage, 'vmma player', - default=''), - video_id, transform_source=lambda s: '[%s]' % s, fatal=False) - if player: - video = player[-1] - if video['videoUrl'] in ('http', 'https'): - return self.url_result(video['url'], MedialaanIE.ie_key()) - info = { - 'id': video_id, - 'url': video['videoUrl'], - 'title': video['title'], - 'thumbnail': video.get('imageUrl'), - 'timestamp': int_or_none(video.get('createdDate')), - 'duration': int_or_none(video.get('duration')), - } + formats = [] + for source in (production.get('sources') or []): + src = source.get('src') + if not src: + continue + ext = mimetype2ext(source.get('type')) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, production_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: - info = self._parse_html5_media_entries( - url, webpage, video_id, m3u8_id='hls')[0] - info.update({ - 'id': video_id, - 'title': self._html_search_meta('description', webpage), - 'duration': parse_duration(self._html_search_meta('duration', webpage)), + formats.append({ + 'ext': ext, + 'url': src, }) - # vod, authentication required - else: - if not self._logged_in: - self._login() + self._sort_formats(formats) - settings = self._parse_json( - self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings', default='{}'), - video_id) - - def get(container, item): - return try_get( - settings, lambda x: x[container][item], - compat_str) or self._search_regex( - r'"%s"\s*:\s*"([^"]+)' % item, webpage, item, - default=None) - - app_id = get('vod', 'app_id') or self._SITE_TO_APP_ID.get(site_id, 'vtm_watch') - sso = get('vod', 'gigyaDatabase') or 'vtm-sso' - - data = self._download_json( - 'http://vod.medialaan.io/api/1.0/item/%s/video' % vod_id, - video_id, query={ - 'app_id': app_id, - 'user_network': sso, - 'UID': self._uid, - 'UIDSignature': self._uid_signature, - 'signatureTimestamp': self._signature_timestamp, - }) - - formats = self._extract_m3u8_formats( - data['response']['uri'], video_id, entry_protocol='m3u8_native', - ext='mp4', m3u8_id='hls') - - self._sort_formats(formats) - - info = { - 'id': vod_id, - 'formats': formats, - } - - api_key = get('vod', 'apiKey') - channel = get('medialaanGigya', 'channel') - - if api_key: - videos = self._download_json( - 'http://vod.medialaan.io/vod/v2/videos', video_id, fatal=False, - query={ - 'channels': channel, - 'ids': vod_id, - 'limit': 1, - 'apikey': api_key, - }) - if videos: - video = try_get( - videos, lambda x: x['response']['videos'][0], dict) - if video: - def get(container, item, expected_type=None): - return try_get( - video, lambda x: x[container][item], expected_type) - - def get_string(container, item): - return get(container, item, compat_str) - - info.update({ - 'series': get_string('program', 'title'), - 'season': get_string('season', 'title'), - 'season_number': int_or_none(get('season', 'number')), - 'season_id': get_string('season', 'id'), - 'episode': get_string('episode', 'title'), - 'episode_number': int_or_none(get('episode', 'number')), - 'episode_id': get_string('episode', 'id'), - 'duration': int_or_none( - video.get('duration')) or int_or_none( - video.get('durationMillis'), scale=1000), - 'title': get_string('episode', 'title'), - 'description': get_string('episode', 'text'), - 'timestamp': unified_timestamp(get_string( - 'publication', 'begin')), - }) - - if not info.get('title'): - info['title'] = try_get( - config, lambda x: x['videoConfig']['title'], - compat_str) or self._html_search_regex( - r'\\"title\\"\s*:\s*\\"(.+?)\\"', webpage, 'title', - default=None) or self._og_search_title(webpage) - - if not info.get('description'): - info['description'] = self._html_search_regex( - r']+class="field-item\s+even">\s*

(.+?)

', - webpage, 'description', default=None) - - return info + return { + 'id': production_id, + 'title': title, + 'formats': formats, + 'thumbnail': production.get('posterUrl'), + 'timestamp': parse_iso8601(production.get('publicationDate'), ' '), + 'duration': int_or_none(production.get('duration')) or None, + } diff --git a/haruhi_dl/extractor/mediaset.py b/haruhi_dl/extractor/mediaset.py index 933df1495..2c16fc9e2 100644 --- a/haruhi_dl/extractor/mediaset.py +++ b/haruhi_dl/extractor/mediaset.py @@ -23,7 +23,7 @@ class MediasetIE(ThePlatformBaseIE): https?:// (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ (?: - (?:video|on-demand)/(?:[^/]+/)+[^/]+_| + (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_| player/index\.html\?.*?\bprogramGuid= ) )(?P[0-9A-Z]{16,}) @@ -88,6 +88,9 @@ class MediasetIE(ThePlatformBaseIE): }, { 'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135', 'only_matching': True, + }, { + 'url': 'https://www.mediasetplay.mediaset.it/movie/herculeslaleggendahainizio/hercules-la-leggenda-ha-inizio_F305927501000102', + 'only_matching': True, }] @staticmethod diff --git a/haruhi_dl/extractor/mgtv.py b/haruhi_dl/extractor/mgtv.py index 71fc3ec56..cab3aa045 100644 --- a/haruhi_dl/extractor/mgtv.py +++ b/haruhi_dl/extractor/mgtv.py @@ -17,9 +17,8 @@ from ..utils import ( class MGTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P\d+)\.html' + _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P\d+)\.html' IE_DESC = '芒果TV' - _GEO_COUNTRIES = ['CN'] _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', @@ -34,14 +33,18 @@ class MGTVIE(InfoExtractor): }, { 'url': 'http://www.mgtv.com/b/301817/3826653.html', 'only_matching': True, + }, { + 'url': 'https://w.mgtv.com/b/301817/3826653.html', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1] try: api_data = self._download_json( 'https://pcweb.api.mgtv.com/player/video', video_id, query={ - 'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] except ExtractorError as e: @@ -56,6 +59,7 @@ class MGTVIE(InfoExtractor): stream_data = self._download_json( 'https://pcweb.api.mgtv.com/player/getSource', video_id, query={ 'pm2': api_data['atc']['pm2'], + 'tk2': tk2, 'video_id': video_id, }, headers=self.geo_verification_headers())['data'] stream_domain = stream_data['stream_domain'][0] diff --git a/haruhi_dl/extractor/minds.py b/haruhi_dl/extractor/minds.py new file mode 100644 index 000000000..8e9f0f825 --- /dev/null +++ b/haruhi_dl/extractor/minds.py @@ -0,0 +1,196 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + clean_html, + int_or_none, + str_or_none, + strip_or_none, +) + + +class MindsBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?minds\.com/' + + def _call_api(self, path, video_id, resource, query=None): + api_url = 'https://www.minds.com/api/' + path + token = self._get_cookies(api_url).get('XSRF-TOKEN') + return self._download_json( + api_url, video_id, 'Downloading %s JSON metadata' % resource, headers={ + 'Referer': 'https://www.minds.com/', + 'X-XSRF-TOKEN': token.value if token else '', + }, query=query) + + +class MindsIE(MindsBaseIE): + IE_NAME = 'minds' + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?:media|newsfeed|archive/view)/(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://www.minds.com/media/100000000000086822', + 'md5': '215a658184a419764852239d4970b045', + 'info_dict': { + 'id': '100000000000086822', + 'ext': 'mp4', + 'title': 'Minds intro sequence', + 'thumbnail': r're:https?://.+\.png', + 'uploader_id': 'ottman', + 'upload_date': '20130524', + 'timestamp': 1369404826, + 'uploader': 'Bill Ottman', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'tags': ['animation'], + 'comment_count': int, + 'license': 'attribution-cc', + }, + }, { + # entity.type == 'activity' and empty title + 'url': 'https://www.minds.com/newsfeed/798025111988506624', + 'md5': 'b2733a74af78d7fd3f541c4cbbaa5950', + 'info_dict': { + 'id': '798022190320226304', + 'ext': 'mp4', + 'title': '798022190320226304', + 'uploader': 'ColinFlaherty', + 'upload_date': '20180111', + 'timestamp': 1515639316, + 'uploader_id': 'ColinFlaherty', + }, + }, { + 'url': 'https://www.minds.com/archive/view/715172106794442752', + 'only_matching': True, + }, { + # youtube perma_url + 'url': 'https://www.minds.com/newsfeed/1197131838022602752', + 'only_matching': True, + }] + + def _real_extract(self, url): + entity_id = self._match_id(url) + entity = self._call_api( + 'v1/entities/entity/' + entity_id, entity_id, 'entity')['entity'] + if entity.get('type') == 'activity': + if entity.get('custom_type') == 'video': + video_id = entity['entity_guid'] + else: + return self.url_result(entity['perma_url']) + else: + assert(entity['subtype'] == 'video') + video_id = entity_id + # 1080p and webm formats available only on the sources array + video = self._call_api( + 'v2/media/video/' + video_id, video_id, 'video') + + formats = [] + for source in (video.get('sources') or []): + src = source.get('src') + if not src: + continue + formats.append({ + 'format_id': source.get('label'), + 'height': int_or_none(source.get('size')), + 'url': src, + }) + self._sort_formats(formats) + + entity = video.get('entity') or entity + owner = entity.get('ownerObj') or {} + uploader_id = owner.get('username') + + tags = entity.get('tags') + if tags and isinstance(tags, compat_str): + tags = [tags] + + thumbnail = None + poster = video.get('poster') or entity.get('thumbnail_src') + if poster: + urlh = self._request_webpage(poster, video_id, fatal=False) + if urlh: + thumbnail = urlh.geturl() + + return { + 'id': video_id, + 'title': entity.get('title') or video_id, + 'formats': formats, + 'description': clean_html(entity.get('description')) or None, + 'license': str_or_none(entity.get('license')), + 'timestamp': int_or_none(entity.get('time_created')), + 'uploader': strip_or_none(owner.get('name')), + 'uploader_id': uploader_id, + 'uploader_url': 'https://www.minds.com/' + uploader_id if uploader_id else None, + 'view_count': int_or_none(entity.get('play:count')), + 'like_count': int_or_none(entity.get('thumbs:up:count')), + 'dislike_count': int_or_none(entity.get('thumbs:down:count')), + 'tags': tags, + 'comment_count': int_or_none(entity.get('comments:count')), + 'thumbnail': thumbnail, + } + + +class MindsFeedBaseIE(MindsBaseIE): + _PAGE_SIZE = 150 + + def _entries(self, feed_id): + query = {'limit': self._PAGE_SIZE, 'sync': 1} + i = 1 + while True: + data = self._call_api( + 'v2/feeds/container/%s/videos' % feed_id, + feed_id, 'page %s' % i, query) + entities = data.get('entities') or [] + for entity in entities: + guid = entity.get('guid') + if not guid: + continue + yield self.url_result( + 'https://www.minds.com/newsfeed/' + guid, + MindsIE.ie_key(), guid) + query['from_timestamp'] = data['load-next'] + if not (query['from_timestamp'] and len(entities) == self._PAGE_SIZE): + break + i += 1 + + def _real_extract(self, url): + feed_id = self._match_id(url) + feed = self._call_api( + 'v1/%s/%s' % (self._FEED_PATH, feed_id), + feed_id, self._FEED_TYPE)[self._FEED_TYPE] + + return self.playlist_result( + self._entries(feed['guid']), feed_id, + strip_or_none(feed.get('name')), + feed.get('briefdescription')) + + +class MindsChannelIE(MindsFeedBaseIE): + _FEED_TYPE = 'channel' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'(?!(?:newsfeed|media|api|archive|groups)/)(?P[^/?&#]+)' + _FEED_PATH = 'channel' + _TEST = { + 'url': 'https://www.minds.com/ottman', + 'info_dict': { + 'id': 'ottman', + 'title': 'Bill Ottman', + 'description': 'Co-creator & CEO @minds', + }, + 'playlist_mincount': 54, + } + + +class MindsGroupIE(MindsFeedBaseIE): + _FEED_TYPE = 'group' + IE_NAME = 'minds:' + _FEED_TYPE + _VALID_URL = MindsBaseIE._VALID_URL_BASE + r'groups/profile/(?P[0-9]+)' + _FEED_PATH = 'groups/group' + _TEST = { + 'url': 'https://www.minds.com/groups/profile/785582576369672204/feed/videos', + 'info_dict': { + 'id': '785582576369672204', + 'title': 'Cooking Videos', + }, + 'playlist_mincount': 1, + } diff --git a/haruhi_dl/extractor/mitele.py b/haruhi_dl/extractor/mitele.py index ad9da9612..b5937233b 100644 --- a/haruhi_dl/extractor/mitele.py +++ b/haruhi_dl/extractor/mitele.py @@ -1,15 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor +from .telecinco import TelecincoIE from ..utils import ( int_or_none, parse_iso8601, - smuggle_url, ) -class MiTeleIE(InfoExtractor): +class MiTeleIE(TelecincoIE): IE_DESC = 'mitele.es' _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P[^/]+)/player' @@ -31,7 +30,6 @@ class MiTeleIE(InfoExtractor): 'timestamp': 1471209401, 'upload_date': '20160814', }, - 'add_ie': ['Ooyala'], }, { # no explicit title 'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player', @@ -54,7 +52,6 @@ class MiTeleIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': ['Ooyala'], }, { 'url': 'http://www.mitele.es/series-online/la-que-se-avecina/57aac5c1c915da951a8b45ed/player', 'only_matching': True, @@ -70,16 +67,11 @@ class MiTeleIE(InfoExtractor): r'window\.\$REACTBASE_STATE\.prePlayer_mtweb\s*=\s*({.+})', webpage, 'Pre Player'), display_id)['prePlayer'] title = pre_player['title'] - video = pre_player['video'] - video_id = video['dataMediaId'] + video_info = self._parse_content(pre_player['video'], url) content = pre_player.get('content') or {} info = content.get('info') or {} - return { - '_type': 'url_transparent', - # for some reason only HLS is supported - 'url': smuggle_url('ooyala:' + video_id, {'supportedformats': 'm3u8,dash'}), - 'id': video_id, + video_info.update({ 'title': title, 'description': info.get('synopsis'), 'series': content.get('title'), @@ -87,7 +79,7 @@ class MiTeleIE(InfoExtractor): 'episode': content.get('subtitle'), 'episode_number': int_or_none(info.get('episode_number')), 'duration': int_or_none(info.get('duration')), - 'thumbnail': video.get('dataPoster'), 'age_limit': int_or_none(info.get('rating')), 'timestamp': parse_iso8601(pre_player.get('publishedTime')), - } + }) + return video_info diff --git a/haruhi_dl/extractor/mixcloud.py b/haruhi_dl/extractor/mixcloud.py index 9759560f1..69319857d 100644 --- a/haruhi_dl/extractor/mixcloud.py +++ b/haruhi_dl/extractor/mixcloud.py @@ -251,8 +251,11 @@ class MixcloudPlaylistBaseIE(MixcloudBaseIE): cloudcast_url = cloudcast.get('url') if not cloudcast_url: continue + slug = try_get(cloudcast, lambda x: x['slug'], compat_str) + owner_username = try_get(cloudcast, lambda x: x['owner']['username'], compat_str) + video_id = '%s_%s' % (owner_username, slug) if slug and owner_username else None entries.append(self.url_result( - cloudcast_url, MixcloudIE.ie_key(), cloudcast.get('slug'))) + cloudcast_url, MixcloudIE.ie_key(), video_id)) page_info = items['pageInfo'] has_next_page = page_info['hasNextPage'] @@ -321,7 +324,8 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): _DESCRIPTION_KEY = 'biog' _ROOT_TYPE = 'user' _NODE_TEMPLATE = '''slug - url''' + url + owner { username }''' def _get_playlist_title(self, title, slug): return '%s (%s)' % (title, slug) @@ -345,6 +349,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): _NODE_TEMPLATE = '''cloudcast { slug url + owner { username } }''' def _get_cloudcast(self, node): diff --git a/haruhi_dl/extractor/motherless.py b/haruhi_dl/extractor/motherless.py index b1615b4d8..ef1e081f2 100644 --- a/haruhi_dl/extractor/motherless.py +++ b/haruhi_dl/extractor/motherless.py @@ -61,6 +61,23 @@ class MotherlessIE(InfoExtractor): # no keywords 'url': 'http://motherless.com/8B4BBC1', 'only_matching': True, + }, { + # see https://motherless.com/videos/recent for recent videos with + # uploaded date in "ago" format + 'url': 'https://motherless.com/3C3E2CF', + 'info_dict': { + 'id': '3C3E2CF', + 'ext': 'mp4', + 'title': 'a/ Hot Teens', + 'categories': list, + 'upload_date': '20210104', + 'uploader_id': 'yonbiw', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -85,20 +102,28 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Views<', r'Views\s+([^<]+)<'), + (r'>([\d,.]+)\s+Views<', r'Views\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Favorites<', r'Favorited\s+([^<]+)<'), + (r'>([\d,.]+)\s+Favorites<', + r'Favorited\s+([^<]+)<'), webpage, 'like count', fatal=False)) - upload_date = self._html_search_regex( - (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', - r'Uploaded\s+([^<]+)<'), webpage, 'upload date') - if 'Ago' in upload_date: - days = int(re.search(r'([0-9]+)', upload_date).group(1)) - upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') - else: - upload_date = unified_strdate(upload_date) + upload_date = unified_strdate(self._search_regex( + r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage, + 'upload date', default=None)) + if not upload_date: + uploaded_ago = self._search_regex( + r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago', + default=None) + if uploaded_ago: + delta = int(uploaded_ago[:-1]) + _AGO_UNITS = { + 'h': 'hours', + 'd': 'days', + } + kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} + upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') comment_count = webpage.count('class="media-comment-contents"') uploader_id = self._html_search_regex( diff --git a/haruhi_dl/extractor/mtv.py b/haruhi_dl/extractor/mtv.py index fedd5f46b..f5e30d22d 100644 --- a/haruhi_dl/extractor/mtv.py +++ b/haruhi_dl/extractor/mtv.py @@ -253,6 +253,10 @@ class MTVServicesInfoExtractor(InfoExtractor): return try_get(feed, lambda x: x['result']['data']['id'], compat_str) + @staticmethod + def _extract_child_with_type(parent, t): + return next(c for c in parent['children'] if c.get('type') == t) + def _extract_mgid(self, webpage): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf @@ -278,6 +282,13 @@ class MTVServicesInfoExtractor(InfoExtractor): if not mgid: mgid = self._extract_triforce_mgid(webpage) + if not mgid: + data = self._parse_json(self._search_regex( + r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) + main_container = self._extract_child_with_type(data, 'MainContainer') + video_player = self._extract_child_with_type(main_container, 'VideoPlayer') + mgid = video_player['props']['media']['video']['config']['uri'] + return mgid def _real_extract(self, url): diff --git a/haruhi_dl/extractor/nba.py b/haruhi_dl/extractor/nba.py index be295a7a3..fbc7adaf4 100644 --- a/haruhi_dl/extractor/nba.py +++ b/haruhi_dl/extractor/nba.py @@ -5,33 +5,137 @@ import re from .turner import TurnerBaseIE from ..compat import ( - compat_urllib_parse_urlencode, - compat_urlparse, + compat_parse_qs, + compat_str, + compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, ) from ..utils import ( + int_or_none, + merge_dicts, OnDemandPagedList, - remove_start, + parse_duration, + parse_iso8601, + try_get, + update_url_query, + urljoin, ) -class NBAIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:watch\.|www\.)?nba\.com/(?P(?:[^/]+/)+(?P[^?]*?))/?(?:/index\.html)?(?:\?.*)?$' +class NBACVPBaseIE(TurnerBaseIE): + def _extract_nba_cvp_info(self, path, video_id, fatal=False): + return self._extract_cvp_info( + 'http://secure.nba.com/%s' % path, video_id, { + 'default': { + 'media_src': 'http://nba.cdn.turner.com/nba/big', + }, + 'm3u8': { + 'media_src': 'http://nbavod-f.akamaihd.net', + }, + }, fatal=fatal) + + +class NBAWatchBaseIE(NBACVPBaseIE): + _VALID_URL_BASE = r'https?://(?:(?:www\.)?nba\.com(?:/watch)?|watch\.nba\.com)/' + + def _extract_video(self, filter_key, filter_value): + video = self._download_json( + 'https://neulionscnbav2-a.akamaihd.net/solr/nbad_program/usersearch', + filter_value, query={ + 'fl': 'description,image,name,pid,releaseDate,runtime,tags,seoName', + 'q': filter_key + ':' + filter_value, + 'wt': 'json', + })['response']['docs'][0] + + video_id = str(video['pid']) + title = video['name'] + + formats = [] + m3u8_url = (self._download_json( + 'https://watch.nba.com/service/publishpoint', video_id, query={ + 'type': 'video', + 'format': 'json', + 'id': video_id, + }, headers={ + 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1', + }, fatal=False) or {}).get('path') + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats( + re.sub(r'_(?:pc|iphone)\.', '.', m3u8_url), video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + for f in m3u8_formats: + http_f = f.copy() + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': http_f['url'].replace('.m3u8', ''), + }) + formats.append(http_f) + + info = { + 'id': video_id, + 'title': title, + 'thumbnail': urljoin('https://nbadsdmt.akamaized.net/media/nba/nba/thumbs/', video.get('image')), + 'description': video.get('description'), + 'duration': int_or_none(video.get('runtime')), + 'timestamp': parse_iso8601(video.get('releaseDate')), + 'tags': video.get('tags'), + } + + seo_name = video.get('seoName') + if seo_name and re.search(r'\d{4}/\d{2}/\d{2}/', seo_name): + base_path = '' + if seo_name.startswith('teams/'): + base_path += seo_name.split('/')[1] + '/' + base_path += 'video/' + cvp_info = self._extract_nba_cvp_info( + base_path + seo_name + '.xml', video_id, False) + if cvp_info: + formats.extend(cvp_info['formats']) + info = merge_dicts(info, cvp_info) + + self._sort_formats(formats) + info['formats'] = formats + return info + + +class NBAWatchEmbedIE(NBAWatchBaseIE): + IENAME = 'nba:watch:embed' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'embed\?.*?\bid=(?P\d+)' + _TESTS = [{ + 'url': 'http://watch.nba.com/embed?id=659395', + 'md5': 'b7e3f9946595f4ca0a13903ce5edd120', + 'info_dict': { + 'id': '659395', + 'ext': 'mp4', + 'title': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017', + 'description': 'Mix clip: More than 7 points of Joe Ingles, Luc Mbah a Moute, Blake Griffin and 6 more in Utah Jazz vs. the Clippers, 4/15/2017', + 'timestamp': 1492228800, + 'upload_date': '20170415', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._extract_video('pid', video_id) + + +class NBAWatchIE(NBAWatchBaseIE): + IE_NAME = 'nba:watch' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'(?:nba/)?video/(?P.+?(?=/index\.html)|(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ 'url': 'http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html', - 'md5': '9e7729d3010a9c71506fd1248f74e4f4', + 'md5': '9d902940d2a127af3f7f9d2f3dc79c96', 'info_dict': { - 'id': '0021200253-okc-bkn-recap', + 'id': '70946', 'ext': 'mp4', 'title': 'Thunder vs. Nets', 'description': 'Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.', 'duration': 181, - 'timestamp': 1354638466, + 'timestamp': 1354597200, 'upload_date': '20121204', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }, { 'url': 'http://www.nba.com/video/games/hornets/2014/12/05/0021400276-nyk-cha-play5.nba/', 'only_matching': True, @@ -39,116 +143,286 @@ class NBAIE(TurnerBaseIE): 'url': 'http://watch.nba.com/video/channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', 'md5': 'b2b39b81cf28615ae0c3360a3f9668c4', 'info_dict': { - 'id': 'channels/playoffs/2015/05/20/0041400301-cle-atl-recap.nba', + 'id': '330865', 'ext': 'mp4', 'title': 'Hawks vs. Cavaliers Game 1', 'description': 'md5:8094c3498d35a9bd6b1a8c396a071b4d', 'duration': 228, - 'timestamp': 1432134543, - 'upload_date': '20150520', - }, - 'expected_warnings': ['Unable to download f4m manifest'], - }, { - 'url': 'http://www.nba.com/clippers/news/doc-rivers-were-not-trading-blake', - 'info_dict': { - 'id': 'teams/clippers/2016/02/17/1455672027478-Doc_Feb16_720.mov-297324', - 'ext': 'mp4', - 'title': 'Practice: Doc Rivers - 2/16/16', - 'description': 'Head Coach Doc Rivers addresses the media following practice.', - 'upload_date': '20160216', - 'timestamp': 1455672000, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], - }, { - 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', - 'info_dict': { - 'id': 'timberwolves', - 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', - }, - 'playlist_count': 30, - 'params': { - # Download the whole playlist takes too long time - 'playlist_items': '1-30', + 'timestamp': 1432094400, + 'upload_date': '20150521', }, }, { - 'url': 'http://www.nba.com/timberwolves/wiggins-shootaround#', - 'info_dict': { - 'id': 'teams/timberwolves/2014/12/12/Wigginsmp4-3462601', - 'ext': 'mp4', - 'title': 'Shootaround Access - Dec. 12 | Andrew Wiggins', - 'description': 'Wolves rookie Andrew Wiggins addresses the media after Friday\'s shootaround.', - 'upload_date': '20141212', - 'timestamp': 1418418600, - }, - 'params': { - 'noplaylist': True, - # m3u8 download - 'skip_download': True, - }, - 'expected_warnings': ['Unable to download f4m manifest'], + 'url': 'http://watch.nba.com/nba/video/channels/nba_tv/2015/06/11/YT_go_big_go_home_Game4_061115', + 'only_matching': True, + }, { + # only CVP mp4 format available + 'url': 'https://watch.nba.com/video/teams/cavaliers/2012/10/15/sloan121015mov-2249106', + 'only_matching': True, + }, { + 'url': 'https://watch.nba.com/video/top-100-dunks-from-the-2019-20-season?plsrc=nba&collection=2019-20-season-highlights', + 'only_matching': True, }] - _PAGE_SIZE = 30 + def _real_extract(self, url): + display_id = self._match_id(url) + collection_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('collection', [None])[0] + if collection_id: + if self._downloader.params.get('noplaylist'): + self.to_screen('Downloading just video %s because of --no-playlist' % display_id) + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % collection_id) + return self.url_result( + 'https://www.nba.com/watch/list/collection/' + collection_id, + NBAWatchCollectionIE.ie_key(), collection_id) + return self._extract_video('seoName', display_id) - def _fetch_page(self, team, video_id, page): - search_url = 'http://searchapp2.nba.com/nba-search/query.jsp?' + compat_urllib_parse_urlencode({ - 'type': 'teamvideo', - 'start': page * self._PAGE_SIZE + 1, - 'npp': (page + 1) * self._PAGE_SIZE + 1, - 'sort': 'recent', - 'output': 'json', - 'site': team, - }) - results = self._download_json( - search_url, video_id, note='Download page %d of playlist data' % page)['results'][0] - for item in results: - yield self.url_result(compat_urlparse.urljoin('http://www.nba.com/', item['url'])) - def _extract_playlist(self, orig_path, video_id, webpage): - team = orig_path.split('/')[0] +class NBAWatchCollectionIE(NBAWatchBaseIE): + IE_NAME = 'nba:watch:collection' + _VALID_URL = NBAWatchBaseIE._VALID_URL_BASE + r'list/collection/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://watch.nba.com/list/collection/season-preview-2020', + 'info_dict': { + 'id': 'season-preview-2020', + }, + 'playlist_mincount': 43, + }] + _PAGE_SIZE = 100 - if self._downloader.params.get('noplaylist'): - self.to_screen('Downloading just video because of --no-playlist') - video_path = self._search_regex( - r'nbaVideoCore\.firstVideo\s*=\s*\'([^\']+)\';', webpage, 'video path') - video_url = 'http://www.nba.com/%s/video/%s' % (team, video_path) - return self.url_result(video_url) - - self.to_screen('Downloading playlist - add --no-playlist to just download video') - playlist_title = self._og_search_title(webpage, fatal=False) - entries = OnDemandPagedList( - functools.partial(self._fetch_page, team, video_id), - self._PAGE_SIZE) - - return self.playlist_result(entries, team, playlist_title) + def _fetch_page(self, collection_id, page): + page += 1 + videos = self._download_json( + 'https://content-api-prod.nba.com/public/1/endeavor/video-list/collection/' + collection_id, + collection_id, 'Downloading page %d JSON metadata' % page, query={ + 'count': self._PAGE_SIZE, + 'page': page, + })['results']['videos'] + for video in videos: + program = video.get('program') or {} + seo_name = program.get('seoName') or program.get('slug') + if not seo_name: + continue + yield { + '_type': 'url', + 'id': program.get('id'), + 'title': program.get('title') or video.get('title'), + 'url': 'https://www.nba.com/watch/video/' + seo_name, + 'thumbnail': video.get('image'), + 'description': program.get('description') or video.get('description'), + 'duration': parse_duration(program.get('runtimeHours')), + 'timestamp': parse_iso8601(video.get('releaseDate')), + } def _real_extract(self, url): - path, video_id = re.match(self._VALID_URL, url).groups() - orig_path = path - if path.startswith('nba/'): - path = path[3:] + collection_id = self._match_id(url) + entries = OnDemandPagedList( + functools.partial(self._fetch_page, collection_id), + self._PAGE_SIZE) + return self.playlist_result(entries, collection_id) - if 'video/' not in path: - webpage = self._download_webpage(url, video_id) - path = remove_start(self._search_regex(r'data-videoid="([^"]+)"', webpage, 'video id'), '/') - if path == '{{id}}': - return self._extract_playlist(orig_path, video_id, webpage) +class NBABaseIE(NBACVPBaseIE): + _VALID_URL_BASE = r'''(?x) + https?://(?:www\.)?nba\.com/ + (?P + blazers| + bucks| + bulls| + cavaliers| + celtics| + clippers| + grizzlies| + hawks| + heat| + hornets| + jazz| + kings| + knicks| + lakers| + magic| + mavericks| + nets| + nuggets| + pacers| + pelicans| + pistons| + raptors| + rockets| + sixers| + spurs| + suns| + thunder| + timberwolves| + warriors| + wizards + ) + (?:/play\#)?/''' + _CHANNEL_PATH_REGEX = r'video/channel|series' - # See prepareContentId() of pkgCvp.js - if path.startswith('video/teams'): - path = 'video/channels/proxy/' + path[6:] + def _embed_url_result(self, team, content_id): + return self.url_result(update_url_query( + 'https://secure.nba.com/assets/amp/include/video/iframe.html', { + 'contentId': content_id, + 'team': team, + }), NBAEmbedIE.ie_key()) - return self._extract_cvp_info( - 'http://www.nba.com/%s.xml' % path, video_id, { - 'default': { - 'media_src': 'http://nba.cdn.turner.com/nba/big', - }, - 'm3u8': { - 'media_src': 'http://nbavod-f.akamaihd.net', - }, + def _call_api(self, team, content_id, query, resource): + return self._download_json( + 'https://api.nba.net/2/%s/video,imported_video,wsc/' % team, + content_id, 'Download %s JSON metadata' % resource, + query=query, headers={ + 'accessToken': 'internal|bb88df6b4c2244e78822812cecf1ee1b', + })['response']['result'] + + def _extract_video(self, video, team, extract_all=True): + video_id = compat_str(video['nid']) + team = video['brand'] + + info = { + 'id': video_id, + 'title': video.get('title') or video.get('headline') or video['shortHeadline'], + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('published')), + } + + subtitles = {} + captions = try_get(video, lambda x: x['videoCaptions']['sidecars'], dict) or {} + for caption_url in captions.values(): + subtitles.setdefault('en', []).append({'url': caption_url}) + + formats = [] + mp4_url = video.get('mp4') + if mp4_url: + formats.append({ + 'url': mp4_url, }) + + if extract_all: + source_url = video.get('videoSource') + if source_url and not source_url.startswith('s3://') and self._is_valid_url(source_url, video_id, 'source'): + formats.append({ + 'format_id': 'source', + 'url': source_url, + 'preference': 1, + }) + + m3u8_url = video.get('m3u8') + if m3u8_url: + if '.akamaihd.net/i/' in m3u8_url: + formats.extend(self._extract_akamai_formats( + m3u8_url, video_id, {'http': 'pmd.cdn.turner.com'})) + else: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + content_xml = video.get('contentXml') + if team and content_xml: + cvp_info = self._extract_nba_cvp_info( + team + content_xml, video_id, fatal=False) + if cvp_info: + formats.extend(cvp_info['formats']) + subtitles = self._merge_subtitles(subtitles, cvp_info['subtitles']) + info = merge_dicts(info, cvp_info) + + self._sort_formats(formats) + else: + info.update(self._embed_url_result(team, video['videoId'])) + + info.update({ + 'formats': formats, + 'subtitles': subtitles, + }) + + return info + + def _real_extract(self, url): + team, display_id = re.match(self._VALID_URL, url).groups() + if '/play#/' in url: + display_id = compat_urllib_parse_unquote(display_id) + else: + webpage = self._download_webpage(url, display_id) + display_id = self._search_regex( + self._CONTENT_ID_REGEX + r'\s*:\s*"([^"]+)"', webpage, 'video id') + return self._extract_url_results(team, display_id) + + +class NBAEmbedIE(NBABaseIE): + IENAME = 'nba:embed' + _VALID_URL = r'https?://secure\.nba\.com/assets/amp/include/video/(?:topI|i)frame\.html\?.*?\bcontentId=(?P[^?#&]+)' + _TESTS = [{ + 'url': 'https://secure.nba.com/assets/amp/include/video/topIframe.html?contentId=teams/bulls/2020/12/04/3478774/1607105587854-20201204_SCHEDULE_RELEASE_FINAL_DRUPAL-3478774&team=bulls&adFree=false&profile=71&videoPlayerName=TAMPCVP&baseUrl=&videoAdsection=nba.com_mobile_web_teamsites_chicagobulls&Env=', + 'only_matching': True, + }, { + 'url': 'https://secure.nba.com/assets/amp/include/video/iframe.html?contentId=2016/10/29/0021600027boschaplay7&adFree=false&profile=71&team=&videoPlayerName=LAMPCVP', + 'only_matching': True, + }] + + def _real_extract(self, url): + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + content_id = qs['contentId'][0] + team = qs.get('team', [None])[0] + if not team: + return self.url_result( + 'https://watch.nba.com/video/' + content_id, NBAWatchIE.ie_key()) + video = self._call_api(team, content_id, {'videoid': content_id}, 'video')[0] + return self._extract_video(video, team) + + +class NBAIE(NBABaseIE): + IENAME = 'nba' + _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?!%s)video/(?P(?:[^/]+/)*[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX + _TESTS = [{ + 'url': 'https://www.nba.com/bulls/video/teams/bulls/2020/12/04/3478774/1607105587854-20201204schedulereleasefinaldrupal-3478774', + 'info_dict': { + 'id': '45039', + 'ext': 'mp4', + 'title': 'AND WE BACK.', + 'description': 'Part 1 of our 2020-21 schedule is here! Watch our games on NBC Sports Chicago.', + 'duration': 94, + 'timestamp': 1607112000, + 'upload_date': '20201218', + }, + }, { + 'url': 'https://www.nba.com/bucks/play#/video/teams%2Fbucks%2F2020%2F12%2F17%2F64860%2F1608252863446-Op_Dream_16x9-64860', + 'only_matching': True, + }, { + 'url': 'https://www.nba.com/bucks/play#/video/wsc%2Fteams%2F2787C911AA1ACD154B5377F7577CCC7134B2A4B0', + 'only_matching': True, + }] + _CONTENT_ID_REGEX = r'videoID' + + def _extract_url_results(self, team, content_id): + return self._embed_url_result(team, content_id) + + +class NBAChannelIE(NBABaseIE): + IENAME = 'nba:channel' + _VALID_URL = NBABaseIE._VALID_URL_BASE + '(?:%s)/(?P[^/?#&]+)' % NBABaseIE._CHANNEL_PATH_REGEX + _TESTS = [{ + 'url': 'https://www.nba.com/blazers/video/channel/summer_league', + 'info_dict': { + 'title': 'Summer League', + }, + 'playlist_mincount': 138, + }, { + 'url': 'https://www.nba.com/bucks/play#/series/On%20This%20Date', + 'only_matching': True, + }] + _CONTENT_ID_REGEX = r'videoSubCategory' + _PAGE_SIZE = 100 + + def _fetch_page(self, team, channel, page): + results = self._call_api(team, channel, { + 'channels': channel, + 'count': self._PAGE_SIZE, + 'offset': page * self._PAGE_SIZE, + }, 'page %d' % (page + 1)) + for video in results: + yield self._extract_video(video, team, False) + + def _extract_url_results(self, team, content_id): + entries = OnDemandPagedList( + functools.partial(self._fetch_page, team, content_id), + self._PAGE_SIZE) + return self.playlist_result(entries, playlist_title=content_id) diff --git a/haruhi_dl/extractor/nbc.py b/haruhi_dl/extractor/nbc.py index 6f3cb3003..0d77648c2 100644 --- a/haruhi_dl/extractor/nbc.py +++ b/haruhi_dl/extractor/nbc.py @@ -10,7 +10,6 @@ from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, - js_to_json, parse_duration, smuggle_url, try_get, @@ -159,7 +158,8 @@ class NBCIE(AdobePassIE): class NBCSportsVPlayerIE(InfoExtractor): - _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' + _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' + _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P[0-9a-zA-Z_]+)' _TESTS = [{ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', @@ -175,12 +175,15 @@ class NBCSportsVPlayerIE(InfoExtractor): }, { 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/_hqLjQ95yx8Z', 'only_matching': True, + }, { + 'url': 'https://www.nbcsports.com/vplayer/p/BxmELC/nbcsports/select/PHJSaFWbrTY9?form=html&autoPlay=true', + 'only_matching': True, }] @staticmethod def _extract_url(webpage): iframe_m = re.search( - r']+src="(?Phttps?://vplayer\.nbcsports\.com/[^"]+)"', webpage) + r'<(?:iframe[^>]+|div[^>]+data-(?:mpx-)?)src="(?P%s[^"]+)"' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) if iframe_m: return iframe_m.group('url') @@ -193,21 +196,29 @@ class NBCSportsVPlayerIE(InfoExtractor): class NBCSportsIE(InfoExtractor): - # Does not include https because its certificate is invalid - _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?:[^/]+/)+(?P[0-9a-z-]+)' + _VALID_URL = r'https?://(?:www\.)?nbcsports\.com//?(?!vplayer/)(?:[^/]+/)+(?P[0-9a-z-]+)' - _TEST = { + _TESTS = [{ + # iframe src 'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke', 'info_dict': { 'id': 'PHJSaFWbrTY9', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke', 'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113', 'uploader': 'NBCU-SPORTS', 'upload_date': '20150330', 'timestamp': 1427726529, } - } + }, { + # data-mpx-src + 'url': 'https://www.nbcsports.com/philadelphia/philadelphia-phillies/bruce-bochy-hector-neris-hes-idiot', + 'only_matching': True, + }, { + # data-src + 'url': 'https://www.nbcsports.com/boston/video/report-card-pats-secondary-no-match-josh-allen', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -275,33 +286,6 @@ class NBCSportsStreamIE(AdobePassIE): } -class CSNNEIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?csnne\.com/video/(?P[0-9a-z-]+)' - - _TEST = { - 'url': 'http://www.csnne.com/video/snc-evening-update-wright-named-red-sox-no-5-starter', - 'info_dict': { - 'id': 'yvBLLUgQ8WU0', - 'ext': 'mp4', - 'title': 'SNC evening update: Wright named Red Sox\' No. 5 starter.', - 'description': 'md5:1753cfee40d9352b19b4c9b3e589b9e3', - 'timestamp': 1459369979, - 'upload_date': '20160330', - 'uploader': 'NBCU-SPORTS', - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - return { - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': self._html_search_meta('twitter:player:stream', webpage), - 'display_id': display_id, - } - - class NBCNewsIE(ThePlatformIE): _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P[^/?]+)' @@ -394,8 +378,8 @@ class NBCNewsIE(ThePlatformIE): webpage = self._download_webpage(url, video_id) data = self._parse_json(self._search_regex( - r'window\.__data\s*=\s*({.+});', webpage, - 'bootstrap json'), video_id, js_to_json) + r']+id="__NEXT_DATA__"[^>]*>({.+?})', + webpage, 'bootstrap json'), video_id)['props']['initialState'] video_data = try_get(data, lambda x: x['video']['current'], dict) if not video_data: video_data = data['article']['content'][0]['primaryMedia']['video'] diff --git a/haruhi_dl/extractor/ndr.py b/haruhi_dl/extractor/ndr.py index 2447c812e..ddd828d92 100644 --- a/haruhi_dl/extractor/ndr.py +++ b/haruhi_dl/extractor/ndr.py @@ -81,6 +81,29 @@ class NDRIE(NDRBaseIE): 'params': { 'skip_download': True, }, + }, { + # with subtitles + 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', + 'info_dict': { + 'id': 'extra18674', + 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', + 'ext': 'mp4', + 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', + 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', + 'uploader': 'ndrtv', + 'upload_date': '20201113', + 'duration': 1749, + 'subtitles': { + 'de': [{ + 'ext': 'ttml', + 'url': r're:^https://www\.ndr\.de.+', + }], + }, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', 'only_matching': True, @@ -239,6 +262,20 @@ class NDREmbedBaseIE(InfoExtractor): 'preference': quality_key(thumbnail.get('quality')), }) + subtitles = {} + tracks = config.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + track_url = urljoin(url, track.get('src')) + if not track_url: + continue + subtitles.setdefault(track.get('srclang') or 'de', []).append({ + 'url': track_url, + 'ext': 'ttml', + }) + return { 'id': video_id, 'title': title, @@ -248,6 +285,7 @@ class NDREmbedBaseIE(InfoExtractor): 'duration': duration, 'thumbnails': thumbnails, 'formats': formats, + 'subtitles': subtitles, } diff --git a/haruhi_dl/extractor/nfl.py b/haruhi_dl/extractor/nfl.py index 460deb162..871923e4c 100644 --- a/haruhi_dl/extractor/nfl.py +++ b/haruhi_dl/extractor/nfl.py @@ -4,19 +4,15 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlparse, -) from ..utils import ( - ExtractorError, - int_or_none, - remove_end, + clean_html, + determine_ext, + get_element_by_class, ) -class NFLIE(InfoExtractor): - IE_NAME = 'nfl.com' - _VALID_URL = r'''(?x) +class NFLBaseIE(InfoExtractor): + _VALID_URL_BASE = r'''(?x) https?:// (?P (?:www\.)? @@ -34,15 +30,15 @@ class NFLIE(InfoExtractor): houstontexans| colts| jaguars| - titansonline| + (?:titansonline|tennesseetitans)| denverbroncos| - kcchiefs| + (?:kc)?chiefs| raiders| chargers| dallascowboys| giants| philadelphiaeagles| - redskins| + (?:redskins|washingtonfootball)| chicagobears| detroitlions| packers| @@ -52,180 +48,113 @@ class NFLIE(InfoExtractor): neworleanssaints| buccaneers| azcardinals| - stlouisrams| + (?:stlouis|the)rams| 49ers| seahawks )\.com| .+?\.clubs\.nfl\.com ) )/ - (?:.+?/)* - (?P[^/#?&]+) ''' + _VIDEO_CONFIG_REGEX = r']+id="[^"]*video-config-[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}[^"]*"[^>]*>\s*({.+})' + _WORKING = False + + def _parse_video_config(self, video_config, display_id): + video_config = self._parse_json(video_config, display_id) + item = video_config['playlist'][0] + mcp_id = item.get('mcpID') + if mcp_id: + info = self.url_result( + 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:' + mcp_id, + 'Anvato', mcp_id) + else: + media_id = item.get('id') or item['entityId'] + title = item['title'] + item_url = item['url'] + info = {'id': media_id} + ext = determine_ext(item_url) + if ext == 'm3u8': + info['formats'] = self._extract_m3u8_formats(item_url, media_id, 'mp4') + self._sort_formats(info['formats']) + else: + info['url'] = item_url + if item.get('audio') is True: + info['vcodec'] = 'none' + is_live = video_config.get('live') is True + thumbnails = None + image_url = item.get(item.get('imageSrc')) or item.get(item.get('posterImage')) + if image_url: + thumbnails = [{ + 'url': image_url, + 'ext': determine_ext(image_url, 'jpg'), + }] + info.update({ + 'title': self._live_title(title) if is_live else title, + 'is_live': is_live, + 'description': clean_html(item.get('description')), + 'thumbnails': thumbnails, + }) + return info + + +class NFLIE(NFLBaseIE): + IE_NAME = 'nfl.com' + _VALID_URL = NFLBaseIE._VALID_URL_BASE + r'(?:videos?|listen|audio)/(?P[^/#?&]+)' _TESTS = [{ - 'url': 'http://www.nfl.com/videos/nfl-game-highlights/0ap3000000398478/Week-3-Redskins-vs-Eagles-highlights', - 'md5': '394ef771ddcd1354f665b471d78ec4c6', + 'url': 'https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14', 'info_dict': { - 'id': '0ap3000000398478', + 'id': '899441', 'ext': 'mp4', - 'title': 'Week 3: Redskins vs. Eagles highlights', - 'description': 'md5:56323bfb0ac4ee5ab24bd05fdf3bf478', - 'upload_date': '20140921', - 'timestamp': 1411337580, + 'title': "Baker Mayfield's game-changing plays from 3-TD game Week 14", + 'description': 'md5:85e05a3cc163f8c344340f220521136d', + 'upload_date': '20201215', + 'timestamp': 1608009755, 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'NFL', } }, { - 'url': 'http://prod.www.steelers.clubs.nfl.com/video-and-audio/videos/LIVE_Post_Game_vs_Browns/9d72f26a-9e2b-4718-84d3-09fb4046c266', - 'md5': 'cf85bdb4bc49f6e9d3816d130c78279c', + 'url': 'https://www.chiefs.com/listen/patrick-mahomes-travis-kelce-react-to-win-over-dolphins-the-breakdown', + 'md5': '6886b32c24b463038c760ceb55a34566', 'info_dict': { - 'id': '9d72f26a-9e2b-4718-84d3-09fb4046c266', - 'ext': 'mp4', - 'title': 'LIVE: Post Game vs. Browns', - 'description': 'md5:6a97f7e5ebeb4c0e69a418a89e0636e8', - 'upload_date': '20131229', - 'timestamp': 1388354455, - 'thumbnail': r're:^https?://.*\.jpg$', + 'id': 'd87e8790-3e14-11eb-8ceb-ff05c2867f99', + 'ext': 'mp3', + 'title': 'Patrick Mahomes, Travis Kelce React to Win Over Dolphins | The Breakdown', + 'description': 'md5:12ada8ee70e6762658c30e223e095075', } }, { - 'url': 'http://www.nfl.com/news/story/0ap3000000467586/article/patriots-seahawks-involved-in-lategame-skirmish', - 'info_dict': { - 'id': '0ap3000000467607', - 'ext': 'mp4', - 'title': 'Frustrations flare on the field', - 'description': 'Emotions ran high at the end of the Super Bowl on both sides of the ball after a dramatic finish.', - 'timestamp': 1422850320, - 'upload_date': '20150202', - }, - }, { - 'url': 'http://www.patriots.com/video/2015/09/18/10-days-gillette', - 'md5': '4c319e2f625ffd0b481b4382c6fc124c', - 'info_dict': { - 'id': 'n-238346', - 'ext': 'mp4', - 'title': '10 Days at Gillette', - 'description': 'md5:8cd9cd48fac16de596eadc0b24add951', - 'timestamp': 1442618809, - 'upload_date': '20150918', - }, - }, { - # lowercase data-contentid - 'url': 'http://www.steelers.com/news/article-1/Tomlin-on-Ben-getting-Vick-ready/56399c96-4160-48cf-a7ad-1d17d4a3aef7', - 'info_dict': { - 'id': '12693586-6ea9-4743-9c1c-02c59e4a5ef2', - 'ext': 'mp4', - 'title': 'Tomlin looks ahead to Ravens on a short week', - 'description': 'md5:32f3f7b139f43913181d5cbb24ecad75', - 'timestamp': 1443459651, - 'upload_date': '20150928', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.nfl.com/videos/nfl-network-top-ten/09000d5d810a6bd4/Top-10-Gutsiest-Performances-Jack-Youngblood', + 'url': 'https://www.buffalobills.com/video/buffalo-bills-military-recognition-week-14', 'only_matching': True, }, { - 'url': 'http://www.buffalobills.com/video/videos/Rex_Ryan_Show_World_Wide_Rex/b1dcfab2-3190-4bb1-bfc0-d6e603d6601a', + 'url': 'https://www.raiders.com/audio/instant-reactions-raiders-week-14-loss-to-indianapolis-colts-espn-jason-fitz', 'only_matching': True, }] - @staticmethod - def prepend_host(host, url): - if not url.startswith('http'): - if not url.startswith('/'): - url = '/%s' % url - url = 'http://{0:}{1:}'.format(host, url) - return url + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + return self._parse_video_config(self._search_regex( + self._VIDEO_CONFIG_REGEX, webpage, 'video config'), display_id) - @staticmethod - def format_from_stream(stream, protocol, host, path_prefix='', - preference=0, note=None): - url = '{protocol:}://{host:}/{prefix:}{path:}'.format( - protocol=protocol, - host=host, - prefix=path_prefix, - path=stream.get('path'), - ) - return { - 'url': url, - 'vbr': int_or_none(stream.get('rate', 0), 1000), - 'preference': preference, - 'format_note': note, - } + +class NFLArticleIE(NFLBaseIE): + IE_NAME = 'nfl.com:article' + _VALID_URL = NFLBaseIE._VALID_URL_BASE + r'news/(?P[^/#?&]+)' + _TEST = { + 'url': 'https://www.buffalobills.com/news/the-only-thing-we-ve-earned-is-the-noise-bills-coaches-discuss-handling-rising-e', + 'info_dict': { + 'id': 'the-only-thing-we-ve-earned-is-the-noise-bills-coaches-discuss-handling-rising-e', + 'title': "'The only thing we've earned is the noise' | Bills coaches discuss handling rising expectations", + }, + 'playlist_count': 4, + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, host = mobj.group('id'), mobj.group('host') - - webpage = self._download_webpage(url, video_id) - - config_url = NFLIE.prepend_host(host, self._search_regex( - r'(?:(?:config|configURL)\s*:\s*|]+data-config\s*=\s*)(["\'])(?P.+?)\1', - webpage, 'config URL', default='static/content/static/config/video/config.json', - group='config')) - # For articles, the id in the url is not the video id - video_id = self._search_regex( - r'(?:]+data-content[Ii]d\s*=\s*|content[Ii]d\s*:\s*)(["\'])(?P(?:(?!\1).)+)\1', - webpage, 'video id', default=video_id, group='id') - config = self._download_json(config_url, video_id, 'Downloading player config') - url_template = NFLIE.prepend_host( - host, '{contentURLTemplate:}'.format(**config)) - video_data = self._download_json( - url_template.format(id=video_id), video_id) - - formats = [] - cdn_data = video_data.get('cdnData', {}) - streams = cdn_data.get('bitrateInfo', []) - if cdn_data.get('format') == 'EXTERNAL_HTTP_STREAM': - parts = compat_urllib_parse_urlparse(cdn_data.get('uri')) - protocol, host = parts.scheme, parts.netloc - for stream in streams: - formats.append( - NFLIE.format_from_stream(stream, protocol, host)) - else: - cdns = config.get('cdns') - if not cdns: - raise ExtractorError('Failed to get CDN data', expected=True) - - for name, cdn in cdns.items(): - # LimeLight streams don't seem to work - if cdn.get('name') == 'LIMELIGHT': - continue - - protocol = cdn.get('protocol') - host = remove_end(cdn.get('host', ''), '/') - if not (protocol and host): - continue - - prefix = cdn.get('pathprefix', '') - if prefix and not prefix.endswith('/'): - prefix = '%s/' % prefix - - preference = 0 - if protocol == 'rtmp': - preference = -2 - elif 'prog' in name.lower(): - preference = 1 - - for stream in streams: - formats.append( - NFLIE.format_from_stream(stream, protocol, host, - prefix, preference, name)) - - self._sort_formats(formats) - - thumbnail = None - for q in ('xl', 'l', 'm', 's', 'xs'): - thumbnail = video_data.get('imagePaths', {}).get(q) - if thumbnail: - break - - return { - 'id': video_id, - 'title': video_data.get('headline'), - 'formats': formats, - 'description': video_data.get('caption'), - 'duration': video_data.get('duration'), - 'thumbnail': thumbnail, - 'timestamp': int_or_none(video_data.get('posted'), 1000), - } + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + entries = [] + for video_config in re.findall(self._VIDEO_CONFIG_REGEX, webpage): + entries.append(self._parse_video_config(video_config, display_id)) + title = clean_html(get_element_by_class( + 'nfl-c-article__title', webpage)) or self._html_search_meta( + ['og:title', 'twitter:title'], webpage) + return self.playlist_result(entries, display_id, title) diff --git a/haruhi_dl/extractor/nhk.py b/haruhi_dl/extractor/nhk.py index de6a707c4..8a9331a79 100644 --- a/haruhi_dl/extractor/nhk.py +++ b/haruhi_dl/extractor/nhk.py @@ -3,51 +3,33 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import urljoin -class NhkVodIE(InfoExtractor): - _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P[a-z]{2})/ondemand/(?Pvideo|audio)/(?P\d{7}|[^/]+?-\d{8}-\d+)' - # Content available only for a limited period of time. Visit - # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. - _TESTS = [{ - # clip - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', - 'md5': '256a1be14f48d960a7e61e2532d95ec3', - 'info_dict': { - 'id': 'a95j5iza', - 'ext': 'mp4', - 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU", - 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', - 'timestamp': 1565965194, - 'upload_date': '20190816', - }, - }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', - 'only_matching': True, - }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/', - 'only_matching': True, - }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/', - 'only_matching': True, - }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', - 'only_matching': True, - }] - _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/episode/%s/%s/all%s.json' +class NhkBaseIE(InfoExtractor): + _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json' + _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P[a-z]{2})/ondemand' + _TYPE_REGEX = r'/(?Pvideo|audio)/' - def _real_extract(self, url): - lang, m_type, episode_id = re.match(self._VALID_URL, url).groups() + def _call_api(self, m_id, lang, is_video, is_episode, is_clip): + return self._download_json( + self._API_URL_TEMPLATE % ( + 'v' if is_video else 'r', + 'clip' if is_clip else 'esd', + 'episode' if is_episode else 'program', + m_id, lang, '/all' if is_video else ''), + m_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'] or [] + + def _extract_episode_info(self, url, episode=None): + fetch_episode = episode is None + lang, m_type, episode_id = re.match(NhkVodIE._VALID_URL, url).groups() if episode_id.isdigit(): episode_id = episode_id[:4] + '-' + episode_id[4:] is_video = m_type == 'video' - episode = self._download_json( - self._API_URL_TEMPLATE % ( - 'v' if is_video else 'r', - 'clip' if episode_id[:4] == '9999' else 'esd', - episode_id, lang, '/all' if is_video else ''), - episode_id, query={'apikey': 'EJfK8jdS57GqlupFgAfAAwr573q01y6k'})['data']['episodes'][0] + if fetch_episode: + episode = self._call_api( + episode_id, lang, is_video, True, episode_id[:4] == '9999')[0] title = episode.get('sub_title_clean') or episode['sub_title'] def get_clean_field(key): @@ -76,18 +58,121 @@ class NhkVodIE(InfoExtractor): 'episode': title, } if is_video: + vod_id = episode['vod_id'] info.update({ '_type': 'url_transparent', 'ie_key': 'Piksel', - 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + episode['vod_id'], + 'url': 'https://player.piksel.com/v/refid/nhkworld/prefid/' + vod_id, + 'id': vod_id, }) else: - audio = episode['audio'] - audio_path = audio['audio'] - info['formats'] = self._extract_m3u8_formats( - 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, - episode_id, 'm4a', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False) - for f in info['formats']: - f['language'] = lang + if fetch_episode: + audio_path = episode['audio']['audio'] + info['formats'] = self._extract_m3u8_formats( + 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + episode_id, 'm4a', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in info['formats']: + f['language'] = lang + else: + info.update({ + '_type': 'url_transparent', + 'ie_key': NhkVodIE.ie_key(), + 'url': url, + }) return info + + +class NhkVodIE(NhkBaseIE): + _VALID_URL = r'%s%s(?P\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + # Content available only for a limited period of time. Visit + # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. + _TESTS = [{ + # video clip + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999011/', + 'md5': '7a90abcfe610ec22a6bfe15bd46b30ca', + 'info_dict': { + 'id': 'a95j5iza', + 'ext': 'mp4', + 'title': "Dining with the Chef - Chef Saito's Family recipe: MENCHI-KATSU", + 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', + 'timestamp': 1565965194, + 'upload_date': '20190816', + }, + }, { + # audio clip + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/r_inventions-20201104-1/', + 'info_dict': { + 'id': 'r_inventions-20201104-1-en', + 'ext': 'm4a', + 'title': "Japan's Top Inventions - Miniature Video Cameras", + 'description': 'md5:07ea722bdbbb4936fdd360b6a480c25b', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/2015173/', + 'only_matching': True, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/plugin-20190404-1/', + 'only_matching': True, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/', + 'only_matching': True, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', + 'only_matching': True, + }] + + def _real_extract(self, url): + return self._extract_episode_info(url) + + +class NhkVodProgramIE(NhkBaseIE): + _VALID_URL = r'%s/program%s(?P[0-9a-z]+)(?:.+?\btype=(?Pclip|(?:radio|tv)Episode))?' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + _TESTS = [{ + # video program episodes + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', + 'info_dict': { + 'id': 'japanrailway', + 'title': 'Japan Railway Journal', + }, + 'playlist_mincount': 1, + }, { + # video program clips + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip', + 'info_dict': { + 'id': 'japanrailway', + 'title': 'Japan Railway Journal', + }, + 'playlist_mincount': 5, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/', + 'only_matching': True, + }, { + # audio program + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/', + 'only_matching': True, + }] + + def _real_extract(self, url): + lang, m_type, program_id, episode_type = re.match(self._VALID_URL, url).groups() + + episodes = self._call_api( + program_id, lang, m_type == 'video', False, episode_type == 'clip') + + entries = [] + for episode in episodes: + episode_path = episode.get('url') + if not episode_path: + continue + entries.append(self._extract_episode_info( + urljoin(url, episode_path), episode)) + + program_title = None + if entries: + program_title = entries[0].get('series') + + return self.playlist_result(entries, program_id, program_title) diff --git a/haruhi_dl/extractor/niconico.py b/haruhi_dl/extractor/niconico.py index eb07ca776..a85fc3d5c 100644 --- a/haruhi_dl/extractor/niconico.py +++ b/haruhi_dl/extractor/niconico.py @@ -1,20 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -import json import datetime +import functools +import json +import math from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urlparse, + compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, dict_get, ExtractorError, - int_or_none, float_or_none, + InAdvancePagedList, + int_or_none, parse_duration, parse_iso8601, remove_start, @@ -181,7 +184,7 @@ class NiconicoIE(InfoExtractor): if urlh is False: login_ok = False else: - parts = compat_urlparse.urlparse(urlh.geturl()) + parts = compat_urllib_parse_urlparse(urlh.geturl()) if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login': login_ok = False if not login_ok: @@ -292,7 +295,7 @@ class NiconicoIE(InfoExtractor): 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1', video_id, 'Downloading flv info') - flv_info = compat_urlparse.parse_qs(flv_info_webpage) + flv_info = compat_parse_qs(flv_info_webpage) if 'url' not in flv_info: if 'deleted' in flv_info: raise ExtractorError('The video has been deleted.', @@ -437,34 +440,76 @@ class NiconicoIE(InfoExtractor): class NiconicoPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/mylist/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/)?mylist/(?P\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.nicovideo.jp/mylist/27411728', 'info_dict': { 'id': '27411728', 'title': 'AKB48のオールナイトニッポン', + 'description': 'md5:d89694c5ded4b6c693dea2db6e41aa08', + 'uploader': 'のっく', + 'uploader_id': '805442', }, 'playlist_mincount': 225, - } + }, { + 'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728', + 'only_matching': True, + }] + _PAGE_SIZE = 100 + + def _call_api(self, list_id, resource, query): + return self._download_json( + 'https://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, + 'Downloading %s JSON metatdata' % resource, query=query, + headers={'X-Frontend-Id': 6})['data']['mylist'] + + def _parse_owner(self, item): + owner = item.get('owner') or {} + if owner: + return { + 'uploader': owner.get('name'), + 'uploader_id': owner.get('id'), + } + return {} + + def _fetch_page(self, list_id, page): + page += 1 + items = self._call_api(list_id, 'page %d' % page, { + 'page': page, + 'pageSize': self._PAGE_SIZE, + })['items'] + for item in items: + video = item.get('video') or {} + video_id = video.get('id') + if not video_id: + continue + count = video.get('count') or {} + get_count = lambda x: int_or_none(count.get(x)) + info = { + '_type': 'url', + 'id': video_id, + 'title': video.get('title'), + 'url': 'https://www.nicovideo.jp/watch/' + video_id, + 'description': video.get('shortDescription'), + 'duration': int_or_none(video.get('duration')), + 'view_count': get_count('view'), + 'comment_count': get_count('comment'), + 'ie_key': NiconicoIE.ie_key(), + } + info.update(self._parse_owner(video)) + yield info def _real_extract(self, url): list_id = self._match_id(url) - webpage = self._download_webpage(url, list_id) - - entries_json = self._search_regex(r'Mylist\.preload\(\d+, (\[.*\])\);', - webpage, 'entries') - entries = json.loads(entries_json) - entries = [{ - '_type': 'url', - 'ie_key': NiconicoIE.ie_key(), - 'url': ('http://www.nicovideo.jp/watch/%s' % - entry['item_data']['video_id']), - } for entry in entries] - - return { - '_type': 'playlist', - 'title': self._search_regex(r'\s+name: "(.*?)"', webpage, 'title'), - 'id': list_id, - 'entries': entries, - } + mylist = self._call_api(list_id, 'list', { + 'pageSize': 1, + }) + entries = InAdvancePagedList( + functools.partial(self._fetch_page, list_id), + math.ceil(mylist['totalItemCount'] / self._PAGE_SIZE), + self._PAGE_SIZE) + result = self.playlist_result( + entries, list_id, mylist.get('name'), mylist.get('description')) + result.update(self._parse_owner(mylist)) + return result diff --git a/haruhi_dl/extractor/ninecninemedia.py b/haruhi_dl/extractor/ninecninemedia.py index 65754c5e7..a569c889e 100644 --- a/haruhi_dl/extractor/ninecninemedia.py +++ b/haruhi_dl/extractor/ninecninemedia.py @@ -5,10 +5,11 @@ import re from .common import InfoExtractor from ..utils import ( - parse_iso8601, - float_or_none, ExtractorError, + float_or_none, int_or_none, + parse_iso8601, + try_get, ) @@ -35,7 +36,7 @@ class NineCNineMediaIE(InfoExtractor): '$include': '[HasClosedCaptions]', }) - if content_package.get('Constraints', {}).get('Security', {}).get('Type'): + if try_get(content_package, lambda x: x['Constraints']['Security']['Type']): raise ExtractorError('This video is DRM protected.', expected=True) manifest_base_url = content_package_url + 'manifest.' @@ -52,7 +53,7 @@ class NineCNineMediaIE(InfoExtractor): self._sort_formats(formats) thumbnails = [] - for image in content.get('Images', []): + for image in (content.get('Images') or []): image_url = image.get('Url') if not image_url: continue @@ -70,7 +71,7 @@ class NineCNineMediaIE(InfoExtractor): continue container.append(e_name) - season = content.get('Season', {}) + season = content.get('Season') or {} info = { 'id': content_id, @@ -79,13 +80,14 @@ class NineCNineMediaIE(InfoExtractor): 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), 'episode_number': int_or_none(content.get('Episode')), 'season': season.get('Name'), - 'season_number': season.get('Number'), + 'season_number': int_or_none(season.get('Number')), 'season_id': season.get('Id'), - 'series': content.get('Media', {}).get('Name'), + 'series': try_get(content, lambda x: x['Media']['Name']), 'tags': tags, 'categories': categories, 'duration': float_or_none(content_package.get('Duration')), 'formats': formats, + 'thumbnails': thumbnails, } if content_package.get('HasClosedCaptions'): diff --git a/haruhi_dl/extractor/ninegag.py b/haruhi_dl/extractor/ninegag.py index dc6a27d36..14390823b 100644 --- a/haruhi_dl/extractor/ninegag.py +++ b/haruhi_dl/extractor/ninegag.py @@ -1,104 +1,130 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import str_to_int +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, + try_get, + unescapeHTML, + url_or_none, +) class NineGagIE(InfoExtractor): IE_NAME = '9gag' - _VALID_URL = r'https?://(?:www\.)?9gag(?:\.com/tv|\.tv)/(?:p|embed)/(?P[a-zA-Z0-9]+)(?:/(?P[^?#/]+))?' + _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P[^/?&#]+)' _TESTS = [{ - 'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome', + 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { - 'id': 'kXzwOKyGlSA', + 'id': 'ae5Ag7B', 'ext': 'mp4', - 'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)', - 'title': '\"People Are Awesome 2013\" Is Absolutely Awesome', - 'uploader_id': 'UCdEH6EjDKwtTe-sO2f0_1XA', - 'uploader': 'CompilationChannel', - 'upload_date': '20131110', - 'view_count': int, - }, - 'add_ie': ['Youtube'], + 'title': 'Capybara Agility Training', + 'upload_date': '20191108', + 'timestamp': 1573237208, + 'categories': ['Awesome'], + 'tags': ['Weimaraner', 'American Pit Bull Terrier'], + 'duration': 44, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + } }, { - 'url': 'http://9gag.com/tv/p/aKolP3', - 'info_dict': { - 'id': 'aKolP3', - 'ext': 'mp4', - 'title': 'This Guy Travelled 11 countries In 44 days Just To Make This Amazing Video', - 'description': "I just saw more in 1 minute than I've seen in 1 year. This guy's video is epic!!", - 'uploader_id': 'rickmereki', - 'uploader': 'Rick Mereki', - 'upload_date': '20110803', - 'view_count': int, - }, - 'add_ie': ['Vimeo'], - }, { - 'url': 'http://9gag.com/tv/p/KklwM', - 'only_matching': True, - }, { - 'url': 'http://9gag.tv/p/Kk2X5', - 'only_matching': True, - }, { - 'url': 'http://9gag.com/tv/embed/a5Dmvl', + # HTML escaped title + 'url': 'https://9gag.com/gag/av5nvyb', 'only_matching': True, }] - _EXTERNAL_VIDEO_PROVIDER = { - '1': { - 'url': '%s', - 'ie_key': 'Youtube', - }, - '2': { - 'url': 'http://player.vimeo.com/video/%s', - 'ie_key': 'Vimeo', - }, - '3': { - 'url': 'http://instagram.com/p/%s', - 'ie_key': 'Instagram', - }, - '4': { - 'url': 'http://vine.co/v/%s', - 'ie_key': 'Vine', - }, - } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id + post_id = self._match_id(url) + post = self._download_json( + 'https://9gag.com/v1/post', post_id, query={ + 'id': post_id + })['data']['post'] - webpage = self._download_webpage(url, display_id) + if post.get('type') != 'Animated': + raise ExtractorError( + 'The given url does not contain a video', + expected=True) - post_view = self._parse_json( - self._search_regex( - r'var\s+postView\s*=\s*new\s+app\.PostView\({\s*post:\s*({.+?})\s*,\s*posts:\s*prefetchedCurrentPost', - webpage, 'post view'), - display_id) + title = unescapeHTML(post['title']) - ie_key = None - source_url = post_view.get('sourceUrl') - if not source_url: - external_video_id = post_view['videoExternalId'] - external_video_provider = post_view['videoExternalProvider'] - source_url = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['url'] % external_video_id - ie_key = self._EXTERNAL_VIDEO_PROVIDER[external_video_provider]['ie_key'] - title = post_view['title'] - description = post_view.get('description') - view_count = str_to_int(post_view.get('externalView')) - thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w') + duration = None + formats = [] + thumbnails = [] + for key, image in (post.get('images') or {}).items(): + image_url = url_or_none(image.get('url')) + if not image_url: + continue + ext = determine_ext(image_url) + image_id = key.strip('image') + common = { + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } + if ext in ('jpg', 'png'): + webp_url = image.get('webpUrl') + if webp_url: + t = common.copy() + t.update({ + 'id': image_id + '-webp', + 'url': webp_url, + }) + thumbnails.append(t) + common.update({ + 'id': image_id, + 'ext': ext, + }) + thumbnails.append(common) + elif ext in ('webm', 'mp4'): + if not duration: + duration = int_or_none(image.get('duration')) + common['acodec'] = 'none' if image.get('hasAudio') == 0 else None + for vcodec in ('vp8', 'vp9', 'h265'): + c_url = image.get(vcodec + 'Url') + if not c_url: + continue + c_f = common.copy() + c_f.update({ + 'format_id': image_id + '-' + vcodec, + 'url': c_url, + 'vcodec': vcodec, + }) + formats.append(c_f) + common.update({ + 'ext': ext, + 'format_id': image_id, + }) + formats.append(common) + self._sort_formats(formats) + + section = try_get(post, lambda x: x['postSection']['name']) + + tags = None + post_tags = post.get('tags') + if post_tags: + tags = [] + for tag in post_tags: + tag_key = tag.get('key') + if not tag_key: + continue + tags.append(tag_key) + + get_count = lambda x: int_or_none(post.get(x + 'Count')) return { - '_type': 'url_transparent', - 'url': source_url, - 'ie_key': ie_key, - 'id': video_id, - 'display_id': display_id, + 'id': post_id, 'title': title, - 'description': description, - 'view_count': view_count, - 'thumbnail': thumbnail, + 'timestamp': int_or_none(post.get('creationTs')), + 'duration': duration, + 'formats': formats, + 'thumbnails': thumbnails, + 'like_count': get_count('upVote'), + 'dislike_count': get_count('downVote'), + 'comment_count': get_count('comments'), + 'age_limit': 18 if post.get('nsfw') == 1 else None, + 'categories': [section] if section else None, + 'tags': tags, } diff --git a/haruhi_dl/extractor/njpwworld.py b/haruhi_dl/extractor/njpwworld.py index 025c5d249..3639d142f 100644 --- a/haruhi_dl/extractor/njpwworld.py +++ b/haruhi_dl/extractor/njpwworld.py @@ -6,30 +6,40 @@ import re from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( - extract_attributes, get_element_by_class, urlencode_postdata, ) class NJPWWorldIE(InfoExtractor): - _VALID_URL = r'https?://njpwworld\.com/p/(?P[a-z0-9_]+)' + _VALID_URL = r'https?://(front\.)?njpwworld\.com/p/(?P[a-z0-9_]+)' IE_DESC = '新日本プロレスワールド' _NETRC_MACHINE = 'njpwworld' - _TEST = { + _TESTS = [{ 'url': 'http://njpwworld.com/p/s_series_00155_1_9/', 'info_dict': { 'id': 's_series_00155_1_9', 'ext': 'mp4', - 'title': '第9試合 ランディ・サベージ vs リック・スタイナー', + 'title': '闘強導夢2000 2000年1月4日 東京ドーム 第9試合 ランディ・サベージ VS リック・スタイナー', 'tags': list, }, 'params': { 'skip_download': True, # AES-encrypted m3u8 }, 'skip': 'Requires login', - } + }, { + 'url': 'https://front.njpwworld.com/p/s_series_00563_16_bs', + 'info_dict': { + 'id': 's_series_00563_16_bs', + 'ext': 'mp4', + 'title': 'WORLD TAG LEAGUE 2020 & BEST OF THE SUPER Jr.27 2020年12月6日 福岡・福岡国際センター バックステージコメント(字幕あり)', + 'tags': ["福岡・福岡国際センター", "バックステージコメント", "2020", "20年代"], + }, + 'params': { + 'skip_download': True, + }, + }] _LOGIN_URL = 'https://front.njpwworld.com/auth/login' @@ -64,35 +74,27 @@ class NJPWWorldIE(InfoExtractor): webpage = self._download_webpage(url, video_id) formats = [] - for mobj in re.finditer(r']+\bhref=(["\'])/player.+?[^>]*>', webpage): - player = extract_attributes(mobj.group(0)) - player_path = player.get('href') - if not player_path: - continue - kind = self._search_regex( - r'(low|high)$', player.get('class') or '', 'kind', - default='low') + for kind, vid in re.findall(r'if\s+\(\s*imageQualityType\s*==\s*\'([^\']+)\'\s*\)\s*{\s*video_id\s*=\s*"(\d+)"', webpage): + player_path = '/intent?id=%s&type=url' % vid player_url = compat_urlparse.urljoin(url, player_path) - player_page = self._download_webpage( - player_url, video_id, note='Downloading player page') - entries = self._parse_html5_media_entries( - player_url, player_page, video_id, m3u8_id='hls-%s' % kind, - m3u8_entry_protocol='m3u8_native') - kind_formats = entries[0]['formats'] - for f in kind_formats: - f['quality'] = 2 if kind == 'high' else 1 - formats.extend(kind_formats) + formats.append({ + 'url': player_url, + 'format_id': kind, + 'ext': 'mp4', + 'protocol': 'm3u8', + 'quality': 2 if kind == 'high' else 1, + }) self._sort_formats(formats) - post_content = get_element_by_class('post-content', webpage) + tag_block = get_element_by_class('tag-block', webpage) tags = re.findall( - r']+class="tag-[^"]+">]*>([^<]+)', post_content - ) if post_content else None + r']+class="tag-[^"]+"[^>]*>([^<]+)', tag_block + ) if tag_block else None return { 'id': video_id, - 'title': self._og_search_title(webpage), + 'title': get_element_by_class('article-title', webpage) or self._og_search_title(webpage), 'formats': formats, 'tags': tags, } diff --git a/haruhi_dl/extractor/nrk.py b/haruhi_dl/extractor/nrk.py index 84aacbcda..40dee2162 100644 --- a/haruhi_dl/extractor/nrk.py +++ b/haruhi_dl/extractor/nrk.py @@ -1,199 +1,67 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools +import random import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) +from ..compat import compat_str from ..utils import ( + determine_ext, ExtractorError, int_or_none, - js_to_json, - NO_DEFAULT, - parse_age_limit, parse_duration, + str_or_none, try_get, + urljoin, + url_or_none, ) class NRKBaseIE(InfoExtractor): _GEO_COUNTRIES = ['NO'] + _CDN_REPL_REGEX = r'''(?x):// + (?: + nrkod\d{1,2}-httpcache0-47115-cacheod0\.dna\.ip-only\.net/47115-cacheod0| + nrk-od-no\.telenorcdn\.net| + minicdn-od\.nrk\.no/od/nrkhd-osl-rr\.netwerk\.no/no + )/''' - _api_host = None + def _extract_nrk_formats(self, asset_url, video_id): + if re.match(r'https?://[^/]+\.akamaihd\.net/i/', asset_url): + return self._extract_akamai_formats(asset_url, video_id) + asset_url = re.sub(r'(?:bw_(?:low|high)=\d+|no_audio_only)&?', '', asset_url) + formats = self._extract_m3u8_formats( + asset_url, video_id, 'mp4', 'm3u8_native', fatal=False) + if not formats and re.search(self._CDN_REPL_REGEX, asset_url): + formats = self._extract_m3u8_formats( + re.sub(self._CDN_REPL_REGEX, '://nrk-od-%02d.akamaized.net/no/' % random.randint(0, 99), asset_url), + video_id, 'mp4', 'm3u8_native', fatal=False) + return formats - def _real_extract(self, url): - video_id = self._match_id(url) - - api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS - - for api_host in api_hosts: - data = self._download_json( - 'http://%s/mediaelement/%s' % (api_host, video_id), - video_id, 'Downloading mediaelement JSON', - fatal=api_host == api_hosts[-1]) - if not data: - continue - self._api_host = api_host - break - - title = data.get('fullTitle') or data.get('mainTitle') or data['title'] - video_id = data.get('id') or video_id - - entries = [] - - conviva = data.get('convivaStatistics') or {} - live = (data.get('mediaElementType') == 'Live' - or data.get('isLive') is True or conviva.get('isLive')) - - def make_title(t): - return self._live_title(t) if live else t - - media_assets = data.get('mediaAssets') - if media_assets and isinstance(media_assets, list): - def video_id_and_title(idx): - return ((video_id, title) if len(media_assets) == 1 - else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) - for num, asset in enumerate(media_assets, 1): - asset_url = asset.get('url') - if not asset_url: - continue - formats = self._extract_akamai_formats(asset_url, video_id) - if not formats: - continue - self._sort_formats(formats) - - # Some f4m streams may not work with hdcore in fragments' URLs - for f in formats: - extra_param = f.get('extra_param_to_segment_url') - if extra_param and 'hdcore' in extra_param: - del f['extra_param_to_segment_url'] - - entry_id, entry_title = video_id_and_title(num) - duration = parse_duration(asset.get('duration')) - subtitles = {} - for subtitle in ('webVtt', 'timedText'): - subtitle_url = asset.get('%sSubtitlesUrl' % subtitle) - if subtitle_url: - subtitles.setdefault('no', []).append({ - 'url': compat_urllib_parse_unquote(subtitle_url) - }) - entries.append({ - 'id': asset.get('carrierId') or entry_id, - 'title': make_title(entry_title), - 'duration': duration, - 'subtitles': subtitles, - 'formats': formats, - }) - - if not entries: - media_url = data.get('mediaUrl') - if media_url: - formats = self._extract_akamai_formats(media_url, video_id) - self._sort_formats(formats) - duration = parse_duration(data.get('duration')) - entries = [{ - 'id': video_id, - 'title': make_title(title), - 'duration': duration, - 'formats': formats, - }] - - if not entries: - MESSAGES = { - 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', - 'ProgramRightsHasExpired': 'Programmet har gått ut', - 'NoProgramRights': 'Ikke tilgjengelig', - 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', - } - message_type = data.get('messageType', '') - # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* - if 'IsGeoBlocked' in message_type: - self.raise_geo_restricted( - msg=MESSAGES.get('ProgramIsGeoBlocked'), - countries=self._GEO_COUNTRIES) - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, MESSAGES.get( - message_type, message_type)), - expected=True) - - series = conviva.get('seriesName') or data.get('seriesTitle') - episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') - - season_number = None - episode_number = None - if data.get('mediaElementType') == 'Episode': - _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ - data.get('relativeOriginUrl', '') - EPISODENUM_RE = [ - r'/s(?P\d{,2})e(?P\d{,2})\.', - r'/sesong-(?P\d{,2})/episode-(?P\d{,2})', - ] - season_number = int_or_none(self._search_regex( - EPISODENUM_RE, _season_episode, 'season number', - default=None, group='season')) - episode_number = int_or_none(self._search_regex( - EPISODENUM_RE, _season_episode, 'episode number', - default=None, group='episode')) - - thumbnails = None - images = data.get('images') - if images and isinstance(images, dict): - web_images = images.get('webImages') - if isinstance(web_images, list): - thumbnails = [{ - 'url': image['imageUrl'], - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - } for image in web_images if image.get('imageUrl')] - - description = data.get('description') - category = data.get('mediaAnalytics', {}).get('category') - - common_info = { - 'description': description, - 'series': series, - 'episode': episode, - 'season_number': season_number, - 'episode_number': episode_number, - 'categories': [category] if category else None, - 'age_limit': parse_age_limit(data.get('legalAge')), - 'thumbnails': thumbnails, + def _raise_error(self, data): + MESSAGES = { + 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', + 'ProgramRightsHasExpired': 'Programmet har gått ut', + 'NoProgramRights': 'Ikke tilgjengelig', + 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', } + message_type = data.get('messageType', '') + # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* + if 'IsGeoBlocked' in message_type or try_get(data, lambda x: x['usageRights']['isGeoBlocked']) is True: + self.raise_geo_restricted( + msg=MESSAGES.get('ProgramIsGeoBlocked'), + countries=self._GEO_COUNTRIES) + message = data.get('endUserMessage') or MESSAGES.get(message_type, message_type) + raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True) - vcodec = 'none' if data.get('mediaType') == 'Audio' else None - - for entry in entries: - entry.update(common_info) - for f in entry['formats']: - f['vcodec'] = vcodec - - points = data.get('shortIndexPoints') - if isinstance(points, list): - chapters = [] - for next_num, point in enumerate(points, start=1): - if not isinstance(point, dict): - continue - start_time = parse_duration(point.get('startPoint')) - if start_time is None: - continue - end_time = parse_duration( - data.get('duration') - if next_num == len(points) - else points[next_num].get('startPoint')) - if end_time is None: - continue - chapters.append({ - 'start_time': start_time, - 'end_time': end_time, - 'title': point.get('title'), - }) - if chapters and len(entries) == 1: - entries[0]['chapters'] = chapters - - return self.playlist_result(entries, video_id, title, description) + def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): + return self._download_json( + urljoin('http://psapi.nrk.no/', path), + video_id, note or 'Downloading %s JSON' % item, + fatal=fatal, query=query, + headers={'Accept-Encoding': 'gzip, deflate, br'}) class NRKIE(NRKBaseIE): @@ -202,17 +70,17 @@ class NRKIE(NRKBaseIE): nrk:| https?:// (?: - (?:www\.)?nrk\.no/video/PS\*| + (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)| v8[-.]psapi\.nrk\.no/mediaelement/ ) ) - (?P[^?#&]+) + (?P[^?\#&]+) ''' - _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no') + _TESTS = [{ # video 'url': 'http://www.nrk.no/video/PS*150533', - 'md5': '706f34cdf1322577589e369e522b50ef', + 'md5': 'f46be075326e23ad0e524edfcb06aeb6', 'info_dict': { 'id': '150533', 'ext': 'mp4', @@ -226,7 +94,7 @@ class NRKIE(NRKBaseIE): # MD5 is unstable 'info_dict': { 'id': '154915', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Slik høres internett ut når du er blind', 'description': 'md5:a621f5cc1bd75c8d5104cb048c6b8568', 'duration': 20, @@ -240,55 +108,230 @@ class NRKIE(NRKBaseIE): }, { 'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9', 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', + 'only_matching': True, + }, { + 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', + 'only_matching': True, + }, { + # podcast + 'url': 'nrk:l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'nrk:podcast/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + # clip + 'url': 'nrk:150533', + 'only_matching': True, + }, { + 'url': 'nrk:clip/150533', + 'only_matching': True, + }, { + # program + 'url': 'nrk:MDDP12000117', + 'only_matching': True, + }, { + 'url': 'nrk:program/ENRK10100318', + 'only_matching': True, + }, { + # direkte + 'url': 'nrk:nrk1', + 'only_matching': True, + }, { + 'url': 'nrk:channel/nrk1', + 'only_matching': True, }] + def _real_extract(self, url): + video_id = self._match_id(url).split('/')[-1] -class NRKTVIE(NRKBaseIE): + path_templ = 'playback/%s/' + video_id + + def call_playback_api(item, query=None): + return self._call_api(path_templ % item, video_id, item, query=query) + # known values for preferredCdn: akamai, iponly, minicdn and telenor + manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'}) + + video_id = try_get(manifest, lambda x: x['id'], compat_str) or video_id + + if manifest.get('playability') == 'nonPlayable': + self._raise_error(manifest['nonPlayable']) + + playable = manifest['playable'] + + formats = [] + for asset in playable['assets']: + if not isinstance(asset, dict): + continue + if asset.get('encrypted'): + continue + format_url = url_or_none(asset.get('url')) + if not format_url: + continue + asset_format = (asset.get('format') or '').lower() + if asset_format == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_nrk_formats(format_url, video_id)) + elif asset_format == 'mp3': + formats.append({ + 'url': format_url, + 'format_id': asset_format, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + data = call_playback_api('metadata') + + preplay = data['preplay'] + titles = preplay['titles'] + title = titles['title'] + alt_title = titles.get('subtitle') + + description = preplay.get('description') + duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) + + thumbnails = [] + for image in try_get( + preplay, lambda x: x['poster']['images'], list) or []: + if not isinstance(image, dict): + continue + image_url = url_or_none(image.get('url')) + if not image_url: + continue + thumbnails.append({ + 'url': image_url, + 'width': int_or_none(image.get('pixelWidth')), + 'height': int_or_none(image.get('pixelHeight')), + }) + + subtitles = {} + for sub in try_get(playable, lambda x: x['subtitles'], list) or []: + if not isinstance(sub, dict): + continue + sub_url = url_or_none(sub.get('webVtt')) + if not sub_url: + continue + sub_key = str_or_none(sub.get('language')) or 'nb' + sub_type = str_or_none(sub.get('type')) + if sub_type: + sub_key += '-%s' % sub_type + subtitles.setdefault(sub_key, []).append({ + 'url': sub_url, + }) + + legal_age = try_get( + data, lambda x: x['legalAge']['body']['rating']['code'], compat_str) + # https://en.wikipedia.org/wiki/Norwegian_Media_Authority + age_limit = None + if legal_age: + if legal_age == 'A': + age_limit = 0 + elif legal_age.isdigit(): + age_limit = int_or_none(legal_age) + + is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' + + info = { + 'id': video_id, + 'title': title, + 'alt_title': alt_title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'age_limit': age_limit, + 'formats': formats, + 'subtitles': subtitles, + } + + if is_series: + series = season_id = season_number = episode = episode_number = None + programs = self._call_api( + 'programs/%s' % video_id, video_id, 'programs', fatal=False) + if programs and isinstance(programs, dict): + series = str_or_none(programs.get('seriesTitle')) + season_id = str_or_none(programs.get('seasonId')) + season_number = int_or_none(programs.get('seasonNumber')) + episode = str_or_none(programs.get('episodeTitle')) + episode_number = int_or_none(programs.get('episodeNumber')) + if not series: + series = title + if alt_title: + title += ' - %s' % alt_title + if not season_number: + season_number = int_or_none(self._search_regex( + r'Sesong\s+(\d+)', description or '', 'season number', + default=None)) + if not episode: + episode = alt_title if is_series else None + if not episode_number: + episode_number = int_or_none(self._search_regex( + r'^(\d+)\.', episode or '', 'episode number', + default=None)) + if not episode_number: + episode_number = int_or_none(self._search_regex( + r'\((\d+)\s*:\s*\d+\)', description or '', + 'episode number', default=None)) + info.update({ + 'title': title, + 'series': series, + 'season_id': season_id, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + + return info + + +class NRKTVIE(InfoExtractor): IE_DESC = 'NRK TV and NRK Radio' _EPISODE_RE = r'(?P[a-zA-Z]{4}\d{8})' - _VALID_URL = r'''(?x) - https?:// - (?:tv|radio)\.nrk(?:super)?\.no/ - (?:serie(?:/[^/]+){1,2}|program)/ - (?![Ee]pisodes)%s - (?:/\d{2}-\d{2}-\d{4})? - (?:\#del=(?P\d+))? - ''' % _EPISODE_RE - _API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no') + _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE _TESTS = [{ 'url': 'https://tv.nrk.no/program/MDDP12000117', - 'md5': '8270824df46ec629b66aeaa5796b36fb', + 'md5': 'c4a5960f1b00b40d47db65c1064e0ab1', 'info_dict': { - 'id': 'MDDP12000117AA', + 'id': 'MDDP12000117', 'ext': 'mp4', 'title': 'Alarm Trolltunga', 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', - 'duration': 2223, + 'duration': 2223.44, 'age_limit': 6, + 'subtitles': { + 'nb-nor': [{ + 'ext': 'vtt', + }], + 'nb-ttv': [{ + 'ext': 'vtt', + }] + }, }, }, { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', - 'md5': '9a167e54d04671eb6317a37b7bc8a280', + 'md5': '8d40dab61cea8ab0114e090b029a0565', 'info_dict': { - 'id': 'MUHH48000314AA', + 'id': 'MUHH48000314', 'ext': 'mp4', - 'title': '20 spørsmål 23.05.2014', + 'title': '20 spørsmål - 23. mai 2014', + 'alt_title': '23. mai 2014', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', 'duration': 1741, 'series': '20 spørsmål', - 'episode': '23.05.2014', + 'episode': '23. mai 2014', + 'age_limit': 0, }, - 'skip': 'NoProgramRights', }, { 'url': 'https://tv.nrk.no/program/mdfp15000514', 'info_dict': { - 'id': 'MDFP15000514CA', + 'id': 'MDFP15000514', 'ext': 'mp4', - 'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014', + 'title': 'Kunnskapskanalen - Grunnlovsjubiléet - Stor ståhei for ingenting', 'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db', - 'duration': 4605, + 'duration': 4605.08, 'series': 'Kunnskapskanalen', - 'episode': '24.05.2014', + 'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -297,63 +340,41 @@ class NRKTVIE(NRKBaseIE): # single playlist video 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2', 'info_dict': { - 'id': 'MSPO40010515-part2', - 'ext': 'flv', - 'title': 'Tour de Ski: Sprint fri teknikk, kvinner og menn 06.01.2015 (del 2:2)', - 'description': 'md5:238b67b97a4ac7d7b4bf0edf8cc57d26', + 'id': 'MSPO40010515', + 'ext': 'mp4', + 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'age_limit': 0, }, 'params': { 'skip_download': True, }, - 'expected_warnings': ['Video is geo restricted'], + 'expected_warnings': ['Failed to download m3u8 information'], 'skip': 'particular part is not supported currently', }, { 'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015', - 'playlist': [{ - 'info_dict': { - 'id': 'MSPO40010515AH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 772, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }, { - 'info_dict': { - 'id': 'MSPO40010515BH', - 'ext': 'mp4', - 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', - 'duration': 6175, - 'series': 'Tour de Ski', - 'episode': '06.01.2015', - }, - 'params': { - 'skip_download': True, - }, - }], 'info_dict': { 'id': 'MSPO40010515', + 'ext': 'mp4', 'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015', - 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d', + 'description': 'md5:c03aba1e917561eface5214020551b7a', + 'age_limit': 0, }, - 'expected_warnings': ['Video is geo restricted'], + 'expected_warnings': ['Failed to download m3u8 information'], + 'skip': 'Ikke tilgjengelig utenfor Norge', }, { 'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13', 'info_dict': { - 'id': 'KMTE50001317AA', + 'id': 'KMTE50001317', 'ext': 'mp4', - 'title': 'Anno 13:30', + 'title': 'Anno - 13. episode', 'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa', 'duration': 2340, 'series': 'Anno', - 'episode': '13:30', + 'episode': '13. episode', 'season_number': 3, 'episode_number': 13, + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -361,40 +382,50 @@ class NRKTVIE(NRKBaseIE): }, { 'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017', 'info_dict': { - 'id': 'MUHH46000317AA', + 'id': 'MUHH46000317', 'ext': 'mp4', 'title': 'Nytt på Nytt 27.01.2017', 'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b', 'duration': 1796, 'series': 'Nytt på nytt', 'episode': '27.01.2017', + 'age_limit': 0, }, 'params': { 'skip_download': True, }, + 'skip': 'ProgramRightsHasExpired', }, { 'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#', 'only_matching': True, }, { 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201507/NPUB21019315', + 'only_matching': True, }] + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) + class NRKTVEpisodeIE(InfoExtractor): - _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/\d+/episode/\d+)' + _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P[^/]+/sesong/(?P\d+)/episode/(?P\d+))' _TESTS = [{ 'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2', 'info_dict': { - 'id': 'MUHH36005220BA', + 'id': 'MUHH36005220', 'ext': 'mp4', - 'title': 'Kro, krig og kjærlighet 2:6', - 'description': 'md5:b32a7dc0b1ed27c8064f58b97bda4350', - 'duration': 1563, + 'title': 'Hellums kro - 2. Kro, krig og kjærlighet', + 'description': 'md5:ad92ddffc04cea8ce14b415deef81787', + 'duration': 1563.92, 'series': 'Hellums kro', 'season_number': 1, 'episode_number': 2, - 'episode': '2:6', + 'episode': '2. Kro, krig og kjærlighet', 'age_limit': 6, }, 'params': { @@ -403,15 +434,16 @@ class NRKTVEpisodeIE(InfoExtractor): }, { 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8', 'info_dict': { - 'id': 'MSUI14000816AA', + 'id': 'MSUI14000816', 'ext': 'mp4', - 'title': 'Backstage 8:30', + 'title': 'Backstage - 8. episode', 'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4', 'duration': 1320, 'series': 'Backstage', 'season_number': 1, 'episode_number': 8, - 'episode': '8:30', + 'episode': '8. episode', + 'age_limit': 0, }, 'params': { 'skip_download': True, @@ -420,7 +452,7 @@ class NRKTVEpisodeIE(InfoExtractor): }] def _real_extract(self, url): - display_id = self._match_id(url) + display_id, season_number, episode_number = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) @@ -432,91 +464,170 @@ class NRKTVEpisodeIE(InfoExtractor): assert re.match(NRKTVIE._EPISODE_RE, nrk_id) info.update({ - '_type': 'url_transparent', + '_type': 'url', 'id': nrk_id, 'url': 'nrk:%s' % nrk_id, 'ie_key': NRKIE.ie_key(), + 'season_number': int(season_number), + 'episode_number': int(episode_number), }) return info -class NRKTVSerieBaseIE(InfoExtractor): - def _extract_series(self, webpage, display_id, fatal=True): - config = self._parse_json( - self._search_regex( - (r'INITIAL_DATA(?:_V\d)?_*\s*=\s*({.+?})\s*;', - r'({.+?})\s*,\s*"[^"]+"\s*\)\s*'), - webpage, 'config', default='{}' if not fatal else NO_DEFAULT), - display_id, fatal=False, transform_source=js_to_json) - if not config: - return - return try_get( - config, - (lambda x: x['initialState']['series'], lambda x: x['series']), - dict) - - def _extract_seasons(self, seasons): - if not isinstance(seasons, list): - return [] - entries = [] - for season in seasons: - entries.extend(self._extract_episodes(season)) - return entries - - def _extract_episodes(self, season): - if not isinstance(season, dict): - return [] - return self._extract_entries(season.get('episodes')) - +class NRKTVSerieBaseIE(NRKBaseIE): def _extract_entries(self, entry_list): if not isinstance(entry_list, list): return [] entries = [] for episode in entry_list: - nrk_id = episode.get('prfId') + nrk_id = episode.get('prfId') or episode.get('episodeId') if not nrk_id or not isinstance(nrk_id, compat_str): continue entries.append(self.url_result( 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)) return entries + _ASSETS_KEYS = ('episodes', 'instalments',) + + def _extract_assets_key(self, embedded): + for asset_key in self._ASSETS_KEYS: + if embedded.get(asset_key): + return asset_key + + @staticmethod + def _catalog_name(serie_kind): + return 'podcast' if serie_kind in ('podcast', 'podkast') else 'series' + + def _entries(self, data, display_id): + for page_num in itertools.count(1): + embedded = data.get('_embedded') or data + if not isinstance(embedded, dict): + break + assets_key = self._extract_assets_key(embedded) + if not assets_key: + break + # Extract entries + entries = try_get( + embedded, + (lambda x: x[assets_key]['_embedded'][assets_key], + lambda x: x[assets_key]), + list) + for e in self._extract_entries(entries): + yield e + # Find next URL + next_url_path = try_get( + data, + (lambda x: x['_links']['next']['href'], + lambda x: x['_embedded'][assets_key]['_links']['next']['href']), + compat_str) + if not next_url_path: + break + data = self._call_api( + next_url_path, display_id, + note='Downloading %s JSON page %d' % (assets_key, page_num), + fatal=False) + if not data: + break + class NRKTVSeasonIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P\d+)' - _TEST = { + _VALID_URL = r'''(?x) + https?:// + (?Ptv|radio)\.nrk\.no/ + (?Pserie|pod[ck]ast)/ + (?P[^/]+)/ + (?: + (?:sesong/)?(?P\d+)| + sesong/(?P[^/?#&]+) + ) + ''' + _TESTS = [{ 'url': 'https://tv.nrk.no/serie/backstage/sesong/1', 'info_dict': { - 'id': '1', + 'id': 'backstage/1', 'title': 'Sesong 1', }, 'playlist_mincount': 30, - } + }, { + # no /sesong/ in path + 'url': 'https://tv.nrk.no/serie/lindmo/2016', + 'info_dict': { + 'id': 'lindmo/2016', + 'title': '2016', + }, + 'playlist_mincount': 29, + }, { + # weird nested _embedded in catalog JSON response + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens/sesong/1', + 'info_dict': { + 'id': 'dickie-dick-dickens/1', + 'title': 'Sesong 1', + }, + 'playlist_mincount': 11, + }, { + # 841 entries, multi page + 'url': 'https://radio.nrk.no/serie/dagsnytt/sesong/201509', + 'info_dict': { + 'id': 'dagsnytt/201509', + 'title': 'September 2015', + }, + 'playlist_mincount': 841, + }, { + # 180 entries, single page + 'url': 'https://tv.nrk.no/serie/spangas/sesong/1', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/diagnose-kverulant', + 'info_dict': { + 'id': 'hele_historien/diagnose-kverulant', + 'title': 'Diagnose kverulant', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://radio.nrk.no/podkast/loerdagsraadet/sesong/202101', + 'only_matching': True, + }] @classmethod def suitable(cls, url): - return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) + return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) or NRKRadioPodkastIE.suitable(url) else super(NRKTVSeasonIE, cls).suitable(url)) def _real_extract(self, url): - display_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + domain = mobj.group('domain') + serie_kind = mobj.group('serie_kind') + serie = mobj.group('serie') + season_id = mobj.group('id') or mobj.group('id_2') + display_id = '%s/%s' % (serie, season_id) - webpage = self._download_webpage(url, display_id) + data = self._call_api( + '%s/catalog/%s/%s/seasons/%s' + % (domain, self._catalog_name(serie_kind), serie, season_id), + display_id, 'season', query={'pageSize': 50}) - series = self._extract_series(webpage, display_id) - - season = next( - s for s in series['seasons'] - if int(display_id) == s.get('seasonNumber')) - - title = try_get(season, lambda x: x['titles']['title'], compat_str) + title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id return self.playlist_result( - self._extract_episodes(season), display_id, title) + self._entries(data, display_id), + display_id, title) class NRKTVSeriesIE(NRKTVSerieBaseIE): - _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P[^/]+)' - _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P\d+)' + _VALID_URL = r'https?://(?P(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/(?Pserie|pod[ck]ast)/(?P[^/]+)' _TESTS = [{ + # new layout, instalments + 'url': 'https://tv.nrk.no/serie/groenn-glede', + 'info_dict': { + 'id': 'groenn-glede', + 'title': 'Grønn glede', + 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', + }, + 'playlist_mincount': 90, + }, { + # new layout, instalments, more entries + 'url': 'https://tv.nrk.no/serie/lindmo', + 'only_matching': True, + }, { 'url': 'https://tv.nrk.no/serie/blank', 'info_dict': { 'id': 'blank', @@ -530,25 +641,16 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): 'info_dict': { 'id': 'backstage', 'title': 'Backstage', - 'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3', + 'description': 'md5:63692ceb96813d9a207e9910483d948b', }, 'playlist_mincount': 60, - }, { - # new layout, instalments - 'url': 'https://tv.nrk.no/serie/groenn-glede', - 'info_dict': { - 'id': 'groenn-glede', - 'title': 'Grønn glede', - 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608', - }, - 'playlist_mincount': 10, }, { # old layout 'url': 'https://tv.nrksuper.no/serie/labyrint', 'info_dict': { 'id': 'labyrint', 'title': 'Labyrint', - 'description': 'md5:318b597330fdac5959247c9b69fdb1ec', + 'description': 'I Daidalos sin undersjøiske Labyrint venter spennende oppgaver, skumle robotskapninger og slim.', }, 'playlist_mincount': 3, }, { @@ -560,53 +662,75 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): }, { 'url': 'https://tv.nrk.no/serie/postmann-pat', 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/serie/dickie-dick-dickens', + 'info_dict': { + 'id': 'dickie-dick-dickens', + 'title': 'Dickie Dick Dickens', + 'description': 'md5:19e67411ffe57f7dce08a943d7a0b91f', + }, + 'playlist_mincount': 8, + }, { + 'url': 'https://nrksuper.no/serie/labyrint', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers', + 'info_dict': { + 'id': 'ulrikkes_univers', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/nrkno-poddkast-26588-134079-05042018030000', + 'only_matching': True, }] @classmethod def suitable(cls, url): return ( False if any(ie.suitable(url) - for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE)) + for ie in (NRKTVIE, NRKTVEpisodeIE, NRKRadioPodkastIE, NRKTVSeasonIE)) else super(NRKTVSeriesIE, cls).suitable(url)) def _real_extract(self, url): - series_id = self._match_id(url) + site, serie_kind, series_id = re.match(self._VALID_URL, url).groups() + is_radio = site == 'radio.nrk' + domain = 'radio' if is_radio else 'tv' - webpage = self._download_webpage(url, series_id) + size_prefix = 'p' if is_radio else 'embeddedInstalmentsP' + series = self._call_api( + '%s/catalog/%s/%s' + % (domain, self._catalog_name(serie_kind), series_id), + series_id, 'serie', query={size_prefix + 'ageSize': 50}) + titles = try_get(series, [ + lambda x: x['titles'], + lambda x: x[x['type']]['titles'], + lambda x: x[x['seriesType']]['titles'], + ]) or {} - # New layout (e.g. https://tv.nrk.no/serie/backstage) - series = self._extract_series(webpage, series_id, fatal=False) - if series: - title = try_get(series, lambda x: x['titles']['title'], compat_str) - description = try_get( - series, lambda x: x['titles']['subtitle'], compat_str) - entries = [] - entries.extend(self._extract_seasons(series.get('seasons'))) - entries.extend(self._extract_entries(series.get('instalments'))) - entries.extend(self._extract_episodes(series.get('extraMaterial'))) - return self.playlist_result(entries, series_id, title, description) + entries = [] + entries.extend(self._entries(series, series_id)) + embedded = series.get('_embedded') or {} + linked_seasons = try_get(series, lambda x: x['_links']['seasons']) or [] + embedded_seasons = embedded.get('seasons') or [] + if len(linked_seasons) > len(embedded_seasons): + for season in linked_seasons: + season_url = urljoin(url, season.get('href')) + if not season_url: + season_name = season.get('name') + if season_name and isinstance(season_name, compat_str): + season_url = 'https://%s.nrk.no/serie/%s/sesong/%s' % (domain, series_id, season_name) + if season_url: + entries.append(self.url_result( + season_url, ie=NRKTVSeasonIE.ie_key(), + video_title=season.get('title'))) + else: + for season in embedded_seasons: + entries.extend(self._entries(season, series_id)) + entries.extend(self._entries( + embedded.get('extraMaterial') or {}, series_id)) - # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint) - entries = [ - self.url_result( - 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format( - series=series_id, season=season_id)) - for season_id in re.findall(self._ITEM_RE, webpage) - ] - - title = self._html_search_meta( - 'seriestitle', webpage, - 'title', default=None) or self._og_search_title( - webpage, fatal=False) - if title: - title = self._search_regex( - r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title) - - description = self._html_search_meta( - 'series_description', webpage, - 'description', default=None) or self._og_search_description(webpage) - - return self.playlist_result(entries, series_id, title, description) + return self.playlist_result( + entries, series_id, titles.get('title'), titles.get('subtitle')) class NRKTVDirekteIE(NRKTVIE): @@ -622,6 +746,38 @@ class NRKTVDirekteIE(NRKTVIE): }] +class NRKRadioPodkastIE(InfoExtractor): + _VALID_URL = r'https?://radio\.nrk\.no/pod[ck]ast/(?:[^/]+/)+(?Pl_[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + _TESTS = [{ + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'md5': '8d40dab61cea8ab0114e090b029a0565', + 'info_dict': { + 'id': 'MUHH48000314AA', + 'ext': 'mp4', + 'title': '20 spørsmål 23.05.2014', + 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', + 'duration': 1741, + 'series': '20 spørsmål', + 'episode': '23.05.2014', + }, + }, { + 'url': 'https://radio.nrk.no/podcast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/sesong/1/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8', + 'only_matching': True, + }, { + 'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/bortfoert-i-bergen/l_774d1a2c-7aa7-4965-8d1a-2c7aa7d9652c', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + 'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id) + + class NRKPlaylistBaseIE(InfoExtractor): def _extract_description(self, webpage): pass @@ -710,14 +866,8 @@ class NRKSkoleIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://mimir.nrk.no/plugin/1.0/static?mediaId=%s' % video_id, - video_id) - - nrk_id = self._parse_json( - self._search_regex( - r']+type=["\']application/json["\'][^>]*>({.+?})', - webpage, 'application json'), - video_id)['activeMedia']['psId'] + nrk_id = self._download_json( + 'https://nrkno-skole-prod.kube.nrk.no/skole/api/media/%s' % video_id, + video_id)['psId'] return self.url_result('nrk:%s' % nrk_id) diff --git a/haruhi_dl/extractor/nytimes.py b/haruhi_dl/extractor/nytimes.py index fc78ca56c..976b1c694 100644 --- a/haruhi_dl/extractor/nytimes.py +++ b/haruhi_dl/extractor/nytimes.py @@ -221,3 +221,41 @@ class NYTimesArticleIE(NYTimesBaseIE): r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'), webpage, 'podcast data') return self._extract_podcast_from_json(podcast_data, page_id, webpage) + + +class NYTimesCookingIE(NYTimesBaseIE): + _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P\d+)' + _TESTS = [{ + 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', + 'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3', + 'info_dict': { + 'id': '100000004756089', + 'ext': 'mov', + 'timestamp': 1479383008, + 'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON', + 'title': 'Cranberry Tart', + 'upload_date': '20161117', + 'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.', + }, + }, { + 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', + 'md5': '4b2e8c70530a89b8d905a2b572316eb8', + 'info_dict': { + 'id': '100000003951728', + 'ext': 'mov', + 'timestamp': 1445509539, + 'description': 'Turkey guide', + 'upload_date': '20151022', + 'title': 'Turkey', + } + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + video_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'video id') + + return self._extract_video_from_id(video_id) diff --git a/haruhi_dl/extractor/peertube.py b/haruhi_dl/extractor/peertube.py index f89ccda7f..6cba97a7b 100644 --- a/haruhi_dl/extractor/peertube.py +++ b/haruhi_dl/extractor/peertube.py @@ -53,6 +53,18 @@ class PeerTubeSHIE(SelfhostedInfoExtractor): 'tags': ['framasoft', 'peertube'], 'categories': ['Science & Technology'], } + }, { + # Issue #26002 + 'url': 'peertube:spacepub.space:d8943b2d-8280-497b-85ec-bc282ec2afdc', + 'info_dict': { + 'id': 'd8943b2d-8280-497b-85ec-bc282ec2afdc', + 'ext': 'mp4', + 'title': 'Dot matrix printer shell demo', + 'uploader_id': '3', + 'timestamp': 1587401293, + 'upload_date': '20200420', + 'uploader': 'Drew DeVault', + } }, { 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', 'only_matching': True, @@ -117,7 +129,15 @@ class PeerTubeSHIE(SelfhostedInfoExtractor): title = video['name'] formats = [] - for file_ in video['files']: + files = video.get('files') or [] + for playlist in (video.get('streamingPlaylists') or []): + if not isinstance(playlist, dict): + continue + playlist_files = playlist.get('files') + if not (playlist_files and isinstance(playlist_files, list)): + continue + files.extend(playlist_files) + for file_ in files: if not isinstance(file_, dict): continue file_url = url_or_none(file_.get('fileUrl')) @@ -132,6 +152,10 @@ class PeerTubeSHIE(SelfhostedInfoExtractor): 'format_id': format_id, 'filesize': file_size, }) + if format_id == '0p': + f['vcodec'] = 'none' + else: + f['fps'] = int_or_none(file_.get('fps')) formats.append(f) self._sort_formats(formats) diff --git a/haruhi_dl/extractor/piksel.py b/haruhi_dl/extractor/piksel.py index 88b6859b0..ecf56ff8f 100644 --- a/haruhi_dl/extractor/piksel.py +++ b/haruhi_dl/extractor/piksel.py @@ -6,16 +6,33 @@ import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( - ExtractorError, dict_get, + ExtractorError, int_or_none, - unescapeHTML, parse_iso8601, + try_get, + unescapeHTML, ) class PikselIE(InfoExtractor): - _VALID_URL = r'https?://player\.piksel\.com/v/(?:refid/[^/]+/prefid/)?(?P[a-z0-9_]+)' + _VALID_URL = r'''(?x)https?:// + (?: + (?: + player\. + (?: + olympusattelecom| + vibebyvista + )| + (?:api|player)\.multicastmedia| + (?:api-ovp|player)\.piksel + )\.com| + (?: + mz-edge\.stream\.co| + movie-s\.nhk\.or + )\.jp| + vidego\.baltimorecity\.gov + )/v/(?:refid/(?P[^/]+)/prefid/)?(?P[\w-]+)''' _TESTS = [ { 'url': 'http://player.piksel.com/v/ums2867l', @@ -56,46 +73,41 @@ class PikselIE(InfoExtractor): if mobj: return mobj.group('url') + def _call_api(self, app_token, resource, display_id, query, fatal=True): + response = (self._download_json( + 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), + display_id, query=query, fatal=fatal) or {}).get('response') + failure = try_get(response, lambda x: x['failure']['reason']) + if failure: + if fatal: + raise ExtractorError(failure, expected=True) + self.report_warning(failure) + return response + def _real_extract(self, url): - display_id = self._match_id(url) + ref_id, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'data-de-program-uuid=[\'"]([a-z0-9]+)', - webpage, 'program uuid', default=display_id) app_token = self._search_regex([ r'clientAPI\s*:\s*"([^"]+)"', r'data-de-api-key\s*=\s*"([^"]+)"' ], webpage, 'app token') - response = self._download_json( - 'http://player.piksel.com/ws/ws_program/api/%s/mode/json/apiv/5' % app_token, - video_id, query={ - 'v': video_id - })['response'] - failure = response.get('failure') - if failure: - raise ExtractorError(response['failure']['reason'], expected=True) - video_data = response['WsProgramResponse']['program']['asset'] + query = {'refid': ref_id, 'prefid': display_id} if ref_id else {'v': display_id} + program = self._call_api( + app_token, 'program', display_id, query)['WsProgramResponse']['program'] + video_id = program['uuid'] + video_data = program['asset'] title = video_data['title'] + asset_type = dict_get(video_data, ['assetType', 'asset_type']) formats = [] - m3u8_url = dict_get(video_data, [ - 'm3u8iPadURL', - 'ipadM3u8Url', - 'm3u8AndroidURL', - 'm3u8iPhoneURL', - 'iphoneM3u8Url']) - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - - asset_type = dict_get(video_data, ['assetType', 'asset_type']) - for asset_file in video_data.get('assetFiles', []): + def process_asset_file(asset_file): + if not asset_file: + return # TODO: extract rtmp formats http_url = asset_file.get('http_url') if not http_url: - continue + return tbr = None vbr = int_or_none(asset_file.get('videoBitrate'), 1024) abr = int_or_none(asset_file.get('audioBitrate'), 1024) @@ -118,6 +130,43 @@ class PikselIE(InfoExtractor): 'filesize': int_or_none(asset_file.get('filesize')), 'tbr': tbr, }) + + def process_asset_files(asset_files): + for asset_file in (asset_files or []): + process_asset_file(asset_file) + + process_asset_files(video_data.get('assetFiles')) + process_asset_file(video_data.get('referenceFile')) + if not formats: + asset_id = video_data.get('assetid') or program.get('assetid') + if asset_id: + process_asset_files(try_get(self._call_api( + app_token, 'asset_file', display_id, { + 'assetid': asset_id, + }, False), lambda x: x['WsAssetFileResponse']['AssetFiles'])) + + m3u8_url = dict_get(video_data, [ + 'm3u8iPadURL', + 'ipadM3u8Url', + 'm3u8AndroidURL', + 'm3u8iPhoneURL', + 'iphoneM3u8Url']) + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + smil_url = dict_get(video_data, ['httpSmil', 'hdSmil', 'rtmpSmil']) + if smil_url: + transform_source = None + if ref_id == 'nhkworld': + # TODO: figure out if this is something to be fixed in urljoin, + # _parse_smil_formats or keep it here + transform_source = lambda x: x.replace('src="/', 'src="').replace('/media"', '/media/"') + formats.extend(self._extract_smil_formats( + re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, + transform_source=transform_source, fatal=False)) + self._sort_formats(formats) subtitles = {} diff --git a/haruhi_dl/extractor/pinterest.py b/haruhi_dl/extractor/pinterest.py new file mode 100644 index 000000000..b249c9eda --- /dev/null +++ b/haruhi_dl/extractor/pinterest.py @@ -0,0 +1,201 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +class PinterestBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' + + def _call_api(self, resource, video_id, options): + return self._download_json( + 'https://www.pinterest.com/resource/%sResource/get/' % resource, + video_id, 'Download %s JSON metadata' % resource, query={ + 'data': json.dumps({'options': options}) + })['resource_response'] + + def _extract_video(self, data, extract_formats=True): + video_id = data['id'] + + title = (data.get('title') or data.get('grid_title') or video_id).strip() + + formats = [] + duration = None + if extract_formats: + for format_id, format_dict in data['videos']['video_list'].items(): + if not isinstance(format_dict, dict): + continue + format_url = url_or_none(format_dict.get('url')) + if not format_url: + continue + duration = float_or_none(format_dict.get('duration'), scale=1000) + ext = determine_ext(format_url) + if 'hls' in format_id.lower() or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'width': int_or_none(format_dict.get('width')), + 'height': int_or_none(format_dict.get('height')), + 'duration': duration, + }) + self._sort_formats( + formats, field_preference=('height', 'width', 'tbr', 'format_id')) + + description = data.get('description') or data.get('description_html') or data.get('seo_description') + timestamp = unified_timestamp(data.get('created_at')) + + def _u(field): + return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) + + uploader = _u('full_name') + uploader_id = _u('id') + + repost_count = int_or_none(data.get('repin_count')) + comment_count = int_or_none(data.get('comment_count')) + categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) + tags = data.get('hashtags') + + thumbnails = [] + images = data.get('images') + if isinstance(images, dict): + for thumbnail_id, thumbnail in images.items(): + if not isinstance(thumbnail, dict): + continue + thumbnail_url = url_or_none(thumbnail.get('url')) + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'repost_count': repost_count, + 'comment_count': comment_count, + 'categories': categories, + 'tags': tags, + 'formats': formats, + 'extractor_key': PinterestIE.ie_key(), + } + + +class PinterestIE(PinterestBaseIE): + _VALID_URL = r'%s/pin/(?P\d+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.com/pin/664281013778109217/', + 'md5': '6550c2af85d6d9f3fe3b88954d1577fc', + 'info_dict': { + 'id': '664281013778109217', + 'ext': 'mp4', + 'title': 'Origami', + 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', + 'duration': 57.7, + 'timestamp': 1593073622, + 'upload_date': '20200625', + 'uploader': 'Love origami -I am Dafei', + 'uploader_id': '586523688879454212', + 'repost_count': 50, + 'comment_count': 0, + 'categories': list, + 'tags': list, + }, + }, { + 'url': 'https://co.pinterest.com/pin/824721750502199491/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._call_api( + 'Pin', video_id, { + 'field_set_key': 'unauth_react_main_pin', + 'id': video_id, + })['data'] + return self._extract_video(data) + + +class PinterestCollectionIE(PinterestBaseIE): + _VALID_URL = r'%s/(?P[^/]+)/(?P[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', + 'info_dict': { + 'id': '585890301462791043', + 'title': 'cool diys', + }, + 'playlist_count': 8, + }, { + 'url': 'https://www.pinterest.ca/fudohub/videos/', + 'info_dict': { + 'id': '682858430939307450', + 'title': 'VIDEOS', + }, + 'playlist_mincount': 365, + 'skip': 'Test with extract_formats=False', + }] + + @classmethod + def suitable(cls, url): + return False if PinterestIE.suitable(url) else super( + PinterestCollectionIE, cls).suitable(url) + + def _real_extract(self, url): + username, slug = re.match(self._VALID_URL, url).groups() + board = self._call_api( + 'Board', slug, { + 'slug': slug, + 'username': username + })['data'] + board_id = board['id'] + options = { + 'board_id': board_id, + 'page_size': 250, + } + bookmark = None + entries = [] + while True: + if bookmark: + options['bookmarks'] = [bookmark] + board_feed = self._call_api('BoardFeed', board_id, options) + for item in (board_feed.get('data') or []): + if not isinstance(item, dict) or item.get('type') != 'pin': + continue + video_id = item.get('id') + if video_id: + # Some pins may not be available anonymously via pin URL + # video = self._extract_video(item, extract_formats=False) + # video.update({ + # '_type': 'url_transparent', + # 'url': 'https://www.pinterest.com/pin/%s/' % video_id, + # }) + # entries.append(video) + entries.append(self._extract_video(item)) + bookmark = board_feed.get('bookmark') + if not bookmark: + break + return self.playlist_result( + entries, playlist_id=board_id, playlist_title=board.get('name')) diff --git a/haruhi_dl/extractor/pornhub.py b/haruhi_dl/extractor/pornhub.py index d91c869c4..97b5508eb 100644 --- a/haruhi_dl/extractor/pornhub.py +++ b/haruhi_dl/extractor/pornhub.py @@ -22,18 +22,26 @@ from ..utils import ( orderedSet, remove_quotes, str_to_int, + update_url_query, + urlencode_postdata, url_or_none, ) class PornHubBaseIE(InfoExtractor): _REQUIRES_PLAYWRIGHT = True + _NETRC_MACHINE = 'pornhub' def _download_webpage_handle(self, *args, **kwargs): def dl(*args, **kwargs): return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs) - webpage, urlh = dl(*args, **kwargs) + ret = dl(*args, **kwargs) + + if not ret: + return ret + + webpage, urlh = ret if any(re.search(p, webpage) for p in ( r']+\bonload=["\']go\(\)', @@ -51,6 +59,66 @@ class PornHubBaseIE(InfoExtractor): return webpage, urlh + def _real_initialize(self): + self._logged_in = False + + def _login(self, host): + if self._logged_in: + return + + site = host.split('.')[0] + + # Both sites pornhub and pornhubpremium have separate accounts + # so there should be an option to provide credentials for both. + # At the same time some videos are available under the same video id + # on both sites so that we have to identify them as the same video. + # For that purpose we have to keep both in the same extractor + # but under different netrc machines. + username, password = self._get_login_info(netrc_machine=site) + if username is None: + return + + login_url = 'https://www.%s/%slogin' % (host, 'premium/' if 'premium' in host else '') + login_page = self._download_webpage( + login_url, None, 'Downloading %s login page' % site) + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'class=["\']signOut', + r'>Sign\s+[Oo]ut\s*<')) + + if is_logged(login_page): + self._logged_in = True + return + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + response = self._download_json( + 'https://www.%s/front/authenticate' % host, None, + 'Logging in to %s' % site, + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': login_url, + 'X-Requested-With': 'XMLHttpRequest', + }) + + if response.get('success') == '1': + self._logged_in = True + return + + message = response.get('message') + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % message, expected=True) + + raise ExtractorError('Unable to log in') + class PornHubIE(PornHubBaseIE): IE_DESC = 'PornHub and Thumbzilla' @@ -161,12 +229,20 @@ class PornHubIE(PornHubBaseIE): }, { 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82', 'only_matching': True, + }, { + # Some videos are available with the same id on both premium + # and non-premium sites (e.g. this and the following test) + 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, + }, { + 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage, **kwargs): return re.findall( - r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)', + r']+?src=["\'](?P(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', webpage) def _extract_count(self, pattern, webpage, name): @@ -178,12 +254,7 @@ class PornHubIE(PornHubBaseIE): host = mobj.group('host') or 'pornhub.com' video_id = mobj.group('id') - if 'premium' in host: - if not self._downloader.params.get('cookiefile'): - raise ExtractorError( - 'PornHub Premium requires authentication.' - ' You may want to use --cookies.', - expected=True) + self._login(host) self._set_cookie(host, 'age_verified', '1') @@ -286,14 +357,24 @@ class PornHubIE(PornHubBaseIE): video_urls.append((v_url, None)) video_urls_set.add(v_url) + def parse_quality_items(quality_items): + q_items = self._parse_json(quality_items, video_id, fatal=False) + if not isinstance(q_items, list): + return + for item in q_items: + if isinstance(item, dict): + add_video_url(item.get('url')) + if not video_urls: - FORMAT_PREFIXES = ('media', 'quality') + FORMAT_PREFIXES = ('media', 'quality', 'qualityItems') js_vars = extract_js_vars( webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES), default=None) if js_vars: for key, format_url in js_vars.items(): - if any(key.startswith(p) for p in FORMAT_PREFIXES): + if key.startswith(FORMAT_PREFIXES[-1]): + parse_quality_items(format_url) + elif any(key.startswith(p) for p in FORMAT_PREFIXES[:2]): add_video_url(format_url) if not video_urls and re.search( r'<[^>]+\bid=["\']lockedPlayer', webpage): @@ -389,6 +470,10 @@ class PornHubIE(PornHubBaseIE): class PornHubPlaylistBaseIE(PornHubBaseIE): + def _extract_page(self, url): + return int_or_none(self._search_regex( + r'\bpage=(\d+)', url, 'page', default=None)) + def _extract_entries(self, webpage, host): # Only process container div with main playlist content skipping # drop-down menu that uses similar pattern for videos (see @@ -406,26 +491,6 @@ class PornHubPlaylistBaseIE(PornHubBaseIE): container)) ] - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - playlist_id = mobj.group('id') - - webpage = self._download_webpage(url, playlist_id) - - entries = self._extract_entries(webpage, host) - - playlist = self._parse_json( - self._search_regex( - r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage, - 'playlist', default='{}'), - playlist_id, fatal=False) - title = playlist.get('title') or self._search_regex( - r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False) - - return self.playlist_result( - entries, playlist_id, title, playlist.get('description')) - class PornHubUserIE(PornHubPlaylistBaseIE): _VALID_URL = r'(?Phttps?://(?:[^/]+\.)?(?Ppornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P[^/?#&]+))(?:[?#&]|/(?!videos)|$)' @@ -447,14 +512,27 @@ class PornHubUserIE(PornHubPlaylistBaseIE): }, { 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1', 'only_matching': True, + }, { + # Unavailable via /videos page, but available with direct pagination + # on pornstar page (see [1]), requires premium + # 1. https://github.com/hdl-org/haruhi-dl/issues/27853 + 'url': 'https://www.pornhubpremium.com/pornstar/sienna-west', + 'only_matching': True, + }, { + # Same as before, multi page + 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) user_id = mobj.group('id') + videos_url = '%s/videos' % mobj.group('url') + page = self._extract_page(url) + if page: + videos_url = update_url_query(videos_url, {'page': page}) return self.url_result( - '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(), - video_id=user_id) + videos_url, ie=PornHubPagedVideoListIE.ie_key(), video_id=user_id) class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): @@ -467,32 +545,55 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE): ]+\bid=["\']moreDataBtn ''', webpage) is not None - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - host = mobj.group('host') - item_id = mobj.group('id') + def _entries(self, url, host, item_id): + page = self._extract_page(url) - page = int_or_none(self._search_regex( - r'\bpage=(\d+)', url, 'page', default=None)) + VIDEOS = '/videos' - entries = [] - for page_num in (page, ) if page is not None else itertools.count(1): + def download_page(base_url, num, fallback=False): + note = 'Downloading page %d%s' % (num, ' (switch to fallback)' if fallback else '') + return self._download_webpage( + base_url, item_id, note, query={'page': num}) + + def is_404(e): + return isinstance(e.cause, compat_HTTPError) and e.cause.code == 404 + + base_url = url + has_page = page is not None + first_page = page if has_page else 1 + for page_num in (first_page, ) if has_page else itertools.count(first_page): try: - webpage = self._download_webpage( - url, item_id, 'Downloading page %d' % page_num, - query={'page': page_num}) + try: + webpage = download_page(base_url, page_num) + except ExtractorError as e: + # Some sources may not be available via /videos page, + # trying to fallback to main page pagination (see [1]) + # 1. https://github.com/hdl-org/haruhi-dl/issues/27853 + if is_404(e) and page_num == first_page and VIDEOS in base_url: + base_url = base_url.replace(VIDEOS, '') + webpage = download_page(base_url, page_num, fallback=True) + else: + raise except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + if is_404(e) and page_num != first_page: break raise page_entries = self._extract_entries(webpage, host) if not page_entries: break - entries.extend(page_entries) + for e in page_entries: + yield e if not self._has_more(webpage): break - return self.playlist_result(orderedSet(entries), item_id) + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + host = mobj.group('host') + item_id = mobj.group('id') + + self._login(host) + + return self.playlist_result(self._entries(url, host, item_id), item_id) class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): diff --git a/haruhi_dl/extractor/rai.py b/haruhi_dl/extractor/rai.py index 207a6c247..67b86fc72 100644 --- a/haruhi_dl/extractor/rai.py +++ b/haruhi_dl/extractor/rai.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -15,9 +16,9 @@ from ..utils import ( GeoRestrictedError, int_or_none, parse_duration, + remove_start, strip_or_none, try_get, - unescapeHTML, unified_strdate, unified_timestamp, update_url_query, @@ -67,7 +68,7 @@ class RaiBaseIE(InfoExtractor): # This does not imply geo restriction (e.g. # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) - if media_url == 'http://download.rai.it/video_no_available.mp4': + if '/video_no_available.mp4' in media_url: continue ext = determine_ext(media_url) @@ -102,61 +103,50 @@ class RaiBaseIE(InfoExtractor): }.items() if v is not None) @staticmethod - def _extract_subtitles(url, subtitle_url): + def _extract_subtitles(url, video_data): + STL_EXT = 'stl' + SRT_EXT = 'srt' subtitles = {} - if subtitle_url and isinstance(subtitle_url, compat_str): - subtitle_url = urljoin(url, subtitle_url) - STL_EXT = '.stl' - SRT_EXT = '.srt' - subtitles['it'] = [{ - 'ext': 'stl', - 'url': subtitle_url, - }] - if subtitle_url.endswith(STL_EXT): - srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT - subtitles['it'].append({ - 'ext': 'srt', - 'url': srt_url, + subtitles_array = video_data.get('subtitlesArray') or [] + for k in ('subtitles', 'subtitlesUrl'): + subtitles_array.append({'url': video_data.get(k)}) + for subtitle in subtitles_array: + sub_url = subtitle.get('url') + if sub_url and isinstance(sub_url, compat_str): + sub_lang = subtitle.get('language') or 'it' + sub_url = urljoin(url, sub_url) + sub_ext = determine_ext(sub_url, SRT_EXT) + subtitles.setdefault(sub_lang, []).append({ + 'ext': sub_ext, + 'url': sub_url, }) + if STL_EXT == sub_ext: + subtitles[sub_lang].append({ + 'ext': SRT_EXT, + 'url': sub_url[:-len(STL_EXT)] + SRT_EXT, + }) return subtitles class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/.+?-(?P%s)\.html)' % RaiBaseIE._UUID_RE + _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/.+?-(?P%s))\.(?:html|json)' % RaiBaseIE._UUID_RE _TESTS = [{ - 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', - 'md5': '340aa3b7afb54bfd14a8c11786450d76', - 'info_dict': { - 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', - 'ext': 'mp4', - 'title': 'La Casa Bianca', - 'alt_title': 'S2016 - Puntata del 23/10/2016', - 'description': 'md5:a09d45890850458077d1f68bb036e0a5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 3', - 'creator': 'Rai 3', - 'duration': 3278, - 'timestamp': 1477764300, - 'upload_date': '20161029', - 'series': 'La Casa Bianca', - 'season': '2016', - }, - }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', 'title': 'Report del 07/04/2014', - 'alt_title': 'S2013/14 - Puntata del 07/04/2014', - 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', + 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014', + 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 5', - 'creator': 'Rai 5', + 'uploader': 'Rai Gulp', 'duration': 6160, 'series': 'Report', - 'season_number': 5, 'season': '2013/14', + 'subtitles': { + 'it': 'count:2', + }, }, 'params': { 'skip_download': True, @@ -164,51 +154,59 @@ class RaiPlayIE(RaiBaseIE): }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 'only_matching': True, + }, { + # subtitles at 'subtitlesArray' key (see #27698) + 'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - url, video_id = mobj.group('url', 'id') + base, video_id = re.match(self._VALID_URL, url).groups() media = self._download_json( - '%s?json' % url, video_id, 'Downloading video JSON') + base + '.json', video_id, 'Downloading video JSON') title = media['name'] video = media['video'] - relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) + relinker_info = self._extract_relinker_info(video['content_url'], video_id) self._sort_formats(relinker_info['formats']) thumbnails = [] - if 'images' in media: - for _, value in media.get('images').items(): - if value: - thumbnails.append({ - 'url': value.replace('[RESOLUTION]', '600x400') - }) + for _, value in media.get('images', {}).items(): + if value: + thumbnails.append({ + 'url': urljoin(url, value), + }) - timestamp = unified_timestamp(try_get( - media, lambda x: x['availabilities'][0]['start'], compat_str)) + date_published = media.get('date_published') + time_published = media.get('time_published') + if date_published and time_published: + date_published += ' ' + time_published - subtitles = self._extract_subtitles(url, video.get('subtitles')) + subtitles = self._extract_subtitles(url, video) + + program_info = media.get('program_info') or {} + season = media.get('season') info = { - 'id': video_id, + 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, + 'display_id': video_id, 'title': self._live_title(title) if relinker_info.get( 'is_live') else title, - 'alt_title': media.get('subtitle'), + 'alt_title': strip_or_none(media.get('subtitle')), 'description': media.get('description'), 'uploader': strip_or_none(media.get('channel')), - 'creator': strip_or_none(media.get('editor')), + 'creator': strip_or_none(media.get('editor') or None), 'duration': parse_duration(video.get('duration')), - 'timestamp': timestamp, + 'timestamp': unified_timestamp(date_published), 'thumbnails': thumbnails, - 'series': try_get( - media, lambda x: x['isPartOf']['name'], compat_str), - 'season_number': int_or_none(try_get( - media, lambda x: x['isPartOf']['numeroStagioni'])), - 'season': media.get('stagione') or None, + 'series': program_info.get('name'), + 'season_number': int_or_none(season), + 'season': season if (season and not season.isdigit()) else None, + 'episode': media.get('episode_title'), + 'episode_number': int_or_none(media.get('episode')), 'subtitles': subtitles, } @@ -216,16 +214,16 @@ class RaiPlayIE(RaiBaseIE): return info -class RaiPlayLiveIE(RaiBaseIE): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P[^/?#&]+)' - _TEST = { +class RaiPlayLiveIE(RaiPlayIE): + _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/dirette/(?P[^/?#&]+))' + _TESTS = [{ 'url': 'http://www.raiplay.it/dirette/rainews24', 'info_dict': { 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', 'display_id': 'rainews24', 'ext': 'mp4', 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:6eca31500550f9376819f174e5644754', + 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', 'uploader': 'Rai News 24', 'creator': 'Rai News 24', 'is_live': True, @@ -233,58 +231,50 @@ class RaiPlayLiveIE(RaiBaseIE): 'params': { 'skip_download': True, }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_id = self._search_regex( - r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, - webpage, 'content id') - - return { - '_type': 'url_transparent', - 'ie_key': RaiPlayIE.ie_key(), - 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, - 'id': video_id, - 'display_id': display_id, - } + }] class RaiPlayPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P[^/?#&]+)' + _VALID_URL = r'(?Phttps?://(?:www\.)?raiplay\.it/programmi/(?P[^/?#&]+))' _TESTS = [{ 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/', 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', - 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', + 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, 'playlist_mincount': 12, }] def _real_extract(self, url): - playlist_id = self._match_id(url) + base, playlist_id = re.match(self._VALID_URL, url).groups() - webpage = self._download_webpage(url, playlist_id) - - title = self._html_search_meta( - ('programma', 'nomeProgramma'), webpage, 'title') - description = unescapeHTML(self._html_search_meta( - ('description', 'og:description'), webpage, 'description')) + program = self._download_json( + base + '.json', playlist_id, 'Downloading program JSON') entries = [] - for mobj in re.finditer( - r']+\bhref=(["\'])(?P/raiplay/video/.+?)\1', - webpage): - video_url = urljoin(url, mobj.group('path')) - entries.append(self.url_result( - video_url, ie=RaiPlayIE.ie_key(), - video_id=RaiPlayIE._match_id(video_url))) + for b in (program.get('blocks') or []): + for s in (b.get('sets') or []): + s_id = s.get('id') + if not s_id: + continue + medias = self._download_json( + '%s/%s.json' % (base, s_id), s_id, + 'Downloading content set JSON', fatal=False) + if not medias: + continue + for m in (medias.get('items') or []): + path_id = m.get('path_id') + if not path_id: + continue + video_url = urljoin(url, path_id) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + entries, playlist_id, program.get('name'), + try_get(program, lambda x: x['program_info']['description'])) class RaiIE(RaiBaseIE): @@ -300,7 +290,8 @@ class RaiIE(RaiBaseIE): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 1758, 'upload_date': '20140612', - } + }, + 'skip': 'This content is available only in Italy', }, { # with ContentItem in many metas 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', @@ -316,7 +307,7 @@ class RaiIE(RaiBaseIE): }, { # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '11959b4e44fa74de47011b5799490adf', + 'md5': '6865dd00cf0bbf5772fdd89d59bd768a', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', @@ -326,18 +317,6 @@ class RaiIE(RaiBaseIE): 'duration': 2214, 'upload_date': '20161103', } - }, { - # drawMediaRaiTV(...) - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '2dd727e61114e1ee9c47f0da6914e178', - 'info_dict': { - 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'mp4', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20141221', - }, }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', @@ -349,17 +328,6 @@ class RaiIE(RaiBaseIE): 'upload_date': '20170401', }, 'skip': 'Changes daily', - }, { - # HDS live stream with only relinker URL - 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - 'info_dict': { - 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', - 'ext': 'flv', - 'title': 'EuroNews', - }, - 'params': { - 'skip_download': True, - }, }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', @@ -371,6 +339,22 @@ class RaiIE(RaiBaseIE): 'params': { 'skip_download': True, }, + }, { + # ContentItem in iframe (see #12652) and subtitle at 'subtitlesUrl' key + 'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html', + 'info_dict': { + 'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd', + 'ext': 'mp4', + 'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015', + 'description': 'md5:d291b03407ec505f95f27970c0b025f4', + 'upload_date': '20150913', + 'subtitles': { + 'it': 'count:2', + }, + }, + 'params': { + 'skip_download': True, + }, }, { # Direct MMS URL 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', @@ -411,7 +395,7 @@ class RaiIE(RaiBaseIE): 'url': compat_urlparse.urljoin(url, thumbnail_url), }) - subtitles = self._extract_subtitles(url, media.get('subtitlesUrl')) + subtitles = self._extract_subtitles(url, media) info = { 'id': content_id, @@ -448,7 +432,8 @@ class RaiIE(RaiBaseIE): r'''(?x) (?: (?:initEdizione|drawMediaRaiTV)\(| - <(?:[^>]+\bdata-id|var\s+uniquename)= + <(?:[^>]+\bdata-id|var\s+uniquename)=| + ]+\bsrc= ) (["\']) (?:(?!\1).)*\bContentItem-(?P%s) @@ -469,7 +454,7 @@ class RaiIE(RaiBaseIE): except ExtractorError: pass - relinker_url = self._search_regex( + relinker_url = self._proto_relative_url(self._search_regex( r'''(?x) (?: var\s+videoURL| @@ -481,7 +466,7 @@ class RaiIE(RaiBaseIE): //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 ''', - webpage, 'relinker URL', group='url') + webpage, 'relinker URL', group='url')) relinker_info = self._extract_relinker_info( urljoin(url, relinker_url), video_id) diff --git a/haruhi_dl/extractor/reddit.py b/haruhi_dl/extractor/reddit.py index 663f622b3..222fa0172 100644 --- a/haruhi_dl/extractor/reddit.py +++ b/haruhi_dl/extractor/reddit.py @@ -7,6 +7,8 @@ from ..utils import ( ExtractorError, int_or_none, float_or_none, + try_get, + unescapeHTML, url_or_none, ) @@ -55,10 +57,12 @@ class RedditRIE(InfoExtractor): 'id': 'zv89llsvexdz', 'ext': 'mp4', 'title': 'That small heart attack.', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:4', 'timestamp': 1501941939, 'upload_date': '20170805', 'uploader': 'Antw87', + 'duration': 12, 'like_count': int, 'dislike_count': int, 'comment_count': int, @@ -116,13 +120,40 @@ class RedditRIE(InfoExtractor): else: age_limit = None + thumbnails = [] + + def add_thumbnail(src): + if not isinstance(src, dict): + return + thumbnail_url = url_or_none(src.get('url')) + if not thumbnail_url: + return + thumbnails.append({ + 'url': unescapeHTML(thumbnail_url), + 'width': int_or_none(src.get('width')), + 'height': int_or_none(src.get('height')), + }) + + for image in try_get(data, lambda x: x['preview']['images']) or []: + if not isinstance(image, dict): + continue + add_thumbnail(image.get('source')) + resolutions = image.get('resolutions') + if isinstance(resolutions, list): + for resolution in resolutions: + add_thumbnail(resolution) + return { '_type': 'url_transparent', 'url': video_url, 'title': data.get('title'), - 'thumbnail': url_or_none(data.get('thumbnail')), + 'thumbnails': thumbnails, 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), + 'duration': int_or_none(try_get( + data, + (lambda x: x['media']['reddit_video']['duration'], + lambda x: x['secure_media']['reddit_video']['duration']))), 'like_count': int_or_none(data.get('ups')), 'dislike_count': int_or_none(data.get('downs')), 'comment_count': int_or_none(data.get('num_comments')), diff --git a/haruhi_dl/extractor/rts.py b/haruhi_dl/extractor/rts.py index 48f17b828..aed35f8a9 100644 --- a/haruhi_dl/extractor/rts.py +++ b/haruhi_dl/extractor/rts.py @@ -6,11 +6,12 @@ import re from .srgssr import SRGSSRIE from ..compat import compat_str from ..utils import ( + determine_ext, int_or_none, parse_duration, parse_iso8601, unescapeHTML, - determine_ext, + urljoin, ) @@ -21,7 +22,7 @@ class RTSIE(SRGSSRIE): _TESTS = [ { 'url': 'http://www.rts.ch/archives/tv/divers/3449373-les-enfants-terribles.html', - 'md5': 'ff7f8450a90cf58dacb64e29707b4a8e', + 'md5': '753b877968ad8afaeddccc374d4256a5', 'info_dict': { 'id': '3449373', 'display_id': 'les-enfants-terribles', @@ -35,6 +36,7 @@ class RTSIE(SRGSSRIE): 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/emissions/passe-moi-les-jumelles/5624067-entre-ciel-et-mer.html', @@ -63,11 +65,12 @@ class RTSIE(SRGSSRIE): # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], 'skip': 'Blocked outside Switzerland', }, { 'url': 'http://www.rts.ch/video/info/journal-continu/5745356-londres-cachee-par-un-epais-smog.html', - 'md5': '1bae984fe7b1f78e94abc74e802ed99f', + 'md5': '9bb06503773c07ce83d3cbd793cebb91', 'info_dict': { 'id': '5745356', 'display_id': 'londres-cachee-par-un-epais-smog', @@ -81,6 +84,7 @@ class RTSIE(SRGSSRIE): 'thumbnail': r're:^https?://.*\.image', 'view_count': int, }, + 'expected_warnings': ['Unable to download f4m manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rts.ch/audio/couleur3/programmes/la-belle-video-de-stephane-laurenceau/5706148-urban-hippie-de-damien-krisl-03-04-2014.html', @@ -160,7 +164,7 @@ class RTSIE(SRGSSRIE): media_type = 'video' if 'video' in all_info else 'audio' # check for errors - self.get_media_data('rts', media_type, media_id) + self._get_media_data('rts', media_type, media_id) info = all_info['video']['JSONinfo'] if 'video' in all_info else all_info['audio'] @@ -194,6 +198,7 @@ class RTSIE(SRGSSRIE): 'tbr': extract_bitrate(format_url), }) + download_base = 'http://rtsww%s-d.rts.ch/' % ('-a' if media_type == 'audio' else '') for media in info.get('media', []): media_url = media.get('url') if not media_url or re.match(r'https?://', media_url): @@ -205,7 +210,7 @@ class RTSIE(SRGSSRIE): format_id += '-%dk' % rate formats.append({ 'format_id': format_id, - 'url': 'http://download-video.rts.ch/' + media_url, + 'url': urljoin(download_base, media_url), 'tbr': rate or extract_bitrate(media_url), }) diff --git a/haruhi_dl/extractor/rumble.py b/haruhi_dl/extractor/rumble.py new file mode 100644 index 000000000..4a0225109 --- /dev/null +++ b/haruhi_dl/extractor/rumble.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + int_or_none, + parse_iso8601, + try_get, +) + + +class RumbleEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P[0-9a-z]+)' + _TESTS = [{ + 'url': 'https://rumble.com/embed/v5pv5f', + 'md5': '36a18a049856720189f30977ccbb2c34', + 'info_dict': { + 'id': 'v5pv5f', + 'ext': 'mp4', + 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', + 'timestamp': 1571611968, + 'upload_date': '20191020', + } + }, { + 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._download_json( + 'https://rumble.com/embedJS/', video_id, + query={'request': 'video', 'v': video_id}) + title = video['title'] + + formats = [] + for height, ua in (video.get('ua') or {}).items(): + for i in range(2): + f_url = try_get(ua, lambda x: x[i], compat_str) + if f_url: + ext = determine_ext(f_url) + f = { + 'ext': ext, + 'format_id': '%s-%sp' % (ext, height), + 'height': int_or_none(height), + 'url': f_url, + } + bitrate = try_get(ua, lambda x: x[i + 2]['bitrate']) + if bitrate: + f['tbr'] = int_or_none(bitrate) + formats.append(f) + self._sort_formats(formats) + + author = video.get('author') or {} + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video.get('i'), + 'timestamp': parse_iso8601(video.get('pubDate')), + 'channel': author.get('name'), + 'channel_url': author.get('url'), + 'duration': int_or_none(video.get('duration')), + } diff --git a/haruhi_dl/extractor/ruutu.py b/haruhi_dl/extractor/ruutu.py index f984040aa..c50cd3ecd 100644 --- a/haruhi_dl/extractor/ruutu.py +++ b/haruhi_dl/extractor/ruutu.py @@ -6,14 +6,24 @@ from ..compat import compat_urllib_parse_urlparse from ..utils import ( determine_ext, ExtractorError, + find_xpath_attr, int_or_none, + unified_strdate, + url_or_none, xpath_attr, xpath_text, ) class RuutuIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla)/(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?(?:ruutu|supla)\.fi/(?:video|supla|audio)/| + static\.nelonenmedia\.fi/player/misc/embed_player\.html\?.*?\bnid= + ) + (?P\d+) + ''' _TESTS = [ { 'url': 'http://www.ruutu.fi/video/2058907', @@ -71,15 +81,53 @@ class RuutuIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 0, }, - 'expected_warnings': ['HTTP Error 502: Bad Gateway'], - } + 'expected_warnings': [ + 'HTTP Error 502: Bad Gateway', + 'Failed to download m3u8 information', + ], + }, + { + 'url': 'http://www.supla.fi/audio/2231370', + 'only_matching': True, + }, + { + 'url': 'https://static.nelonenmedia.fi/player/misc/embed_player.html?nid=3618790', + 'only_matching': True, + }, + { + # episode + 'url': 'https://www.ruutu.fi/video/3401964', + 'info_dict': { + 'id': '3401964', + 'ext': 'mp4', + 'title': 'Temptation Island Suomi - Kausi 5 - Jakso 17', + 'description': 'md5:87cf01d5e1e88adf0c8a2937d2bd42ba', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2582, + 'age_limit': 12, + 'upload_date': '20190508', + 'series': 'Temptation Island Suomi', + 'season_number': 5, + 'episode_number': 17, + 'categories': ['Reality ja tositapahtumat', 'Kotimaiset suosikit', 'Romantiikka ja parisuhde'], + }, + 'params': { + 'skip_download': True, + }, + }, + { + # premium + 'url': 'https://www.ruutu.fi/video/3618715', + 'only_matching': True, + }, ] + _API_BASE = 'https://gatling.nelonenmedia.fi' def _real_extract(self, url): video_id = self._match_id(url) video_xml = self._download_xml( - 'https://gatling.nelonenmedia.fi/media-xml-cache', video_id, + '%s/media-xml-cache' % self._API_BASE, video_id, query={'id': video_id}) formats = [] @@ -96,9 +144,18 @@ class RuutuIE(InfoExtractor): continue processed_urls.append(video_url) ext = determine_ext(video_url) + auth_video_url = url_or_none(self._download_webpage( + '%s/auth/access/v2' % self._API_BASE, video_id, + note='Downloading authenticated %s stream URL' % ext, + fatal=False, query={'stream': video_url})) + if auth_video_url: + processed_urls.append(auth_video_url) + video_url = auth_video_url if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + video_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds', fatal=False)) @@ -136,18 +193,35 @@ class RuutuIE(InfoExtractor): extract_formats(video_xml.find('./Clip')) - drm = xpath_text(video_xml, './Clip/DRM', default=None) - if not formats and drm: - raise ExtractorError('This video is DRM protected.', expected=True) + def pv(name): + node = find_xpath_attr( + video_xml, './Clip/PassthroughVariables/variable', 'name', name) + if node is not None: + return node.get('value') + + if not formats: + drm = xpath_text(video_xml, './Clip/DRM', default=None) + if drm: + raise ExtractorError('This video is DRM protected.', expected=True) + ns_st_cds = pv('ns_st_cds') + if ns_st_cds != 'free': + raise ExtractorError('This video is %s.' % ns_st_cds, expected=True) self._sort_formats(formats) + themes = pv('themes') + return { 'id': video_id, 'title': xpath_attr(video_xml, './/Behavior/Program', 'program_name', 'title', fatal=True), 'description': xpath_attr(video_xml, './/Behavior/Program', 'description', 'description'), 'thumbnail': xpath_attr(video_xml, './/Behavior/Startpicture', 'href', 'thumbnail'), - 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')), + 'duration': int_or_none(xpath_text(video_xml, './/Runtime', 'duration')) or int_or_none(pv('runtime')), 'age_limit': int_or_none(xpath_text(video_xml, './/AgeLimit', 'age limit')), + 'upload_date': unified_strdate(pv('date_start')), + 'series': pv('series_name'), + 'season_number': int_or_none(pv('season_number')), + 'episode_number': int_or_none(pv('episode_number')), + 'categories': themes.split(',') if themes else [], 'formats': formats, } diff --git a/haruhi_dl/extractor/samplefocus.py b/haruhi_dl/extractor/samplefocus.py new file mode 100644 index 000000000..806c3c354 --- /dev/null +++ b/haruhi_dl/extractor/samplefocus.py @@ -0,0 +1,100 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + get_element_by_attribute, + int_or_none, +) + + +class SampleFocusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?samplefocus\.com/samples/(?P[^/?&#]+)' + _TESTS = [{ + 'url': 'https://samplefocus.com/samples/lil-peep-sad-emo-guitar', + 'md5': '48c8d62d60be467293912e0e619a5120', + 'info_dict': { + 'id': '40316', + 'display_id': 'lil-peep-sad-emo-guitar', + 'ext': 'mp3', + 'title': 'Lil Peep Sad Emo Guitar', + 'thumbnail': r're:^https?://.+\.png', + 'license': 'Standard License', + 'uploader': 'CapsCtrl', + 'uploader_id': 'capsctrl', + 'like_count': int, + 'comment_count': int, + 'categories': ['Samples', 'Guitar', 'Electric guitar'], + }, + }, { + 'url': 'https://samplefocus.com/samples/dababy-style-bass-808', + 'only_matching': True + }, { + 'url': 'https://samplefocus.com/samples/young-chop-kick', + 'only_matching': True + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + sample_id = self._search_regex( + r']+id=(["\'])sample_id\1[^>]+value=(?:["\'])(?P\d+)', + webpage, 'sample id', group='id') + + title = self._og_search_title(webpage, fatal=False) or self._html_search_regex( + r'

(.+?)

', webpage, 'title') + + mp3_url = self._search_regex( + r']+id=(["\'])sample_mp3\1[^>]+value=(["\'])(?P(?:(?!\2).)+)', + webpage, 'mp3', fatal=False, group='url') or extract_attributes(self._search_regex( + r']+itemprop=(["\'])contentUrl\1[^>]*>', + webpage, 'mp3 url', group=0))['content'] + + thumbnail = self._og_search_thumbnail(webpage) or self._html_search_regex( + r']+class=(?:["\'])waveform responsive-img[^>]+src=(["\'])(?P(?:(?!\1).)+)', + webpage, 'mp3', fatal=False, group='url') + + comments = [] + for author_id, author, body in re.findall(r'(?s)]+class="comment-author">]+href="/users/([^"]+)">([^"]+).+?]+class="comment-body">([^>]+)

', webpage): + comments.append({ + 'author': author, + 'author_id': author_id, + 'text': body, + }) + + uploader_id = uploader = None + mobj = re.search(r'>By ]+href="/users/([^"]+)"[^>]*>([^<]+)', webpage) + if mobj: + uploader_id, uploader = mobj.groups() + + breadcrumb = get_element_by_attribute('typeof', 'BreadcrumbList', webpage) + categories = [] + if breadcrumb: + for _, name in re.findall(r']+property=(["\'])name\1[^>]*>([^<]+)', breadcrumb): + categories.append(name) + + def extract_count(klass): + return int_or_none(self._html_search_regex( + r']+class=(?:["\'])?%s-count[^>]*>(\d+)' % klass, + webpage, klass, fatal=False)) + + return { + 'id': sample_id, + 'title': title, + 'url': mp3_url, + 'display_id': display_id, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'license': self._html_search_regex( + r']+href=(["\'])/license\1[^>]*>(?P[^<]+)<', + webpage, 'license', fatal=False, group='license'), + 'uploader_id': uploader_id, + 'like_count': extract_count('sample-%s-favorites' % sample_id), + 'comment_count': extract_count('comments'), + 'comments': comments, + 'categories': categories, + } diff --git a/haruhi_dl/extractor/sbs.py b/haruhi_dl/extractor/sbs.py index 0e623ff7b..f722528cd 100644 --- a/haruhi_dl/extractor/sbs.py +++ b/haruhi_dl/extractor/sbs.py @@ -10,7 +10,7 @@ from ..utils import ( class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=)|news/(?:embeds/)?video/)(?P[0-9]+)' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -18,7 +18,7 @@ class SBSIE(InfoExtractor): 'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed', 'md5': '3150cf278965eeabb5b4cea1c963fe0a', 'info_dict': { - 'id': '320403011771', + 'id': '_rFBPRPO4pMR', 'ext': 'mp4', 'title': 'Dingo Conservation (The Feed)', 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', @@ -34,6 +34,15 @@ class SBSIE(InfoExtractor): }, { 'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/?play=1836638787723', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/program/inside-windsor-castle?play=1283505731842', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/news/embeds/video/1840778819866', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/haruhi_dl/extractor/servus.py b/haruhi_dl/extractor/servus.py index 9401bf2cf..1610ddc2c 100644 --- a/haruhi_dl/extractor/servus.py +++ b/haruhi_dl/extractor/servus.py @@ -1,9 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + unified_timestamp, + urlencode_postdata, + url_or_none, +) class ServusIE(InfoExtractor): @@ -12,20 +18,29 @@ class ServusIE(InfoExtractor): (?:www\.)? (?: servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| - servustv\.com/videos + (?:servustv|pm-wissen)\.com/videos ) /(?P[aA]{2}-\w+|\d+-\d+) ''' _TESTS = [{ # new URL schema 'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', - 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', + 'md5': '60474d4c21f3eb148838f215c37f02b9', 'info_dict': { 'id': 'AA-1T6VBU5PW1W12', 'ext': 'mp4', 'title': 'Die Grünen aus Sicht des Volkes', + 'alt_title': 'Talk im Hangar-7 Voxpops Gruene', 'description': 'md5:1247204d85783afe3682644398ff2ec4', 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 62.442, + 'timestamp': 1605193976, + 'upload_date': '20201112', + 'series': 'Talk im Hangar-7', + 'season': 'Season 9', + 'season_number': 9, + 'episode': 'Episode 31 - September 14', + 'episode_number': 31, } }, { # old URL schema @@ -40,30 +55,94 @@ class ServusIE(InfoExtractor): }, { 'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/', 'only_matching': True, + }, { + 'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url).upper() - webpage = self._download_webpage(url, video_id) - title = self._search_regex( - (r'videoLabel\s*=\s*(["\'])(?P(?:(?!\1).)+)\1', - r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), - webpage, 'title', default=None, - group='title') or self._og_search_title(webpage) - title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + token = self._download_json( + 'https://auth.redbullmediahouse.com/token', video_id, + 'Downloading token', data=urlencode_postdata({ + 'grant_type': 'client_credentials', + }), headers={ + 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==', + }) + access_token = token['access_token'] + token_type = token.get('token_type', 'Bearer') - formats = self._extract_m3u8_formats( - 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, - video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + video = self._download_json( + 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id, + video_id, 'Downloading video JSON', headers={ + 'Authorization': '%s %s' % (token_type, access_token), + }) + + formats = [] + thumbnail = None + for resource in video['resources']: + if not isinstance(resource, dict): + continue + format_url = url_or_none(resource.get('url')) + if not format_url: + continue + extension = resource.get('extension') + type_ = resource.get('type') + if extension == 'jpg' or type_ == 'reference_keyframe': + thumbnail = format_url + continue + ext = determine_ext(format_url) + if type_ == 'dash' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, video_id, mpd_id='dash', fatal=False)) + elif type_ == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif extension == 'mp4' or ext == 'mp4': + formats.append({ + 'url': format_url, + 'format_id': type_, + 'width': int_or_none(resource.get('width')), + 'height': int_or_none(resource.get('height')), + }) self._sort_formats(formats) + attrs = {} + for attribute in video['attributes']: + if not isinstance(attribute, dict): + continue + key = attribute.get('fieldKey') + value = attribute.get('fieldValue') + if not key or not value: + continue + attrs[key] = value + + title = attrs.get('title_stv') or video_id + alt_title = attrs.get('title') + description = attrs.get('long_description') or attrs.get('short_description') + series = attrs.get('label') + season = attrs.get('season') + episode = attrs.get('chapter') + duration = float_or_none(attrs.get('duration'), scale=1000) + season_number = int_or_none(self._search_regex( + r'Season (\d+)', season or '', 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'Episode (\d+)', episode or '', 'episode number', default=None)) + return { 'id': video_id, 'title': title, + 'alt_title': alt_title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': unified_timestamp(video.get('lastPublished')), + 'series': series, + 'season': season, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, 'formats': formats, } diff --git a/haruhi_dl/extractor/sevenplus.py b/haruhi_dl/extractor/sevenplus.py index 84568ac69..240afc18f 100644 --- a/haruhi_dl/extractor/sevenplus.py +++ b/haruhi_dl/extractor/sevenplus.py @@ -4,8 +4,12 @@ from __future__ import unicode_literals import re from .brightcove import BrightcoveNewIE -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( + ExtractorError, try_get, update_url_query, ) @@ -41,16 +45,22 @@ class SevenPlusIE(BrightcoveNewIE): def _real_extract(self, url): path, episode_id = re.match(self._VALID_URL, url).groups() - media = self._download_json( - 'https://videoservice.swm.digital/playback', episode_id, query={ - 'appId': '7plus', - 'deviceType': 'web', - 'platformType': 'web', - 'accountId': 5303576322001, - 'referenceId': 'ref:' + episode_id, - 'deliveryId': 'csai', - 'videoType': 'vod', - })['media'] + try: + media = self._download_json( + 'https://videoservice.swm.digital/playback', episode_id, query={ + 'appId': '7plus', + 'deviceType': 'web', + 'platformType': 'web', + 'accountId': 5303576322001, + 'referenceId': 'ref:' + episode_id, + 'deliveryId': 'csai', + 'videoType': 'vod', + })['media'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + raise ExtractorError(self._parse_json( + e.cause.read().decode(), episode_id)[0]['error_code'], expected=True) + raise for source in media.get('sources', {}): src = source.get('src') diff --git a/haruhi_dl/extractor/simplecast.py b/haruhi_dl/extractor/simplecast.py new file mode 100644 index 000000000..98d8f7de6 --- /dev/null +++ b/haruhi_dl/extractor/simplecast.py @@ -0,0 +1,160 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + strip_or_none, + try_get, + urlencode_postdata, +) + + +class SimplecastBaseIE(InfoExtractor): + _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}' + _API_BASE = 'https://api.simplecast.com/' + + def _call_api(self, path_tmpl, video_id): + return self._download_json( + self._API_BASE + path_tmpl % video_id, video_id) + + def _call_search_api(self, resource, resource_id, resource_url): + return self._download_json( + 'https://api.simplecast.com/%ss/search' % resource, resource_id, + data=urlencode_postdata({'url': resource_url})) + + def _parse_episode(self, episode): + episode_id = episode['id'] + title = episode['title'].strip() + audio_file = episode.get('audio_file') or {} + audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url'] + + season = episode.get('season') or {} + season_href = season.get('href') + season_id = None + if season_href: + season_id = self._search_regex( + r'https?://api.simplecast.com/seasons/(%s)' % self._UUID_REGEX, + season_href, 'season id', default=None) + + webpage_url = episode.get('episode_url') + channel_url = None + if webpage_url: + channel_url = self._search_regex( + r'(https?://[^/]+\.simplecast\.com)', + webpage_url, 'channel url', default=None) + + return { + 'id': episode_id, + 'display_id': episode.get('slug'), + 'title': title, + 'url': clean_podcast_url(audio_file_url), + 'webpage_url': webpage_url, + 'channel_url': channel_url, + 'series': try_get(episode, lambda x: x['podcast']['title']), + 'season_number': int_or_none(season.get('number')), + 'season_id': season_id, + 'thumbnail': episode.get('image_url'), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode.get('number')), + 'description': strip_or_none(episode.get('description')), + 'timestamp': parse_iso8601(episode.get('published_at')), + 'duration': int_or_none(episode.get('duration')), + 'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')), + } + + +class SimplecastIE(SimplecastBaseIE): + IE_NAME = 'simplecast' + _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX + _COMMON_TEST_INFO = { + 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays', + 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'ext': 'mp3', + 'title': 'Errant Signal - Chris Franklin & New Wave Video Essays', + 'episode_number': 1, + 'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'description': 'md5:34752789d3d2702e2d2c975fbd14f357', + 'season_number': 1, + 'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13', + 'series': 'The RE:BIND.io Podcast', + 'duration': 5343, + 'timestamp': 1580979475, + 'upload_date': '20200206', + 'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', + 'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$', + } + _TESTS = [{ + 'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'md5': '8c93be7be54251bf29ee97464eabe61c', + 'info_dict': _COMMON_TEST_INFO, + }, { + 'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage, **kw): + return re.findall( + r'''(?x)<iframe[^>]+src=["\'] + ( + https?://(?:embed\.simplecast\.com/[0-9a-f]{8}| + player\.simplecast\.com/%s + ))''' % SimplecastBaseIE._UUID_REGEX, webpage) + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api('episodes/%s', episode_id) + return self._parse_episode(episode) + + +class SimplecastEpisodeIE(SimplecastBaseIE): + IE_NAME = 'simplecast:episode' + _VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P<id>[^/?&#]+)' + _TEST = { + 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays', + 'md5': '8c93be7be54251bf29ee97464eabe61c', + 'info_dict': SimplecastIE._COMMON_TEST_INFO, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + episode = self._call_search_api( + 'episode', mobj.group(1), mobj.group(0)) + return self._parse_episode(episode) + + +class SimplecastPodcastIE(SimplecastBaseIE): + IE_NAME = 'simplecast:podcast' + _VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P<id>[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)' + _TESTS = [{ + 'url': 'https://the-re-bind-io-podcast.simplecast.com', + 'playlist_mincount': 33, + 'info_dict': { + 'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c', + 'title': 'The RE:BIND.io Podcast', + }, + }, { + 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes', + 'only_matching': True, + }] + + def _real_extract(self, url): + subdomain = self._match_id(url) + site = self._call_search_api('site', subdomain, url) + podcast = site['podcast'] + podcast_id = podcast['id'] + podcast_title = podcast.get('title') + + def entries(): + episodes = self._call_api('podcasts/%s/episodes', podcast_id) + for episode in (episodes.get('collection') or []): + info = self._parse_episode(episode) + info['series'] = podcast_title + yield info + + return self.playlist_result(entries(), podcast_id, podcast_title) diff --git a/haruhi_dl/extractor/sky.py b/haruhi_dl/extractor/sky.py index ea30d6e62..ff2c977a0 100644 --- a/haruhi_dl/extractor/sky.py +++ b/haruhi_dl/extractor/sky.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( extract_attributes, @@ -11,38 +13,61 @@ from ..utils import ( class SkyBaseIE(InfoExtractor): - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_data = extract_attributes(self._search_regex( - r'(<div.+?class="[^"]*sdc-article-video__media-ooyala[^"]*"[^>]+>)', - webpage, 'video data')) + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + _SDC_EL_REGEX = r'(?s)(<div[^>]+data-(?:component-name|fn)="sdc-(?:articl|sit)e-video"[^>]*>)' - video_url = 'ooyala:%s' % video_data['data-video-id'] - if video_data.get('data-token-required') == 'true': - token_fetch_options = self._parse_json(video_data.get( - 'data-token-fetch-options', '{}'), video_id, fatal=False) or {} - token_fetch_url = token_fetch_options.get('url') - if token_fetch_url: - embed_token = self._download_webpage(urljoin( - url, token_fetch_url), video_id, fatal=False) - if embed_token: - video_url = smuggle_url( - video_url, {'embed_token': embed_token.strip('"')}) + def _process_ooyala_element(self, webpage, sdc_el, url): + sdc = extract_attributes(sdc_el) + provider = sdc.get('data-provider') + if provider == 'ooyala': + video_id = sdc['data-sdc-video-id'] + video_url = 'ooyala:%s' % video_id + ie_key = 'Ooyala' + ooyala_el = self._search_regex( + r'(<div[^>]+class="[^"]*\bsdc-article-video__media-ooyala\b[^"]*"[^>]+data-video-id="%s"[^>]*>)' % video_id, + webpage, 'video data', fatal=False) + if ooyala_el: + ooyala_attrs = extract_attributes(ooyala_el) or {} + if ooyala_attrs.get('data-token-required') == 'true': + token_fetch_url = (self._parse_json(ooyala_attrs.get( + 'data-token-fetch-options', '{}'), + video_id, fatal=False) or {}).get('url') + if token_fetch_url: + embed_token = self._download_json(urljoin( + url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url( + video_url, {'embed_token': embed_token}) + elif provider == 'brightcove': + video_id = sdc['data-video-id'] + account_id = sdc.get('data-account-id') or '6058004172001' + player_id = sdc.get('data-player-id') or 'RC9PQUaJ6' + video_url = self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id) + ie_key = 'BrightcoveNew' return { '_type': 'url_transparent', 'id': video_id, 'url': video_url, + 'ie_key': ie_key, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = self._process_ooyala_element(webpage, self._search_regex( + self._SDC_EL_REGEX, webpage, 'sdc element'), url) + info.update({ 'title': self._og_search_title(webpage), 'description': strip_or_none(self._og_search_description(webpage)), - 'ie_key': 'Ooyala', - } + }) + return info class SkySportsIE(SkyBaseIE): - _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)' - _TEST = { + IE_NAME = 'sky:sports' + _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/([^/]+/)*(?P<id>[0-9]+)' + _TESTS = [{ 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', 'md5': '77d59166cddc8d3cb7b13e35eaf0f5ec', 'info_dict': { @@ -52,19 +77,55 @@ class SkySportsIE(SkyBaseIE): 'description': 'md5:e88bda94ae15f7720c5cb467e777bb6d', }, 'add_ie': ['Ooyala'], - } + }, { + 'url': 'https://www.skysports.com/watch/video/sports/f1/12160544/abu-dhabi-gp-the-notebook', + 'only_matching': True, + }, { + 'url': 'https://www.skysports.com/watch/video/tv-shows/12118508/rainford-brent-how-ace-programme-helps', + 'only_matching': True, + }] class SkyNewsIE(SkyBaseIE): + IE_NAME = 'sky:news' _VALID_URL = r'https?://news\.sky\.com/video/[0-9a-z-]+-(?P<id>[0-9]+)' _TEST = { 'url': 'https://news.sky.com/video/russian-plane-inspected-after-deadly-fire-11712962', - 'md5': 'd6327e581473cea9976a3236ded370cd', + 'md5': '411e8893fd216c75eaf7e4c65d364115', 'info_dict': { - 'id': '1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', + 'id': 'ref:1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM', 'ext': 'mp4', 'title': 'Russian plane inspected after deadly fire', 'description': 'The Russian Investigative Committee has released video of the wreckage of a passenger plane which caught fire near Moscow.', + 'uploader_id': '6058004172001', + 'timestamp': 1567112345, + 'upload_date': '20190829', }, - 'add_ie': ['Ooyala'], + 'add_ie': ['BrightcoveNew'], } + + +class SkySportsNewsIE(SkyBaseIE): + IE_NAME = 'sky:sports:news' + _VALID_URL = r'https?://(?:www\.)?skysports\.com/([^/]+/)*news/\d+/(?P<id>\d+)' + _TEST = { + 'url': 'http://www.skysports.com/golf/news/12176/10871916/dustin-johnson-ready-to-conquer-players-championship-at-tpc-sawgrass', + 'info_dict': { + 'id': '10871916', + 'title': 'Dustin Johnson ready to conquer Players Championship at TPC Sawgrass', + 'description': 'Dustin Johnson is confident he can continue his dominant form in 2017 by adding the Players Championship to his list of victories.', + }, + 'playlist_count': 2, + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + + entries = [] + for sdc_el in re.findall(self._SDC_EL_REGEX, webpage): + entries.append(self._process_ooyala_element(webpage, sdc_el, url)) + + return self.playlist_result( + entries, article_id, self._og_search_title(webpage), + self._html_search_meta(['og:description', 'description'], webpage)) diff --git a/haruhi_dl/extractor/skyit.py b/haruhi_dl/extractor/skyit.py new file mode 100644 index 000000000..14a4d8d4c --- /dev/null +++ b/haruhi_dl/extractor/skyit.py @@ -0,0 +1,239 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_parse_qs, + compat_urllib_parse_urlparse, +) +from ..utils import ( + dict_get, + int_or_none, + parse_duration, + unified_timestamp, +) + + +class SkyItPlayerIE(InfoExtractor): + IE_NAME = 'player.sky.it' + _VALID_URL = r'https?://player\.sky\.it/player/(?:external|social)\.html\?.*?\bid=(?P<id>\d+)' + _GEO_BYPASS = False + _DOMAIN = 'sky' + _PLAYER_TMPL = 'https://player.sky.it/player/external.html?id=%s&domain=%s' + # http://static.sky.it/static/skyplayer/conf.json + _TOKEN_MAP = { + 'cielo': 'Hh9O7M8ks5yi6nSROL7bKYz933rdf3GhwZlTLMgvy4Q', + 'hotclub': 'kW020K2jq2lk2eKRJD2vWEg832ncx2EivZlTLQput2C', + 'mtv8': 'A5Nn9GGb326CI7vP5e27d7E4PIaQjota', + 'salesforce': 'C6D585FD1615272C98DE38235F38BD86', + 'sitocommerciale': 'VJwfFuSGnLKnd9Phe9y96WkXgYDCguPMJ2dLhGMb2RE', + 'sky': 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk', + 'skyacademy': 'A6LAn7EkO2Q26FRy0IAMBekX6jzDXYL3', + 'skyarte': 'LWk29hfiU39NNdq87ePeRach3nzTSV20o0lTv2001Cd', + 'theupfront': 'PRSGmDMsg6QMGc04Obpoy7Vsbn7i2Whp', + } + + def _player_url_result(self, video_id): + return self.url_result( + self._PLAYER_TMPL % (video_id, self._DOMAIN), + SkyItPlayerIE.ie_key(), video_id) + + def _parse_video(self, video, video_id): + title = video['title'] + is_live = video.get('type') == 'live' + hls_url = video.get(('streaming' if is_live else 'hls') + '_url') + if not hls_url and video.get('geoblock' if is_live else 'geob'): + self.raise_geo_restricted(countries=['IT']) + + if is_live: + formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4') + else: + formats = self._extract_akamai_formats( + hls_url, video_id, {'http': 'videoplatform.sky.it'}) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'formats': formats, + 'thumbnail': dict_get(video, ('video_still', 'video_still_medium', 'thumb')), + 'description': video.get('short_desc') or None, + 'timestamp': unified_timestamp(video.get('create_date')), + 'duration': int_or_none(video.get('duration_sec')) or parse_duration(video.get('duration')), + 'is_live': is_live, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + domain = compat_parse_qs(compat_urllib_parse_urlparse( + url).query).get('domain', [None])[0] + token = dict_get(self._TOKEN_MAP, (domain, 'sky')) + video = self._download_json( + 'https://apid.sky.it/vdp/v1/getVideoData', + video_id, query={ + 'caller': 'sky', + 'id': video_id, + 'token': token + }, headers=self.geo_verification_headers()) + return self._parse_video(video, video_id) + + +class SkyItVideoIE(SkyItPlayerIE): + IE_NAME = 'video.sky.it' + _VALID_URL = r'https?://(?:masterchef|video|xfactor)\.sky\.it(?:/[^/]+)*/video/[0-9a-z-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://video.sky.it/news/mondo/video/uomo-ucciso-da-uno-squalo-in-australia-631227', + 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', + 'info_dict': { + 'id': '631227', + 'ext': 'mp4', + 'title': 'Uomo ucciso da uno squalo in Australia', + 'timestamp': 1606036192, + 'upload_date': '20201122', + } + }, { + 'url': 'https://xfactor.sky.it/video/x-factor-2020-replay-audizioni-1-615820', + 'only_matching': True, + }, { + 'url': 'https://masterchef.sky.it/video/masterchef-9-cosa-e-successo-nella-prima-puntata-562831', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self._player_url_result(video_id) + + +class SkyItVideoLiveIE(SkyItPlayerIE): + IE_NAME = 'video.sky.it:live' + _VALID_URL = r'https?://video\.sky\.it/diretta/(?P<id>[^/?&#]+)' + _TEST = { + 'url': 'https://video.sky.it/diretta/tg24', + 'info_dict': { + 'id': '1', + 'ext': 'mp4', + 'title': r're:Diretta TG24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'description': 'Guarda la diretta streaming di SkyTg24, segui con Sky tutti gli appuntamenti e gli speciali di Tg24.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + asset_id = compat_str(self._parse_json(self._search_regex( + r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', + webpage, 'next data'), display_id)['props']['initialState']['livePage']['content']['asset_id']) + livestream = self._download_json( + 'https://apid.sky.it/vdp/v1/getLivestream', + asset_id, query={'id': asset_id}) + return self._parse_video(livestream, asset_id) + + +class SkyItIE(SkyItPlayerIE): + IE_NAME = 'sky.it' + _VALID_URL = r'https?://(?:sport|tg24)\.sky\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://sport.sky.it/calcio/serie-a/2020/11/21/juventus-cagliari-risultato-gol', + 'info_dict': { + 'id': '631201', + 'ext': 'mp4', + 'title': 'Un rosso alla violenza: in campo per i diritti delle donne', + 'upload_date': '20201121', + 'timestamp': 1605995753, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'https://tg24.sky.it/mondo/2020/11/22/australia-squalo-uccide-uomo', + 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', + 'info_dict': { + 'id': '631227', + 'ext': 'mp4', + 'title': 'Uomo ucciso da uno squalo in Australia', + 'timestamp': 1606036192, + 'upload_date': '20201122', + }, + }] + _VIDEO_ID_REGEX = r'data-videoid="(\d+)"' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + self._VIDEO_ID_REGEX, webpage, 'video id') + return self._player_url_result(video_id) + + +class SkyItAcademyIE(SkyItIE): + IE_NAME = 'skyacademy.it' + _VALID_URL = r'https?://(?:www\.)?skyacademy\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://www.skyacademy.it/eventi-speciali/2019/07/05/a-lezione-di-cinema-con-sky-academy-/', + 'md5': 'ced5c26638b7863190cbc44dd6f6ba08', + 'info_dict': { + 'id': '523458', + 'ext': 'mp4', + 'title': 'Sky Academy "The Best CineCamp 2019"', + 'timestamp': 1562843784, + 'upload_date': '20190711', + } + }] + _DOMAIN = 'skyacademy' + _VIDEO_ID_REGEX = r'id="news-videoId_(\d+)"' + + +class SkyItArteIE(SkyItIE): + IE_NAME = 'arte.sky.it' + _VALID_URL = r'https?://arte\.sky\.it/video/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://arte.sky.it/video/serie-musei-venezia-collezionismo-12-novembre/', + 'md5': '515aee97b87d7a018b6c80727d3e7e17', + 'info_dict': { + 'id': '627926', + 'ext': 'mp4', + 'title': "Musei Galleria Franchetti alla Ca' d'Oro Palazzo Grimani", + 'upload_date': '20201106', + 'timestamp': 1604664493, + } + }] + _DOMAIN = 'skyarte' + _VIDEO_ID_REGEX = r'(?s)<iframe[^>]+src="(?:https:)?//player\.sky\.it/player/external\.html\?[^"]*\bid=(\d+)' + + +class CieloTVItIE(SkyItIE): + IE_NAME = 'cielotv.it' + _VALID_URL = r'https?://(?:www\.)?cielotv\.it/video/(?P<id>[^.]+)\.html' + _TESTS = [{ + 'url': 'https://www.cielotv.it/video/Il-lunedi-e-sempre-un-dramma.html', + 'md5': 'c4deed77552ba901c2a0d9258320304b', + 'info_dict': { + 'id': '499240', + 'ext': 'mp4', + 'title': 'Il lunedì è sempre un dramma', + 'upload_date': '20190329', + 'timestamp': 1553862178, + } + }] + _DOMAIN = 'cielo' + _VIDEO_ID_REGEX = r'videoId\s*=\s*"(\d+)"' + + +class TV8ItIE(SkyItVideoIE): + IE_NAME = 'tv8.it' + _VALID_URL = r'https?://tv8\.it/showvideo/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://tv8.it/showvideo/630529/ogni-mattina-ucciso-asino-di-andrea-lo-cicero/18-11-2020/', + 'md5': '9ab906a3f75ea342ed928442f9dabd21', + 'info_dict': { + 'id': '630529', + 'ext': 'mp4', + 'title': 'Ogni mattina - Ucciso asino di Andrea Lo Cicero', + 'timestamp': 1605721374, + 'upload_date': '20201118', + } + }] + _DOMAIN = 'mtv8' diff --git a/haruhi_dl/extractor/slideslive.py b/haruhi_dl/extractor/slideslive.py index d9ea76831..9409a0100 100644 --- a/haruhi_dl/extractor/slideslive.py +++ b/haruhi_dl/extractor/slideslive.py @@ -2,7 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + bool_or_none, + smuggle_url, + try_get, + url_or_none, +) class SlidesLiveIE(InfoExtractor): @@ -18,8 +23,21 @@ class SlidesLiveIE(InfoExtractor): 'description': 'Watch full version of this video at https://slideslive.com/38902413.', 'uploader': 'SlidesLive Videos - A', 'uploader_id': 'UC62SdArr41t_-_fX40QCLRw', + 'timestamp': 1597615266, 'upload_date': '20170925', } + }, { + # video_service_name = yoda + 'url': 'https://slideslive.com/38935785', + 'md5': '575cd7a6c0acc6e28422fe76dd4bcb1a', + 'info_dict': { + 'id': 'RMraDYN5ozA_', + 'ext': 'mp4', + 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', + }, + 'params': { + 'format': 'bestvideo', + }, }, { # video_service_name = youtube 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', @@ -39,18 +57,48 @@ class SlidesLiveIE(InfoExtractor): video_data = self._download_json( 'https://ben.slideslive.com/player/' + video_id, video_id) service_name = video_data['video_service_name'].lower() - assert service_name in ('url', 'vimeo', 'youtube') + assert service_name in ('url', 'yoda', 'vimeo', 'youtube') service_id = video_data['video_service_id'] + subtitles = {} + for sub in try_get(video_data, lambda x: x['subtitles'], list) or []: + if not isinstance(sub, dict): + continue + webvtt_url = url_or_none(sub.get('webvtt_url')) + if not webvtt_url: + continue + lang = sub.get('language') or 'en' + subtitles.setdefault(lang, []).append({ + 'url': webvtt_url, + }) info = { 'id': video_id, 'thumbnail': video_data.get('thumbnail'), - 'url': service_id, + 'is_live': bool_or_none(video_data.get('is_live')), + 'subtitles': subtitles, } - if service_name == 'url': + if service_name in ('url', 'yoda'): info['title'] = video_data['title'] + if service_name == 'url': + info['url'] = service_id + else: + formats = [] + _MANIFEST_PATTERN = 'https://01.cdn.yoda.slideslive.com/%s/master.%s' + # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol + formats.extend(self._extract_m3u8_formats( + _MANIFEST_PATTERN % (service_id, 'm3u8'), + service_id, 'mp4', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_mpd_formats( + _MANIFEST_PATTERN % (service_id, 'mpd'), service_id, + mpd_id='dash', fatal=False)) + self._sort_formats(formats) + info.update({ + 'id': service_id, + 'formats': formats, + }) else: info.update({ '_type': 'url_transparent', + 'url': service_id, 'ie_key': service_name.capitalize(), 'title': video_data.get('title'), }) diff --git a/haruhi_dl/extractor/smotri.py b/haruhi_dl/extractor/smotri.py deleted file mode 100644 index 45995f30f..000000000 --- a/haruhi_dl/extractor/smotri.py +++ /dev/null @@ -1,416 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import json -import hashlib -import uuid - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - int_or_none, - sanitized_Request, - unified_strdate, - urlencode_postdata, - xpath_text, -) - - -class SmotriIE(InfoExtractor): - IE_DESC = 'Smotri.com' - IE_NAME = 'smotri' - _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' - _NETRC_MACHINE = 'smotri' - - _TESTS = [ - # real video id 2610366 - { - 'url': 'http://smotri.com/video/view/?id=v261036632ab', - 'md5': '02c0dfab2102984e9c5bb585cc7cc321', - 'info_dict': { - 'id': 'v261036632ab', - 'ext': 'mp4', - 'title': 'катастрофа с камер видеонаблюдения', - 'uploader': 'rbc2008', - 'uploader_id': 'rbc08', - 'upload_date': '20131118', - 'thumbnail': 'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg', - }, - }, - # real video id 57591 - { - 'url': 'http://smotri.com/video/view/?id=v57591cb20', - 'md5': '830266dfc21f077eac5afd1883091bcd', - 'info_dict': { - 'id': 'v57591cb20', - 'ext': 'flv', - 'title': 'test', - 'uploader': 'Support Photofile@photofile', - 'uploader_id': 'support-photofile', - 'upload_date': '20070704', - 'thumbnail': 'http://frame4.loadup.ru/03/ed/57591.2.3.jpg', - }, - }, - # video-password, not approved by moderator - { - 'url': 'http://smotri.com/video/view/?id=v1390466a13c', - 'md5': 'f6331cef33cad65a0815ee482a54440b', - 'info_dict': { - 'id': 'v1390466a13c', - 'ext': 'mp4', - 'title': 'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1', - 'uploader': 'timoxa40', - 'uploader_id': 'timoxa40', - 'upload_date': '20100404', - 'thumbnail': 'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg', - }, - 'params': { - 'videopassword': 'qwerty', - }, - 'skip': 'Video is not approved by moderator', - }, - # video-password - { - 'url': 'http://smotri.com/video/view/?id=v6984858774#', - 'md5': 'f11e01d13ac676370fc3b95b9bda11b0', - 'info_dict': { - 'id': 'v6984858774', - 'ext': 'mp4', - 'title': 'Дача Солженицина ПАРОЛЬ 223322', - 'uploader': 'psavari1', - 'uploader_id': 'psavari1', - 'upload_date': '20081103', - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'params': { - 'videopassword': '223322', - }, - }, - # age limit + video-password, not approved by moderator - { - 'url': 'http://smotri.com/video/view/?id=v15408898bcf', - 'md5': '91e909c9f0521adf5ee86fbe073aad70', - 'info_dict': { - 'id': 'v15408898bcf', - 'ext': 'flv', - 'title': 'этот ролик не покажут по ТВ', - 'uploader': 'zzxxx', - 'uploader_id': 'ueggb', - 'upload_date': '20101001', - 'thumbnail': 'http://frame3.loadup.ru/75/75/1540889.1.3.jpg', - 'age_limit': 18, - }, - 'params': { - 'videopassword': '333' - }, - 'skip': 'Video is not approved by moderator', - }, - # age limit + video-password - { - 'url': 'http://smotri.com/video/view/?id=v7780025814', - 'md5': 'b4599b068422559374a59300c5337d72', - 'info_dict': { - 'id': 'v7780025814', - 'ext': 'mp4', - 'title': 'Sexy Beach (пароль 123)', - 'uploader': 'вАся', - 'uploader_id': 'asya_prosto', - 'upload_date': '20081218', - 'thumbnail': r're:^https?://.*\.jpg$', - 'age_limit': 18, - }, - 'params': { - 'videopassword': '123' - }, - }, - # swf player - { - 'url': 'http://pics.smotri.com/scrubber_custom8.swf?file=v9188090500', - 'md5': '31099eeb4bc906712c5f40092045108d', - 'info_dict': { - 'id': 'v9188090500', - 'ext': 'mp4', - 'title': 'Shakira - Don\'t Bother', - 'uploader': 'HannahL', - 'uploader_id': 'lisaha95', - 'upload_date': '20090331', - 'thumbnail': 'http://frame8.loadup.ru/44/0b/918809.7.3.jpg', - }, - }, - ] - - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r'<embed[^>]src=(["\'])(?P<url>http://pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=v.+?\1)', - webpage) - if mobj is not None: - return mobj.group('url') - - mobj = re.search( - r'''(?x)<div\s+class="video_file">http://smotri\.com/video/download/file/[^<]+</div>\s* - <div\s+class="video_image">[^<]+</div>\s* - <div\s+class="video_id">(?P<id>[^<]+)</div>''', webpage) - if mobj is not None: - return 'http://smotri.com/video/view/?id=%s' % mobj.group('id') - - def _search_meta(self, name, html, display_name=None): - if display_name is None: - display_name = name - return self._html_search_meta(name, html, display_name) - - def _real_extract(self, url): - video_id = self._match_id(url) - - video_form = { - 'ticket': video_id, - 'video_url': '1', - 'frame_url': '1', - 'devid': 'LoadupFlashPlayer', - 'getvideoinfo': '1', - } - - video_password = self._downloader.params.get('videopassword') - if video_password: - video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() - - video = self._download_json( - 'http://smotri.com/video/view/url/bot/', - video_id, 'Downloading video JSON', - data=urlencode_postdata(video_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - video_url = video.get('_vidURL') or video.get('_vidURL_mp4') - - if not video_url: - if video.get('_moderate_no'): - raise ExtractorError( - 'Video %s has not been approved by moderator' % video_id, expected=True) - - if video.get('error'): - raise ExtractorError('Video %s does not exist' % video_id, expected=True) - - if video.get('_pass_protected') == 1: - msg = ('Invalid video password' if video_password - else 'This video is protected by a password, use the --video-password option') - raise ExtractorError(msg, expected=True) - - title = video['title'] - thumbnail = video.get('_imgURL') - upload_date = unified_strdate(video.get('added')) - uploader = video.get('userNick') - uploader_id = video.get('userLogin') - duration = int_or_none(video.get('duration')) - - # Video JSON does not provide enough meta data - # We will extract some from the video web page instead - webpage_url = 'http://smotri.com/video/view/?id=%s' % video_id - webpage = self._download_webpage(webpage_url, video_id, 'Downloading video page') - - # Warning if video is unavailable - warning = self._html_search_regex( - r'<div[^>]+class="videoUnModer"[^>]*>(.+?)</div>', webpage, - 'warning message', default=None) - if warning is not None: - self._downloader.report_warning( - 'Video %s may not be available; smotri said: %s ' % - (video_id, warning)) - - # Adult content - if 'EroConfirmText">' in webpage: - self.report_age_confirmation() - confirm_string = self._html_search_regex( - r'<a[^>]+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id, - webpage, 'confirm string') - confirm_url = webpage_url + '&confirm=%s' % confirm_string - webpage = self._download_webpage( - confirm_url, video_id, - 'Downloading video page (age confirmed)') - adult_content = True - else: - adult_content = False - - view_count = self._html_search_regex( - r'(?s)Общее количество просмотров.*?<span class="Number">(\d+)</span>', - webpage, 'view count', fatal=False) - - return { - 'id': video_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, - 'uploader_id': uploader_id, - 'duration': duration, - 'view_count': int_or_none(view_count), - 'age_limit': 18 if adult_content else 0, - } - - -class SmotriCommunityIE(InfoExtractor): - IE_DESC = 'Smotri.com community videos' - IE_NAME = 'smotri:community' - _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P<id>[0-9A-Za-z_\'-]+)' - _TEST = { - 'url': 'http://smotri.com/community/video/kommuna', - 'info_dict': { - 'id': 'kommuna', - }, - 'playlist_mincount': 4, - } - - def _real_extract(self, url): - community_id = self._match_id(url) - - rss = self._download_xml( - 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id, - community_id, 'Downloading community RSS') - - entries = [ - self.url_result(video_url.text, SmotriIE.ie_key()) - for video_url in rss.findall('./channel/item/link')] - - return self.playlist_result(entries, community_id) - - -class SmotriUserIE(InfoExtractor): - IE_DESC = 'Smotri.com user videos' - IE_NAME = 'smotri:user' - _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P<id>[0-9A-Za-z_\'-]+)' - _TESTS = [{ - 'url': 'http://smotri.com/user/inspector', - 'info_dict': { - 'id': 'inspector', - 'title': 'Inspector', - }, - 'playlist_mincount': 9, - }] - - def _real_extract(self, url): - user_id = self._match_id(url) - - rss = self._download_xml( - 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id, - user_id, 'Downloading user RSS') - - entries = [self.url_result(video_url.text, 'Smotri') - for video_url in rss.findall('./channel/item/link')] - - description_text = xpath_text(rss, './channel/description') or '' - user_nickname = self._search_regex( - '^Видео режиссера (.+)$', description_text, - 'user nickname', fatal=False) - - return self.playlist_result(entries, user_id, user_nickname) - - -class SmotriBroadcastIE(InfoExtractor): - IE_DESC = 'Smotri.com broadcasts' - IE_NAME = 'smotri:broadcast' - _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*' - _NETRC_MACHINE = 'smotri' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - broadcast_id = mobj.group('id') - - broadcast_url = 'http://' + mobj.group('url') - broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') - - if re.search('>Режиссер с логином <br/>"%s"<br/> <span>не существует<' % broadcast_id, broadcast_page) is not None: - raise ExtractorError( - 'Broadcast %s does not exist' % broadcast_id, expected=True) - - # Adult content - if re.search('EroConfirmText">', broadcast_page) is not None: - - (username, password) = self._get_login_info() - if username is None: - self.raise_login_required( - 'Erotic broadcasts allowed only for registered users') - - login_form = { - 'login-hint53': '1', - 'confirm_erotic': '1', - 'login': username, - 'password': password, - } - - request = sanitized_Request( - broadcast_url + '/?no_redirect=1', urlencode_postdata(login_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - broadcast_page = self._download_webpage( - request, broadcast_id, 'Logging in and confirming age') - - if '>Неверный логин или пароль<' in broadcast_page: - raise ExtractorError( - 'Unable to log in: bad username or password', expected=True) - - adult_content = True - else: - adult_content = False - - ticket = self._html_search_regex( - (r'data-user-file=(["\'])(?P<ticket>(?!\1).+)\1', - r"window\.broadcast_control\.addFlashVar\('file'\s*,\s*'(?P<ticket>[^']+)'\)"), - broadcast_page, 'broadcast ticket', group='ticket') - - broadcast_url = 'http://smotri.com/broadcast/view/url/?ticket=%s' % ticket - - broadcast_password = self._downloader.params.get('videopassword') - if broadcast_password: - broadcast_url += '&pass=%s' % hashlib.md5(broadcast_password.encode('utf-8')).hexdigest() - - broadcast_json_page = self._download_webpage( - broadcast_url, broadcast_id, 'Downloading broadcast JSON') - - try: - broadcast_json = json.loads(broadcast_json_page) - - protected_broadcast = broadcast_json['_pass_protected'] == 1 - if protected_broadcast and not broadcast_password: - raise ExtractorError( - 'This broadcast is protected by a password, use the --video-password option', - expected=True) - - broadcast_offline = broadcast_json['is_play'] == 0 - if broadcast_offline: - raise ExtractorError('Broadcast %s is offline' % broadcast_id, expected=True) - - rtmp_url = broadcast_json['_server'] - mobj = re.search(r'^rtmp://[^/]+/(?P<app>.+)/?$', rtmp_url) - if not mobj: - raise ExtractorError('Unexpected broadcast rtmp URL') - - broadcast_playpath = broadcast_json['_streamName'] - broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) - broadcast_thumbnail = broadcast_json.get('_imgURL') - broadcast_title = self._live_title(broadcast_json['title']) - broadcast_description = broadcast_json.get('description') - broadcaster_nick = broadcast_json.get('nick') - broadcaster_login = broadcast_json.get('login') - rtmp_conn = 'S:%s' % uuid.uuid4().hex - except KeyError: - if protected_broadcast: - raise ExtractorError('Bad broadcast password', expected=True) - raise ExtractorError('Unexpected broadcast JSON') - - return { - 'id': broadcast_id, - 'url': rtmp_url, - 'title': broadcast_title, - 'thumbnail': broadcast_thumbnail, - 'description': broadcast_description, - 'uploader': broadcaster_nick, - 'uploader_id': broadcaster_login, - 'age_limit': 18 if adult_content else 0, - 'ext': 'flv', - 'play_path': broadcast_playpath, - 'player_url': 'http://pics.smotri.com/broadcast_play.swf', - 'app': broadcast_app, - 'rtmp_live': True, - 'rtmp_conn': rtmp_conn, - 'is_live': True, - } diff --git a/haruhi_dl/extractor/sonyliv.py b/haruhi_dl/extractor/sonyliv.py index 58a8c0d4d..fedfceb62 100644 --- a/haruhi_dl/extractor/sonyliv.py +++ b/haruhi_dl/extractor/sonyliv.py @@ -1,40 +1,112 @@ # coding: utf-8 from __future__ import unicode_literals +import time +import uuid + from .common import InfoExtractor -from ..utils import smuggle_url +from ..compat import compat_HTTPError +from ..utils import ( + ExtractorError, + int_or_none, +) class SonyLIVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/(?:s(?:how|port)s/[^/]+|movies|clip|trailer|music-videos)/[^/?#&]+-(?P<id>\d+)' _TESTS = [{ - 'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight", + 'url': 'https://www.sonyliv.com/shows/bachelors-delight-1700000113/achaari-cheese-toast-1000022678?watch=true', 'info_dict': { - 'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", - 'id': 'ref:5024612095001', + 'title': 'Bachelors Delight - Achaari Cheese Toast', + 'id': '1000022678', 'ext': 'mp4', - 'upload_date': '20170923', - 'description': 'md5:7f28509a148d5be9d0782b4d5106410d', - 'uploader_id': '5182475815001', - 'timestamp': 1506200547, + 'upload_date': '20200411', + 'description': 'md5:3957fa31d9309bf336ceb3f37ad5b7cb', + 'timestamp': 1586632091, + 'duration': 185, + 'season_number': 1, + 'episode': 'Achaari Cheese Toast', + 'episode_number': 1, + 'release_year': 2016, }, 'params': { 'skip_download': True, }, - 'add_ie': ['BrightcoveNew'], }, { - 'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)', + 'url': 'https://www.sonyliv.com/movies/tahalka-1000050121?watch=true', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/clip/jigarbaaz-1000098925', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/trailer/sandwiched-forever-1000100286?watch=true', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/sports/india-tour-of-australia-2020-21-1700000286/cricket-hls-day-3-1st-test-aus-vs-ind-19-dec-2020-1000100959?watch=true', + 'only_matching': True, + }, { + 'url': 'https://www.sonyliv.com/music-videos/yeh-un-dinon-ki-baat-hai-1000018779', 'only_matching': True, }] + _GEO_COUNTRIES = ['IN'] + _TOKEN = None - # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5182475815001/default_default/index.html?videoId=ref:%s' + def _call_api(self, version, path, video_id): + headers = {} + if self._TOKEN: + headers['security_token'] = self._TOKEN + try: + return self._download_json( + 'https://apiv2.sonyliv.com/AGL/%s/A/ENG/WEB/%s' % (version, path), + video_id, headers=headers)['resultObj'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + message = self._parse_json( + e.cause.read().decode(), video_id)['message'] + if message == 'Geoblocked Country': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + raise ExtractorError(message) + raise + + def _real_initialize(self): + self._TOKEN = self._call_api('1.4', 'ALL/GETTOKEN', None) def _real_extract(self, url): - brightcove_id = self._match_id(url) - return self.url_result( - smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, { - 'geo_countries': ['IN'], - 'referrer': url, - }), - 'BrightcoveNew', brightcove_id) + video_id = self._match_id(url) + content = self._call_api( + '1.5', 'IN/CONTENT/VIDEOURL/VOD/' + video_id, video_id) + if content.get('isEncrypted'): + raise ExtractorError('This video is DRM protected.', expected=True) + dash_url = content['videoURL'] + headers = { + 'x-playback-session-id': '%s-%d' % (uuid.uuid4().hex, time.time() * 1000) + } + formats = self._extract_mpd_formats( + dash_url, video_id, mpd_id='dash', headers=headers, fatal=False) + formats.extend(self._extract_m3u8_formats( + dash_url.replace('.mpd', '.m3u8').replace('/DASH/', '/HLS/'), + video_id, 'mp4', m3u8_id='hls', headers=headers, fatal=False)) + for f in formats: + f.setdefault('http_headers', {}).update(headers) + self._sort_formats(formats) + + metadata = self._call_api( + '1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata'] + title = metadata['title'] + episode = metadata.get('episodeTitle') + if episode and title != episode: + title += ' - ' + episode + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': content.get('posterURL'), + 'description': metadata.get('longDescription') or metadata.get('shortDescription'), + 'timestamp': int_or_none(metadata.get('creationDate'), 1000), + 'duration': int_or_none(metadata.get('duration')), + 'season_number': int_or_none(metadata.get('season')), + 'episode': episode, + 'episode_number': int_or_none(metadata.get('episodeNumber')), + 'release_year': int_or_none(metadata.get('year')), + } diff --git a/haruhi_dl/extractor/spankbang.py b/haruhi_dl/extractor/spankbang.py index 61ca902ce..37cb8c839 100644 --- a/haruhi_dl/extractor/spankbang.py +++ b/haruhi_dl/extractor/spankbang.py @@ -7,17 +7,24 @@ from ..utils import ( determine_ext, ExtractorError, merge_dicts, - orderedSet, parse_duration, parse_resolution, str_to_int, url_or_none, urlencode_postdata, + urljoin, ) class SpankBangIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/(?:video|play|embed)\b' + _VALID_URL = r'''(?x) + https?:// + (?:[^/]+\.)?spankbang\.com/ + (?: + (?P<id>[\da-z]+)/(?:video|play|embed)\b| + [\da-z]+-(?P<id_2>[\da-z]+)/playlist/[^/?#&]+ + ) + ''' _TESTS = [{ 'url': 'http://spankbang.com/3vvn/video/fantasy+solo', 'md5': '1cc433e1d6aa14bc376535b8679302f7', @@ -57,10 +64,14 @@ class SpankBangIE(InfoExtractor): }, { 'url': 'https://spankbang.com/2y3td/embed/', 'only_matching': True, + }, { + 'url': 'https://spankbang.com/2v7ik-7ecbgu/playlist/latina+booty', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') or mobj.group('id_2') webpage = self._download_webpage( url.replace('/%s/embed' % video_id, '/%s/video' % video_id), video_id, headers={'Cookie': 'country=US'}) @@ -155,30 +166,33 @@ class SpankBangIE(InfoExtractor): class SpankBangPlaylistIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/[^/]+' + _VALID_URL = r'https?://(?:[^/]+\.)?spankbang\.com/(?P<id>[\da-z]+)/playlist/(?P<display_id>[^/]+)' _TEST = { 'url': 'https://spankbang.com/ug0k/playlist/big+ass+titties', 'info_dict': { 'id': 'ug0k', 'title': 'Big Ass Titties', }, - 'playlist_mincount': 50, + 'playlist_mincount': 40, } def _real_extract(self, url): - playlist_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + display_id = mobj.group('display_id') webpage = self._download_webpage( url, playlist_id, headers={'Cookie': 'country=US; mobile=on'}) entries = [self.url_result( - 'https://spankbang.com/%s/video' % video_id, - ie=SpankBangIE.ie_key(), video_id=video_id) - for video_id in orderedSet(re.findall( - r'<a[^>]+\bhref=["\']/?([\da-z]+)/play/', webpage))] + urljoin(url, mobj.group('path')), + ie=SpankBangIE.ie_key(), video_id=mobj.group('id')) + for mobj in re.finditer( + r'<a[^>]+\bhref=(["\'])(?P<path>/?[\da-z]+-(?P<id>[\da-z]+)/playlist/%s(?:(?!\1).)*)\1' + % re.escape(display_id), webpage)] title = self._html_search_regex( - r'<h1>([^<]+)\s+playlist</h1>', webpage, 'playlist title', + r'<h1>([^<]+)\s+playlist\s*<', webpage, 'playlist title', fatal=False) return self.playlist_result(entries, playlist_id, title) diff --git a/haruhi_dl/extractor/spiegel.py b/haruhi_dl/extractor/spiegel.py index 4df7f4ddc..2da32b9b2 100644 --- a/haruhi_dl/extractor/spiegel.py +++ b/haruhi_dl/extractor/spiegel.py @@ -1,159 +1,54 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .spiegeltv import SpiegeltvIE -from ..compat import compat_urlparse -from ..utils import ( - parse_duration, - strip_or_none, - unified_timestamp, -) +from .jwplatform import JWPlatformIE class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE _TESTS = [{ 'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', - 'md5': 'b57399839d055fccfeb9a0455c439868', + 'md5': '50c7948883ec85a3e431a0a44b7ad1d6', 'info_dict': { - 'id': '563747', + 'id': 'II0BUyxY', + 'display_id': '1259285', 'ext': 'mp4', - 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', + 'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', - 'duration': 49, + 'duration': 48.0, 'upload_date': '20130311', - 'timestamp': 1362994320, + 'timestamp': 1362997920, }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', - 'md5': '5b6c2f4add9d62912ed5fc78a1faed80', - 'info_dict': { - 'id': '580988', - 'ext': 'mp4', - 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', - 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', - 'duration': 983, - 'upload_date': '20131115', - 'timestamp': 1384546642, - }, - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', - 'md5': '97b91083a672d72976faa8433430afb9', - 'info_dict': { - 'id': '601883', - 'ext': 'mp4', - 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', - 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', - 'upload_date': '20140904', - 'timestamp': 1409834160, - } - }, { - 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', 'only_matching': True, }, { - # nexx video + 'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html', + 'only_matching': True, + }, { + 'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7', + 'only_matching': True, + }, { 'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html', 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id - handle = self._request_webpage(metadata_url, video_id) - - # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html - if SpiegeltvIE.suitable(handle.geturl()): - return self.url_result(handle.geturl(), 'Spiegeltv') - - video_data = self._parse_json(self._webpage_read_content( - handle, metadata_url, video_id), video_id) - title = video_data['title'] - nexx_id = video_data['nexxOmniaId'] - domain_id = video_data.get('nexxOmniaDomain') or '748' - - return { - '_type': 'url_transparent', - 'id': video_id, - 'url': 'nexx:%s:%s' % (domain_id, nexx_id), - 'title': title, - 'description': strip_or_none(video_data.get('teaser')), - 'duration': parse_duration(video_data.get('duration')), - 'timestamp': unified_timestamp(video_data.get('datum')), - 'ie_key': NexxIE.ie_key(), - } - - -class SpiegelArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' - IE_NAME = 'Spiegel:Article' - IE_DESC = 'Articles on spiegel.de' - _TESTS = [{ + }, { 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', - 'info_dict': { - 'id': '1516455', - 'ext': 'mp4', - 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', - 'description': 're:^Patrick Kämnitz gehört.{100,}', - 'upload_date': '20140825', - }, - }, { - 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', - 'info_dict': { - - }, - 'playlist_count': 6, - }, { - # Nexx iFrame embed - 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', - 'info_dict': { - 'id': '161464', - 'ext': 'mp4', - 'title': 'Nervenkitzel Achterbahn', - 'alt_title': 'Karussellbauer in Deutschland', - 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', - 'release_year': 2005, - 'creator': 'SPIEGEL TV', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 2761, - 'timestamp': 1394021479, - 'upload_date': '20140305', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - - # Single video on top of the page - video_link = self._search_regex( - r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage, - 'video page URL', default=None) - if video_link: - video_url = compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', video_link) - return self.url_result(video_url) - - # Multiple embedded videos - embeds = re.findall( - r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"', - webpage) - entries = [ - self.url_result(compat_urlparse.urljoin( - self.http_scheme() + '//spiegel.de/', embed_path)) - for embed_path in embeds] - if embeds: - return self.playlist_result(entries) - - return self.playlist_from_matches( - NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) + media_id = self._html_search_regex( + r'("|["\'])mediaId\1\s*:\s*("|["\'])(?P<id>(?:(?!\2).)+)\2', + webpage, 'media id', group='id') + return { + '_type': 'url_transparent', + 'id': video_id, + 'display_id': video_id, + 'url': 'jwplatform:%s' % media_id, + 'title': self._og_search_title(webpage, default=None), + 'ie_key': JWPlatformIE.ie_key(), + } diff --git a/haruhi_dl/extractor/spiegeltv.py b/haruhi_dl/extractor/spiegeltv.py deleted file mode 100644 index 6ccf4c342..000000000 --- a/haruhi_dl/extractor/spiegeltv.py +++ /dev/null @@ -1,17 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .nexx import NexxIE - - -class SpiegeltvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/', - 'only_matching': True, - } - - def _real_extract(self, url): - return self.url_result( - 'https://api.nexx.cloud/v3/748/videos/byid/%s' - % self._match_id(url), ie=NexxIE.ie_key()) diff --git a/haruhi_dl/extractor/spike.py b/haruhi_dl/extractor/spike.py index aabff7a3c..5805f3d44 100644 --- a/haruhi_dl/extractor/spike.py +++ b/haruhi_dl/extractor/spike.py @@ -20,9 +20,6 @@ class BellatorIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.bellator.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - def _extract_mgid(self, webpage): - return self._extract_triforce_mgid(webpage) - class ParamountNetworkIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' @@ -40,16 +37,12 @@ class ParamountNetworkIE(MTVServicesInfoExtractor): }, }] - _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/' + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _GEO_COUNTRIES = ['US'] - def _extract_mgid(self, webpage): - root_data = self._parse_json(self._search_regex( - r'window\.__DATA__\s*=\s*({.+})', - webpage, 'data'), None) - - def find_sub_data(data, data_type): - return next(c for c in data['children'] if c.get('type') == data_type) - - c = find_sub_data(find_sub_data(root_data, 'MainContainer'), 'VideoPlayer') - return c['props']['media']['video']['config']['uri'] + def _get_feed_query(self, uri): + return { + 'arcEp': 'paramountnetwork.com', + 'imageEp': 'paramountnetwork.com', + 'mgid': uri, + } diff --git a/haruhi_dl/extractor/spotify.py b/haruhi_dl/extractor/spotify.py new file mode 100644 index 000000000..826f98cff --- /dev/null +++ b/haruhi_dl/extractor/spotify.py @@ -0,0 +1,156 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + float_or_none, + int_or_none, + strip_or_none, + try_get, + unified_strdate, +) + + +class SpotifyBaseIE(InfoExtractor): + _ACCESS_TOKEN = None + _OPERATION_HASHES = { + 'Episode': '8276d4423d709ae9b68ec1b74cc047ba0f7479059a37820be730f125189ac2bf', + 'MinimalShow': '13ee079672fad3f858ea45a55eb109553b4fb0969ed793185b2e34cbb6ee7cc0', + 'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d', + } + _VALID_URL_TEMPL = r'https?://open\.spotify\.com/%s/(?P<id>[^/?&#]+)' + + def _real_initialize(self): + self._ACCESS_TOKEN = self._download_json( + 'https://open.spotify.com/get_access_token', None)['accessToken'] + + def _call_api(self, operation, video_id, variables): + return self._download_json( + 'https://api-partner.spotify.com/pathfinder/v1/query', video_id, query={ + 'operationName': 'query' + operation, + 'variables': json.dumps(variables), + 'extensions': json.dumps({ + 'persistedQuery': { + 'sha256Hash': self._OPERATION_HASHES[operation], + }, + }) + }, headers={'authorization': 'Bearer ' + self._ACCESS_TOKEN})['data'] + + def _extract_episode(self, episode, series): + episode_id = episode['id'] + title = episode['name'].strip() + + formats = [] + audio_preview = episode.get('audioPreview') or {} + audio_preview_url = audio_preview.get('url') + if audio_preview_url: + f = { + 'url': audio_preview_url.replace('://p.scdn.co/mp3-preview/', '://anon-podcast.scdn.co/'), + 'vcodec': 'none', + } + audio_preview_format = audio_preview.get('format') + if audio_preview_format: + f['format_id'] = audio_preview_format + mobj = re.match(r'([0-9A-Z]{3})_(?:[A-Z]+_)?(\d+)', audio_preview_format) + if mobj: + f.update({ + 'abr': int(mobj.group(2)), + 'ext': mobj.group(1).lower(), + }) + formats.append(f) + + for item in (try_get(episode, lambda x: x['audio']['items']) or []): + item_url = item.get('url') + if not (item_url and item.get('externallyHosted')): + continue + formats.append({ + 'url': clean_podcast_url(item_url), + 'vcodec': 'none', + }) + + thumbnails = [] + for source in (try_get(episode, lambda x: x['coverArt']['sources']) or []): + source_url = source.get('url') + if not source_url: + continue + thumbnails.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + + return { + 'id': episode_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': strip_or_none(episode.get('description')), + 'duration': float_or_none(try_get( + episode, lambda x: x['duration']['totalMilliseconds']), 1000), + 'release_date': unified_strdate(try_get( + episode, lambda x: x['releaseDate']['isoString'])), + 'series': series, + } + + +class SpotifyIE(SpotifyBaseIE): + IE_NAME = 'spotify' + _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'episode' + _TEST = { + 'url': 'https://open.spotify.com/episode/4Z7GAJ50bgctf6uclHlWKo', + 'md5': '74010a1e3fa4d9e1ab3aa7ad14e42d3b', + 'info_dict': { + 'id': '4Z7GAJ50bgctf6uclHlWKo', + 'ext': 'mp3', + 'title': 'From the archive: Why time management is ruining our lives', + 'description': 'md5:b120d9c4ff4135b42aa9b6d9cde86935', + 'duration': 2083.605, + 'release_date': '20201217', + 'series': "The Guardian's Audio Long Reads", + } + } + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api('Episode', episode_id, { + 'uri': 'spotify:episode:' + episode_id + })['episode'] + return self._extract_episode( + episode, try_get(episode, lambda x: x['podcast']['name'])) + + +class SpotifyShowIE(SpotifyBaseIE): + IE_NAME = 'spotify:show' + _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'show' + _TEST = { + 'url': 'https://open.spotify.com/show/4PM9Ke6l66IRNpottHKV9M', + 'info_dict': { + 'id': '4PM9Ke6l66IRNpottHKV9M', + 'title': 'The Story from the Guardian', + 'description': 'The Story podcast is dedicated to our finest audio documentaries, investigations and long form stories', + }, + 'playlist_mincount': 36, + } + + def _real_extract(self, url): + show_id = self._match_id(url) + podcast = self._call_api('ShowEpisodes', show_id, { + 'limit': 1000000000, + 'offset': 0, + 'uri': 'spotify:show:' + show_id, + })['podcast'] + podcast_name = podcast.get('name') + + entries = [] + for item in (try_get(podcast, lambda x: x['episodes']['items']) or []): + episode = item.get('episode') + if not episode: + continue + entries.append(self._extract_episode(episode, podcast_name)) + + return self.playlist_result( + entries, show_id, podcast_name, podcast.get('description')) diff --git a/haruhi_dl/extractor/spreaker.py b/haruhi_dl/extractor/spreaker.py new file mode 100644 index 000000000..6c7e40ae4 --- /dev/null +++ b/haruhi_dl/extractor/spreaker.py @@ -0,0 +1,176 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + float_or_none, + int_or_none, + str_or_none, + try_get, + unified_timestamp, + url_or_none, +) + + +def _extract_episode(data, episode_id=None): + title = data['title'] + download_url = data['download_url'] + + series = try_get(data, lambda x: x['show']['title'], compat_str) + uploader = try_get(data, lambda x: x['author']['fullname'], compat_str) + + thumbnails = [] + for image in ('image_original', 'image_medium', 'image'): + image_url = url_or_none(data.get('%s_url' % image)) + if image_url: + thumbnails.append({'url': image_url}) + + def stats(key): + return int_or_none(try_get( + data, + (lambda x: x['%ss_count' % key], + lambda x: x['stats']['%ss' % key]))) + + def duration(key): + return float_or_none(data.get(key), scale=1000) + + return { + 'id': compat_str(episode_id or data['episode_id']), + 'url': download_url, + 'display_id': data.get('permalink'), + 'title': title, + 'description': data.get('description'), + 'timestamp': unified_timestamp(data.get('published_at')), + 'uploader': uploader, + 'uploader_id': str_or_none(data.get('author_id')), + 'creator': uploader, + 'duration': duration('duration') or duration('length'), + 'view_count': stats('play'), + 'like_count': stats('like'), + 'comment_count': stats('message'), + 'format': 'MPEG Layer 3', + 'format_id': 'mp3', + 'container': 'mp3', + 'ext': 'mp3', + 'thumbnails': thumbnails, + 'series': series, + 'extractor_key': SpreakerIE.ie_key(), + } + + +class SpreakerIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + api\.spreaker\.com/ + (?: + (?:download/)?episode| + v2/episodes + )/ + (?P<id>\d+) + ''' + _TESTS = [{ + 'url': 'https://api.spreaker.com/episode/12534508', + 'info_dict': { + 'id': '12534508', + 'display_id': 'swm-ep15-how-to-market-your-music-part-2', + 'ext': 'mp3', + 'title': 'EP:15 | Music Marketing (Likes) - Part 2', + 'description': 'md5:0588c43e27be46423e183076fa071177', + 'timestamp': 1502250336, + 'upload_date': '20170809', + 'uploader': 'SWM', + 'uploader_id': '9780658', + 'duration': 1063.42, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'series': 'Success With Music (SWM)', + }, + }, { + 'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3', + 'only_matching': True, + }, { + 'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + data = self._download_json( + 'https://api.spreaker.com/v2/episodes/%s' % episode_id, + episode_id)['response']['episode'] + return _extract_episode(data, episode_id) + + +class SpreakerPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + episode_id = self._search_regex( + (r'data-episode_id=["\'](?P<id>\d+)', + r'episode_id\s*:\s*(?P<id>\d+)'), webpage, 'episode id') + return self.url_result( + 'https://api.spreaker.com/episode/%s' % episode_id, + ie=SpreakerIE.ie_key(), video_id=episode_id) + + +class SpreakerShowIE(InfoExtractor): + _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://api.spreaker.com/show/4652058', + 'info_dict': { + 'id': '4652058', + }, + 'playlist_mincount': 118, + }] + + def _entries(self, show_id): + for page_num in itertools.count(1): + episodes = self._download_json( + 'https://api.spreaker.com/show/%s/episodes' % show_id, + show_id, note='Downloading JSON page %d' % page_num, query={ + 'page': page_num, + 'max_per_page': 100, + }) + pager = try_get(episodes, lambda x: x['response']['pager'], dict) + if not pager: + break + results = pager.get('results') + if not results or not isinstance(results, list): + break + for result in results: + if not isinstance(result, dict): + continue + yield _extract_episode(result) + if page_num == pager.get('last_page'): + break + + def _real_extract(self, url): + show_id = self._match_id(url) + return self.playlist_result(self._entries(show_id), playlist_id=show_id) + + +class SpreakerShowPageIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.spreaker.com/show/success-with-music', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show_id = self._search_regex( + r'show_id\s*:\s*(?P<id>\d+)', webpage, 'show id') + return self.url_result( + 'https://api.spreaker.com/show/%s' % show_id, + ie=SpreakerShowIE.ie_key(), video_id=show_id) diff --git a/haruhi_dl/extractor/sprout.py b/haruhi_dl/extractor/sprout.py index 8467bf49d..e243732f2 100644 --- a/haruhi_dl/extractor/sprout.py +++ b/haruhi_dl/extractor/sprout.py @@ -3,50 +3,62 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( - extract_attributes, - update_url_query, + int_or_none, smuggle_url, + update_url_query, ) class SproutIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?sproutonline\.com/watch/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', - 'md5': '74bf14128578d1e040c3ebc82088f45f', + _VALID_URL = r'https?://(?:www\.)?(?:sproutonline|universalkids)\.com/(?:watch|(?:[^/]+/)*videos)/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.universalkids.com/shows/remy-and-boo/season/1/videos/robot-bike-race', 'info_dict': { - 'id': '9dexnwtmh8_X', + 'id': 'bm0foJFaTKqb', 'ext': 'mp4', - 'title': 'A Cowboy Adventure', - 'description': 'Ruff-Ruff, Tweet and Dave get to be cowboys for the day at Six Cow Corral.', - 'timestamp': 1437758640, - 'upload_date': '20150724', - 'uploader': 'NBCU-SPROUT-NEW', - } - } + 'title': 'Robot Bike Race', + 'description': 'md5:436b1d97117cc437f54c383f4debc66d', + 'timestamp': 1606148940, + 'upload_date': '20201123', + 'uploader': 'NBCU-MPAT', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.sproutonline.com/watch/cowboy-adventure', + 'only_matching': True, + }, { + 'url': 'https://www.universalkids.com/watch/robot-bike-race', + 'only_matching': True, + }] + _GEO_COUNTRIES = ['US'] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_component = self._search_regex( - r'(?s)(<div[^>]+data-component="video"[^>]*?>)', - webpage, 'video component', default=None) - if video_component: - options = self._parse_json(extract_attributes( - video_component)['data-options'], video_id) - theplatform_url = options['video'] - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - if options.get('protected'): - query['auth'] = self._extract_mvpd_auth(url, options['pid'], 'sprout', 'sprout') - theplatform_url = smuggle_url(update_url_query( - theplatform_url, query), {'force_smil_url': True}) - else: - iframe = self._search_regex( - r'(<iframe[^>]+id="sproutVideoIframe"[^>]*?>)', - webpage, 'iframe') - theplatform_url = extract_attributes(iframe)['src'] - - return self.url_result(theplatform_url, 'ThePlatform') + display_id = self._match_id(url) + mpx_metadata = self._download_json( + # http://nbcuunikidsprod.apps.nbcuni.com/networks/universalkids/content/videos/ + 'https://www.universalkids.com/_api/videos/' + display_id, + display_id)['mpxMetadata'] + media_pid = mpx_metadata['mediaPid'] + theplatform_url = 'https://link.theplatform.com/s/HNK2IC/' + media_pid + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + if mpx_metadata.get('entitlement') == 'auth': + query['auth'] = self._extract_mvpd_auth(url, media_pid, 'sprout', 'sprout') + theplatform_url = smuggle_url( + update_url_query(theplatform_url, query), { + 'force_smil_url': True, + 'geo_countries': self._GEO_COUNTRIES, + }) + return { + '_type': 'url_transparent', + 'id': media_pid, + 'url': theplatform_url, + 'series': mpx_metadata.get('seriesName'), + 'season_number': int_or_none(mpx_metadata.get('seasonNumber')), + 'episode_number': int_or_none(mpx_metadata.get('episodeNumber')), + 'ie_key': 'ThePlatform', + } diff --git a/haruhi_dl/extractor/srgssr.py b/haruhi_dl/extractor/srgssr.py index f63a1359a..ac018e740 100644 --- a/haruhi_dl/extractor/srgssr.py +++ b/haruhi_dl/extractor/srgssr.py @@ -4,16 +4,32 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, + float_or_none, + int_or_none, parse_iso8601, qualities, + try_get, ) class SRGSSRIE(InfoExtractor): - _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'''(?x) + (?: + https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| + srgssr + ): + (?P<bu> + srf|rts|rsi|rtr|swi + ):(?:[^:]+:)? + (?P<type> + video|audio + ): + (?P<id> + [0-9a-f\-]{36}|\d+ + ) + ''' _GEO_BYPASS = False _GEO_COUNTRIES = ['CH'] @@ -25,25 +41,39 @@ class SRGSSRIE(InfoExtractor): 'LEGAL': 'The video cannot be transmitted for legal reasons.', 'STARTDATE': 'This video is not yet available. Please try again later.', } + _DEFAULT_LANGUAGE_CODES = { + 'srf': 'de', + 'rts': 'fr', + 'rsi': 'it', + 'rtr': 'rm', + 'swi': 'en', + } def _get_tokenized_src(self, url, video_id, format_id): - sp = compat_urllib_parse_urlparse(url).path.split('/') token = self._download_json( - 'http://tp.srgssr.ch/akahd/token?acl=/%s/%s/*' % (sp[1], sp[2]), + 'http://tp.srgssr.ch/akahd/token?acl=*', video_id, 'Downloading %s token' % format_id, fatal=False) or {} - auth_params = token.get('token', {}).get('authparams') + auth_params = try_get(token, lambda x: x['token']['authparams']) if auth_params: - url += '?' + auth_params + url += ('?' if '?' not in url else '&') + auth_params return url - def get_media_data(self, bu, media_type, media_id): - media_data = self._download_json( - 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), - media_id)[media_type.capitalize()] + def _get_media_data(self, bu, media_type, media_id): + query = {'onlyChapters': True} if media_type == 'video' else {} + full_media_data = self._download_json( + 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' + % (bu, media_type, media_id), + media_id, query=query)['chapterList'] + try: + media_data = next( + x for x in full_media_data if x.get('id') == media_id) + except StopIteration: + raise ExtractorError('No media information found') - if media_data.get('block') and media_data['block'] in self._ERRORS: - message = self._ERRORS[media_data['block']] - if media_data['block'] == 'GEOBLOCK': + block_reason = media_data.get('blockReason') + if block_reason and block_reason in self._ERRORS: + message = self._ERRORS[block_reason] + if block_reason == 'GEOBLOCK': self.raise_geo_restricted( msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( @@ -53,53 +83,75 @@ class SRGSSRIE(InfoExtractor): def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + media_data = self._get_media_data(bu, media_type, media_id) + title = media_data['title'] - media_data = self.get_media_data(bu, media_type, media_id) - - metadata = media_data['AssetMetadatas']['AssetMetadata'][0] - title = metadata['title'] - description = metadata.get('description') - created_date = media_data.get('createdDate') or metadata.get('createdDate') - timestamp = parse_iso8601(created_date) - - thumbnails = [{ - 'id': image.get('id'), - 'url': image['url'], - } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])] - - preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) formats = [] - for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []): - protocol = source.get('@protocol') - for asset in source['url']: - asset_url = asset['text'] - quality = asset['@quality'] - format_id = '%s-%s' % (protocol, quality) - if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): - asset_url = self._get_tokenized_src(asset_url, media_id, format_id) - if protocol.startswith('HTTP-HDS'): - formats.extend(self._extract_f4m_formats( - asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', - media_id, f4m_id=format_id, fatal=False)) - elif protocol.startswith('HTTP-HLS'): - formats.extend(self._extract_m3u8_formats( - asset_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - else: - formats.append({ - 'format_id': format_id, - 'url': asset_url, - 'preference': preference(quality), - 'ext': 'flv' if protocol == 'RTMP' else None, - }) + q = qualities(['SD', 'HD']) + for source in (media_data.get('resourceList') or []): + format_url = source.get('url') + if not format_url: + continue + protocol = source.get('protocol') + quality = source.get('quality') + format_id = [] + for e in (protocol, source.get('encoding'), quality): + if e: + format_id.append(e) + format_id = '-'.join(format_id) + + if protocol in ('HDS', 'HLS'): + if source.get('tokenType') == 'AKAMAI': + format_url = self._get_tokenized_src( + format_url, media_id, format_id) + formats.extend(self._extract_akamai_formats( + format_url, media_id)) + elif protocol == 'HLS': + formats.extend(self._extract_m3u8_formats( + format_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif protocol in ('HTTP', 'HTTPS'): + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'quality': q(quality), + }) + + # This is needed because for audio medias the podcast url is usually + # always included, even if is only an audio segment and not the + # whole episode. + if int_or_none(media_data.get('position')) == 0: + for p in ('S', 'H'): + podcast_url = media_data.get('podcast%sdUrl' % p) + if not podcast_url: + continue + quality = p + 'D' + formats.append({ + 'format_id': 'PODCAST-' + quality, + 'url': podcast_url, + 'quality': q(quality), + }) self._sort_formats(formats) + subtitles = {} + if media_type == 'video': + for sub in (media_data.get('subtitleList') or []): + sub_url = sub.get('url') + if not sub_url: + continue + lang = sub.get('locale') or self._DEFAULT_LANGUAGE_CODES[bu] + subtitles.setdefault(lang, []).append({ + 'url': sub_url, + }) + return { 'id': media_id, 'title': title, - 'description': description, - 'timestamp': timestamp, - 'thumbnails': thumbnails, + 'description': media_data.get('description'), + 'timestamp': parse_iso8601(media_data.get('date')), + 'thumbnail': media_data.get('imageUrl'), + 'duration': float_or_none(media_data.get('duration'), 1000), + 'subtitles': subtitles, 'formats': formats, } @@ -119,26 +171,17 @@ class SRGSSRPlayIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', + 'md5': '6db2226ba97f62ad42ce09783680046c', 'info_dict': { 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'ext': 'mp4', 'upload_date': '20130701', 'title': 'Snowden beantragt Asyl in Russland', - 'timestamp': 1372713995, - } - }, { - # No Speichern (Save) button - 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', - 'md5': '0a274ce38fda48c53c01890651985bc6', - 'info_dict': { - 'id': '677f5829-e473-4823-ac83-a1087fe97faa', - 'ext': 'flv', - 'upload_date': '20130710', - 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', - 'description': 'md5:88604432b60d5a38787f152dec89cd56', - 'timestamp': 1373493600, + 'timestamp': 1372708215, + 'duration': 113.827, + 'thumbnail': r're:^https?://.*1383719781\.png$', }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', 'info_dict': { @@ -146,7 +189,8 @@ class SRGSSRPlayIE(InfoExtractor): 'ext': 'mp3', 'upload_date': '20151013', 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', - 'timestamp': 1444750398, + 'timestamp': 1444709160, + 'duration': 336.816, }, 'params': { # rtmp download @@ -159,19 +203,32 @@ class SRGSSRPlayIE(InfoExtractor): 'id': '6348260', 'display_id': '6348260', 'ext': 'mp4', - 'duration': 1796, + 'duration': 1796.76, 'title': 'Le 19h30', - 'description': '', - 'uploader': '19h30', 'upload_date': '20141201', 'timestamp': 1417458600, 'thumbnail': r're:^https?://.*\.image', - 'view_count': int, }, 'params': { # m3u8 download 'skip_download': True, } + }, { + 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', + 'info_dict': { + 'id': '42960270', + 'ext': 'mp4', + 'title': 'Why people were against tax reforms', + 'description': 'md5:7ac442c558e9630e947427469c4b824d', + 'duration': 94.0, + 'upload_date': '20170215', + 'timestamp': 1487173560, + 'thumbnail': r're:https?://www\.swissinfo\.ch/srgscalableimage/42961964', + 'subtitles': 'count:9', + }, + 'params': { + 'skip_download': True, + } }, { 'url': 'https://www.srf.ch/play/tv/popupvideoplayer?id=c4dba0ca-e75b-43b2-a34f-f708a4932e01', 'only_matching': True, @@ -181,6 +238,10 @@ class SRGSSRPlayIE(InfoExtractor): }, { 'url': 'https://www.rts.ch/play/tv/19h30/video/le-19h30?urn=urn:rts:video:6348260', 'only_matching': True, + }, { + # audio segment, has podcastSdUrl of the full episode + 'url': 'https://www.srf.ch/play/radio/popupaudioplayer?id=50b20dc8-f05b-4972-bf03-e438ff2833eb', + 'only_matching': True, }] def _real_extract(self, url): @@ -188,5 +249,4 @@ class SRGSSRPlayIE(InfoExtractor): bu = mobj.group('bu') media_type = mobj.group('type') or mobj.group('type_2') media_id = mobj.group('id') - # other info can be extracted from url + '&layout=json' return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') diff --git a/haruhi_dl/extractor/stitcher.py b/haruhi_dl/extractor/stitcher.py index 97d1ff681..822782507 100644 --- a/haruhi_dl/extractor/stitcher.py +++ b/haruhi_dl/extractor/stitcher.py @@ -1,28 +1,74 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - determine_ext, + clean_html, + clean_podcast_url, + ExtractorError, int_or_none, - js_to_json, - unescapeHTML, + str_or_none, + try_get, + url_or_none, ) -class StitcherIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)' +class StitcherBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/' + + def _call_api(self, path, video_id, query): + resp = self._download_json( + 'https://api.prod.stitcher.com/' + path, + video_id, query=query) + error_massage = try_get(resp, lambda x: x['errors'][0]['message']) + if error_massage: + raise ExtractorError(error_massage, expected=True) + return resp['data'] + + def _extract_description(self, data): + return clean_html(data.get('html_description') or data.get('description')) + + def _extract_audio_url(self, episode): + return url_or_none(episode.get('audio_url') or episode.get('guid')) + + def _extract_show_info(self, show): + return { + 'thumbnail': show.get('image_base_url'), + 'series': show.get('title'), + } + + def _extract_episode(self, episode, audio_url, show_info): + info = { + 'id': compat_str(episode['id']), + 'display_id': episode.get('slug'), + 'title': episode['title'].strip(), + 'description': self._extract_description(episode), + 'duration': int_or_none(episode.get('duration')), + 'url': clean_podcast_url(audio_url), + 'vcodec': 'none', + 'timestamp': int_or_none(episode.get('date_published')), + 'season_number': int_or_none(episode.get('season')), + 'season_id': str_or_none(episode.get('season_id')), + } + info.update(show_info) + return info + + +class StitcherIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?:[^/]+/)+e(?:pisode)?/(?:[^/#?&]+-)?(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true', - 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940', + 'md5': 'e9635098e0da10b21a0e2b85585530f6', 'info_dict': { 'id': '40789481', 'ext': 'mp3', 'title': 'Machine Learning Mastery and Cancer Clusters', - 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3', + 'description': 'md5:547adb4081864be114ae3831b4c2b42f', 'duration': 1604, 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20151008', + 'timestamp': 1444285800, + 'series': 'Talking Machines', }, }, { 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true', @@ -38,6 +84,7 @@ class StitcherIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Page Not Found', }, { # escaped title 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true', @@ -45,37 +92,53 @@ class StitcherIE(InfoExtractor): }, { 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true', 'only_matching': True, + }, { + 'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - audio_id = mobj.group('id') - display_id = mobj.group('display_id') or audio_id + audio_id = self._match_id(url) + data = self._call_api( + 'shows/episodes', audio_id, {'episode_ids': audio_id}) + episode = data['episodes'][0] + audio_url = self._extract_audio_url(episode) + if not audio_url: + self.raise_login_required() + show = try_get(data, lambda x: x['shows'][0], dict) or {} + return self._extract_episode( + episode, audio_url, self._extract_show_info(show)) - webpage = self._download_webpage(url, display_id) - episode = self._parse_json( - js_to_json(self._search_regex( - r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')), - display_id)['config']['episode'] +class StitcherShowIE(StitcherBaseIE): + _VALID_URL = StitcherBaseIE._VALID_URL_BASE + r'(?P<id>[^/#?&]+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'http://www.stitcher.com/podcast/the-talking-machines', + 'info_dict': { + 'id': 'the-talking-machines', + 'title': 'Talking Machines', + 'description': 'md5:831f0995e40f26c10231af39cf1ebf0b', + }, + 'playlist_mincount': 106, + }, { + 'url': 'https://www.stitcher.com/show/the-talking-machines', + 'only_matching': True, + }] - title = unescapeHTML(episode['title']) - formats = [{ - 'url': episode[episode_key], - 'ext': determine_ext(episode[episode_key]) or 'mp3', - 'vcodec': 'none', - } for episode_key in ('episodeURL',) if episode.get(episode_key)] - description = self._search_regex( - r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False) - duration = int_or_none(episode.get('duration')) - thumbnail = episode.get('episodeImage') + def _real_extract(self, url): + show_slug = self._match_id(url) + data = self._call_api( + 'search/show/%s/allEpisodes' % show_slug, show_slug, {'count': 10000}) + show = try_get(data, lambda x: x['shows'][0], dict) or {} + show_info = self._extract_show_info(show) - return { - 'id': audio_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'duration': duration, - 'thumbnail': thumbnail, - 'formats': formats, - } + entries = [] + for episode in (data.get('episodes') or []): + audio_url = self._extract_audio_url(episode) + if not audio_url: + continue + entries.append(self._extract_episode(episode, audio_url, show_info)) + + return self.playlist_result( + entries, show_slug, show.get('title'), + self._extract_description(show)) diff --git a/haruhi_dl/extractor/storyfire.py b/haruhi_dl/extractor/storyfire.py new file mode 100644 index 000000000..9c698626f --- /dev/null +++ b/haruhi_dl/extractor/storyfire.py @@ -0,0 +1,151 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import functools + +from .common import InfoExtractor +from ..utils import ( + # HEADRequest, + int_or_none, + OnDemandPagedList, + smuggle_url, +) + + +class StoryFireBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?storyfire\.com/' + + def _call_api(self, path, video_id, resource, query=None): + return self._download_json( + 'https://storyfire.com/app/%s/%s' % (path, video_id), video_id, + 'Downloading %s JSON metadata' % resource, query=query) + + def _parse_video(self, video): + title = video['title'] + vimeo_id = self._search_regex( + r'https?://player\.vimeo\.com/external/(\d+)', + video['vimeoVideoURL'], 'vimeo id') + + # video_url = self._request_webpage( + # HEADRequest(video['vimeoVideoURL']), video_id).geturl() + # formats = [] + # for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]: + # formats.extend(self._extract_m3u8_formats( + # v_url, video_id, 'mp4', 'm3u8_native', + # m3u8_id='hls' + suffix, fatal=False)) + # formats.extend(self._extract_mpd_formats( + # v_url.replace('.m3u8', '.mpd'), video_id, + # mpd_id='dash' + suffix, fatal=False)) + # self._sort_formats(formats) + + uploader_id = video.get('hostID') + + return { + '_type': 'url_transparent', + 'id': vimeo_id, + 'title': title, + 'description': video.get('description'), + 'url': smuggle_url( + 'https://player.vimeo.com/video/' + vimeo_id, { + 'http_headers': { + 'Referer': 'https://storyfire.com/', + } + }), + # 'formats': formats, + 'thumbnail': video.get('storyImage'), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likesCount')), + 'comment_count': int_or_none(video.get('commentsCount')), + 'duration': int_or_none(video.get('videoDuration')), + 'timestamp': int_or_none(video.get('publishDate')), + 'uploader': video.get('username'), + 'uploader_id': uploader_id, + 'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None, + 'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')), + } + + +class StoryFireIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'video-details/(?P<id>[0-9a-f]{24})' + _TEST = { + 'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181', + 'md5': 'caec54b9e4621186d6079c7ec100c1eb', + 'info_dict': { + 'id': '378954662', + 'ext': 'mp4', + 'title': 'Buzzfeed Teaches You About Memes', + 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + 'timestamp': 1576129028, + 'description': 'md5:0b4e28021548e144bed69bb7539e62ea', + 'uploader': 'whang!', + 'upload_date': '20191212', + 'duration': 418, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download JSON metadata'] + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._call_api( + 'generic/video-detail', video_id, 'video')['video'] + return self._parse_video(video) + + +class StoryFireUserIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'user/(?P<id>[^/]+)/video' + _TEST = { + 'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video', + 'info_dict': { + 'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2', + }, + 'playlist_mincount': 151, + } + _PAGE_SIZE = 20 + + def _fetch_page(self, user_id, page): + videos = self._call_api( + 'publicVideos', user_id, 'page %d' % (page + 1), { + 'skip': page * self._PAGE_SIZE, + })['videos'] + for video in videos: + yield self._parse_video(video) + + def _real_extract(self, url): + user_id = self._match_id(url) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, user_id), self._PAGE_SIZE) + return self.playlist_result(entries, user_id) + + +class StoryFireSeriesIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'write/series/stories/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/', + 'info_dict': { + 'id': '-Lq6MsuIHLODO6d2dDkr', + }, + 'playlist_mincount': 13, + }, { + 'url': 'https://storyfire.com/write/series/stories/the_mortal_one/', + 'info_dict': { + 'id': 'the_mortal_one', + }, + 'playlist_count': 0, + }] + + def _extract_videos(self, stories): + for story in stories.values(): + if story.get('hasVideo'): + yield self._parse_video(story) + + def _real_extract(self, url): + series_id = self._match_id(url) + stories = self._call_api( + 'seriesStories', series_id, 'series stories') + return self.playlist_result(self._extract_videos(stories), series_id) diff --git a/haruhi_dl/extractor/streetvoice.py b/haruhi_dl/extractor/streetvoice.py index 91612c7f2..f21681ae7 100644 --- a/haruhi_dl/extractor/streetvoice.py +++ b/haruhi_dl/extractor/streetvoice.py @@ -2,25 +2,40 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_str -from ..utils import unified_strdate +from ..utils import ( + int_or_none, + parse_iso8601, + str_or_none, + strip_or_none, + try_get, + urljoin, +) class StreetVoiceIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?streetvoice\.com/[^/]+/songs/(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://streetvoice.com/skippylu/songs/94440/', - 'md5': '15974627fc01a29e492c98593c2fd472', + 'url': 'https://streetvoice.com/skippylu/songs/123688/', + 'md5': '0eb535970629a5195685355f3ed60bfd', 'info_dict': { - 'id': '94440', + 'id': '123688', 'ext': 'mp3', - 'title': '輸', - 'description': 'Crispy脆樂團 - 輸', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 260, - 'upload_date': '20091018', + 'title': '流浪', + 'description': 'md5:8eb0bfcc9dcd8aa82bd6efca66e3fea6', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 270, + 'upload_date': '20100923', 'uploader': 'Crispy脆樂團', 'uploader_id': '627810', + 'uploader_url': 're:^https?://streetvoice.com/skippylu/', + 'timestamp': 1285261661, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'repost_count': int, + 'track': '流浪', + 'track_id': '123688', + 'album': '2010', } }, { 'url': 'http://tw.streetvoice.com/skippylu/songs/94440/', @@ -29,21 +44,57 @@ class StreetVoiceIE(InfoExtractor): def _real_extract(self, url): song_id = self._match_id(url) - - song = self._download_json( - 'https://streetvoice.com/api/v1/public/song/%s/' % song_id, song_id, data=b'') - + base_url = 'https://streetvoice.com/api/v4/song/%s/' % song_id + song = self._download_json(base_url, song_id, query={ + 'fields': 'album,comments_count,created_at,id,image,length,likes_count,name,nickname,plays_count,profile,share_count,synopsis,user,username', + }) title = song['name'] - author = song['user']['nickname'] + + formats = [] + for suffix, format_id in [('hls/file', 'hls'), ('file', 'http'), ('file/original', 'original')]: + f_url = (self._download_json( + base_url + suffix + '/', song_id, + 'Downloading %s format URL' % format_id, + data=b'', fatal=False) or {}).get('file') + if not f_url: + continue + f = { + 'ext': 'mp3', + 'format_id': format_id, + 'url': f_url, + 'vcodec': 'none', + } + if format_id == 'hls': + f['protocol'] = 'm3u8_native' + abr = self._search_regex(r'\.mp3\.(\d+)k', f_url, 'bitrate', default=None) + if abr: + abr = int(abr) + f.update({ + 'abr': abr, + 'tbr': abr, + }) + formats.append(f) + + user = song.get('user') or {} + username = user.get('username') + get_count = lambda x: int_or_none(song.get(x + '_count')) return { 'id': song_id, - 'url': song['file'], + 'formats': formats, 'title': title, - 'description': '%s - %s' % (author, title), - 'thumbnail': self._proto_relative_url(song.get('image'), 'http:'), - 'duration': song.get('length'), - 'upload_date': unified_strdate(song.get('created_at')), - 'uploader': author, - 'uploader_id': compat_str(song['user']['id']), + 'description': strip_or_none(song.get('synopsis')), + 'thumbnail': song.get('image'), + 'duration': int_or_none(song.get('length')), + 'timestamp': parse_iso8601(song.get('created_at')), + 'uploader': try_get(user, lambda x: x['profile']['nickname']), + 'uploader_id': str_or_none(user.get('id')), + 'uploader_url': urljoin(url, '/%s/' % username) if username else None, + 'view_count': get_count('plays'), + 'like_count': get_count('likes'), + 'comment_count': get_count('comments'), + 'repost_count': get_count('share'), + 'track': title, + 'track_id': song_id, + 'album': try_get(song, lambda x: x['album']['name']), } diff --git a/haruhi_dl/extractor/stv.py b/haruhi_dl/extractor/stv.py index bae8b71f4..539220a94 100644 --- a/haruhi_dl/extractor/stv.py +++ b/haruhi_dl/extractor/stv.py @@ -8,13 +8,17 @@ from ..utils import ( compat_str, float_or_none, int_or_none, + smuggle_url, + str_or_none, + try_get, ) class STVPlayerIE(InfoExtractor): IE_NAME = 'stv:player' _VALID_URL = r'https?://player\.stv\.tv/(?P<type>episode|video)/(?P<id>[a-z0-9]{4})' - _TEST = { + _TESTS = [{ + # shortform 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/', 'md5': '5adf9439c31d554f8be0707c7abe7e0a', 'info_dict': { @@ -27,7 +31,11 @@ class STVPlayerIE(InfoExtractor): 'uploader_id': '1486976045', }, 'skip': 'this resource is unavailable outside of the UK', - } + }, { + # episodes + 'url': 'https://player.stv.tv/episode/4125/jennifer-saunders-memory-lane', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s' _PTYPE_MAP = { 'episode': 'episodes', @@ -36,11 +44,31 @@ class STVPlayerIE(InfoExtractor): def _real_extract(self, url): ptype, video_id = re.match(self._VALID_URL, url).groups() - resp = self._download_json( - 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id), - video_id) - result = resp['results'] + webpage = self._download_webpage(url, video_id, fatal=False) or '' + props = (self._parse_json(self._search_regex( + r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', + webpage, 'next data', default='{}'), video_id, + fatal=False) or {}).get('props') or {} + player_api_cache = try_get( + props, lambda x: x['initialReduxState']['playerApiCache']) or {} + + api_path, resp = None, {} + for k, v in player_api_cache.items(): + if k.startswith('/episodes/') or k.startswith('/shortform/'): + api_path, resp = k, v + break + else: + episode_id = str_or_none(try_get( + props, lambda x: x['pageProps']['episodeId'])) + api_path = '/%s/%s' % (self._PTYPE_MAP[ptype], episode_id or video_id) + + result = resp.get('results') + if not result: + resp = self._download_json( + 'https://player.api.stv.tv/v1' + api_path, video_id) + result = resp['results'] + video = result['video'] video_id = compat_str(video['id']) @@ -57,7 +85,7 @@ class STVPlayerIE(InfoExtractor): return { '_type': 'url_transparent', 'id': video_id, - 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_id, + 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['GB']}), 'description': result.get('summary'), 'duration': float_or_none(video.get('length'), 1000), 'subtitles': subtitles, diff --git a/haruhi_dl/extractor/svt.py b/haruhi_dl/extractor/svt.py index a5e480f0b..2f768edf4 100644 --- a/haruhi_dl/extractor/svt.py +++ b/haruhi_dl/extractor/svt.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, dict_get, int_or_none, + unified_timestamp, str_or_none, strip_or_none, try_get, @@ -44,7 +45,8 @@ class SVTBaseIE(InfoExtractor): 'format_id': player_type, 'url': vurl, }) - if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): + rights = try_get(video_info, lambda x: x['rights'], dict) or {} + if not formats and rights.get('geoBlockedSweden'): self.raise_geo_restricted( 'This video is only available in Sweden', countries=self._GEO_COUNTRIES) @@ -70,6 +72,7 @@ class SVTBaseIE(InfoExtractor): episode = video_info.get('episodeTitle') episode_number = int_or_none(video_info.get('episodeNumber')) + timestamp = unified_timestamp(rights.get('validFrom')) duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration'))) age_limit = None adult = dict_get( @@ -84,6 +87,7 @@ class SVTBaseIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'duration': duration, + 'timestamp': timestamp, 'age_limit': age_limit, 'series': series, 'season_number': season_number, @@ -135,26 +139,39 @@ class SVTPlayIE(SVTPlayBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' _VALID_URL = r'''(?x) (?: - svt:(?P<svt_id>[^/?#&]+)| + (?: + svt:| + https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/ + ) + (?P<svt_id>[^/?#&]+)| https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+) ) ''' _TESTS = [{ - 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', - 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', + 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', + 'md5': '2382036fd6f8c994856c323fe51c426e', 'info_dict': { - 'id': '5996901', + 'id': 'jNwpV9P', 'ext': 'mp4', - 'title': 'Flygplan till Haile Selassie', - 'duration': 3527, - 'thumbnail': r're:^https?://.*[\.-]jpg$', + 'title': 'Det här är himlen', + 'timestamp': 1586044800, + 'upload_date': '20200405', + 'duration': 3515, + 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$', 'age_limit': 0, 'subtitles': { 'sv': [{ - 'ext': 'wsrt', + 'ext': 'vtt', }] }, }, + 'params': { + 'format': 'bestvideo', + # skip for now due to download test asserts that segment is > 10000 bytes and svt uses + # init segments that are smaller + # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B + 'skip_download': True, + }, }, { # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -171,6 +188,12 @@ class SVTPlayIE(SVTPlayBaseIE): }, { 'url': 'svt:14278044', 'only_matching': True, + }, { + 'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/', + 'only_matching': True, + }, { + 'url': 'svt:eWv5MLX', + 'only_matching': True, }] def _adjust_title(self, info): @@ -231,11 +254,16 @@ class SVTPlayIE(SVTPlayBaseIE): svt_id = self._search_regex( (r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', r'["\']videoSvtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']videoSvtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)', r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"', - r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'), + r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)', + r'["\']svtId\\?["\']\s*:\s*\\?["\']([\da-zA-Z-]+)'), webpage, 'video id') - return self._extract_by_video_id(svt_id, webpage) + info_dict = self._extract_by_video_id(svt_id, webpage) + info_dict['thumbnail'] = thumbnail + + return info_dict class SVTSeriesIE(SVTPlayBaseIE): @@ -359,7 +387,7 @@ class SVTPageIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) + return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url) def _real_extract(self, url): path, display_id = re.match(self._VALID_URL, url).groups() diff --git a/haruhi_dl/extractor/tastytrade.py b/haruhi_dl/extractor/tastytrade.py deleted file mode 100644 index 7fe96bd5f..000000000 --- a/haruhi_dl/extractor/tastytrade.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .ooyala import OoyalaIE - - -class TastyTradeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/shows/[^/]+/episodes/(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017', - 'info_dict': { - 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', - 'ext': 'mp4', - 'title': 'A History of Teaming', - 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', - 'duration': 422.255, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, { - 'url': 'https://www.tastytrade.com/tt/shows/daily-dose/episodes/daily-dose-06-30-2017', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - ooyala_code = self._search_regex( - r'data-media-id=(["\'])(?P<code>(?:(?!\1).)+)\1', - webpage, 'ooyala code', group='code') - - info = self._search_json_ld(webpage, display_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'ie_key': OoyalaIE.ie_key(), - 'url': 'ooyala:%s' % ooyala_code, - 'display_id': display_id, - }) - return info diff --git a/haruhi_dl/extractor/teachable.py b/haruhi_dl/extractor/teachable.py index 5557a9925..3a337afd8 100644 --- a/haruhi_dl/extractor/teachable.py +++ b/haruhi_dl/extractor/teachable.py @@ -140,7 +140,7 @@ class TeachableIE(TeachableBaseIE): @staticmethod def _is_teachable(webpage): return 'teachableTracker.linker:autoLink' in webpage and re.search( - r'<link[^>]+href=["\']https?://process\.fs\.teachablecdn\.com', + r'<link[^>]+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com', webpage) @staticmethod @@ -269,7 +269,7 @@ class TeachableCourseIE(TeachableBaseIE): r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)', webpage): li = mobj.group('li') - if 'fa-youtube-play' not in li: + if 'fa-youtube-play' not in li and not re.search(r'\d{1,2}:\d{2}', li): continue lecture_url = self._search_regex( r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li, diff --git a/haruhi_dl/extractor/telecinco.py b/haruhi_dl/extractor/telecinco.py index 9ba3da341..eecd6a5c9 100644 --- a/haruhi_dl/extractor/telecinco.py +++ b/haruhi_dl/extractor/telecinco.py @@ -5,14 +5,11 @@ import json import re from .common import InfoExtractor -from .ooyala import OoyalaIE from ..utils import ( clean_html, - determine_ext, int_or_none, str_or_none, try_get, - urljoin, ) @@ -28,7 +25,7 @@ class TelecincoIE(InfoExtractor): 'description': 'md5:716caf5601e25c3c5ab6605b1ae71529', }, 'playlist': [{ - 'md5': 'adb28c37238b675dad0f042292f209a7', + 'md5': '7ee56d665cfd241c0e6d80fd175068b0', 'info_dict': { 'id': 'JEA5ijCnF6p5W08A1rNKn7', 'ext': 'mp4', @@ -38,7 +35,7 @@ class TelecincoIE(InfoExtractor): }] }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', - 'md5': '9468140ebc300fbb8b9d65dc6e5c4b43', + 'md5': 'c86fe0d99e3bdb46b7950d38bf6ef12a', 'info_dict': { 'id': 'jn24Od1zGLG4XUZcnUnZB6', 'ext': 'mp4', @@ -48,7 +45,7 @@ class TelecincoIE(InfoExtractor): }, }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6', + 'md5': 'eddb50291df704ce23c74821b995bcac', 'info_dict': { 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', @@ -90,58 +87,24 @@ class TelecincoIE(InfoExtractor): def _parse_content(self, content, url): video_id = content['dataMediaId'] - if content.get('dataCmsId') == 'ooyala': - return self.url_result( - 'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id) - config_url = urljoin(url, content['dataConfig']) config = self._download_json( - config_url, video_id, 'Downloading config JSON') + content['dataConfig'], video_id, 'Downloading config JSON') title = config['info']['title'] - - def mmc_url(mmc_type): - return re.sub( - r'/(?:flash|html5)\.json', '/%s.json' % mmc_type, - config['services']['mmc']) - - duration = None - formats = [] - for mmc_type in ('flash', 'html5'): - mmc = self._download_json( - mmc_url(mmc_type), video_id, - 'Downloading %s mmc JSON' % mmc_type, fatal=False) - if not mmc: - continue - if not duration: - duration = int_or_none(mmc.get('duration')) - for location in mmc['locations']: - gat = self._proto_relative_url(location.get('gat'), 'http:') - gcp = location.get('gcp') - ogn = location.get('ogn') - if None in (gat, gcp, ogn): - continue - token_data = { - 'gcp': gcp, - 'ogn': ogn, - 'sta': 0, - } - media = self._download_json( - gat, video_id, data=json.dumps(token_data).encode('utf-8'), - headers={ - 'Content-Type': 'application/json;charset=utf-8', - 'Referer': url, - }, fatal=False) or {} - stream = media.get('stream') or media.get('file') - if not stream: - continue - ext = determine_ext(stream) - if ext == 'f4m': - formats.extend(self._extract_f4m_formats( - stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - stream, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + services = config['services'] + caronte = self._download_json(services['caronte'], video_id) + stream = caronte['dls'][0]['stream'] + headers = self.geo_verification_headers() + headers.update({ + 'Content-Type': 'application/json;charset=UTF-8', + 'Origin': re.match(r'https?://[^/]+', url).group(0), + }) + cdn = self._download_json( + caronte['cerbero'], video_id, data=json.dumps({ + 'bbx': caronte['bbx'], + 'gbx': self._download_json(services['gbx'], video_id)['gbx'], + }).encode(), headers=headers)['tokens']['1']['cdn'] + formats = self._extract_m3u8_formats( + stream + '?' + cdn, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) return { @@ -149,7 +112,7 @@ class TelecincoIE(InfoExtractor): 'title': title, 'formats': formats, 'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'), - 'duration': duration, + 'duration': int_or_none(content.get('dataDuration')), } def _real_extract(self, url): diff --git a/haruhi_dl/extractor/telequebec.py b/haruhi_dl/extractor/telequebec.py index b4c485b9b..800d87b70 100644 --- a/haruhi_dl/extractor/telequebec.py +++ b/haruhi_dl/extractor/telequebec.py @@ -12,25 +12,16 @@ from ..utils import ( class TeleQuebecBaseIE(InfoExtractor): + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + @staticmethod - def _result(url, ie_key): + def _brightcove_result(brightcove_id, player_id, account_id='6150020952001'): return { '_type': 'url_transparent', - 'url': smuggle_url(url, {'geo_countries': ['CA']}), - 'ie_key': ie_key, + 'url': smuggle_url(TeleQuebecBaseIE.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, brightcove_id), {'geo_countries': ['CA']}), + 'ie_key': 'BrightcoveNew', } - @staticmethod - def _limelight_result(media_id): - return TeleQuebecBaseIE._result( - 'limelight:media:' + media_id, 'LimelightMedia') - - @staticmethod - def _brightcove_result(brightcove_id): - return TeleQuebecBaseIE._result( - 'http://players.brightcove.net/6150020952001/default_default/index.html?videoId=%s' - % brightcove_id, 'BrightcoveNew') - class TeleQuebecIE(TeleQuebecBaseIE): _VALID_URL = r'''(?x) @@ -44,14 +35,18 @@ class TeleQuebecIE(TeleQuebecBaseIE): # available till 01.01.2023 'url': 'http://zonevideo.telequebec.tv/media/37578/un-petit-choc-et-puis-repart/un-chef-a-la-cabane', 'info_dict': { - 'id': '577116881b4b439084e6b1cf4ef8b1b3', + 'id': '6155972771001', 'ext': 'mp4', 'title': 'Un petit choc et puis repart!', - 'description': 'md5:067bc84bd6afecad85e69d1000730907', + 'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374', + 'timestamp': 1589262469, + 'uploader_id': '6150020952001', + 'upload_date': '20200512', }, 'params': { - 'skip_download': True, + 'format': 'bestvideo', }, + 'add_ie': ['BrightcoveNew'], }, { 'url': 'https://zonevideo.telequebec.tv/media/55267/le-soleil/passe-partout', 'info_dict': { @@ -65,7 +60,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): }, 'params': { 'format': 'bestvideo', - 'skip_download': True, }, 'add_ie': ['BrightcoveNew'], }, { @@ -79,25 +73,20 @@ class TeleQuebecIE(TeleQuebecBaseIE): def _real_extract(self, url): media_id = self._match_id(url) - - media_data = self._download_json( - 'https://mnmedias.api.telequebec.tv/api/v2/media/' + media_id, + media = self._download_json( + 'https://mnmedias.api.telequebec.tv/api/v3/media/' + media_id, media_id)['media'] - - source_id = media_data['streamInfo']['sourceId'] - source = (try_get( - media_data, lambda x: x['streamInfo']['source'], - compat_str) or 'limelight').lower() - if source == 'brightcove': - info = self._brightcove_result(source_id) - else: - info = self._limelight_result(source_id) + source_id = next(source_info['sourceId'] for source_info in media['streamInfos'] if source_info.get('source') == 'Brightcove') + info = self._brightcove_result(source_id, '22gPKdt7f') + product = media.get('product') or {} + season = product.get('season') or {} info.update({ - 'title': media_data.get('title'), - 'description': try_get( - media_data, lambda x: x['descriptions'][0]['text'], compat_str), - 'duration': int_or_none( - media_data.get('durationInMilliseconds'), 1000), + 'description': try_get(media, lambda x: x['descriptions'][-1]['text'], compat_str), + 'series': try_get(season, lambda x: x['serie']['titre']), + 'season': season.get('name'), + 'season_number': int_or_none(season.get('seasonNo')), + 'episode': product.get('titre'), + 'episode_number': int_or_none(product.get('episodeNo')), }) return info @@ -148,7 +137,7 @@ class TeleQuebecSquatIE(InfoExtractor): } -class TeleQuebecEmissionIE(TeleQuebecBaseIE): +class TeleQuebecEmissionIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: @@ -160,15 +149,16 @@ class TeleQuebecEmissionIE(TeleQuebecBaseIE): _TESTS = [{ 'url': 'http://lindicemcsween.telequebec.tv/emissions/100430013/des-soins-esthetiques-a-377-d-interets-annuels-ca-vous-tente', 'info_dict': { - 'id': '66648a6aef914fe3badda25e81a4d50a', + 'id': '6154476028001', 'ext': 'mp4', - 'title': "Des soins esthétiques à 377 % d'intérêts annuels, ça vous tente?", - 'description': 'md5:369e0d55d0083f1fc9b71ffb640ea014', - 'upload_date': '20171024', - 'timestamp': 1508862118, + 'title': 'Des soins esthétiques à 377 % d’intérêts annuels, ça vous tente?', + 'description': 'md5:cb4d378e073fae6cce1f87c00f84ae9f', + 'upload_date': '20200505', + 'timestamp': 1588713424, + 'uploader_id': '6150020952001', }, 'params': { - 'skip_download': True, + 'format': 'bestvideo', }, }, { 'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression', @@ -187,26 +177,26 @@ class TeleQuebecEmissionIE(TeleQuebecBaseIE): webpage = self._download_webpage(url, display_id) media_id = self._search_regex( - r'mediaUID\s*:\s*["\'][Ll]imelight_(?P<id>[a-z0-9]{32})', webpage, - 'limelight id') + r'mediaId\s*:\s*(?P<id>\d+)', webpage, 'media id') - info = self._limelight_result(media_id) - info.update({ - 'title': self._og_search_title(webpage, default=None), - 'description': self._og_search_description(webpage, default=None), - }) - return info + return self.url_result( + 'http://zonevideo.telequebec.tv/media/' + media_id, + TeleQuebecIE.ie_key()) -class TeleQuebecLiveIE(InfoExtractor): +class TeleQuebecLiveIE(TeleQuebecBaseIE): _VALID_URL = r'https?://zonevideo\.telequebec\.tv/(?P<id>endirect)' _TEST = { 'url': 'http://zonevideo.telequebec.tv/endirect/', 'info_dict': { - 'id': 'endirect', + 'id': '6159095684001', 'ext': 'mp4', - 'title': 're:^Télé-Québec - En direct [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^Télé-Québec [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, + 'description': 'Canal principal de Télé-Québec', + 'uploader_id': '6150020952001', + 'timestamp': 1590439901, + 'upload_date': '20200525', }, 'params': { 'skip_download': True, @@ -214,25 +204,49 @@ class TeleQuebecLiveIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + return self._brightcove_result('6159095684001', 'skCsmi2Uw') - m3u8_url = None - webpage = self._download_webpage( - 'https://player.telequebec.tv/Tq_VideoPlayer.js', video_id, - fatal=False) - if webpage: - m3u8_url = self._search_regex( - r'm3U8Url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'm3u8 url', default=None, group='url') - if not m3u8_url: - m3u8_url = 'https://teleqmmd.mmdlive.lldns.net/teleqmmd/f386e3b206814e1f8c8c1c71c0f8e748/manifest.m3u8' - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', m3u8_id='hls') - self._sort_formats(formats) - return { - 'id': video_id, - 'title': self._live_title('Télé-Québec - En direct'), - 'is_live': True, - 'formats': formats, - } +class TeleQuebecVideoIE(TeleQuebecBaseIE): + _VALID_URL = r'https?://video\.telequebec\.tv/player(?:-live)?/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://video.telequebec.tv/player/31110/stream', + 'info_dict': { + 'id': '6202570652001', + 'ext': 'mp4', + 'title': 'Le coût du véhicule le plus vendu au Canada / Tous les frais liés à la procréation assistée', + 'description': 'md5:685a7e4c450ba777c60adb6e71e41526', + 'upload_date': '20201019', + 'timestamp': 1603115930, + 'uploader_id': '6101674910001', + }, + 'params': { + 'format': 'bestvideo', + }, + }, { + 'url': 'https://video.telequebec.tv/player-live/28527', + 'only_matching': True, + }] + + def _call_api(self, path, video_id): + return self._download_json( + 'http://beacon.playback.api.brightcove.com/telequebec/api/assets/' + path, + video_id, query={'device_layout': 'web', 'device_type': 'web'})['data'] + + def _real_extract(self, url): + asset_id = self._match_id(url) + asset = self._call_api(asset_id, asset_id)['asset'] + stream = self._call_api( + asset_id + '/streams/' + asset['streams'][0]['id'], asset_id)['stream'] + stream_url = stream['url'] + account_id = try_get( + stream, lambda x: x['video_provider_details']['account_id']) or '6101674910001' + info = self._brightcove_result(stream_url, 'default', account_id) + info.update({ + 'description': asset.get('long_description') or asset.get('short_description'), + 'series': asset.get('series_original_name'), + 'season_number': int_or_none(asset.get('season_number')), + 'episode': asset.get('original_name'), + 'episode_number': int_or_none(asset.get('episode_number')), + }) + return info diff --git a/haruhi_dl/extractor/tenplay.py b/haruhi_dl/extractor/tenplay.py index af325fea8..cd30d57f4 100644 --- a/haruhi_dl/extractor/tenplay.py +++ b/haruhi_dl/extractor/tenplay.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + HEADRequest, parse_age_limit, parse_iso8601, - smuggle_url, + # smuggle_url, ) @@ -24,14 +25,16 @@ class TenPlayIE(InfoExtractor): 'uploader_id': '2199827728001', }, 'params': { - 'format': 'bestvideo', + # 'format': 'bestvideo', 'skip_download': True, } }, { 'url': 'https://10play.com.au/how-to-stay-married/web-extras/season-1/terrys-talks-ep-1-embracing-change/tpv190915ylupc', 'only_matching': True, }] - BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s' + _GEO_BYPASS = False + _FASTLY_URL_TEMPL = 'https://10-selector.global.ssl.fastly.net/s/kYEXFC/media/%s?mbr=true&manifest=m3u&format=redirect' def _real_extract(self, url): content_id = self._match_id(url) @@ -40,19 +43,28 @@ class TenPlayIE(InfoExtractor): video = data.get('video') or {} metadata = data.get('metaData') or {} brightcove_id = video.get('videoId') or metadata['showContentVideoId'] - brightcove_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, - {'geo_countries': ['AU']}) + # brightcove_url = smuggle_url( + # self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + # {'geo_countries': ['AU']}) + m3u8_url = self._request_webpage(HEADRequest( + self._FASTLY_URL_TEMPL % brightcove_id), brightcove_id).geturl() + if '10play-not-in-oz' in m3u8_url: + self.raise_geo_restricted(countries=['AU']) + formats = self._extract_m3u8_formats(m3u8_url, brightcove_id, 'mp4') + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'url': brightcove_url, - 'id': content_id, - 'title': video.get('title') or metadata.get('pageContentName') or metadata.get('showContentName'), + # '_type': 'url_transparent', + # 'url': brightcove_url, + 'formats': formats, + 'id': brightcove_id, + 'title': video.get('title') or metadata.get('pageContentName') or metadata['showContentName'], 'description': video.get('description'), 'age_limit': parse_age_limit(video.get('showRatingClassification') or metadata.get('showProgramClassification')), 'series': metadata.get('showName'), 'season': metadata.get('showContentSeason'), 'timestamp': parse_iso8601(metadata.get('contentPublishDate') or metadata.get('pageContentPublishDate')), - 'ie_key': 'BrightcoveNew', + 'thumbnail': video.get('poster'), + 'uploader_id': '2199827728001', + # 'ie_key': 'BrightcoveNew', } diff --git a/haruhi_dl/extractor/tf1.py b/haruhi_dl/extractor/tf1.py index 55e2a0721..23c2808a1 100644 --- a/haruhi_dl/extractor/tf1.py +++ b/haruhi_dl/extractor/tf1.py @@ -1,92 +1,87 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor -from ..compat import compat_str +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, +) class TF1IE(InfoExtractor): - """TF1 uses the wat.tv player.""" - _VALID_URL = r'https?://(?:(?:videos|www|lci)\.tf1|(?:www\.)?(?:tfou|ushuaiatv|histoire|tvbreizh))\.fr/(?:[^/]+/)*(?P<id>[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?tf1\.fr/[^/]+/(?P<program_slug>[^/]+)/videos/(?P<id>[^/?&#]+)\.html' _TESTS = [{ - 'url': 'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html', - 'info_dict': { - 'id': '10635995', - 'ext': 'mp4', - 'title': 'Citroën Grand C4 Picasso 2013 : présentation officielle', - 'description': 'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.', - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, - }, - 'expected_warnings': ['HTTP Error 404'], - }, { - 'url': 'http://www.tfou.fr/chuggington/videos/le-grand-mysterioso-chuggington-7085291-739.html', - 'info_dict': { - 'id': 'le-grand-mysterioso-chuggington-7085291-739', - 'ext': 'mp4', - 'title': 'Le grand Mystérioso - Chuggington', - 'description': 'Le grand Mystérioso - Emery rêve qu\'un article lui soit consacré dans le journal.', - 'upload_date': '20150103', - }, - 'params': { - # Sometimes wat serves the whole file with the --test option - 'skip_download': True, - }, - 'skip': 'HTTP Error 410: Gone', - }, { - 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', - 'only_matching': True, - }, { - 'url': 'http://lci.tf1.fr/sept-a-huit/videos/sept-a-huit-du-24-mai-2015-8611550.html', - 'only_matching': True, - }, { - 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', - 'only_matching': True, - }, { 'url': 'https://www.tf1.fr/tmc/quotidien-avec-yann-barthes/videos/quotidien-premiere-partie-11-juin-2019.html', 'info_dict': { 'id': '13641379', 'ext': 'mp4', 'title': 'md5:f392bc52245dc5ad43771650c96fb620', - 'description': 'md5:44bc54f0a21322f5b91d68e76a544eae', + 'description': 'md5:a02cdb217141fb2d469d6216339b052f', 'upload_date': '20190611', + 'timestamp': 1560273989, + 'duration': 1738, + 'series': 'Quotidien avec Yann Barthès', + 'tags': ['intégrale', 'quotidien', 'Replay'], }, 'params': { # Sometimes wat serves the whole file with the --test option 'skip_download': True, + 'format': 'bestvideo', }, + }, { + 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', + 'only_matching': True, + }, { + 'url': 'http://www.tf1.fr/hd1/documentaire/videos/mylene-farmer-d-une-icone.html', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + program_slug, slug = re.match(self._VALID_URL, url).groups() + video = self._download_json( + 'https://www.tf1.fr/graphql/web', slug, query={ + 'id': '9b80783950b85247541dd1d851f9cc7fa36574af015621f853ab111a679ce26f', + 'variables': json.dumps({ + 'programSlug': program_slug, + 'slug': slug, + }) + })['data']['videoBySlug'] + wat_id = video['streamId'] - webpage = self._download_webpage(url, video_id) + tags = [] + for tag in (video.get('tags') or []): + label = tag.get('label') + if not label: + continue + tags.append(label) - wat_id = None + decoration = video.get('decoration') or {} - data = self._parse_json( - self._search_regex( - r'__APOLLO_STATE__\s*=\s*({.+?})\s*(?:;|</script>)', webpage, - 'data', default='{}'), video_id, fatal=False) + thumbnails = [] + for source in (try_get(decoration, lambda x: x['image']['sources'], list) or []): + source_url = source.get('url') + if not source_url: + continue + thumbnails.append({ + 'url': source_url, + 'width': int_or_none(source.get('width')), + }) - if data: - try: - wat_id = next( - video.get('streamId') - for key, video in data.items() - if isinstance(video, dict) - and video.get('slug') == video_id) - if not isinstance(wat_id, compat_str) or not wat_id.isdigit(): - wat_id = None - except StopIteration: - pass - - if not wat_id: - wat_id = self._html_search_regex( - (r'(["\'])(?:https?:)?//www\.wat\.tv/embedframe/.*?(?P<id>\d{8})\1', - r'(["\']?)streamId\1\s*:\s*(["\']?)(?P<id>\d+)\2'), - webpage, 'wat id', group='id') - - return self.url_result('wat:%s' % wat_id, 'Wat') + return { + '_type': 'url_transparent', + 'id': wat_id, + 'url': 'wat:' + wat_id, + 'title': video.get('title'), + 'thumbnails': thumbnails, + 'description': decoration.get('description'), + 'timestamp': parse_iso8601(video.get('date')), + 'duration': int_or_none(try_get(video, lambda x: x['publicPlayingInfos']['duration'])), + 'tags': tags, + 'series': decoration.get('programLabel'), + 'season_number': int_or_none(video.get('season')), + 'episode_number': int_or_none(video.get('episode')), + } diff --git a/haruhi_dl/extractor/theplatform.py b/haruhi_dl/extractor/theplatform.py index 5b14bbf82..cdba10e40 100644 --- a/haruhi_dl/extractor/theplatform.py +++ b/haruhi_dl/extractor/theplatform.py @@ -234,6 +234,9 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass({ + 'countries': smuggled_data.get('geo_countries'), + }) mobj = re.match(self._VALID_URL, url) provider_id = mobj.group('provider_id') diff --git a/haruhi_dl/extractor/theweatherchannel.py b/haruhi_dl/extractor/theweatherchannel.py index c34a49d03..b2a8c3797 100644 --- a/haruhi_dl/extractor/theweatherchannel.py +++ b/haruhi_dl/extractor/theweatherchannel.py @@ -1,18 +1,22 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .theplatform import ThePlatformIE from ..utils import ( determine_ext, parse_duration, + parse_iso8601, ) class TheWeatherChannelIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?weather\.com/(?:[^/]+/)*video/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?weather\.com(?P<asset_name>(?:/(?P<locale>[a-z]{2}-[A-Z]{2}))?/(?:[^/]+/)*video/(?P<id>[^/?#]+))' _TESTS = [{ 'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock', - 'md5': 'ab924ac9574e79689c24c6b95e957def', + 'md5': 'c4cbe74c9c17c5676b704b950b73dd92', 'info_dict': { 'id': 'cc82397e-cc3f-4d11-9390-a785add090e8', 'ext': 'mp4', @@ -20,18 +24,33 @@ class TheWeatherChannelIE(ThePlatformIE): 'description': 'md5:55606ce1378d4c72e6545e160c9d9695', 'uploader': 'TWC - Digital (No Distro)', 'uploader_id': '6ccd5455-16bb-46f2-9c57-ff858bb9f62c', + 'upload_date': '20160720', + 'timestamp': 1469018835, } + }, { + 'url': 'https://weather.com/en-CA/international/videos/video/unidentified-object-falls-from-sky-in-india', + 'only_matching': True, }] def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - drupal_settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), display_id) - video_id = drupal_settings['twc']['contexts']['node']['uuid'] - video_data = self._download_json( - 'https://dsx.weather.com/cms/v4/asset-collection/en_US/' + video_id, video_id) + asset_name, locale, display_id = re.match(self._VALID_URL, url).groups() + if not locale: + locale = 'en-US' + video_data = list(self._download_json( + 'https://weather.com/api/v1/p/redux-dal', display_id, data=json.dumps([{ + 'name': 'getCMSAssetsUrlConfig', + 'params': { + 'language': locale.replace('-', '_'), + 'query': { + 'assetName': { + '$in': asset_name, + }, + }, + } + }]).encode(), headers={ + 'Content-Type': 'application/json', + })['dal']['getCMSAssetsUrlConfig'].values())[0]['data'][0] + video_id = video_data['id'] seo_meta = video_data.get('seometa', {}) title = video_data.get('title') or seo_meta['title'] @@ -66,6 +85,8 @@ class TheWeatherChannelIE(ThePlatformIE): }) self._sort_formats(formats) + cc_url = video_data.get('cc_url') + return { 'id': video_id, 'display_id': display_id, @@ -74,6 +95,8 @@ class TheWeatherChannelIE(ThePlatformIE): 'duration': parse_duration(video_data.get('duration')), 'uploader': video_data.get('providername'), 'uploader_id': video_data.get('providerid'), + 'timestamp': parse_iso8601(video_data.get('publishdate')), + 'subtitles': {locale[:2]: [{'url': cc_url}]} if cc_url else None, 'thumbnails': thumbnails, 'formats': formats, } diff --git a/haruhi_dl/extractor/threeqsdn.py b/haruhi_dl/extractor/threeqsdn.py index f26937da1..f6d37bb9e 100644 --- a/haruhi_dl/extractor/threeqsdn.py +++ b/haruhi_dl/extractor/threeqsdn.py @@ -3,10 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_HTTPError from ..utils import ( determine_ext, - js_to_json, - mimetype2ext, + ExtractorError, + float_or_none, + int_or_none, + parse_iso8601, ) @@ -15,29 +18,35 @@ class ThreeQSDNIE(InfoExtractor): IE_DESC = '3Q SDN' _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _TESTS = [{ - # ondemand from http://www.philharmonie.tv/veranstaltung/26/ - 'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http', - 'md5': 'ab040e37bcfa2e0c079f92cb1dd7f6cd', + # https://player.3qsdn.com/demo.html + 'url': 'https://playout.3qsdn.com/7201c779-6b3c-11e7-a40e-002590c750be', + 'md5': '64a57396b16fa011b15e0ea60edce918', 'info_dict': { - 'id': '0280d6b9-1215-11e6-b427-0cc47a188158', + 'id': '7201c779-6b3c-11e7-a40e-002590c750be', 'ext': 'mp4', - 'title': '0280d6b9-1215-11e6-b427-0cc47a188158', + 'title': 'Video Ads', 'is_live': False, + 'description': 'Video Ads Demo', + 'timestamp': 1500334803, + 'upload_date': '20170717', + 'duration': 888.032, + 'subtitles': { + 'eng': 'count:1', + }, }, - 'expected_warnings': ['Failed to download MPD manifest', 'Failed to parse JSON'], + 'expected_warnings': ['Unknown MIME type application/mp4 in DASH manifest'], }, { # live video stream - 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true', + 'url': 'https://playout.3qsdn.com/66e68995-11ca-11e8-9273-002590c750be', 'info_dict': { - 'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f', + 'id': '66e68995-11ca-11e8-9273-002590c750be', 'ext': 'mp4', - 'title': 're:^d755d94b-4ab9-11e3-9162-0025907ad44f [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^66e68995-11ca-11e8-9273-002590c750be [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, }, 'params': { 'skip_download': True, # m3u8 downloads }, - 'expected_warnings': ['Failed to download MPD manifest'], }, { # live audio stream 'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48', @@ -58,6 +67,14 @@ class ThreeQSDNIE(InfoExtractor): # live video with rtmp link 'url': 'https://playout.3qsdn.com/6092bb9e-8f72-11e4-a173-002590c750be', 'only_matching': True, + }, { + # ondemand from http://www.philharmonie.tv/veranstaltung/26/ + 'url': 'http://playout.3qsdn.com/0280d6b9-1215-11e6-b427-0cc47a188158?protocol=http', + 'only_matching': True, + }, { + # live video stream + 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true', + 'only_matching': True, }] @staticmethod @@ -70,73 +87,78 @@ class ThreeQSDNIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - js = self._download_webpage( - 'http://playout.3qsdn.com/%s' % video_id, video_id, - query={'js': 'true'}) + try: + config = self._download_json( + url.replace('://playout.3qsdn.com/', '://playout.3qsdn.com/config/'), video_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self.raise_geo_restricted() + raise - if any(p in js for p in ( - '>This content is not available in your country', - 'playout.3qsdn.com/forbidden')): - self.raise_geo_restricted() - - stream_content = self._search_regex( - r'streamContent\s*:\s*(["\'])(?P<content>.+?)\1', js, - 'stream content', default='demand', group='content') - - live = stream_content == 'live' - - stream_type = self._search_regex( - r'streamType\s*:\s*(["\'])(?P<type>audio|video)\1', js, - 'stream type', default='video', group='type') + live = config.get('streamContent') == 'live' + aspect = float_or_none(config.get('aspect')) formats = [] - urls = set() - - def extract_formats(item_url, item={}): - if not item_url or item_url in urls: - return - urls.add(item_url) - ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None) - if ext == 'mpd': - formats.extend(self._extract_mpd_formats( - item_url, video_id, mpd_id='mpd', fatal=False)) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - item_url, video_id, 'mp4', - entry_protocol='m3u8' if live else 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - item_url, video_id, f4m_id='hds', fatal=False)) - else: - if not self._is_valid_url(item_url, video_id): - return - formats.append({ - 'url': item_url, - 'format_id': item.get('quality'), - 'ext': 'mp4' if item_url.startswith('rtsp') else ext, - 'vcodec': 'none' if stream_type == 'audio' else None, - }) - - for item_js in re.findall(r'({[^{]*?\b(?:src|source)\s*:\s*["\'].+?})', js): - f = self._parse_json( - item_js, video_id, transform_source=js_to_json, fatal=False) - if not f: + for source_type, source in (config.get('sources') or {}).items(): + if not source: continue - extract_formats(f.get('src'), f) + if source_type == 'dash': + formats.extend(self._extract_mpd_formats( + source, video_id, mpd_id='mpd', fatal=False)) + elif source_type == 'hls': + formats.extend(self._extract_m3u8_formats( + source, video_id, 'mp4', 'm3u8' if live else 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif source_type == 'progressive': + for s in source: + src = s.get('src') + if not (src and self._is_valid_url(src, video_id)): + continue + width = None + format_id = ['http'] + ext = determine_ext(src) + if ext: + format_id.append(ext) + height = int_or_none(s.get('height')) + if height: + format_id.append('%dp' % height) + if aspect: + width = int(height * aspect) + formats.append({ + 'ext': ext, + 'format_id': '-'.join(format_id), + 'height': height, + 'source_preference': 0, + 'url': src, + 'vcodec': 'none' if height == 0 else None, + 'width': width, + }) + for f in formats: + if f.get('acodec') == 'none': + f['preference'] = -40 + elif f.get('vcodec') == 'none': + f['preference'] = -50 + self._sort_formats(formats, ('preference', 'width', 'height', 'source_preference', 'tbr', 'vbr', 'abr', 'ext', 'format_id')) - # More relaxed version to collect additional URLs and acting - # as a future-proof fallback - for _, src in re.findall(r'\b(?:src|source)\s*:\s*(["\'])((?:https?|rtsp)://.+?)\1', js): - extract_formats(src) + subtitles = {} + for subtitle in (config.get('subtitles') or []): + src = subtitle.get('src') + if not src: + continue + subtitles.setdefault(subtitle.get('label') or 'eng', []).append({ + 'url': src, + }) - self._sort_formats(formats) - - title = self._live_title(video_id) if live else video_id + title = config.get('title') or video_id return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if live else title, + 'thumbnail': config.get('poster') or None, + 'description': config.get('description') or None, + 'timestamp': parse_iso8601(config.get('upload_date')), + 'duration': float_or_none(config.get('vlength')) or None, 'is_live': live, 'formats': formats, + 'subtitles': subtitles, } diff --git a/haruhi_dl/extractor/tmz.py b/haruhi_dl/extractor/tmz.py index 419f9d92e..3d1bf75ff 100644 --- a/haruhi_dl/extractor/tmz.py +++ b/haruhi_dl/extractor/tmz.py @@ -2,55 +2,110 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .jwplatform import JWPlatformIE +from .kaltura import KalturaIE +from ..utils import ( + int_or_none, + unified_timestamp, +) class TMZIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.tmz.com/videos/0_okj015ty/', - 'md5': '4d22a51ef205b6c06395d8394f72d560', - 'info_dict': { - 'id': '0_okj015ty', - 'ext': 'mp4', - 'title': 'Kim Kardashian\'s Boobs Unlock a Mystery!', - 'description': 'Did Kim Kardasain try to one-up Khloe by one-upping Kylie??? Or is she just showing off her amazing boobs?', - 'timestamp': 1394747163, - 'uploader_id': 'batchUser', - 'upload_date': '20140313', - } - }, { 'url': 'http://www.tmz.com/videos/0-cegprt2p/', + 'md5': '31f9223e20eef55954973359afa61a20', + 'info_dict': { + 'id': 'P6YjLBLk', + 'ext': 'mp4', + 'title': "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet", + 'description': 'md5:b714359fc18607715ebccbd2da8ff488', + 'timestamp': 1467831837, + 'upload_date': '20160706', + }, + 'add_ie': [JWPlatformIE.ie_key()], + }, { + 'url': 'http://www.tmz.com/videos/0_okj015ty/', + 'only_matching': True, + }, { + 'url': 'https://www.tmz.com/videos/071119-chris-morgan-women-4590005-0-zcsejvcr/', + 'only_matching': True, + }, { + 'url': 'https://www.tmz.com/videos/2021-02-19-021921-floyd-mayweather-1043872/', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url).replace('-', '_') - return self.url_result('kaltura:591531:%s' % video_id, 'Kaltura', video_id) + + webpage = self._download_webpage(url, video_id, fatal=False) + if webpage: + tmz_video_id = self._search_regex( + r'nodeRef\s*:\s*["\']tmz:video:([\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12})', + webpage, 'video id', default=None) + video = self._download_json( + 'https://www.tmz.com/_/video/%s' % tmz_video_id, video_id, + fatal=False) + if video: + message = video['message'] + info = { + '_type': 'url_transparent', + 'title': message.get('title'), + 'description': message.get('description'), + 'timestamp': unified_timestamp(message.get('published_at')), + 'duration': int_or_none(message.get('duration')), + } + jwplatform_id = message.get('jwplayer_media_id') + if jwplatform_id: + info.update({ + 'url': 'jwplatform:%s' % jwplatform_id, + 'ie_key': JWPlatformIE.ie_key(), + }) + else: + kaltura_entry_id = message.get('kaltura_entry_id') or video_id + kaltura_partner_id = message.get('kaltura_partner_id') or '591531' + info.update({ + 'url': 'kaltura:%s:%s' % (kaltura_partner_id, kaltura_entry_id), + 'ie_key': KalturaIE.ie_key(), + }) + return info + + return self.url_result( + 'kaltura:591531:%s' % video_id, KalturaIE.ie_key(), video_id) class TMZArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P<id>[^/]+)/?' + _VALID_URL = r'https?://(?:www\.)?tmz\.com/\d{4}/\d{2}/\d{2}/(?P<id>[^/?#&]+)' _TEST = { 'url': 'http://www.tmz.com/2015/04/19/bobby-brown-bobbi-kristina-awake-video-concert', - 'md5': '3316ff838ae5bb7f642537825e1e90d2', 'info_dict': { - 'id': '0_6snoelag', - 'ext': 'mov', + 'id': 'PAKZa97W', + 'ext': 'mp4', 'title': 'Bobby Brown Tells Crowd ... Bobbi Kristina is Awake', 'description': 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', - 'timestamp': 1429467813, + 'timestamp': 1429466400, 'upload_date': '20150419', - 'uploader_id': 'batchUser', - } + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [JWPlatformIE.ie_key()], } def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + + tmz_url = self._search_regex( + r'clickLink\s*\(\s*["\'](?P<url>%s)' % TMZIE._VALID_URL, webpage, + 'video id', default=None, group='url') + if tmz_url: + return self.url_result(tmz_url, ie=TMZIE.ie_key()) + embedded_video_info = self._parse_json(self._html_search_regex( r'tmzVideoEmbed\(({.+?})\);', webpage, 'embedded video info'), video_id) - return self.url_result( - 'http://www.tmz.com/videos/%s/' % embedded_video_info['id']) + 'http://www.tmz.com/videos/%s/' % embedded_video_info['id'], + ie=TMZIE.ie_key()) diff --git a/haruhi_dl/extractor/toggle.py b/haruhi_dl/extractor/toggle.py index ca2e36efe..270c84daa 100644 --- a/haruhi_dl/extractor/toggle.py +++ b/haruhi_dl/extractor/toggle.py @@ -11,13 +11,13 @@ from ..utils import ( float_or_none, int_or_none, parse_iso8601, - sanitized_Request, + strip_or_none, ) class ToggleIE(InfoExtractor): IE_NAME = 'toggle' - _VALID_URL = r'https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)' + _VALID_URL = r'(?:https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}|toggle:)(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.mewatch.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115', 'info_dict': { @@ -84,28 +84,12 @@ class ToggleIE(InfoExtractor): 'only_matching': True, }] - _FORMAT_PREFERENCES = { - 'wvm-STBMain': -10, - 'wvm-iPadMain': -20, - 'wvm-iPhoneMain': -30, - 'wvm-Android': -40, - } _API_USER = 'tvpapi_147' _API_PASS = '11111' def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - url, video_id, note='Downloading video page') - - api_user = self._search_regex( - r'apiUser\s*:\s*(["\'])(?P<user>.+?)\1', webpage, 'apiUser', - default=self._API_USER, group='user') - api_pass = self._search_regex( - r'apiPass\s*:\s*(["\'])(?P<pass>.+?)\1', webpage, 'apiPass', - default=self._API_PASS, group='pass') - params = { 'initObj': { 'Locale': { @@ -118,17 +102,16 @@ class ToggleIE(InfoExtractor): 'SiteGuid': 0, 'DomainID': '0', 'UDID': '', - 'ApiUser': api_user, - 'ApiPass': api_pass + 'ApiUser': self._API_USER, + 'ApiPass': self._API_PASS }, 'MediaID': video_id, 'mediaType': 0, } - req = sanitized_Request( + info = self._download_json( 'http://tvpapi.as.tvinci.com/v2_9/gateways/jsonpostgw.aspx?m=GetMediaInfo', - json.dumps(params).encode('utf-8')) - info = self._download_json(req, video_id, 'Downloading video info json') + video_id, 'Downloading video info json', data=json.dumps(params).encode('utf-8')) title = info['MediaName'] @@ -141,11 +124,16 @@ class ToggleIE(InfoExtractor): vid_format = vid_format.replace(' ', '') # if geo-restricted, m3u8 is inaccessible, but mp4 is okay if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + m3u8_formats = self._extract_m3u8_formats( video_url, video_id, ext='mp4', m3u8_id=vid_format, note='Downloading %s m3u8 information' % vid_format, errnote='Failed to download %s m3u8 information' % vid_format, - fatal=False)) + fatal=False) + for f in m3u8_formats: + # Apple FairPlay Streaming + if '/fpshls/' in f['url']: + continue + formats.append(f) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id=vid_format, @@ -158,28 +146,21 @@ class ToggleIE(InfoExtractor): note='Downloading %s ISM manifest' % vid_format, errnote='Failed to download %s ISM manifest' % vid_format, fatal=False)) - elif ext in ('mp4', 'wvm'): - # wvm are drm-protected files + elif ext == 'mp4': formats.append({ 'ext': ext, 'url': video_url, 'format_id': vid_format, - 'preference': self._FORMAT_PREFERENCES.get(ext + '-' + vid_format) or -1, - 'format_note': 'DRM-protected video' if ext == 'wvm' else None }) if not formats: + for meta in (info.get('Metas') or []): + if meta.get('Key') == 'Encryption' and meta.get('Value') == '1': + raise ExtractorError( + 'This video is DRM protected.', expected=True) # Most likely because geo-blocked raise ExtractorError('No downloadable videos found', expected=True) self._sort_formats(formats) - duration = int_or_none(info.get('Duration')) - description = info.get('Description') - created_at = parse_iso8601(info.get('CreationDate') or None) - - average_rating = float_or_none(info.get('Rating')) - view_count = int_or_none(info.get('ViewCounter') or info.get('view_counter')) - like_count = int_or_none(info.get('LikeCounter') or info.get('like_counter')) - thumbnails = [] for picture in info.get('Pictures', []): if not isinstance(picture, dict): @@ -199,15 +180,55 @@ class ToggleIE(InfoExtractor): }) thumbnails.append(thumbnail) + def counter(prefix): + return int_or_none( + info.get(prefix + 'Counter') or info.get(prefix.lower() + '_counter')) + return { 'id': video_id, 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': created_at, - 'average_rating': average_rating, - 'view_count': view_count, - 'like_count': like_count, + 'description': strip_or_none(info.get('Description')), + 'duration': int_or_none(info.get('Duration')), + 'timestamp': parse_iso8601(info.get('CreationDate') or None), + 'average_rating': float_or_none(info.get('Rating')), + 'view_count': counter('View'), + 'like_count': counter('Like'), 'thumbnails': thumbnails, 'formats': formats, } + + +class MeWatchIE(InfoExtractor): + IE_NAME = 'mewatch' + _VALID_URL = r'https?://(?:(?:www|live)\.)?mewatch\.sg/watch/[^/?#&]+-(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://www.mewatch.sg/watch/Recipe-Of-Life-E1-179371', + 'info_dict': { + 'id': '1008625', + 'ext': 'mp4', + 'title': 'Recipe Of Life 味之道', + 'timestamp': 1603306526, + 'description': 'md5:6e88cde8af2068444fc8e1bc3ebf257c', + 'upload_date': '20201021', + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + }, { + 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-搜密。打卡。小红点-S2-E1-176232', + 'only_matching': True, + }, { + 'url': 'https://www.mewatch.sg/watch/Little-Red-Dot-Detectives-S2-%E6%90%9C%E5%AF%86%E3%80%82%E6%89%93%E5%8D%A1%E3%80%82%E5%B0%8F%E7%BA%A2%E7%82%B9-S2-E1-176232', + 'only_matching': True, + }, { + 'url': 'https://live.mewatch.sg/watch/Recipe-Of-Life-E41-189759', + 'only_matching': True, + }] + + def _real_extract(self, url): + item_id = self._match_id(url) + custom_id = self._download_json( + 'https://cdn.mewatch.sg/api/items/' + item_id, + item_id, query={'segments': 'all'})['customId'] + return self.url_result( + 'toggle:' + custom_id, ToggleIE.ie_key(), custom_id) diff --git a/haruhi_dl/extractor/trovo.py b/haruhi_dl/extractor/trovo.py new file mode 100644 index 000000000..43745213d --- /dev/null +++ b/haruhi_dl/extractor/trovo.py @@ -0,0 +1,193 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + try_get, +) + + +class TrovoBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?trovo\.live/' + + def _extract_streamer_info(self, data): + streamer_info = data.get('streamerInfo') or {} + username = streamer_info.get('userName') + return { + 'uploader': streamer_info.get('nickName'), + 'uploader_id': str_or_none(streamer_info.get('uid')), + 'uploader_url': 'https://trovo.live/' + username if username else None, + } + + +class TrovoIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?!(?:clip|video)/)(?P<id>[^/?&#]+)' + + def _real_extract(self, url): + username = self._match_id(url) + live_info = self._download_json( + 'https://gql.trovo.live/', username, query={ + 'query': '''{ + getLiveInfo(params: {userName: "%s"}) { + isLive + programInfo { + coverUrl + id + streamInfo { + desc + playUrl + } + title + } + streamerInfo { + nickName + uid + userName + } + } +}''' % username, + })['data']['getLiveInfo'] + if live_info.get('isLive') == 0: + raise ExtractorError('%s is offline' % username, expected=True) + program_info = live_info['programInfo'] + program_id = program_info['id'] + title = self._live_title(program_info['title']) + + formats = [] + for stream_info in (program_info.get('streamInfo') or []): + play_url = stream_info.get('playUrl') + if not play_url: + continue + format_id = stream_info.get('desc') + formats.append({ + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'url': play_url, + }) + self._sort_formats(formats) + + info = { + 'id': program_id, + 'title': title, + 'formats': formats, + 'thumbnail': program_info.get('coverUrl'), + 'is_live': True, + } + info.update(self._extract_streamer_info(live_info)) + return info + + +class TrovoVodIE(TrovoBaseIE): + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?:clip|video)/(?P<id>[^/?&#]+)' + _TESTS = [{ + 'url': 'https://trovo.live/video/ltv-100095501_100095501_1609596043', + 'info_dict': { + 'id': 'ltv-100095501_100095501_1609596043', + 'ext': 'mp4', + 'title': 'Spontaner 12 Stunden Stream! - Ok Boomer!', + 'uploader': 'Exsl', + 'timestamp': 1609640305, + 'upload_date': '20210103', + 'uploader_id': '100095501', + 'duration': 43977, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + 'comments': 'mincount:8', + 'categories': ['Grand Theft Auto V'], + }, + }, { + 'url': 'https://trovo.live/clip/lc-5285890810184026005', + 'only_matching': True, + }] + + def _real_extract(self, url): + vid = self._match_id(url) + resp = self._download_json( + 'https://gql.trovo.live/', vid, data=json.dumps([{ + 'query': '''{ + batchGetVodDetailInfo(params: {vids: ["%s"]}) { + VodDetailInfos + } +}''' % vid, + }, { + 'query': '''{ + getCommentList(params: {appInfo: {postID: "%s"}, pageSize: 1000000000, preview: {}}) { + commentList { + author { + nickName + uid + } + commentID + content + createdAt + parentID + } + } +}''' % vid, + }]).encode(), headers={ + 'Content-Type': 'application/json', + }) + vod_detail_info = resp[0]['data']['batchGetVodDetailInfo']['VodDetailInfos'][vid] + vod_info = vod_detail_info['vodInfo'] + title = vod_info['title'] + + language = vod_info.get('languageName') + formats = [] + for play_info in (vod_info.get('playInfos') or []): + play_url = play_info.get('playUrl') + if not play_url: + continue + format_id = play_info.get('desc') + formats.append({ + 'ext': 'mp4', + 'filesize': int_or_none(play_info.get('fileSize')), + 'format_id': format_id, + 'height': int_or_none(format_id[:-1]) if format_id else None, + 'language': language, + 'protocol': 'm3u8_native', + 'tbr': int_or_none(play_info.get('bitrate')), + 'url': play_url, + }) + self._sort_formats(formats) + + category = vod_info.get('categoryName') + get_count = lambda x: int_or_none(vod_info.get(x + 'Num')) + + comment_list = try_get(resp, lambda x: x[1]['data']['getCommentList']['commentList'], list) or [] + comments = [] + for comment in comment_list: + content = comment.get('content') + if not content: + continue + author = comment.get('author') or {} + parent = comment.get('parentID') + comments.append({ + 'author': author.get('nickName'), + 'author_id': str_or_none(author.get('uid')), + 'id': str_or_none(comment.get('commentID')), + 'text': content, + 'timestamp': int_or_none(comment.get('createdAt')), + 'parent': 'root' if parent == 0 else str_or_none(parent), + }) + + info = { + 'id': vid, + 'title': title, + 'formats': formats, + 'thumbnail': vod_info.get('coverUrl'), + 'timestamp': int_or_none(vod_info.get('publishTs')), + 'duration': int_or_none(vod_info.get('duration')), + 'view_count': get_count('watch'), + 'like_count': get_count('like'), + 'comment_count': get_count('comment'), + 'comments': comments, + 'categories': [category] if category else None, + } + info.update(self._extract_streamer_info(vod_detail_info)) + return info diff --git a/haruhi_dl/extractor/tubitv.py b/haruhi_dl/extractor/tubitv.py index a51fa6515..ebfb05c63 100644 --- a/haruhi_dl/extractor/tubitv.py +++ b/haruhi_dl/extractor/tubitv.py @@ -33,6 +33,19 @@ class TubiTvIE(InfoExtractor): }, { 'url': 'http://tubitv.com/movies/383676/tracker', 'only_matching': True, + }, { + 'url': 'https://tubitv.com/movies/560057/penitentiary?start=true', + 'info_dict': { + 'id': '560057', + 'ext': 'mp4', + 'title': 'Penitentiary', + 'description': 'md5:8d2fc793a93cc1575ff426fdcb8dd3f9', + 'uploader_id': 'd8fed30d4f24fcb22ec294421b9defc2', + 'release_year': 1979, + }, + 'params': { + 'skip_download': True, + }, }] def _login(self): @@ -93,4 +106,5 @@ class TubiTvIE(InfoExtractor): 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'uploader_id': video_data.get('publisher_id'), + 'release_year': int_or_none(video_data.get('year')), } diff --git a/haruhi_dl/extractor/turner.py b/haruhi_dl/extractor/turner.py index 4a6cbfbb8..820e3cbe1 100644 --- a/haruhi_dl/extractor/turner.py +++ b/haruhi_dl/extractor/turner.py @@ -6,6 +6,7 @@ import re from .adobepass import AdobePassIE from ..compat import compat_str from ..utils import ( + fix_xml_ampersands, xpath_text, int_or_none, determine_ext, @@ -49,8 +50,13 @@ class TurnerBaseIE(AdobePassIE): self._AKAMAI_SPE_TOKEN_CACHE[secure_path] = token return video_url + '?hdnea=' + token - def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}): - video_data = self._download_xml(data_src, video_id) + def _extract_cvp_info(self, data_src, video_id, path_data={}, ap_data={}, fatal=False): + video_data = self._download_xml( + data_src, video_id, + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=fatal) + if not video_data: + return {} video_id = video_data.attrib['id'] title = xpath_text(video_data, 'headline', fatal=True) content_id = xpath_text(video_data, 'contentId') or video_id @@ -63,12 +69,14 @@ class TurnerBaseIE(AdobePassIE): urls = [] formats = [] + thumbnails = [] + subtitles = {} rex = re.compile( r'(?P<width>[0-9]+)x(?P<height>[0-9]+)(?:_(?P<bitrate>[0-9]+))?') # Possible formats locations: files/file, files/groupFiles/files # and maybe others for video_file in video_data.findall('.//file'): - video_url = video_file.text.strip() + video_url = url_or_none(video_file.text.strip()) if not video_url: continue ext = determine_ext(video_url) @@ -108,9 +116,28 @@ class TurnerBaseIE(AdobePassIE): continue urls.append(video_url) format_id = video_file.get('bitrate') - if ext == 'smil': + if ext in ('scc', 'srt', 'vtt'): + subtitles.setdefault('en', []).append({ + 'ext': ext, + 'url': video_url, + }) + elif ext == 'png': + thumbnails.append({ + 'id': format_id, + 'url': video_url, + }) + elif ext == 'smil': formats.extend(self._extract_smil_formats( video_url, video_id, fatal=False)) + elif re.match(r'https?://[^/]+\.akamaihd\.net/[iz]/', video_url): + formats.extend(self._extract_akamai_formats( + video_url, video_id, { + 'hds': path_data.get('f4m', {}).get('host'), + # nba.cdn.turner.com, ht.cdn.turner.com, ht2.cdn.turner.com + # ht3.cdn.turner.com, i.cdn.turner.com, s.cdn.turner.com + # ssl.cdn.turner.com + 'http': 'pmd.cdn.turner.com', + })) elif ext == 'm3u8': m3u8_formats = self._extract_m3u8_formats( video_url, video_id, 'mp4', @@ -129,7 +156,7 @@ class TurnerBaseIE(AdobePassIE): 'url': video_url, 'ext': ext, } - mobj = rex.search(format_id + video_url) + mobj = rex.search(video_url) if mobj: f.update({ 'width': int(mobj.group('width')), @@ -152,7 +179,6 @@ class TurnerBaseIE(AdobePassIE): formats.append(f) self._sort_formats(formats) - subtitles = {} for source in video_data.findall('closedCaptions/source'): for track in source.findall('track'): track_url = url_or_none(track.get('url')) @@ -168,12 +194,12 @@ class TurnerBaseIE(AdobePassIE): }.get(source.get('format')) }) - thumbnails = [{ - 'id': image.get('cut'), + thumbnails.extend({ + 'id': image.get('cut') or image.get('name'), 'url': image.text, 'width': int_or_none(image.get('width')), 'height': int_or_none(image.get('height')), - } for image in video_data.findall('images/image')] + } for image in video_data.findall('images/image')) is_live = xpath_text(video_data, 'isLive') == 'true' diff --git a/haruhi_dl/extractor/tv2.py b/haruhi_dl/extractor/tv2.py index 4a19b9be6..334b7d540 100644 --- a/haruhi_dl/extractor/tv2.py +++ b/haruhi_dl/extractor/tv2.py @@ -20,7 +20,7 @@ from ..utils import ( class TV2IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { 'id': '916509', @@ -33,7 +33,7 @@ class TV2IE(InfoExtractor): 'view_count': int, 'categories': list, }, - } + }] _API_DOMAIN = 'sumo.tv2.no' _PROTOCOLS = ('HDS', 'HLS', 'DASH') _GEO_COUNTRIES = ['NO'] @@ -42,6 +42,12 @@ class TV2IE(InfoExtractor): video_id = self._match_id(url) api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id) + asset = self._download_json( + api_base + '.json', video_id, + 'Downloading metadata JSON')['asset'] + title = asset.get('subtitle') or asset['title'] + is_live = asset.get('live') is True + formats = [] format_urls = [] for protocol in self._PROTOCOLS: @@ -81,7 +87,8 @@ class TV2IE(InfoExtractor): elif ext == 'm3u8': if not data.get('drmProtected'): formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', + video_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', m3u8_id=format_id, fatal=False)) elif ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -99,11 +106,6 @@ class TV2IE(InfoExtractor): raise ExtractorError('This video is DRM protected.', expected=True) self._sort_formats(formats) - asset = self._download_json( - api_base + '.json', video_id, - 'Downloading metadata JSON')['asset'] - title = asset['title'] - thumbnails = [{ 'id': thumbnail.get('@type'), 'url': thumbnail.get('url'), @@ -112,7 +114,7 @@ class TV2IE(InfoExtractor): return { 'id': video_id, 'url': video_url, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': strip_or_none(asset.get('description')), 'thumbnails': thumbnails, 'timestamp': parse_iso8601(asset.get('createTime')), @@ -120,6 +122,7 @@ class TV2IE(InfoExtractor): 'view_count': int_or_none(asset.get('views')), 'categories': asset.get('keywords', '').split(','), 'formats': formats, + 'is_live': is_live, } @@ -168,13 +171,13 @@ class TV2ArticleIE(InfoExtractor): class KatsomoIE(TV2IE): - _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv)\.fi/(?:#!/)?(?:[^/]+/[0-9a-z-]+-\d+/[0-9a-z-]+-|[^/]+/\d+/[^/]+/)(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv(uutiset)?)\.fi/(?:sarja/[0-9a-z-]+-\d+/[0-9a-z-]+-|(?:#!/)?jakso/(?:\d+/[^/]+/)?|video/prog)(?P<id>\d+)' + _TESTS = [{ 'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321', 'info_dict': { 'id': '1181321', 'ext': 'mp4', - 'title': 'MTV Uutiset Live', + 'title': 'Lahden Pelicans teki kovan ratkaisun – Ville Nieminen pihalle', 'description': 'Päätöksen teki Pelicansin hallitus.', 'timestamp': 1575116484, 'upload_date': '20191130', @@ -186,7 +189,60 @@ class KatsomoIE(TV2IE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://www.katsomo.fi/#!/jakso/33001005/studio55-fi/658521/jukka-kuoppamaki-tekee-yha-lauluja-vaikka-lentokoneessa', + 'only_matching': True, + }, { + 'url': 'https://www.mtvuutiset.fi/video/prog1311159', + 'only_matching': True, + }, { + 'url': 'https://www.katsomo.fi/#!/jakso/1311159', + 'only_matching': True, + }] _API_DOMAIN = 'api.katsomo.fi' _PROTOCOLS = ('HLS', 'MPD') _GEO_COUNTRIES = ['FI'] + + +class MTVUutisetArticleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)mtvuutiset\.fi/artikkeli/[^/]+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.mtvuutiset.fi/artikkeli/tallaisia-vaurioita-viking-amorellassa-on-useamman-osaston-alla-vetta/7931384', + 'info_dict': { + 'id': '1311159', + 'ext': 'mp4', + 'title': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', + 'description': 'Viking Amorellan matkustajien evakuointi on alkanut – tältä operaatio näyttää laivalla', + 'timestamp': 1600608966, + 'upload_date': '20200920', + 'duration': 153.7886666, + 'view_count': int, + 'categories': list, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # multiple Youtube embeds + 'url': 'https://www.mtvuutiset.fi/artikkeli/50-vuotta-subarun-vastaiskua/6070962', + 'only_matching': True, + }] + + def _real_extract(self, url): + article_id = self._match_id(url) + article = self._download_json( + 'http://api.mtvuutiset.fi/mtvuutiset/api/json/' + article_id, + article_id) + + def entries(): + for video in (article.get('videos') or []): + video_type = video.get('videotype') + video_url = video.get('url') + if not (video_url and video_type in ('katsomo', 'youtube')): + continue + yield self.url_result( + video_url, video_type.capitalize(), video.get('video_id')) + + return self.playlist_result( + entries(), article_id, article.get('title'), article.get('description')) diff --git a/haruhi_dl/extractor/tv4.py b/haruhi_dl/extractor/tv4.py index c498b0191..b73bab9a8 100644 --- a/haruhi_dl/extractor/tv4.py +++ b/haruhi_dl/extractor/tv4.py @@ -17,7 +17,7 @@ class TV4IE(InfoExtractor): tv4\.se/(?:[^/]+)/klipp/(?:.*)-| tv4play\.se/ (?: - (?:program|barn)/(?:[^/]+/|(?:[^\?]+)\?video_id=)| + (?:program|barn)/(?:(?:[^/]+/){1,2}|(?:[^\?]+)\?video_id=)| iframe/video/| film/| sport/| @@ -65,6 +65,10 @@ class TV4IE(InfoExtractor): { 'url': 'http://www.tv4play.se/program/farang/3922081', 'only_matching': True, + }, + { + 'url': 'https://www.tv4play.se/program/nyheterna/avsnitt/13315940', + 'only_matching': True, } ] diff --git a/haruhi_dl/extractor/tv5unis.py b/haruhi_dl/extractor/tv5unis.py new file mode 100644 index 000000000..eabdc2271 --- /dev/null +++ b/haruhi_dl/extractor/tv5unis.py @@ -0,0 +1,121 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_age_limit, + smuggle_url, + try_get, +) + + +class TV5UnisBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['CA'] + + def _real_extract(self, url): + groups = re.match(self._VALID_URL, url).groups() + product = self._download_json( + 'https://api.tv5unis.ca/graphql', groups[0], query={ + 'query': '''{ + %s(%s) { + collection { + title + } + episodeNumber + rating { + name + } + seasonNumber + tags + title + videoElement { + ... on Video { + mediaId + } + } + } +}''' % (self._GQL_QUERY_NAME, self._gql_args(groups)), + })['data'][self._GQL_QUERY_NAME] + media_id = product['videoElement']['mediaId'] + + return { + '_type': 'url_transparent', + 'id': media_id, + 'title': product.get('title'), + 'url': smuggle_url('limelight:media:' + media_id, {'geo_countries': self._GEO_COUNTRIES}), + 'age_limit': parse_age_limit(try_get(product, lambda x: x['rating']['name'])), + 'tags': product.get('tags'), + 'series': try_get(product, lambda x: x['collection']['title']), + 'season_number': int_or_none(product.get('seasonNumber')), + 'episode_number': int_or_none(product.get('episodeNumber')), + 'ie_key': 'LimelightMedia', + } + + +class TV5UnisVideoIE(TV5UnisBaseIE): + IE_NAME = 'tv5unis:video' + _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/[^/]+/(?P<id>\d+)' + _TEST = { + 'url': 'https://www.tv5unis.ca/videos/bande-annonces/71843', + 'md5': '3d794164928bda97fb87a17e89923d9b', + 'info_dict': { + 'id': 'a883684aecb2486cad9bdc7bbe17f861', + 'ext': 'mp4', + 'title': 'Watatatow', + 'duration': 10.01, + } + } + _GQL_QUERY_NAME = 'productById' + + @staticmethod + def _gql_args(groups): + return 'id: %s' % groups + + +class TV5UnisIE(TV5UnisBaseIE): + IE_NAME = 'tv5unis' + _VALID_URL = r'https?://(?:www\.)?tv5unis\.ca/videos/(?P<id>[^/]+)(?:/saisons/(?P<season_number>\d+)/episodes/(?P<episode_number>\d+))?/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://www.tv5unis.ca/videos/watatatow/saisons/6/episodes/1', + 'md5': 'a479907d2e531a73e1f8dc48d6388d02', + 'info_dict': { + 'id': 'e5ee23a586c44612a56aad61accf16ef', + 'ext': 'mp4', + 'title': 'Je ne peux pas lui résister', + 'description': "Atys, le nouveau concierge de l'école, a réussi à ébranler la confiance de Mado en affirmant qu\'une médaille, ce n'est que du métal. Comme Mado essaie de lui prouver que ses valeurs sont solides, il veut la mettre à l'épreuve...", + 'subtitles': { + 'fr': 'count:1', + }, + 'duration': 1370, + 'age_limit': 8, + 'tags': 'count:3', + 'series': 'Watatatow', + 'season_number': 6, + 'episode_number': 1, + }, + }, { + 'url': 'https://www.tv5unis.ca/videos/le-voyage-de-fanny', + 'md5': '9ca80ebb575c681d10cae1adff3d4774', + 'info_dict': { + 'id': '726188eefe094d8faefb13381d42bc06', + 'ext': 'mp4', + 'title': 'Le voyage de Fanny', + 'description': "Fanny, 12 ans, cachée dans un foyer loin de ses parents, s'occupe de ses deux soeurs. Devant fuir, Fanny prend la tête d'un groupe de huit enfants et s'engage dans un dangereux périple à travers la France occupée pour rejoindre la frontière suisse.", + 'subtitles': { + 'fr': 'count:1', + }, + 'duration': 5587.034, + 'tags': 'count:4', + }, + }] + _GQL_QUERY_NAME = 'productByRootProductSlug' + + @staticmethod + def _gql_args(groups): + args = 'rootProductSlug: "%s"' % groups[0] + if groups[1]: + args += ', seasonNumber: %s, episodeNumber: %s' % groups[1:] + return args diff --git a/haruhi_dl/extractor/tva.py b/haruhi_dl/extractor/tva.py index 443f46e8a..52a4ddf32 100644 --- a/haruhi_dl/extractor/tva.py +++ b/haruhi_dl/extractor/tva.py @@ -4,7 +4,9 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( float_or_none, + int_or_none, smuggle_url, + strip_or_none, ) @@ -23,7 +25,8 @@ class TVAIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'https://video.tva.ca/details/_5596811470001', 'only_matching': True, @@ -32,26 +35,54 @@ class TVAIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'https://videos.tva.ca/proxy/item/_' + video_id, video_id, headers={ - 'Accept': 'application/json', - }, query={ - 'appId': '5955fc5f23eec60006c951f1', - }) - - def get_attribute(key): - for attribute in video_data.get('attributes', []): - if attribute.get('key') == key: - return attribute.get('value') - return None return { '_type': 'url_transparent', 'id': video_id, - 'title': get_attribute('title'), 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['CA']}), - 'description': get_attribute('description'), - 'thumbnail': get_attribute('image-background') or get_attribute('image-landscape'), - 'duration': float_or_none(get_attribute('video-duration'), 1000), 'ie_key': 'BrightcoveNew', } + + +class QubIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?qub\.ca/(?:[^/]+/)*[0-9a-z-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.qub.ca/tvaplus/tva/alerte-amber/saison-1/episode-01-1000036619', + 'md5': '949490fd0e7aee11d0543777611fbd53', + 'info_dict': { + 'id': '6084352463001', + 'ext': 'mp4', + 'title': 'Épisode 01', + 'uploader_id': '5481942443001', + 'upload_date': '20190907', + 'timestamp': 1567899756, + 'description': 'md5:9c0d7fbb90939420c651fd977df90145', + }, + }, { + 'url': 'https://www.qub.ca/tele/video/lcn-ca-vous-regarde-rev-30s-ap369664-1009357943', + 'only_matching': True, + }] + # reference_id also works with old account_id(5481942443001) + # BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5813221784001/default_default/index.html?videoId=ref:%s' + + def _real_extract(self, url): + entity_id = self._match_id(url) + entity = self._download_json( + 'https://www.qub.ca/proxy/pfu/content-delivery-service/v1/entities', + entity_id, query={'id': entity_id}) + video_id = entity['videoId'] + episode = strip_or_none(entity.get('name')) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': episode, + # 'url': self.BRIGHTCOVE_URL_TEMPLATE % entity['referenceId'], + 'url': 'https://videos.tva.ca/details/_' + video_id, + 'description': entity.get('longDescription'), + 'duration': float_or_none(entity.get('durationMillis'), 1000), + 'episode': episode, + 'episode_number': int_or_none(entity.get('episodeNumber')), + # 'ie_key': 'BrightcoveNew', + 'ie_key': TVAIE.ie_key(), + } diff --git a/haruhi_dl/extractor/tver.py b/haruhi_dl/extractor/tver.py new file mode 100644 index 000000000..931d4d650 --- /dev/null +++ b/haruhi_dl/extractor/tver.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + remove_start, + smuggle_url, + try_get, +) + + +class TVerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>(?:corner|episode|feature)/(?P<id>f?\d+))' + # videos are only available for 7 days + _TESTS = [{ + 'url': 'https://tver.jp/corner/f0062178', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/feature/f0062413', + 'only_matching': True, + }, { + 'url': 'https://tver.jp/episode/79622438', + 'only_matching': True, + }] + _TOKEN = None + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + + def _real_initialize(self): + self._TOKEN = self._download_json( + 'https://tver.jp/api/access_token.php', None)['token'] + + def _real_extract(self, url): + path, video_id = re.match(self._VALID_URL, url).groups() + main = self._download_json( + 'https://api.tver.jp/v4/' + path, video_id, + query={'token': self._TOKEN})['main'] + p_id = main['publisher_id'] + service = remove_start(main['service'], 'ts_') + info = { + '_type': 'url_transparent', + 'description': try_get(main, lambda x: x['note'][0]['text'], compat_str), + 'episode_number': int_or_none(try_get(main, lambda x: x['ext']['episode_number'])), + } + + if service == 'cx': + info.update({ + 'title': main.get('subtitle') or main['title'], + 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), + 'ie_key': 'FujiTVFODPlus7', + }) + else: + r_id = main['reference_id'] + if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): + r_id = 'ref:' + r_id + bc_url = smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), + {'geo_countries': ['JP']}) + info.update({ + 'url': bc_url, + 'ie_key': 'BrightcoveNew', + }) + + return info diff --git a/haruhi_dl/extractor/tvplay.py b/haruhi_dl/extractor/tvplay.py index 3c2450dd0..0d858c025 100644 --- a/haruhi_dl/extractor/tvplay.py +++ b/haruhi_dl/extractor/tvplay.py @@ -12,11 +12,13 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + parse_duration, parse_iso8601, qualities, try_get, update_url_query, url_or_none, + urljoin, ) @@ -414,7 +416,7 @@ class ViafreeIE(InfoExtractor): class TVPlayHomeIE(InfoExtractor): - _VALID_URL = r'https?://tvplay\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/[^/]+/[^/?#&]+-(?P<id>\d+)' + _VALID_URL = r'https?://(?:tv3?)?play\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/(?:[^/]+/)*[^/?#&]+-(?P<id>\d+)' _TESTS = [{ 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/', 'info_dict': { @@ -433,80 +435,58 @@ class TVPlayHomeIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [TVPlayIE.ie_key()], }, { 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/', 'only_matching': True, }, { 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/', 'only_matching': True, + }, { + 'url': 'https://play.tv3.lt/aferistai-10047125', + 'only_matching': True, + }, { + 'url': 'https://tv3play.skaties.lv/vinas-melo-labak-10280317', + 'only_matching': True, + }, { + 'url': 'https://play.tv3.ee/cool-d-ga-mehhikosse-10044354', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + asset = self._download_json( + urljoin(url, '/sb/public/asset/' + video_id), video_id) - video_id = self._search_regex( - r'data-asset-id\s*=\s*["\'](\d{5,})\b', webpage, 'video id') - - if len(video_id) < 8: - return self.url_result( - 'mtg:%s' % video_id, ie=TVPlayIE.ie_key(), video_id=video_id) - - m3u8_url = self._search_regex( - r'data-file\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'm3u8 url', group='url') + m3u8_url = asset['movie']['contentUrl'] + video_id = asset['assetId'] + asset_title = asset['title'] + title = asset_title['title'] formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) - title = self._search_regex( - r'data-title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, - 'title', default=None, group='value') or self._html_search_meta( - 'title', webpage, default=None) or self._og_search_title( - webpage) + thumbnails = None + image_url = asset.get('imageUrl') + if image_url: + thumbnails = [{ + 'url': urljoin(url, image_url), + 'ext': 'jpg', + }] - description = self._html_search_meta( - 'description', webpage, - default=None) or self._og_search_description(webpage) - - thumbnail = self._search_regex( - r'data-image\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'thumbnail', default=None, group='url') or self._html_search_meta( - 'thumbnail', webpage, default=None) or self._og_search_thumbnail( - webpage) - - duration = int_or_none(self._search_regex( - r'data-duration\s*=\s*["\'](\d+)', webpage, 'duration', - fatal=False)) - - season = self._search_regex( - (r'data-series-title\s*=\s*(["\'])[^/]+/(?P<value>(?:(?!\1).)+)\1', - r'\bseason\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, - 'season', default=None, group='value') - season_number = int_or_none(self._search_regex( - r'(\d+)(?:[.\s]+sezona|\s+HOOAEG)', season or '', 'season number', - default=None)) - episode = self._search_regex( - (r'\bepisode\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', - r'data-subtitle\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage, - 'episode', default=None, group='value') - episode_number = int_or_none(self._search_regex( - r'(?:S[eē]rija|Osa)\s+(\d+)', episode or '', 'episode number', - default=None)) + metadata = asset.get('metadata') or {} return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'season': season, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, + 'description': asset_title.get('summaryLong') or asset_title.get('summaryShort'), + 'thumbnails': thumbnails, + 'duration': parse_duration(asset_title.get('runTime')), + 'series': asset.get('tvSeriesTitle'), + 'season': asset.get('tvSeasonTitle'), + 'season_number': int_or_none(metadata.get('seasonNumber')), + 'episode': asset_title.get('titleBrief'), + 'episode_number': int_or_none(metadata.get('episodeNumber')), 'formats': formats, } diff --git a/haruhi_dl/extractor/twitcasting.py b/haruhi_dl/extractor/twitcasting.py index 2dbe89f5b..6596eef9f 100644 --- a/haruhi_dl/extractor/twitcasting.py +++ b/haruhi_dl/extractor/twitcasting.py @@ -1,11 +1,20 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import urlencode_postdata - import re +from .common import InfoExtractor +from ..utils import ( + clean_html, + float_or_none, + get_element_by_class, + get_element_by_id, + parse_duration, + str_to_int, + unified_timestamp, + urlencode_postdata, +) + class TwitCastingIE(InfoExtractor): _VALID_URL = r'https?://(?:[^/]+\.)?twitcasting\.tv/(?P<uploader_id>[^/]+)/movie/(?P<id>\d+)' @@ -17,8 +26,12 @@ class TwitCastingIE(InfoExtractor): 'ext': 'mp4', 'title': 'Live #2357609', 'uploader_id': 'ivetesangalo', - 'description': "Moi! I'm live on TwitCasting from my iPhone.", + 'description': 'Twitter Oficial da cantora brasileira Ivete Sangalo.', 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20110822', + 'timestamp': 1314010824, + 'duration': 32, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -30,8 +43,12 @@ class TwitCastingIE(InfoExtractor): 'ext': 'mp4', 'title': 'Live playing something #3689740', 'uploader_id': 'mttbernardini', - 'description': "I'm live on TwitCasting from my iPad. password: abc (Santa Marinella/Lazio, Italia)", + 'description': 'Salve, io sono Matto (ma con la e). Questa è la mia presentazione, in quanto sono letteralmente matto (nel senso di strano), con qualcosa in più.', 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20120212', + 'timestamp': 1329028024, + 'duration': 681, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -40,9 +57,7 @@ class TwitCastingIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - uploader_id = mobj.group('uploader_id') + uploader_id, video_id = re.match(self._VALID_URL, url).groups() video_password = self._downloader.params.get('videopassword') request_data = None @@ -52,30 +67,45 @@ class TwitCastingIE(InfoExtractor): }) webpage = self._download_webpage(url, video_id, data=request_data) - title = self._html_search_regex( - r'(?s)<[^>]+id=["\']movietitle[^>]+>(.+?)</', - webpage, 'title', default=None) or self._html_search_meta( - 'twitter:title', webpage, fatal=True) + title = clean_html(get_element_by_id( + 'movietitle', webpage)) or self._html_search_meta( + ['og:title', 'twitter:title'], webpage, fatal=True) + video_js_data = {} m3u8_url = self._search_regex( - (r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - r'(["\'])(?P<url>http.+?\.m3u8.*?)\1'), - webpage, 'm3u8 url', group='url') + r'data-movie-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'm3u8 url', group='url', default=None) + if not m3u8_url: + video_js_data = self._parse_json(self._search_regex( + r"data-movie-playlist='(\[[^']+\])'", + webpage, 'movie playlist'), video_id)[0] + m3u8_url = video_js_data['source']['url'] + # use `m3u8` entry_protocol until EXT-X-MAP is properly supported by `m3u8_native` entry_protocol formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + m3u8_url, video_id, 'mp4', m3u8_id='hls') - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'twitter:description', webpage) + thumbnail = video_js_data.get('thumbnailUrl') or self._og_search_thumbnail(webpage) + description = clean_html(get_element_by_id( + 'authorcomment', webpage)) or self._html_search_meta( + ['description', 'og:description', 'twitter:description'], webpage) + duration = float_or_none(video_js_data.get( + 'duration'), 1000) or parse_duration(clean_html( + get_element_by_class('tw-player-duration-time', webpage))) + view_count = str_to_int(self._search_regex( + r'Total\s*:\s*([\d,]+)\s*Views', webpage, 'views', None)) + timestamp = unified_timestamp(self._search_regex( + r'data-toggle="true"[^>]+datetime="([^"]+)"', + webpage, 'datetime', None)) return { 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, + 'timestamp': timestamp, 'uploader_id': uploader_id, + 'duration': duration, + 'view_count': view_count, 'formats': formats, } diff --git a/haruhi_dl/extractor/twitch.py b/haruhi_dl/extractor/twitch.py index ab6654432..a7867f4d3 100644 --- a/haruhi_dl/extractor/twitch.py +++ b/haruhi_dl/extractor/twitch.py @@ -9,7 +9,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_kwargs, compat_parse_qs, compat_str, compat_urlparse, @@ -18,6 +17,7 @@ from ..compat import ( ) from ..utils import ( clean_html, + dict_get, ExtractorError, float_or_none, int_or_none, @@ -42,30 +42,16 @@ class TwitchBaseIE(InfoExtractor): _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko' _NETRC_MACHINE = 'twitch' - def _handle_error(self, response): - if not isinstance(response, dict): - return - error = response.get('error') - if error: - raise ExtractorError( - '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')), - expected=True) - - def _call_api(self, path, item_id, *args, **kwargs): - headers = kwargs.get('headers', {}).copy() - headers.update({ - 'Accept': 'application/vnd.twitchtv.v5+json; charset=UTF-8', - 'Client-ID': self._CLIENT_ID, - }) - kwargs.update({ - 'headers': headers, - 'expected_status': (400, 410), - }) - response = self._download_json( - '%s/%s' % (self._API_BASE, path), item_id, - *args, **compat_kwargs(kwargs)) - self._handle_error(response) - return response + _OPERATION_HASHES = { + 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', + 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', + 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', + 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', + 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', + 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', + 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', + 'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687', + } def _real_initialize(self): self._login() @@ -91,14 +77,14 @@ class TwitchBaseIE(InfoExtractor): headers = { 'Referer': page_url, - 'Origin': page_url, + 'Origin': 'https://www.twitch.tv', 'Content-Type': 'text/plain;charset=UTF-8', } response = self._download_json( post_url, None, note, data=json.dumps(form).encode(), headers=headers, expected_status=400) - error = response.get('error_description') or response.get('error_code') + error = dict_get(response, ('error', 'error_description', 'error_code')) if error: fail(error) @@ -151,13 +137,50 @@ class TwitchBaseIE(InfoExtractor): }) self._sort_formats(formats) - def _download_access_token(self, channel_name): - return self._call_api( - 'api/channels/%s/access_token' % channel_name, channel_name, - 'Downloading access token JSON') + def _download_base_gql(self, video_id, ops, note, fatal=True): + headers = { + 'Content-Type': 'text/plain;charset=UTF-8', + 'Client-ID': self._CLIENT_ID, + } + gql_auth = self._get_cookies('https://gql.twitch.tv').get('auth-token') + if gql_auth: + headers['Authorization'] = 'OAuth ' + gql_auth.value + return self._download_json( + 'https://gql.twitch.tv/gql', video_id, note, + data=json.dumps(ops).encode(), + headers=headers, fatal=fatal) - def _extract_channel_id(self, token, channel_name): - return compat_str(self._parse_json(token, channel_name)['channel_id']) + def _download_gql(self, video_id, ops, note, fatal=True): + for op in ops: + op['extensions'] = { + 'persistedQuery': { + 'version': 1, + 'sha256Hash': self._OPERATION_HASHES[op['operationName']], + } + } + return self._download_base_gql(video_id, ops, note) + + def _download_access_token(self, video_id, token_kind, param_name): + method = '%sPlaybackAccessToken' % token_kind + ops = { + 'query': '''{ + %s( + %s: "%s", + params: { + platform: "web", + playerBackend: "mediaplayer", + playerType: "site" + } + ) + { + value + signature + } + }''' % (method, param_name, video_id), + } + return self._download_base_gql( + video_id, ops, + 'Downloading %s access token GraphQL' % token_kind)['data'][method] class TwitchVodIE(TwitchBaseIE): @@ -170,8 +193,6 @@ class TwitchVodIE(TwitchBaseIE): ) (?P<id>\d+) ''' - _ITEM_TYPE = 'vod' - _ITEM_SHORTCUT = 'v' _TESTS = [{ 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s', @@ -181,7 +202,7 @@ class TwitchVodIE(TwitchBaseIE): 'title': 'LCK Summer Split - Week 6 Day 1', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 17208, - 'timestamp': 1435131709, + 'timestamp': 1435131734, 'upload_date': '20150624', 'uploader': 'Riot Games', 'uploader_id': 'riotgames', @@ -230,10 +251,20 @@ class TwitchVodIE(TwitchBaseIE): }] def _download_info(self, item_id): - return self._extract_info( - self._call_api( - 'kraken/videos/%s' % item_id, item_id, - 'Downloading video info JSON')) + data = self._download_gql( + item_id, [{ + 'operationName': 'VideoMetadata', + 'variables': { + 'channelLogin': '', + 'videoID': item_id, + }, + }], + 'Downloading stream metadata GraphQL')[0]['data'] + video = data.get('video') + if video is None: + raise ExtractorError( + 'Video %s does not exist' % item_id, expected=True) + return self._extract_info_gql(video, item_id) @staticmethod def _extract_info(info): @@ -272,13 +303,33 @@ class TwitchVodIE(TwitchBaseIE): 'is_live': is_live, } + @staticmethod + def _extract_info_gql(info, item_id): + vod_id = info.get('id') or item_id + # id backward compatibility for download archives + if vod_id[0] != 'v': + vod_id = 'v%s' % vod_id + thumbnail = url_or_none(info.get('previewThumbnailURL')) + if thumbnail: + for p in ('width', 'height'): + thumbnail = thumbnail.replace('{%s}' % p, '0') + return { + 'id': vod_id, + 'title': info.get('title') or 'Untitled Broadcast', + 'description': info.get('description'), + 'duration': int_or_none(info.get('lengthSeconds')), + 'thumbnail': thumbnail, + 'uploader': try_get(info, lambda x: x['owner']['displayName'], compat_str), + 'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str), + 'timestamp': unified_timestamp(info.get('publishedAt')), + 'view_count': int_or_none(info.get('viewCount')), + } + def _real_extract(self, url): vod_id = self._match_id(url) info = self._download_info(vod_id) - access_token = self._call_api( - 'api/vods/%s/access_token' % vod_id, vod_id, - 'Downloading %s access token' % self._ITEM_TYPE) + access_token = self._download_access_token(vod_id, 'video', 'id') formats = self._extract_m3u8_formats( '%s/vod/%s.m3u8?%s' % ( @@ -289,8 +340,8 @@ class TwitchVodIE(TwitchBaseIE): 'allow_spectre': 'true', 'player': 'twitchweb', 'playlist_include_framerate': 'true', - 'nauth': access_token['token'], - 'nauthsig': access_token['sig'], + 'nauth': access_token['value'], + 'nauthsig': access_token['signature'], })), vod_id, 'mp4', entry_protocol='m3u8_native') @@ -333,37 +384,7 @@ def _make_video_result(node): } -class TwitchGraphQLBaseIE(TwitchBaseIE): - _PAGE_LIMIT = 100 - - _OPERATION_HASHES = { - 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', - 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', - 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', - 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', - 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', - 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', - 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', - } - - def _download_gql(self, video_id, ops, note, fatal=True): - for op in ops: - op['extensions'] = { - 'persistedQuery': { - 'version': 1, - 'sha256Hash': self._OPERATION_HASHES[op['operationName']], - } - } - return self._download_json( - 'https://gql.twitch.tv/gql', video_id, note, - data=json.dumps(ops).encode(), - headers={ - 'Content-Type': 'text/plain;charset=UTF-8', - 'Client-ID': self._CLIENT_ID, - }, fatal=fatal) - - -class TwitchCollectionIE(TwitchGraphQLBaseIE): +class TwitchCollectionIE(TwitchBaseIE): _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/collections/(?P<id>[^/]+)' _TESTS = [{ @@ -400,7 +421,9 @@ class TwitchCollectionIE(TwitchGraphQLBaseIE): entries, playlist_id=collection_id, playlist_title=title) -class TwitchPlaylistBaseIE(TwitchGraphQLBaseIE): +class TwitchPlaylistBaseIE(TwitchBaseIE): + _PAGE_LIMIT = 100 + def _entries(self, channel_name, *args): cursor = None variables_common = self._make_variables(channel_name, *args) @@ -440,49 +463,6 @@ class TwitchPlaylistBaseIE(TwitchGraphQLBaseIE): if not cursor or not isinstance(cursor, compat_str): break - # Deprecated kraken v5 API - def _entries_kraken(self, channel_name, broadcast_type, sort): - access_token = self._download_access_token(channel_name) - channel_id = self._extract_channel_id(access_token['token'], channel_name) - offset = 0 - counter_override = None - for counter in itertools.count(1): - response = self._call_api( - 'kraken/channels/%s/videos/' % channel_id, - channel_id, - 'Downloading video JSON page %s' % (counter_override or counter), - query={ - 'offset': offset, - 'limit': self._PAGE_LIMIT, - 'broadcast_type': broadcast_type, - 'sort': sort, - }) - videos = response.get('videos') - if not isinstance(videos, list): - break - for video in videos: - if not isinstance(video, dict): - continue - video_url = url_or_none(video.get('url')) - if not video_url: - continue - yield { - '_type': 'url_transparent', - 'ie_key': TwitchVodIE.ie_key(), - 'id': video.get('_id'), - 'url': video_url, - 'title': video.get('title'), - 'description': video.get('description'), - 'timestamp': unified_timestamp(video.get('published_at')), - 'duration': float_or_none(video.get('length')), - 'view_count': int_or_none(video.get('views')), - 'language': video.get('language'), - } - offset += self._PAGE_LIMIT - total = int_or_none(response.get('_total')) - if total and offset >= total: - break - class TwitchVideosIE(TwitchPlaylistBaseIE): _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P<id>[^/]+)/(?:videos|profile)' @@ -724,7 +704,7 @@ class TwitchVideosCollectionsIE(TwitchPlaylistBaseIE): playlist_title='%s - Collections' % channel_name) -class TwitchStreamIE(TwitchGraphQLBaseIE): +class TwitchStreamIE(TwitchBaseIE): IE_NAME = 'twitch:stream' _VALID_URL = r'''(?x) https?:// @@ -814,8 +794,9 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): if not stream: raise ExtractorError('%s is offline' % channel_name, expected=True) - access_token = self._download_access_token(channel_name) - token = access_token['token'] + access_token = self._download_access_token( + channel_name, 'stream', 'channelName') + token = access_token['value'] stream_id = stream.get('id') or channel_name query = { @@ -826,7 +807,7 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): 'player': 'twitchweb', 'playlist_include_framerate': 'true', 'segment_preference': '4', - 'sig': access_token['sig'].encode('utf-8'), + 'sig': access_token['signature'].encode('utf-8'), 'token': token.encode('utf-8'), } formats = self._extract_m3u8_formats( @@ -912,8 +893,8 @@ class TwitchClipsIE(TwitchBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - clip = self._download_json( - 'https://gql.twitch.tv/gql', video_id, data=json.dumps({ + clip = self._download_base_gql( + video_id, { 'query': '''{ clip(slug: "%s") { broadcaster { @@ -937,10 +918,7 @@ class TwitchClipsIE(TwitchBaseIE): } viewCount } -}''' % video_id, - }).encode(), headers={ - 'Client-ID': self._CLIENT_ID, - })['data']['clip'] +}''' % video_id}, 'Downloading clip GraphQL')['data']['clip'] if not clip: raise ExtractorError( diff --git a/haruhi_dl/extractor/twitter.py b/haruhi_dl/extractor/twitter.py index 4284487db..ed495f297 100644 --- a/haruhi_dl/extractor/twitter.py +++ b/haruhi_dl/extractor/twitter.py @@ -251,10 +251,10 @@ class TwitterIE(TwitterBaseIE): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'simon vetugo - BEAT PROD: @suhmeduh #Damndaniel', + 'title': 'simon vertugo - BEAT PROD: @suhmeduh #Damndaniel', 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'simon vetugo', + 'uploader': 'simon vertugo', 'uploader_id': 'simonvertugo', 'duration': 30.0, 'timestamp': 1455777459, @@ -312,6 +312,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1492000653, 'upload_date': '20170412', }, + 'skip': 'Account suspended', }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', 'info_dict': { @@ -372,6 +373,24 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': '1eVjYOLGkGrQL', }, 'add_ie': ['TwitterBroadcast'], + }, { + # unified card + 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', + 'info_dict': { + 'id': '1349794411333394432', + 'ext': 'mp4', + 'title': 'md5:d1c4941658e4caaa6cb579260d85dcba', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:71ead15ec44cee55071547d6447c6a3e', + 'uploader': 'Brooklyn Nets', + 'uploader_id': 'BrooklynNets', + 'duration': 324.484, + 'timestamp': 1610651040, + 'upload_date': '20210114', + }, + 'params': { + 'skip_download': True, + }, }, { # Twitch Clip Embed 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', @@ -380,6 +399,30 @@ class TwitterIE(TwitterBaseIE): # promo_video_website card 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', 'only_matching': True, + }, { + # promo_video_convo card + 'url': 'https://twitter.com/poco_dandy/status/1047395834013384704', + 'only_matching': True, + }, { + # appplayer card + 'url': 'https://twitter.com/poco_dandy/status/1150646424461176832', + 'only_matching': True, + }, { + # video_direct_message card + 'url': 'https://twitter.com/qarev001/status/1348948114569269251', + 'only_matching': True, + }, { + # poll2choice_video card + 'url': 'https://twitter.com/CAF_Online/status/1349365911120195585', + 'only_matching': True, + }, { + # poll3choice_video card + 'url': 'https://twitter.com/SamsungMobileSA/status/1348609186725289984', + 'only_matching': True, + }, { + # poll4choice_video card + 'url': 'https://twitter.com/SouthamptonFC/status/1347577658079641604', + 'only_matching': True, }] def _real_extract(self, url): @@ -424,8 +467,7 @@ class TwitterIE(TwitterBaseIE): 'tags': tags, } - media = try_get(status, lambda x: x['extended_entities']['media'][0]) - if media and media.get('type') != 'photo': + def extract_from_video_info(media): video_info = media.get('video_info') or {} formats = [] @@ -452,6 +494,10 @@ class TwitterIE(TwitterBaseIE): 'thumbnails': thumbnails, 'duration': float_or_none(video_info.get('duration_millis'), 1000), }) + + media = try_get(status, lambda x: x['extended_entities']['media'][0]) + if media and media.get('type') != 'photo': + extract_from_video_info(media) else: card = status.get('card') if card: @@ -462,7 +508,35 @@ class TwitterIE(TwitterBaseIE): return try_get(o, lambda x: x[x['type'].lower() + '_value']) card_name = card['name'].split(':')[-1] - if card_name in ('amplify', 'promo_video_website'): + if card_name == 'player': + info.update({ + '_type': 'url', + 'url': get_binding_value('player_url'), + }) + elif card_name == 'periscope_broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('url') or get_binding_value('player_url'), + 'ie_key': PeriscopeIE.ie_key(), + }) + elif card_name == 'broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('broadcast_url'), + 'ie_key': TwitterBroadcastIE.ie_key(), + }) + elif card_name == 'summary': + info.update({ + '_type': 'url', + 'url': get_binding_value('card_url'), + }) + elif card_name == 'unified_card': + media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities'] + extract_from_video_info(next(iter(media_entities.values()))) + # amplify, promo_video_website, promo_video_convo, appplayer, + # video_direct_message, poll2choice_video, poll3choice_video, + # poll4choice_video, ... + else: is_amplify = card_name == 'amplify' vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) @@ -488,25 +562,6 @@ class TwitterIE(TwitterBaseIE): 'duration': int_or_none(get_binding_value( 'content_duration_seconds')), }) - elif card_name == 'player': - info.update({ - '_type': 'url', - 'url': get_binding_value('player_url'), - }) - elif card_name == 'periscope_broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('url') or get_binding_value('player_url'), - 'ie_key': PeriscopeIE.ie_key(), - }) - elif card_name == 'broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('broadcast_url'), - 'ie_key': TwitterBroadcastIE.ie_key(), - }) - else: - raise ExtractorError('Unsupported Twitter Card.') else: expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) if not expanded_url: diff --git a/haruhi_dl/extractor/uktvplay.py b/haruhi_dl/extractor/uktvplay.py index 2137502a1..f28fd514d 100644 --- a/haruhi_dl/extractor/uktvplay.py +++ b/haruhi_dl/extractor/uktvplay.py @@ -5,10 +5,9 @@ from .common import InfoExtractor class UKTVPlayIE(InfoExtractor): - _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/.+?\?.*?\bvideo=(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P<id>\d+)' + _TESTS = [{ 'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001', - 'md5': '', 'info_dict': { 'id': '2117008346001', 'ext': 'mp4', @@ -23,7 +22,11 @@ class UKTVPlayIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': ['Failed to download MPD manifest'] - } + }, { + 'url': 'https://uktvplay.uktv.co.uk/shows/africa/watch-online/5983349675001', + 'only_matching': True, + }] + # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/1242911124001/OrCyvJ2gyL_default/index.html?videoId=%s' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911124001/H1xnMOqP_default/index.html?videoId=%s' def _real_extract(self, url): diff --git a/haruhi_dl/extractor/urplay.py b/haruhi_dl/extractor/urplay.py index 6030b7cb5..5452c7ca1 100644 --- a/haruhi_dl/extractor/urplay.py +++ b/haruhi_dl/extractor/urplay.py @@ -2,7 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_timestamp +from ..utils import ( + dict_get, + int_or_none, + unified_timestamp, +) class URPlayIE(InfoExtractor): @@ -15,8 +19,8 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', - 'timestamp': 1513512768, - 'upload_date': '20171217', + 'timestamp': 1513292400, + 'upload_date': '20171214', }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -25,7 +29,7 @@ class URPlayIE(InfoExtractor): 'ext': 'mp4', 'title': 'Tripp, Trapp, Träd : Sovkudde', 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', - 'timestamp': 1440093600, + 'timestamp': 1440086400, 'upload_date': '20150820', }, }, { @@ -35,37 +39,58 @@ class URPlayIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - + url = url.replace('skola.se/Produkter', 'play.se/program') webpage = self._download_webpage(url, video_id) - urplayer_data = self._parse_json(self._search_regex( - r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id) - host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] + urplayer_data = self._parse_json(self._html_search_regex( + r'data-react-class="routes/Product/components/ProgramContainer/ProgramContainer"[^>]+data-react-props="({.+?})"', + webpage, 'urplayer data'), video_id)['accessibleEpisodes'][0] + episode = urplayer_data['title'] + raw_streaming_info = urplayer_data['streamingInfo']['raw'] + host = self._download_json( + 'http://streaming-loadbalancer.ur.se/loadbalancer.json', + video_id)['redirect'] formats = [] - for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): - file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) + for k, v in raw_streaming_info.items(): + if not (k in ('sd', 'hd') and isinstance(v, dict)): + continue + file_http = v.get('location') if file_http: formats.extend(self._extract_wowza_formats( - 'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['rtmp', 'rtsp'])) + 'http://%s/%splaylist.m3u8' % (host, file_http), + video_id, skip_protocols=['f4m', 'rtmp', 'rtsp'])) self._sort_formats(formats) - subtitles = {} - for subtitle in urplayer_data.get('subtitles', []): - subtitle_url = subtitle.get('file') - kind = subtitle.get('kind') - if not subtitle_url or (kind and kind != 'captions'): - continue - subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ - 'url': subtitle_url, - }) + image = urplayer_data.get('image') or {} + thumbnails = [] + for k, v in image.items(): + t = { + 'id': k, + 'url': v, + } + wh = k.split('x') + if len(wh) == 2: + t.update({ + 'width': int_or_none(wh[0]), + 'height': int_or_none(wh[1]), + }) + thumbnails.append(t) + + series = urplayer_data.get('series') or {} + series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle')) return { 'id': video_id, - 'title': urplayer_data['title'], - 'description': self._og_search_description(webpage), - 'thumbnail': urplayer_data.get('image'), - 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), webpage, 'timestamp')), - 'series': urplayer_data.get('series_title'), - 'subtitles': subtitles, + 'title': '%s : %s' % (series_title, episode) if series_title else episode, + 'description': urplayer_data.get('description'), + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(urplayer_data.get('publishedAt')), + 'series': series_title, 'formats': formats, + 'duration': int_or_none(urplayer_data.get('duration')), + 'categories': urplayer_data.get('categories'), + 'tags': urplayer_data.get('keywords'), + 'season': series.get('label'), + 'episode': episode, + 'episode_number': int_or_none(urplayer_data.get('episodeNumber')), } diff --git a/haruhi_dl/extractor/usanetwork.py b/haruhi_dl/extractor/usanetwork.py index 54c7495cc..e3784e55f 100644 --- a/haruhi_dl/extractor/usanetwork.py +++ b/haruhi_dl/extractor/usanetwork.py @@ -1,74 +1,24 @@ # coding: utf-8 from __future__ import unicode_literals -from .adobepass import AdobePassIE -from ..utils import ( - NO_DEFAULT, - smuggle_url, - update_url_query, -) +from .nbc import NBCIE -class USANetworkIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity', - 'md5': '33c0d2ba381571b414024440d08d57fd', +class USANetworkIE(NBCIE): + _VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/[^/]+/video/[^/]+/(?P<id>\d+))' + _TESTS = [{ + 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302', 'info_dict': { - 'id': '3086229', + 'id': '4185302', 'ext': 'mp4', - 'title': 'HPE Cybersecurity', - 'description': 'The more we digitize our world, the more vulnerable we are.', - 'upload_date': '20160818', - 'timestamp': 1471535460, - 'uploader': 'NBCU-USA', + 'title': 'Intelligence (Trailer)', + 'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.', + 'upload_date': '20200715', + 'timestamp': 1594785600, + 'uploader': 'NBCU-MPAT', }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - def _x(name, default=NO_DEFAULT): - return self._search_regex( - r'data-%s\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % name, - webpage, name, default=default, group='value') - - video_id = _x('mpx-guid') - title = _x('episode-title') - mpx_account_id = _x('mpx-account-id', '2304992029') - - query = { - 'mbr': 'true', - } - if _x('is-full-episode', None) == '1': - query['manifest'] = 'm3u' - - if _x('is-entitlement', None) == '1': - adobe_pass = {} - drupal_settings = self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings', fatal=False) - if drupal_settings: - drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False) - if drupal_settings: - adobe_pass = drupal_settings.get('adobePass', {}) - resource = self._get_mvpd_resource( - adobe_pass.get('adobePassResourceId', 'usa'), - title, video_id, _x('episode-rating', 'TV-14')) - query['auth'] = self._extract_mvpd_auth( - url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource) - - info = self._search_json_ld(webpage, video_id, default={}) - info.update({ - '_type': 'url_transparent', - 'url': smuggle_url(update_url_query( - 'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id), - query), {'force_smil_url': True}), - 'id': video_id, - 'title': title, - 'series': _x('show-title', None), - 'episode': title, - 'ie_key': 'ThePlatform', - }) - return info + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] diff --git a/haruhi_dl/extractor/videa.py b/haruhi_dl/extractor/videa.py index 5830e7fd7..0f0702852 100644 --- a/haruhi_dl/extractor/videa.py +++ b/haruhi_dl/extractor/videa.py @@ -2,15 +2,24 @@ from __future__ import unicode_literals import re +import random +import string +import struct from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, mimetype2ext, parse_codecs, xpath_element, xpath_text, ) +from ..compat import ( + compat_b64decode, + compat_ord, + compat_parse_qs, +) class VideaIE(InfoExtractor): @@ -60,15 +69,63 @@ class VideaIE(InfoExtractor): r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', webpage)] + def rc4(self, ciphertext, key): + res = b'' + + keyLen = len(key) + S = list(range(256)) + + j = 0 + for i in range(256): + j = (j + S[i] + ord(key[i % keyLen])) % 256 + S[i], S[j] = S[j], S[i] + + i = 0 + j = 0 + for m in range(len(ciphertext)): + i = (i + 1) % 256 + j = (j + S[i]) % 256 + S[i], S[j] = S[j], S[i] + k = S[(S[i] + S[j]) % 256] + res += struct.pack("B", k ^ compat_ord(ciphertext[m])) + + return res + def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, fatal=True) + error = self._search_regex(r'<p class="error-text">([^<]+)</p>', webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) - info = self._download_xml( + video_src_params_raw = self._search_regex(r'<iframe[^>]+id="videa_player_iframe"[^>]+src="/player\?([^"]+)"', webpage, 'video_src_params') + video_src_params = compat_parse_qs(video_src_params_raw) + player_page = self._download_webpage("https://videa.hu/videojs_player?%s" % video_src_params_raw, video_id, fatal=True) + nonce = self._search_regex(r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') + random_seed = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(8)) + static_secret = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' + l = nonce[:32] + s = nonce[32:] + result = '' + for i in range(0, 32): + result += s[i - (static_secret.index(l[i]) - 31)] + + video_src_params['_s'] = random_seed + video_src_params['_t'] = result[:16] + encryption_key_stem = result[16:] + random_seed + + [b64_info, handle] = self._download_webpage_handle( 'http://videa.hu/videaplayer_get_xml.php', video_id, - query={'v': video_id}) + query=video_src_params, fatal=True) + + encrypted_info = compat_b64decode(b64_info) + key = encryption_key_stem + handle.info()['x-videa-xs'] + info_str = self.rc4(encrypted_info, key).decode('utf8') + info = self._parse_xml(info_str, video_id) video = xpath_element(info, './/video', 'video', fatal=True) sources = xpath_element(info, './/video_sources', 'sources', fatal=True) + hash_values = xpath_element(info, './/hash_values', 'hash_values', fatal=True) title = xpath_text(video, './title', fatal=True) @@ -77,6 +134,7 @@ class VideaIE(InfoExtractor): source_url = source.text if not source_url: continue + source_url += '?md5=%s&expires=%s' % (hash_values.find('hash_value_%s' % source.get('name')).text, source.get('exp')) f = parse_codecs(source.get('codecs')) f.update({ 'url': source_url, diff --git a/haruhi_dl/extractor/videomore.py b/haruhi_dl/extractor/videomore.py index e3eda3327..e0c10aa5b 100644 --- a/haruhi_dl/extractor/videomore.py +++ b/haruhi_dl/extractor/videomore.py @@ -4,30 +4,50 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - int_or_none, - orderedSet, - parse_duration, - str_or_none, - unified_strdate, - url_or_none, - xpath_element, - xpath_text, +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, ) +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class VideomoreBaseIE(InfoExtractor): + _API_BASE_URL = 'https://more.tv/api/v3/web/' + _VALID_URL_BASE = r'https?://(?:videomore\.ru|more\.tv)/' + + def _download_page_data(self, display_id): + return self._download_json( + self._API_BASE_URL + 'PageData', display_id, query={ + 'url': '/' + display_id, + })['attributes']['response']['data'] + + def _track_url_result(self, track): + track_vod = track['trackVod'] + video_url = track_vod.get('playerLink') or track_vod['link'] + return self.url_result( + video_url, VideomoreIE.ie_key(), track_vod.get('hubId')) class VideomoreIE(InfoExtractor): IE_NAME = 'videomore' _VALID_URL = r'''(?x) videomore:(?P<sid>\d+)$| - https?://(?:player\.)?videomore\.ru/ + https?:// (?: + videomore\.ru/ (?: embed| [^/]+/[^/]+ )/| - [^/]*\?.*?\btrack_id= + (?: + (?:player\.)?videomore\.ru| + siren\.more\.tv/player + )/[^/]*\?.*?\btrack_id=| + odysseus\.more.tv/player/(?P<partner_id>\d+)/ ) (?P<id>\d+) (?:[/?#&]|\.(?:xml|json)|$) @@ -47,18 +67,19 @@ class VideomoreIE(InfoExtractor): 'comment_count': int, 'age_limit': 16, }, + 'skip': 'The video is not available for viewing.', }, { 'url': 'http://videomore.ru/embed/259974', 'info_dict': { 'id': '259974', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Молодежка 2 сезон 40 серия', 'series': 'Молодежка', + 'season': '2 сезон', 'episode': '40 серия', 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 2809, + 'duration': 2789, 'view_count': int, - 'comment_count': int, 'age_limit': 16, }, 'params': { @@ -79,6 +100,7 @@ class VideomoreIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'The video is not available for viewing.', }, { 'url': 'http://videomore.ru/elki_3?track_id=364623', 'only_matching': True, @@ -100,7 +122,14 @@ class VideomoreIE(InfoExtractor): }, { 'url': 'https://player.videomore.ru/?partner_id=97&track_id=736234&autoplay=0&userToken=', 'only_matching': True, + }, { + 'url': 'https://odysseus.more.tv/player/1788/352317', + 'only_matching': True, + }, { + 'url': 'https://siren.more.tv/player/config?track_id=352317&partner_id=1788&user_token=', + 'only_matching': True, }] + _GEO_BYPASS = False @staticmethod def _extract_url(webpage): @@ -118,46 +147,73 @@ class VideomoreIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('sid') or mobj.group('id') + partner_id = mobj.group('partner_id') or compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('partner_id', [None])[0] or '97' - video = self._download_xml( - 'http://videomore.ru/video/tracks/%s.xml' % video_id, - video_id, 'Downloading video XML') + item = self._download_json( + 'https://siren.more.tv/player/config', video_id, query={ + 'partner_id': partner_id, + 'track_id': video_id, + })['data']['playlist']['items'][0] - item = xpath_element(video, './/playlist/item', fatal=True) + title = item.get('title') + series = item.get('project_name') + season = item.get('season_name') + episode = item.get('episode_name') + if not title: + title = [] + for v in (series, season, episode): + if v: + title.append(v) + title = ' '.join(title) - title = xpath_text( - item, ('./title', './episode_name'), 'title', fatal=True) + streams = item.get('streams') or [] + for protocol in ('DASH', 'HLS'): + stream_url = item.get(protocol.lower() + '_url') + if stream_url: + streams.append({'protocol': protocol, 'url': stream_url}) - video_url = xpath_text(item, './video_url', 'video url', fatal=True) - formats = self._extract_f4m_formats(video_url, video_id, f4m_id='hds') + formats = [] + for stream in streams: + stream_url = stream.get('url') + if not stream_url: + continue + protocol = stream.get('protocol') + if protocol == 'DASH': + formats.extend(self._extract_mpd_formats( + stream_url, video_id, mpd_id='dash', fatal=False)) + elif protocol == 'HLS': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif protocol == 'MSS': + formats.extend(self._extract_ism_formats( + stream_url, video_id, ism_id='mss', fatal=False)) + + if not formats: + error = item.get('error') + if error: + if error in ('Данное видео недоступно для просмотра на территории этой страны', 'Данное видео доступно для просмотра только на территории России'): + self.raise_geo_restricted(countries=['RU']) + raise ExtractorError(error, expected=True) self._sort_formats(formats) - thumbnail = xpath_text(item, './thumbnail_url') - duration = int_or_none(xpath_text(item, './duration')) - view_count = int_or_none(xpath_text(item, './views')) - comment_count = int_or_none(xpath_text(item, './count_comments')) - age_limit = int_or_none(xpath_text(item, './min_age')) - - series = xpath_text(item, './project_name') - episode = xpath_text(item, './episode_name') - return { 'id': video_id, 'title': title, 'series': series, + 'season': season, 'episode': episode, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, - 'age_limit': age_limit, + 'thumbnail': item.get('thumbnail_url'), + 'duration': int_or_none(item.get('duration')), + 'view_count': int_or_none(item.get('views')), + 'age_limit': int_or_none(item.get('min_age')), 'formats': formats, } -class VideomoreVideoIE(InfoExtractor): +class VideomoreVideoIE(VideomoreBaseIE): IE_NAME = 'videomore:video' - _VALID_URL = r'https?://videomore\.ru/(?:(?:[^/]+/){2})?(?P<id>[^/?#&]+)(?:/*|[?#&].*?)$' + _VALID_URL = VideomoreBaseIE._VALID_URL_BASE + r'(?P<id>(?:(?:[^/]+/){2})?[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ # single video with og:video:iframe 'url': 'http://videomore.ru/elki_3', @@ -174,10 +230,25 @@ class VideomoreVideoIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Requires logging in', }, { # season single series with og:video:iframe 'url': 'http://videomore.ru/poslednii_ment/1_sezon/14_seriya', - 'only_matching': True, + 'info_dict': { + 'id': '352317', + 'ext': 'mp4', + 'title': 'Последний мент 1 сезон 14 серия', + 'series': 'Последний мент', + 'season': '1 сезон', + 'episode': '14 серия', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 2464, + 'age_limit': 16, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://videomore.ru/sejchas_v_seti/serii_221-240/226_vypusk', 'only_matching': True, @@ -197,9 +268,13 @@ class VideomoreVideoIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'redirects to https://more.tv/' }, { 'url': 'https://videomore.ru/molodezhka/6_sezon/29_seriya?utm_so', 'only_matching': True, + }, { + 'url': 'https://more.tv/poslednii_ment/1_sezon/14_seriya', + 'only_matching': True, }] @classmethod @@ -208,38 +283,25 @@ class VideomoreVideoIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - video_url = self._og_search_property( - 'video:iframe', webpage, 'video url', default=None) - - if not video_url: - video_id = self._search_regex( - (r'config\s*:\s*["\']https?://videomore\.ru/video/tracks/(\d+)\.xml', - r'track-id=["\'](\d+)', - r'xcnt_product_id\s*=\s*(\d+)'), webpage, 'video id') - video_url = 'videomore:%s' % video_id - else: - video_id = None - - return self.url_result( - video_url, ie=VideomoreIE.ie_key(), video_id=video_id) + return self._track_url_result(self._download_page_data(display_id)) -class VideomoreSeasonIE(InfoExtractor): +class VideomoreSeasonIE(VideomoreBaseIE): IE_NAME = 'videomore:season' - _VALID_URL = r'https?://videomore\.ru/(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$' + _VALID_URL = VideomoreBaseIE._VALID_URL_BASE + r'(?!embed)(?P<id>[^/]+/[^/?#&]+)(?:/*|[?#&].*?)$' _TESTS = [{ - 'url': 'http://videomore.ru/molodezhka/sezon_promo', + 'url': 'http://videomore.ru/molodezhka/film_o_filme', 'info_dict': { - 'id': 'molodezhka/sezon_promo', - 'title': 'Молодежка Промо', + 'id': 'molodezhka/film_o_filme', + 'title': 'Фильм о фильме', }, - 'playlist_mincount': 12, + 'playlist_mincount': 3, }, { 'url': 'http://videomore.ru/molodezhka/sezon_promo?utm_so', 'only_matching': True, + }, { + 'url': 'https://more.tv/molodezhka/film_o_filme', + 'only_matching': True, }] @classmethod @@ -249,59 +311,12 @@ class VideomoreSeasonIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - title = self._og_search_title(webpage) - - data = self._parse_json( - self._html_search_regex( - r'\bclass=["\']seasons-tracks["\'][^>]+\bdata-custom-data=(["\'])(?P<value>{.+?})\1', - webpage, 'data', default='{}', group='value'), - display_id, fatal=False) - + season = self._download_page_data(display_id) + season_id = compat_str(season['id']) + tracks = self._download_json( + self._API_BASE_URL + 'seasons/%s/tracks' % season_id, + season_id)['data'] entries = [] - - if data: - episodes = data.get('episodes') - if isinstance(episodes, list): - for ep in episodes: - if not isinstance(ep, dict): - continue - ep_id = int_or_none(ep.get('id')) - ep_url = url_or_none(ep.get('url')) - if ep_id: - e = { - 'url': 'videomore:%s' % ep_id, - 'id': compat_str(ep_id), - } - elif ep_url: - e = {'url': ep_url} - else: - continue - e.update({ - '_type': 'url', - 'ie_key': VideomoreIE.ie_key(), - 'title': str_or_none(ep.get('title')), - 'thumbnail': url_or_none(ep.get('image')), - 'duration': parse_duration(ep.get('duration')), - 'episode_number': int_or_none(ep.get('number')), - 'upload_date': unified_strdate(ep.get('date')), - }) - entries.append(e) - - if not entries: - entries = [ - self.url_result( - 'videomore:%s' % video_id, ie=VideomoreIE.ie_key(), - video_id=video_id) - for video_id in orderedSet(re.findall( - r':(?:id|key)=["\'](\d+)["\']', webpage))] - - if not entries: - entries = [ - self.url_result(item) for item in re.findall( - r'<a[^>]+href="((?:https?:)?//videomore\.ru/%s/[^/]+)"[^>]+class="widget-item-desc"' - % display_id, webpage)] - - return self.playlist_result(entries, display_id, title) + for track in tracks: + entries.append(self._track_url_result(track)) + return self.playlist_result(entries, display_id, season.get('title')) diff --git a/haruhi_dl/extractor/videopress.py b/haruhi_dl/extractor/videopress.py index 8938050a5..1f046b85b 100644 --- a/haruhi_dl/extractor/videopress.py +++ b/haruhi_dl/extractor/videopress.py @@ -4,21 +4,22 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, + int_or_none, parse_age_limit, qualities, random_birthday, - try_get, unified_timestamp, urljoin, ) class VideoPressIE(InfoExtractor): - _VALID_URL = r'https?://videopress\.com/embed/(?P<id>[\da-zA-Z]+)' + _ID_REGEX = r'[\da-zA-Z]{8}' + _PATH_REGEX = r'video(?:\.word)?press\.com/embed/' + _VALID_URL = r'https?://%s(?P<id>%s)' % (_PATH_REGEX, _ID_REGEX) _TESTS = [{ 'url': 'https://videopress.com/embed/kUJmAcSf', 'md5': '706956a6c875873d51010921310e4bc6', @@ -36,35 +37,36 @@ class VideoPressIE(InfoExtractor): # 17+, requires birth_* params 'url': 'https://videopress.com/embed/iH3gstfZ', 'only_matching': True, + }, { + 'url': 'https://video.wordpress.com/embed/kUJmAcSf', + 'only_matching': True, }] @staticmethod def _extract_urls(webpage, **kwargs): return re.findall( - r'<iframe[^>]+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)', + r'<iframe[^>]+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX), webpage) def _real_extract(self, url): video_id = self._match_id(url) query = random_birthday('birth_year', 'birth_month', 'birth_day') + query['fields'] = 'description,duration,file_url_base,files,height,original,poster,rating,title,upload_date,width' video = self._download_json( 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, video_id, query=query) title = video['title'] - def base_url(scheme): - return try_get( - video, lambda x: x['file_url_base'][scheme], compat_str) - - base_url = base_url('https') or base_url('http') + file_url_base = video.get('file_url_base') or {} + base_url = file_url_base.get('https') or file_url_base.get('http') QUALITIES = ('std', 'dvd', 'hd') quality = qualities(QUALITIES) formats = [] - for format_id, f in video['files'].items(): + for format_id, f in (video.get('files') or {}).items(): if not isinstance(f, dict): continue for ext, path in f.items(): @@ -75,12 +77,14 @@ class VideoPressIE(InfoExtractor): 'ext': determine_ext(path, ext), 'quality': quality(format_id), }) - original_url = try_get(video, lambda x: x['original'], compat_str) + original_url = video.get('original') if original_url: formats.append({ 'url': original_url, 'format_id': 'original', 'quality': len(QUALITIES), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), }) self._sort_formats(formats) diff --git a/haruhi_dl/extractor/vidio.py b/haruhi_dl/extractor/vidio.py index b48baf00b..b1243e847 100644 --- a/haruhi_dl/extractor/vidio.py +++ b/haruhi_dl/extractor/vidio.py @@ -4,7 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + parse_iso8601, + str_or_none, + strip_or_none, + try_get, +) class VidioIE(InfoExtractor): @@ -21,57 +27,63 @@ class VidioIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 149, 'like_count': int, + 'uploader': 'TWELVE Pic', + 'timestamp': 1444902800, + 'upload_date': '20151015', + 'uploader_id': 'twelvepictures', + 'channel': 'Cover Music Video', + 'channel_id': '280236', + 'view_count': int, + 'dislike_count': int, + 'comment_count': int, + 'tags': 'count:4', }, }, { 'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north', 'only_matching': True, }] + def _real_initialize(self): + self._api_key = self._download_json( + 'https://www.vidio.com/auth', None, data=b'')['api_key'] + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id, display_id = mobj.group('id', 'display_id') + video_id, display_id = re.match(self._VALID_URL, url).groups() + data = self._download_json( + 'https://api.vidio.com/videos/' + video_id, display_id, headers={ + 'Content-Type': 'application/vnd.api+json', + 'X-API-KEY': self._api_key, + }) + video = data['videos'][0] + title = video['title'].strip() - webpage = self._download_webpage(url, display_id) - - title = self._og_search_title(webpage) - - m3u8_url, duration, thumbnail = [None] * 3 - - clips = self._parse_json( - self._html_search_regex( - r'data-json-clips\s*=\s*(["\'])(?P<data>\[.+?\])\1', - webpage, 'video data', default='[]', group='data'), - display_id, fatal=False) - if clips: - clip = clips[0] - m3u8_url = clip.get('sources', [{}])[0].get('file') - duration = clip.get('clip_duration') - thumbnail = clip.get('image') - - m3u8_url = m3u8_url or self._search_regex( - r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'hls url', group='url') formats = self._extract_m3u8_formats( - m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + data['clips'][0]['hls_url'], display_id, 'mp4', 'm3u8_native') self._sort_formats(formats) - duration = int_or_none(duration or self._search_regex( - r'data-video-duration=(["\'])(?P<duration>\d+)\1', webpage, - 'duration', fatal=False, group='duration')) - thumbnail = thumbnail or self._og_search_thumbnail(webpage) - - like_count = int_or_none(self._search_regex( - (r'<span[^>]+data-comment-vote-count=["\'](\d+)', - r'<span[^>]+class=["\'].*?\blike(?:__|-)count\b.*?["\'][^>]*>\s*(\d+)'), - webpage, 'like count', fatal=False)) + get_first = lambda x: try_get(data, lambda y: y[x + 's'][0], dict) or {} + channel = get_first('channel') + user = get_first('user') + username = user.get('username') + get_count = lambda x: int_or_none(video.get('total_' + x)) return { 'id': video_id, 'display_id': display_id, 'title': title, - 'description': self._og_search_description(webpage), - 'thumbnail': thumbnail, - 'duration': duration, - 'like_count': like_count, + 'description': strip_or_none(video.get('description')), + 'thumbnail': video.get('image_url_medium'), + 'duration': int_or_none(video.get('duration')), + 'like_count': get_count('likes'), 'formats': formats, + 'uploader': user.get('name'), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader_id': username, + 'uploader_url': 'https://www.vidio.com/@' + username if username else None, + 'channel': channel.get('name'), + 'channel_id': str_or_none(channel.get('id')), + 'view_count': get_count('view_count'), + 'dislike_count': get_count('dislikes'), + 'comment_count': get_count('comments'), + 'tags': video.get('tag_list'), } diff --git a/haruhi_dl/extractor/vidzi.py b/haruhi_dl/extractor/vidzi.py deleted file mode 100644 index b95eaadc0..000000000 --- a/haruhi_dl/extractor/vidzi.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - decode_packed_codes, - js_to_json, - NO_DEFAULT, - PACKED_CODES_RE, -) - - -class VidziIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' - _TESTS = [{ - 'url': 'http://vidzi.tv/cghql9yq6emu.html', - 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', - 'info_dict': { - 'id': 'cghql9yq6emu', - 'ext': 'mp4', - 'title': 'haruhi-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.cc/cghql9yq6emu.html', - 'only_matching': True, - }, { - 'url': 'https://vidzi.si/rph9gztxj1et.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.nu/cghql9yq6emu.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://vidzi.tv/%s' % video_id, video_id) - title = self._html_search_regex( - r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') - - codes = [webpage] - codes.extend([ - decode_packed_codes(mobj.group(0)).replace('\\\'', '\'') - for mobj in re.finditer(PACKED_CODES_RE, webpage)]) - for num, code in enumerate(codes, 1): - jwplayer_data = self._parse_json( - self._search_regex( - r'setup\(([^)]+)\)', code, 'jwplayer data', - default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=lambda s: js_to_json( - re.sub(r'\s*\+\s*window\[.+?\]', '', s))) - if jwplayer_data: - break - - info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) - info_dict['title'] = title - - return info_dict diff --git a/haruhi_dl/extractor/viki.py b/haruhi_dl/extractor/viki.py index b0dcdc0e6..2e9cbf148 100644 --- a/haruhi_dl/extractor/viki.py +++ b/haruhi_dl/extractor/viki.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import hashlib import hmac import itertools @@ -9,12 +10,18 @@ import re import time from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( ExtractorError, int_or_none, parse_age_limit, parse_iso8601, sanitized_Request, + std_headers, + try_get, ) @@ -24,7 +31,7 @@ class VikiBaseIE(InfoExtractor): _API_URL_TEMPLATE = 'https://api.viki.io%s&sig=%s' _APP = '100005a' - _APP_VERSION = '2.2.5.1428709186' + _APP_VERSION = '6.0.0' _APP_SECRET = 'MM_d*yP@`&1@]@!AVrXf_o-HVEnoTnm$O-ti4[G~$JDI/Dc-&piU&z&5.;:}95=Iad' _GEO_BYPASS = False @@ -35,7 +42,7 @@ class VikiBaseIE(InfoExtractor): _ERRORS = { 'geo': 'Sorry, this content is not available in your region.', 'upcoming': 'Sorry, this content is not yet available.', - # 'paywall': 'paywall', + 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers', } def _prepare_call(self, path, timestamp=None, post_data=None): @@ -56,7 +63,8 @@ class VikiBaseIE(InfoExtractor): def _call_api(self, path, video_id, note, timestamp=None, post_data=None): resp = self._download_json( - self._prepare_call(path, timestamp, post_data), video_id, note) + self._prepare_call(path, timestamp, post_data), video_id, note, + headers={'x-viki-app-ver': self._APP_VERSION}) error = resp.get('error') if error: @@ -76,11 +84,13 @@ class VikiBaseIE(InfoExtractor): expected=True) def _check_errors(self, data): - for reason, status in data.get('blocking', {}).items(): + for reason, status in (data.get('blocking') or {}).items(): if status and reason in self._ERRORS: message = self._ERRORS[reason] if reason == 'geo': self.raise_geo_restricted(msg=message) + elif reason == 'paywall': + self.raise_login_required(message) raise ExtractorError('%s said: %s' % ( self.IE_NAME, message), expected=True) @@ -125,13 +135,19 @@ class VikiIE(VikiBaseIE): 'info_dict': { 'id': '1023585v', 'ext': 'mp4', - 'title': 'Heirs Episode 14', - 'uploader': 'SBS', - 'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e', + 'title': 'Heirs - Episode 14', + 'uploader': 'SBS Contents Hub', + 'timestamp': 1385047627, 'upload_date': '20131121', 'age_limit': 13, + 'duration': 3570, + 'episode_number': 14, + }, + 'params': { + 'format': 'bestvideo', }, 'skip': 'Blocked in the US', + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # clip 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', @@ -147,7 +163,8 @@ class VikiIE(VikiBaseIE): 'uploader': 'Arirang TV', 'like_count': int, 'age_limit': 0, - } + }, + 'skip': 'Sorry. There was an error loading this video', }, { 'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi', 'info_dict': { @@ -165,19 +182,24 @@ class VikiIE(VikiBaseIE): }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '5fa476a902e902783ac7a4d615cdbc7a', + 'md5': '0a53dc252e6e690feccd756861495a8c', 'info_dict': { 'id': '44699v', 'ext': 'mp4', 'title': 'Boys Over Flowers - Episode 1', 'description': 'md5:b89cf50038b480b88b5b3c93589a9076', - 'duration': 4204, + 'duration': 4172, 'timestamp': 1270496524, 'upload_date': '20100405', 'uploader': 'group8', 'like_count': int, 'age_limit': 13, - } + 'episode_number': 1, + }, + 'params': { + 'format': 'bestvideo', + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }, { # youtube external 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', @@ -194,14 +216,15 @@ class VikiIE(VikiBaseIE): 'uploader_id': 'ad14065n', 'like_count': int, 'age_limit': 13, - } + }, + 'skip': 'Page not found!', }, { 'url': 'http://www.viki.com/player/44699v', 'only_matching': True, }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '1713ae35df5a521b31f6dc40730e7c9c', + 'md5': '41faaba0de90483fb4848952af7c7d0d', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -212,40 +235,46 @@ class VikiIE(VikiBaseIE): 'title': 'Love In Magic', 'age_limit': 13, }, + 'params': { + 'format': 'bestvideo', + }, + 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'], }] def _real_extract(self, url): video_id = self._match_id(url) - video = self._call_api( - 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') + resp = self._download_json( + 'https://www.viki.com/api/videos/' + video_id, + video_id, 'Downloading video JSON', headers={ + 'x-client-user-agent': std_headers['User-Agent'], + 'x-viki-app-ver': '3.0.0', + }) + video = resp['video'] self._check_errors(video) title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False) + episode_number = int_or_none(video.get('number')) if not title: - title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id - container_titles = video.get('container', {}).get('titles', {}) + title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id + container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {} container_title = self.dict_selection(container_titles, 'en') title = '%s - %s' % (container_title, title) description = self.dict_selection(video.get('descriptions', {}), 'en') - duration = int_or_none(video.get('duration')) - timestamp = parse_iso8601(video.get('created_at')) - uploader = video.get('author') - like_count = int_or_none(video.get('likes', {}).get('count')) - age_limit = parse_age_limit(video.get('rating')) + like_count = int_or_none(try_get(video, lambda x: x['likes']['count'])) thumbnails = [] - for thumbnail_id, thumbnail in video.get('images', {}).items(): + for thumbnail_id, thumbnail in (video.get('images') or {}).items(): thumbnails.append({ 'id': thumbnail_id, 'url': thumbnail.get('url'), }) subtitles = {} - for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): + for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items(): subtitles[subtitle_lang] = [{ 'ext': subtitles_format, 'url': self._prepare_call( @@ -256,66 +285,85 @@ class VikiIE(VikiBaseIE): 'id': video_id, 'title': title, 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'uploader': uploader, + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('created_at')), + 'uploader': video.get('author'), + 'uploader_url': video.get('author_url'), 'like_count': like_count, - 'age_limit': age_limit, + 'age_limit': parse_age_limit(video.get('rating')), 'thumbnails': thumbnails, 'subtitles': subtitles, + 'episode_number': episode_number, } - streams = self._call_api( - 'videos/%s/streams.json' % video_id, video_id, - 'Downloading video streams JSON') - - if 'external' in streams: - result.update({ - '_type': 'url_transparent', - 'url': streams['external']['url'], - }) - return result - formats = [] - for format_id, stream_dict in streams.items(): - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - for protocol, format_dict in stream_dict.items(): - # rtmps URLs does not seem to work - if protocol == 'rtmps': - continue - format_url = format_dict['url'] - if format_id == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False) - # Despite CODECS metadata in m3u8 all video-only formats - # are actually video+audio - for f in m3u8_formats: - if f.get('acodec') == 'none' and f.get('vcodec') != 'none': - f['acodec'] = None - formats.extend(m3u8_formats) - elif format_url.startswith('rtmp'): - mobj = re.search( - r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', - format_url) - if not mobj: + + def add_format(format_id, format_dict, protocol='http'): + # rtmps URLs does not seem to work + if protocol == 'rtmps': + return + format_url = format_dict.get('url') + if not format_url: + return + qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query) + stream = qs.get('stream', [None])[0] + if stream: + format_url = base64.b64decode(stream).decode() + if format_id in ('m3u8', 'hls'): + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', + m3u8_id='m3u8-%s' % protocol, fatal=False) + # Despite CODECS metadata in m3u8 all video-only formats + # are actually video+audio + for f in m3u8_formats: + if '_drm/index_' in f['url']: continue - formats.append({ - 'format_id': 'rtmp-%s' % format_id, - 'ext': 'flv', - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': url, - }) - else: - formats.append({ - 'url': format_url, - 'format_id': '%s-%s' % (format_id, protocol), - 'height': height, - }) + if f.get('acodec') == 'none' and f.get('vcodec') != 'none': + f['acodec'] = None + formats.append(f) + elif format_id in ('mpd', 'dash'): + formats.extend(self._extract_mpd_formats( + format_url, video_id, 'mpd-%s' % protocol, fatal=False)) + elif format_url.startswith('rtmp'): + mobj = re.search( + r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', + format_url) + if not mobj: + return + formats.append({ + 'format_id': 'rtmp-%s' % format_id, + 'ext': 'flv', + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': url, + }) + else: + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (format_id, protocol), + 'height': int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)), + }) + + for format_id, format_dict in (resp.get('streams') or {}).items(): + add_format(format_id, format_dict) + if not formats: + streams = self._call_api( + 'videos/%s/streams.json' % video_id, video_id, + 'Downloading video streams JSON') + + if 'external' in streams: + result.update({ + '_type': 'url_transparent', + 'url': streams['external']['url'], + }) + return result + + for format_id, stream_dict in streams.items(): + for protocol, format_dict in stream_dict.items(): + add_format(format_id, format_dict, protocol) self._sort_formats(formats) result['formats'] = formats @@ -330,7 +378,7 @@ class VikiChannelIE(VikiBaseIE): 'info_dict': { 'id': '50c', 'title': 'Boys Over Flowers', - 'description': 'md5:ecd3cff47967fe193cff37c0bec52790', + 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59', }, 'playlist_mincount': 71, }, { @@ -341,6 +389,7 @@ class VikiChannelIE(VikiBaseIE): 'description': 'md5:05bf5471385aa8b21c18ad450e350525', }, 'playlist_count': 127, + 'skip': 'Page not found', }, { 'url': 'http://www.viki.com/news/24569c-showbiz-korea', 'only_matching': True, diff --git a/haruhi_dl/extractor/vimeo.py b/haruhi_dl/extractor/vimeo.py index e14551459..6fc7b76b4 100644 --- a/haruhi_dl/extractor/vimeo.py +++ b/haruhi_dl/extractor/vimeo.py @@ -226,10 +226,12 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'is_live': is_live, } - def _extract_original_format(self, url, video_id): + def _extract_original_format(self, url, video_id, unlisted_hash=None): + query = {'action': 'load_download_config'} + if unlisted_hash: + query['unlisted_hash'] = unlisted_hash download_data = self._download_json( - url, video_id, fatal=False, - query={'action': 'load_download_config'}, + url, video_id, fatal=False, query=query, headers={'X-Requested-With': 'XMLHttpRequest'}) if download_data: source_file = download_data.get('source_file') @@ -509,6 +511,11 @@ class VimeoIE(VimeoBaseInfoExtractor): { 'url': 'https://vimeo.com/160743502/abd0e13fb4', 'only_matching': True, + }, + { + # requires passing unlisted_hash(a52724358e) to load_download_config request + 'url': 'https://vimeo.com/392479337/a52724358e', + 'only_matching': True, } # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header @@ -673,7 +680,8 @@ class VimeoIE(VimeoBaseInfoExtractor): if config.get('view') == 4: config = self._verify_player_video_password(redirect_url, video_id, headers) - vod = config.get('video', {}).get('vod', {}) + video = config.get('video') or {} + vod = video.get('vod') or {} def is_rented(): if '>You rented this title.<' in webpage: @@ -733,7 +741,7 @@ class VimeoIE(VimeoBaseInfoExtractor): formats = [] source_format = self._extract_original_format( - 'https://vimeo.com/' + video_id, video_id) + 'https://vimeo.com/' + video_id, video_id, video.get('unlisted_hash')) if source_format: formats.append(source_format) @@ -946,10 +954,13 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): def _real_extract(self, url): album_id = self._match_id(url) - webpage = self._download_webpage(url, album_id) - viewer = self._parse_json(self._search_regex( - r'bootstrap_data\s*=\s*({.+?})</script>', - webpage, 'bootstrap data'), album_id)['viewer'] + viewer = self._download_json( + 'https://vimeo.com/_rv/viewer', album_id, fatal=False) + if not viewer: + webpage = self._download_webpage(url, album_id) + viewer = self._parse_json(self._search_regex( + r'bootstrap_data\s*=\s*({.+?})</script>', + webpage, 'bootstrap data'), album_id)['viewer'] jwt = viewer['jwt'] album = self._download_json( 'https://api.vimeo.com/albums/' + album_id, @@ -1122,6 +1133,12 @@ class VHXEmbedIE(VimeoBaseInfoExtractor): IE_NAME = 'vhx:embed' _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)' + @staticmethod + def _extract_urls(webpage, **kw): + mobjs = re.finditer( + r'<iframe[^>]+src="(https?://embed\.vhx\.tv/videos/\d+[^"]*)"', webpage) + return [unescapeHTML(mobj.group(1)) for mobj in mobjs] + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -1130,5 +1147,6 @@ class VHXEmbedIE(VimeoBaseInfoExtractor): 'ott data'), video_id, js_to_json)['config_url'] config = self._download_json(config_url, video_id) info = self._parse_config(config, video_id) + info['id'] = video_id self._vimeo_sort_formats(info['formats']) return info diff --git a/haruhi_dl/extractor/vtm.py b/haruhi_dl/extractor/vtm.py new file mode 100644 index 000000000..093f1aa69 --- /dev/null +++ b/haruhi_dl/extractor/vtm.py @@ -0,0 +1,62 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, + try_get, +) + + +class VTMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vtm\.be/([^/?&#]+)~v(?P<id>[0-9a-f]{8}(?:-[0-9a-f]{4}){3}-[0-9a-f]{12})' + _TEST = { + 'url': 'https://vtm.be/gast-vernielt-genkse-hotelkamer~ve7534523-279f-4b4d-a5c9-a33ffdbe23e1', + 'md5': '37dca85fbc3a33f2de28ceb834b071f8', + 'info_dict': { + 'id': '192445', + 'ext': 'mp4', + 'title': 'Gast vernielt Genkse hotelkamer', + 'timestamp': 1611060180, + 'upload_date': '20210119', + 'duration': 74, + # TODO: fix url _type result processing + # 'series': 'Op Interventie', + } + } + + def _real_extract(self, url): + uuid = self._match_id(url) + video = self._download_json( + 'https://omc4vm23offuhaxx6hekxtzspi.appsync-api.eu-west-1.amazonaws.com/graphql', + uuid, query={ + 'query': '''{ + getComponent(type: Video, uuid: "%s") { + ... on Video { + description + duration + myChannelsVideo + program { + title + } + publishedAt + title + } + } +}''' % uuid, + }, headers={ + 'x-api-key': 'da2-lz2cab4tfnah3mve6wiye4n77e', + })['data']['getComponent'] + + return { + '_type': 'url', + 'id': uuid, + 'title': video.get('title'), + 'url': 'http://mychannels.video/embed/%d' % video['myChannelsVideo'], + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('publishedAt')), + 'duration': int_or_none(video.get('duration')), + 'series': try_get(video, lambda x: x['program']['title']), + 'ie_key': 'Medialaan', + } diff --git a/haruhi_dl/extractor/vvvvid.py b/haruhi_dl/extractor/vvvvid.py index 6906cd2ab..7c94c4ee2 100644 --- a/haruhi_dl/extractor/vvvvid.py +++ b/haruhi_dl/extractor/vvvvid.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( ExtractorError, int_or_none, @@ -12,7 +13,8 @@ from ..utils import ( class VVVVIDIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/(?P<show_id>\d+)/[^/]+/(?P<season_id>\d+)/(?P<id>[0-9]+)' + _VALID_URL_BASE = r'https?://(?:www\.)?vvvvid\.it/(?:#!)?(?:show|anime|film|series)/' + _VALID_URL = r'%s(?P<show_id>\d+)/[^/]+/(?P<season_id>\d+)/(?P<id>[0-9]+)' % _VALID_URL_BASE _TESTS = [{ # video_type == 'video/vvvvid' 'url': 'https://www.vvvvid.it/#!show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048/ping-pong', @@ -21,6 +23,15 @@ class VVVVIDIE(InfoExtractor): 'id': '489048', 'ext': 'mp4', 'title': 'Ping Pong', + 'duration': 239, + 'series': '"Perché dovrei guardarlo?" di Dario Moccia', + 'season_id': '437', + 'episode': 'Ping Pong', + 'episode_number': 1, + 'episode_id': '3334', + 'view_count': int, + 'like_count': int, + 'repost_count': int, }, 'params': { 'skip_download': True, @@ -37,6 +48,25 @@ class VVVVIDIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # video_type == 'video/youtube' + 'url': 'https://www.vvvvid.it/show/404/one-punch-man/406/486683/trailer', + 'md5': '33e0edfba720ad73a8782157fdebc648', + 'info_dict': { + 'id': 'RzmFKUDOUgw', + 'ext': 'mp4', + 'title': 'Trailer', + 'upload_date': '20150906', + 'description': 'md5:a5e802558d35247fee285875328c0b80', + 'uploader_id': 'BandaiVisual', + 'uploader': 'BANDAI NAMCO Arts Channel', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', + 'only_matching': True }] _conn_id = None @@ -45,20 +75,39 @@ class VVVVIDIE(InfoExtractor): 'https://www.vvvvid.it/user/login', None, headers=self.geo_verification_headers())['data']['conn_id'] - def _real_extract(self, url): - show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + def _download_info(self, show_id, path, video_id, fatal=True, query=None): + q = { + 'conn_id': self._conn_id, + } + if query: + q.update(query) response = self._download_json( - 'https://www.vvvvid.it/vvvvid/ondemand/%s/season/%s' % (show_id, season_id), - video_id, headers=self.geo_verification_headers(), query={ - 'conn_id': self._conn_id, - }) - if response['result'] == 'error': + 'https://www.vvvvid.it/vvvvid/ondemand/%s/%s' % (show_id, path), + video_id, headers=self.geo_verification_headers(), query=q, fatal=fatal) + if not (response or fatal): + return + if response.get('result') == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, response['message']), expected=True) + return response['data'] + + def _extract_common_video_info(self, video_data): + return { + 'thumbnail': video_data.get('thumbnail'), + 'episode_id': str_or_none(video_data.get('id')), + } + + def _real_extract(self, url): + show_id, season_id, video_id = re.match(self._VALID_URL, url).groups() + + response = self._download_info( + show_id, 'season/%s' % season_id, + video_id, query={'video_id': video_id}) vid = int(video_id) video_data = list(filter( - lambda episode: episode.get('video_id') == vid, response['data']))[0] + lambda episode: episode.get('video_id') == vid, response))[0] + title = video_data['title'] formats = [] # vvvvid embed_info decryption algorithm is reverse engineered from function $ds(h) at vvvvid.js @@ -115,14 +164,25 @@ class VVVVIDIE(InfoExtractor): return d - for quality in ('_sd', ''): + info = {} + + def metadata_from_url(r_url): + if not info and r_url: + mobj = re.search(r'_(?:S(\d+))?Ep(\d+)', r_url) + if mobj: + info['episode_number'] = int(mobj.group(2)) + season_number = mobj.group(1) + if season_number: + info['season_number'] = int(season_number) + + video_type = video_data.get('video_type') + is_youtube = False + for quality in ('', '_sd'): embed_code = video_data.get('embed_info' + quality) if not embed_code: continue embed_code = ds(embed_code) - video_type = video_data.get('video_type') if video_type in ('video/rcs', 'video/kenc'): - embed_code = re.sub(r'https?://([^/]+)/z/', r'https://\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8') if video_type == 'video/kenc': kenc = self._download_json( 'https://www.vvvvid.it/kenc', video_id, query={ @@ -133,26 +193,89 @@ class VVVVIDIE(InfoExtractor): kenc_message = kenc.get('message') if kenc_message: embed_code += '?' + ds(kenc_message) - formats.extend(self._extract_m3u8_formats( - embed_code, video_id, 'mp4', - m3u8_id='hls', fatal=False)) + formats.extend(self._extract_akamai_formats(embed_code, video_id)) + elif video_type == 'video/youtube': + info.update({ + '_type': 'url_transparent', + 'ie_key': YoutubeIE.ie_key(), + 'url': embed_code, + }) + is_youtube = True + break else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) - self._sort_formats(formats) + metadata_from_url(embed_code) - return { + if not is_youtube: + self._sort_formats(formats) + info['formats'] = formats + + metadata_from_url(video_data.get('thumbnail')) + info.update(self._extract_common_video_info(video_data)) + info.update({ 'id': video_id, - 'title': video_data['title'], - 'formats': formats, - 'thumbnail': video_data.get('thumbnail'), + 'title': title, 'duration': int_or_none(video_data.get('length')), 'series': video_data.get('show_title'), 'season_id': season_id, - 'season_number': video_data.get('season_number'), - 'episode_id': str_or_none(video_data.get('id')), - 'episode_number': int_or_none(video_data.get('number')), - 'episode_title': video_data['title'], + 'episode': title, 'view_count': int_or_none(video_data.get('views')), 'like_count': int_or_none(video_data.get('video_likes')), - } + 'repost_count': int_or_none(video_data.get('video_shares')), + }) + return info + + +class VVVVIDShowIE(VVVVIDIE): + _VALID_URL = r'(?P<base_url>%s(?P<id>\d+)(?:/(?P<show_title>[^/?&#]+))?)/?(?:[?#&]|$)' % VVVVIDIE._VALID_URL_BASE + _TESTS = [{ + 'url': 'https://www.vvvvid.it/show/156/psyco-pass', + 'info_dict': { + 'id': '156', + 'title': 'Psycho-Pass', + 'description': 'md5:94d572c0bd85894b193b8aebc9a3a806', + }, + 'playlist_count': 46, + }, { + 'url': 'https://www.vvvvid.it/show/156', + 'only_matching': True, + }] + + def _real_extract(self, url): + base_url, show_id, show_title = re.match(self._VALID_URL, url).groups() + + seasons = self._download_info( + show_id, 'seasons/', show_title) + + show_info = self._download_info( + show_id, 'info/', show_title, fatal=False) + + if not show_title: + base_url += "/title" + + entries = [] + for season in (seasons or []): + episodes = season.get('episodes') or [] + playlist_title = season.get('name') or show_info.get('title') + for episode in episodes: + if episode.get('playable') is False: + continue + season_id = str_or_none(episode.get('season_id')) + video_id = str_or_none(episode.get('video_id')) + if not (season_id and video_id): + continue + info = self._extract_common_video_info(episode) + info.update({ + '_type': 'url_transparent', + 'ie_key': VVVVIDIE.ie_key(), + 'url': '/'.join([base_url, season_id, video_id]), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'season_id': season_id, + 'playlist_title': playlist_title, + }) + entries.append(info) + + return self.playlist_result( + entries, show_id, show_info.get('title'), show_info.get('description')) diff --git a/haruhi_dl/extractor/washingtonpost.py b/haruhi_dl/extractor/washingtonpost.py index 329907465..7924d80fc 100644 --- a/haruhi_dl/extractor/washingtonpost.py +++ b/haruhi_dl/extractor/washingtonpost.py @@ -4,17 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - strip_jsonp, -) class WashingtonPostIE(InfoExtractor): IE_NAME = 'washingtonpost' - _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/(?:video|posttv)/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' - _TEST = { + _TESTS = [{ 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', 'info_dict': { @@ -23,10 +19,15 @@ class WashingtonPostIE(InfoExtractor): 'title': 'Egypt finds belongings, debris from plane crash', 'description': 'md5:a17ceee432f215a5371388c1f680bd86', 'upload_date': '20160520', - 'uploader': 'Reuters', - 'timestamp': 1463778452, + 'timestamp': 1463775187, }, - } + }, { + 'url': 'https://www.washingtonpost.com/video/world/egypt-finds-belongings-debris-from-plane-crash/2016/05/20/480ba4ee-1ec7-11e6-82c2-a7dcb313287d_video.html', + 'only_matching': True, + }, { + 'url': 'https://www.washingtonpost.com/posttv/world/iraq-to-track-down-antiquities-after-islamic-state-museum-rampage/2015/02/28/7c57e916-bf86-11e4-9dfb-03366e719af8_video.html', + 'only_matching': True, + }] @classmethod def _extract_urls(cls, webpage, **kwargs): @@ -35,73 +36,8 @@ class WashingtonPostIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id, - video_id, transform_source=strip_jsonp)[0]['contentConfig'] - title = video_data['title'] - - urls = [] - formats = [] - for s in video_data.get('streams', []): - s_url = s.get('url') - if not s_url or s_url in urls: - continue - urls.append(s_url) - video_type = s.get('type') - if video_type == 'smil': - continue - elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url): - m3u8_formats = self._extract_m3u8_formats( - s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - for m3u8_format in m3u8_formats: - width = m3u8_format.get('width') - if not width: - continue - vbr = self._search_regex( - r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None) - if vbr: - m3u8_format.update({ - 'vbr': int_or_none(vbr), - }) - formats.extend(m3u8_formats) - else: - width = int_or_none(s.get('width')) - vbr = int_or_none(s.get('bitrate')) - has_width = width != 0 - formats.append({ - 'format_id': ( - '%s-%d-%d' % (video_type, width, vbr) - if width - else video_type), - 'vbr': vbr if has_width else None, - 'width': width, - 'height': int_or_none(s.get('height')), - 'acodec': s.get('audioCodec'), - 'vcodec': s.get('videoCodec') if has_width else 'none', - 'filesize': int_or_none(s.get('fileSize')), - 'url': s_url, - 'ext': 'mp4', - 'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None, - }) - source_media_url = video_data.get('sourceMediaURL') - if source_media_url: - formats.append({ - 'format_id': 'source_media', - 'url': source_media_url, - }) - self._sort_formats( - formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id')) - - return { - 'id': video_id, - 'title': title, - 'description': video_data.get('blurb'), - 'uploader': video_data.get('credits', {}).get('source'), - 'formats': formats, - 'duration': int_or_none(video_data.get('videoDuration'), 100), - 'timestamp': int_or_none( - video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000), - } + return self.url_result( + 'arcpublishing:wapo:' + video_id, 'ArcPublishing', video_id) class WashingtonPostArticleIE(InfoExtractor): @@ -121,9 +57,8 @@ class WashingtonPostArticleIE(InfoExtractor): 'title': 'Breaking Points: The Paper Mine', 'duration': 1290, 'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.', - 'uploader': 'The Washington Post', - 'timestamp': 1395527908, - 'upload_date': '20140322', + 'timestamp': 1395440416, + 'upload_date': '20140321', }, }, { 'md5': '1fff6a689d8770966df78c8cb6c8c17c', @@ -133,9 +68,8 @@ class WashingtonPostArticleIE(InfoExtractor): 'title': 'The town bureaucracy sustains', 'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.', 'duration': 2220, - 'timestamp': 1395528005, - 'upload_date': '20140322', - 'uploader': 'The Washington Post', + 'timestamp': 1395441819, + 'upload_date': '20140321', }, }], }, { @@ -151,8 +85,7 @@ class WashingtonPostArticleIE(InfoExtractor): 'ext': 'mp4', 'description': 'Washington Post transportation reporter Ashley Halsey III explains why a plane\'s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.', 'upload_date': '20141230', - 'uploader': 'The Washington Post', - 'timestamp': 1419974765, + 'timestamp': 1419972442, 'title': 'Why black boxes don’t transmit data in real time', } }] diff --git a/haruhi_dl/extractor/wat.py b/haruhi_dl/extractor/wat.py index 8ef3e0906..f1bccc2d6 100644 --- a/haruhi_dl/extractor/wat.py +++ b/haruhi_dl/extractor/wat.py @@ -1,15 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, - unified_strdate, - HEADRequest, int_or_none, + try_get, + unified_strdate, ) @@ -32,6 +30,7 @@ class WatIE(InfoExtractor): 'skip_download': True, }, 'expected_warnings': ['HTTP Error 404'], + 'skip': 'This content is no longer available', }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', @@ -43,17 +42,10 @@ class WatIE(InfoExtractor): 'upload_date': '20140816', }, 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], + 'skip': 'This content is no longer available', }, ] - - _FORMATS = ( - (200, 416, 234), - (400, 480, 270), - (600, 640, 360), - (1200, 640, 360), - (1800, 960, 540), - (2500, 1280, 720), - ) + _GEO_BYPASS = False def _real_extract(self, url): video_id = self._match_id(url) @@ -61,97 +53,54 @@ class WatIE(InfoExtractor): # 'contentv4' is used in the website, but it also returns the related # videos, we don't need them + # video_data = self._download_json( + # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) video_data = self._download_json( - 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) + 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, + video_id, query={'context': 'MYTF1'}) video_info = video_data['media'] error_desc = video_info.get('error_desc') if error_desc: - self.report_warning( - '%s returned error: %s' % (self.IE_NAME, error_desc)) + if video_info.get('error_code') == 'GEOBLOCKED': + self.raise_geo_restricted(error_desc, video_info.get('geoList')) + raise ExtractorError(error_desc, expected=True) - chapters = video_info['chapters'] - if chapters: - first_chapter = chapters[0] - - def video_id_for_chapter(chapter): - return chapter['tc_start'].split('-')[0] - - if video_id_for_chapter(first_chapter) != video_id: - self.to_screen('Multipart video detected') - entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] - return self.playlist_result(entries, video_id, video_info['title']) - # Otherwise we can continue and extract just one part, we have to use - # the video id for getting the video url - else: - first_chapter = video_info - - title = first_chapter['title'] - - def extract_url(path_template, url_type): - req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) - head = self._request_webpage(HEADRequest(req_url), video_id, 'Extracting %s url' % url_type, fatal=False) - if head: - red_url = head.geturl() - if req_url != red_url: - return red_url - return None - - def remove_bitrate_limit(manifest_url): - return re.sub(r'(?:max|min)_bitrate=\d+&?', '', manifest_url) + title = video_info['title'] formats = [] - try: - alt_urls = lambda manifest_url: [re.sub(r'(?:wdv|ssm)?\.ism/', repl + '.ism/', manifest_url) for repl in ('', 'ssm')] - manifest_urls = self._download_json( - 'http://www.wat.tv/get/webhtml/' + video_id, video_id) - m3u8_url = manifest_urls.get('hls') - if m3u8_url: - m3u8_url = remove_bitrate_limit(m3u8_url) - for m3u8_alt_url in alt_urls(m3u8_url): - formats.extend(self._extract_m3u8_formats( - m3u8_alt_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls', fatal=False)) - formats.extend(self._extract_f4m_formats( - m3u8_alt_url.replace('ios', 'web').replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - mpd_url = manifest_urls.get('mpd') - if mpd_url: - mpd_url = remove_bitrate_limit(mpd_url) - for mpd_alt_url in alt_urls(mpd_url): - formats.extend(self._extract_mpd_formats( - mpd_alt_url, video_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - except ExtractorError: - abr = 64 - for vbr, width, height in self._FORMATS: - tbr = vbr + abr - format_id = 'http-%s' % tbr - fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr) - if self._is_valid_url(fmt_url, video_id, format_id): - formats.append({ - 'format_id': format_id, - 'url': fmt_url, - 'vbr': vbr, - 'abr': abr, - 'width': width, - 'height': height, - }) - date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') - upload_date = unified_strdate(date_diffusion) if date_diffusion else None - duration = None - files = video_info['files'] - if files: - duration = int_or_none(files[0].get('duration')) + def extract_formats(manifest_urls): + for f, f_url in manifest_urls.items(): + if not f_url: + continue + if f in ('dash', 'mpd'): + formats.extend(self._extract_mpd_formats( + f_url.replace('://das-q1.tf1.fr/', '://das-q1-ssl.tf1.fr/'), + video_id, mpd_id='dash', fatal=False)) + elif f == 'hls': + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + + delivery = video_data.get('delivery') or {} + extract_formats({delivery.get('format'): delivery.get('url')}) + if not formats: + if delivery.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) + manifest_urls = self._download_json( + 'http://www.wat.tv/get/webhtml/' + video_id, video_id, fatal=False) + if manifest_urls: + extract_formats(manifest_urls) + + self._sort_formats(formats) return { 'id': video_id, 'title': title, - 'thumbnail': first_chapter.get('preview'), - 'description': first_chapter.get('description'), - 'view_count': int_or_none(video_info.get('views')), - 'upload_date': upload_date, - 'duration': duration, + 'thumbnail': video_info.get('preview'), + 'upload_date': unified_strdate(try_get( + video_data, lambda x: x['mediametrie']['chapters'][0]['estatS4'])), + 'duration': int_or_none(video_info.get('duration')), 'formats': formats, } diff --git a/haruhi_dl/extractor/wdr.py b/haruhi_dl/extractor/wdr.py index cf6f7c7ed..2903d189e 100644 --- a/haruhi_dl/extractor/wdr.py +++ b/haruhi_dl/extractor/wdr.py @@ -17,6 +17,7 @@ from ..utils import ( unified_strdate, update_url_query, urlhandle_detect_ext, + url_or_none, ) @@ -42,16 +43,20 @@ class WDRIE(InfoExtractor): is_live = metadata.get('mediaType') == 'live' tracker_data = metadata['trackerData'] + title = tracker_data['trackerClipTitle'] + media_resource = metadata['mediaResource'] formats = [] # check if the metadata contains a direct URL to a file - for kind, media_resource in media_resource.items(): + for kind, media in media_resource.items(): + if not isinstance(media, dict): + continue if kind not in ('dflt', 'alt'): continue - for tag_name, medium_url in media_resource.items(): + for tag_name, medium_url in media.items(): if tag_name not in ('videoURL', 'audioURL'): continue @@ -88,8 +93,16 @@ class WDRIE(InfoExtractor): 'url': caption_url, 'ext': 'ttml', }] - - title = tracker_data['trackerClipTitle'] + captions_hash = media_resource.get('captionsHash') + if isinstance(captions_hash, dict): + for ext, format_url in captions_hash.items(): + format_url = url_or_none(format_url) + if not format_url: + continue + subtitles.setdefault('de', []).append({ + 'url': format_url, + 'ext': determine_ext(format_url, None) or ext, + }) return { 'id': tracker_data.get('trackerClipId', video_id), @@ -105,7 +118,7 @@ class WDRIE(InfoExtractor): class WDRPageIE(InfoExtractor): _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html' - _VALID_URL = r'https?://(?:www\d?\.)?(?:wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL + _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL _TESTS = [ { @@ -212,7 +225,11 @@ class WDRPageIE(InfoExtractor): { 'url': 'http://www.sportschau.de/handballem2018/audio-vorschau---die-handball-em-startet-mit-grossem-favoritenfeld-100.html', 'only_matching': True, - } + }, + { + 'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html', + 'only_matching': True, + }, ] def _real_extract(self, url): diff --git a/haruhi_dl/extractor/xboxclips.py b/haruhi_dl/extractor/xboxclips.py index d9c277bc3..25f487e1e 100644 --- a/haruhi_dl/extractor/xboxclips.py +++ b/haruhi_dl/extractor/xboxclips.py @@ -1,40 +1,55 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( int_or_none, + month_by_abbreviation, parse_filesize, - unified_strdate, ) class XboxClipsIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?(?:xboxclips\.com|gameclips\.io)/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' + _TESTS = [{ 'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325', 'md5': 'fbe1ec805e920aeb8eced3c3e657df5d', 'info_dict': { 'id': '074a69a9-5faf-46aa-b93b-9909c1720325', 'ext': 'mp4', - 'title': 'Iabdulelah playing Titanfall', + 'title': 'iAbdulElah playing Titanfall', 'filesize_approx': 26800000, 'upload_date': '20140807', 'duration': 56, } - } + }, { + 'url': 'https://gameclips.io/iAbdulElah/074a69a9-5faf-46aa-b93b-9909c1720325', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + if '/video.php' in url: + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0]) - video_url = self._html_search_regex( - r'>(?:Link|Download): <a[^>]+href="([^"]+)"', webpage, 'video URL') - title = self._html_search_regex( - r'<title>XboxClips \| ([^<]+)', webpage, 'title') - upload_date = unified_strdate(self._html_search_regex( - r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False)) + webpage = self._download_webpage(url, video_id) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] + + title = self._html_search_meta(['og:title', 'twitter:title'], webpage) + upload_date = None + mobj = re.search( + r'>Recorded: (\d{2})-(Jan|Feb|Mar|Apr|May|Ju[nl]|Aug|Sep|Oct|Nov|Dec)-(\d{4})', + webpage) + if mobj: + upload_date = '%s%.2d%s' % (mobj.group(3), month_by_abbreviation(mobj.group(2)), mobj.group(1)) filesize = parse_filesize(self._html_search_regex( r'>Size: ([^<]+)<', webpage, 'file size', fatal=False)) duration = int_or_none(self._html_search_regex( @@ -42,12 +57,12 @@ class XboxClipsIE(InfoExtractor): view_count = int_or_none(self._html_search_regex( r'>Views: (\d+)<', webpage, 'view count', fatal=False)) - return { + info.update({ 'id': video_id, - 'url': video_url, 'title': title, 'upload_date': upload_date, 'filesize_approx': filesize, 'duration': duration, 'view_count': view_count, - } + }) + return info diff --git a/haruhi_dl/extractor/xfileshare.py b/haruhi_dl/extractor/xfileshare.py index 20f7013f3..783358fe9 100644 --- a/haruhi_dl/extractor/xfileshare.py +++ b/haruhi_dl/extractor/xfileshare.py @@ -45,6 +45,7 @@ def aa_decode(aa_code): class XFileShareIE(InfoExtractor): _SITES = ( + (r'aparat\.cam', 'Aparat'), (r'clipwatching\.com', 'ClipWatching'), (r'gounlimited\.to', 'GoUnlimited'), (r'govid\.me', 'GoVid'), @@ -78,6 +79,9 @@ class XFileShareIE(InfoExtractor): 'title': 'sample', 'thumbnail': r're:http://.*\.jpg', }, + }, { + 'url': 'https://aparat.cam/n4d6dh0wvlpr', + 'only_matching': True, }] @staticmethod diff --git a/haruhi_dl/extractor/xhamster.py b/haruhi_dl/extractor/xhamster.py index 19ec98de2..3a1e83043 100644 --- a/haruhi_dl/extractor/xhamster.py +++ b/haruhi_dl/extractor/xhamster.py @@ -11,11 +11,14 @@ from ..utils import ( dict_get, extract_attributes, ExtractorError, + float_or_none, int_or_none, parse_duration, + str_or_none, try_get, unified_strdate, url_or_none, + urljoin, ) @@ -146,36 +149,89 @@ class XHamsterIE(InfoExtractor): video = initials['videoModel'] title = video['title'] formats = [] - for format_id, formats_dict in video['sources'].items(): + format_urls = set() + format_sizes = {} + sources = try_get(video, lambda x: x['sources'], dict) or {} + for format_id, formats_dict in sources.items(): if not isinstance(formats_dict, dict): continue + download_sources = try_get(sources, lambda x: x['download'], dict) or {} + for quality, format_dict in download_sources.items(): + if not isinstance(format_dict, dict): + continue + format_sizes[quality] = float_or_none(format_dict.get('size')) for quality, format_item in formats_dict.items(): if format_id == 'download': # Download link takes some time to be generated, # skipping for now continue - if not isinstance(format_item, dict): - continue - format_url = format_item.get('link') - filesize = int_or_none( - format_item.get('size'), invscale=1000000) - else: - format_url = format_item - filesize = None + format_url = format_item format_url = url_or_none(format_url) - if not format_url: + if not format_url or format_url in format_urls: continue + format_urls.add(format_url) formats.append({ 'format_id': '%s-%s' % (format_id, quality), 'url': format_url, 'ext': determine_ext(format_url, 'mp4'), 'height': get_height(quality), - 'filesize': filesize, + 'filesize': format_sizes.get(quality), 'http_headers': { 'Referer': urlh.geturl(), }, }) - self._sort_formats(formats) + xplayer_sources = try_get( + initials, lambda x: x['xplayerSettings']['sources'], dict) + if xplayer_sources: + hls_sources = xplayer_sources.get('hls') + if isinstance(hls_sources, dict): + for hls_format_key in ('url', 'fallback'): + hls_url = hls_sources.get(hls_format_key) + if not hls_url: + continue + hls_url = urljoin(url, hls_url) + if not hls_url or hls_url in format_urls: + continue + format_urls.add(hls_url) + formats.extend(self._extract_m3u8_formats( + hls_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + standard_sources = xplayer_sources.get('standard') + if isinstance(standard_sources, dict): + for format_id, formats_list in standard_sources.items(): + if not isinstance(formats_list, list): + continue + for standard_format in formats_list: + if not isinstance(standard_format, dict): + continue + for standard_format_key in ('url', 'fallback'): + standard_url = standard_format.get(standard_format_key) + if not standard_url: + continue + standard_url = urljoin(url, standard_url) + if not standard_url or standard_url in format_urls: + continue + format_urls.add(standard_url) + ext = determine_ext(standard_url, 'mp4') + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + standard_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = (str_or_none(standard_format.get('quality')) + or str_or_none(standard_format.get('label')) + or '') + formats.append({ + 'format_id': '%s-%s' % (format_id, quality), + 'url': standard_url, + 'ext': ext, + 'height': get_height(quality), + 'filesize': format_sizes.get(quality), + 'http_headers': { + 'Referer': standard_url, + }, + }) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) categories_list = video.get('categories') if isinstance(categories_list, list): diff --git a/haruhi_dl/extractor/xtube.py b/haruhi_dl/extractor/xtube.py index 01b253dcb..18969058f 100644 --- a/haruhi_dl/extractor/xtube.py +++ b/haruhi_dl/extractor/xtube.py @@ -90,7 +90,7 @@ class XTubeIE(InfoExtractor): title, thumbnail, duration = [None] * 3 config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config', + r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', default='{}'), video_id, transform_source=js_to_json, fatal=False) if config: config = config.get('mainRoll') diff --git a/haruhi_dl/extractor/yahoo.py b/haruhi_dl/extractor/yahoo.py index e4615376c..a17b10d6e 100644 --- a/haruhi_dl/extractor/yahoo.py +++ b/haruhi_dl/extractor/yahoo.py @@ -177,46 +177,9 @@ class YahooIE(InfoExtractor): 'only_matching': True, }] - def _real_extract(self, url): - url, country, display_id = re.match(self._VALID_URL, url).groups() - if not country: - country = 'us' - else: - country = country.split('-')[0] - api_base = 'https://%s.yahoo.com/_td/api/resource/' % country - - for i, uuid in enumerate(['url=' + url, 'ymedia-alias=' + display_id]): - content = self._download_json( - api_base + 'content;getDetailView=true;uuids=["%s"]' % uuid, - display_id, 'Downloading content JSON metadata', fatal=i == 1) - if content: - item = content['items'][0] - break - - if item.get('type') != 'video': - entries = [] - - cover = item.get('cover') or {} - if cover.get('type') == 'yvideo': - cover_url = cover.get('url') - if cover_url: - entries.append(self.url_result( - cover_url, 'Yahoo', cover.get('uuid'))) - - for e in item.get('body', []): - if e.get('type') == 'videoIframe': - iframe_url = e.get('url') - if not iframe_url: - continue - entries.append(self.url_result(iframe_url)) - - return self.playlist_result( - entries, item.get('uuid'), - item.get('title'), item.get('summary')) - - video_id = item['uuid'] + def _extract_yahoo_video(self, video_id, country): video = self._download_json( - api_base + 'VideoService.videos;view=full;video_ids=["%s"]' % video_id, + 'https://%s.yahoo.com/_td/api/resource/VideoService.videos;view=full;video_ids=["%s"]' % (country, video_id), video_id, 'Downloading video JSON metadata')[0] title = video['title'] @@ -298,7 +261,6 @@ class YahooIE(InfoExtractor): 'id': video_id, 'title': self._live_title(title) if is_live else title, 'formats': formats, - 'display_id': display_id, 'thumbnails': thumbnails, 'description': clean_html(video.get('description')), 'timestamp': parse_iso8601(video.get('publish_time')), @@ -311,6 +273,44 @@ class YahooIE(InfoExtractor): 'episode_number': int_or_none(series_info.get('episode_number')), } + def _real_extract(self, url): + url, country, display_id = re.match(self._VALID_URL, url).groups() + if not country: + country = 'us' + else: + country = country.split('-')[0] + + item = self._download_json( + 'https://%s.yahoo.com/caas/content/article' % country, display_id, + 'Downloading content JSON metadata', query={ + 'url': url + })['items'][0]['data']['partnerData'] + + if item.get('type') != 'video': + entries = [] + + cover = item.get('cover') or {} + if cover.get('type') == 'yvideo': + cover_url = cover.get('url') + if cover_url: + entries.append(self.url_result( + cover_url, 'Yahoo', cover.get('uuid'))) + + for e in (item.get('body') or []): + if e.get('type') == 'videoIframe': + iframe_url = e.get('url') + if not iframe_url: + continue + entries.append(self.url_result(iframe_url)) + + return self.playlist_result( + entries, item.get('uuid'), + item.get('title'), item.get('summary')) + + info = self._extract_yahoo_video(item['uuid'], country) + info['display_id'] = display_id + return info + class YahooSearchIE(SearchInfoExtractor): IE_DESC = 'Yahoo screen search' diff --git a/haruhi_dl/extractor/yandexdisk.py b/haruhi_dl/extractor/yandexdisk.py index e8f6ae10f..6fcd8ee7e 100644 --- a/haruhi_dl/extractor/yandexdisk.py +++ b/haruhi_dl/extractor/yandexdisk.py @@ -1,23 +1,43 @@ # coding: utf-8 from __future__ import unicode_literals +import json +import re + from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( determine_ext, float_or_none, int_or_none, + mimetype2ext, try_get, - urlencode_postdata, + urljoin, ) class YandexDiskIE(InfoExtractor): - _VALID_URL = r'https?://yadi\.sk/[di]/(?P[^/?#&]+)' + _VALID_URL = r'''(?x)https?:// + (?P + yadi\.sk| + disk\.yandex\. + (?: + az| + by| + co(?:m(?:\.(?:am|ge|tr))?|\.il)| + ee| + fr| + k[gz]| + l[tv]| + md| + t[jm]| + u[az]| + ru + ) + )/(?:[di]/|public.*?\bhash=)(?P[^/?#&]+)''' _TESTS = [{ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', - 'md5': '33955d7ae052f15853dc41f35f17581c', + 'md5': 'a4a8d52958c8fddcf9845935070402ae', 'info_dict': { 'id': 'VdOeDou8eZs6Y', 'ext': 'mp4', @@ -27,92 +47,101 @@ class YandexDiskIE(InfoExtractor): 'uploader_id': '300043621', 'view_count': int, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', 'only_matching': True, + }, { + 'url': 'https://yadi.sk/public?hash=5DZ296JK9GWCLp02f6jrObjnctjRxMs8L6%2B%2FuhNqk38%3D', + 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) - - status = self._download_webpage( - 'https://disk.yandex.com/auth/status', video_id, query={ - 'urlOrigin': url, - 'source': 'public', - 'md5': 'false', - }) - - sk = self._search_regex( - r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P(?:(?!\2).)+)\2', - status, 'sk', group='value') + domain, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, video_id) + store = self._parse_json(self._search_regex( + r']+id="store-prefetch"[^>]*>\s*({.+?})\s*', + webpage, 'store'), video_id) + resource = store['resources'][store['rootResourceId']] - models = self._parse_json( - self._search_regex( - r']+id=["\']models-client[^>]+>\s*(\[.+?\])\s*[\da-f]+) + (?P(?:[\da-f]{32}|[\w-]{12})) ''' _TESTS = [{ - 'url': 'https://yandex.ru/portal/video?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', - 'md5': '33955d7ae052f15853dc41f35f17581c', + 'url': 'https://yandex.ru/portal/video?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374', + 'md5': 'e02a05bfaf0d9615ef07ae3a10f4faf4', 'info_dict': { - 'id': '4dbb262b4fe5cf15a215de4f34eee34d', + 'id': '4dbb36ec4e0526d58f9f2dc8f0ecf374', 'ext': 'mp4', - 'title': 'В Нью-Йорке баржи и теплоход оторвались от причала и расплылись по Гудзону', - 'description': '', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 0, - 'duration': 30, + 'title': 'Русский Вудсток - главный рок-фест в истории СССР / вДудь', + 'description': 'md5:7d6b8d4bc4a3b9a56499916c1ea5b5fa', + 'thumbnail': r're:^https?://', + 'timestamp': 1549972939, + 'duration': 5575, 'age_limit': 18, + 'upload_date': '20190212', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, }, }, { - 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374&from=morda', + 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb262b4fe5cf15a215de4f34eee34d&from=morda', 'only_matching': True, }, { 'url': 'https://yandex.ru/?stream_id=4dbb262b4fe5cf15a215de4f34eee34d', @@ -52,53 +57,88 @@ class YandexVideoIE(InfoExtractor): # DASH with DRM 'url': 'https://yandex.ru/portal/video?from=morda&stream_id=485a92d94518d73a9d0ff778e13505f8', 'only_matching': True, + }, { + 'url': 'https://yandex.ru/efir?stream_active=watching&stream_id=v7a2dZ-v5mSI&from_block=efir_newtab', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - content = self._download_json( - 'https://frontend.vh.yandex.ru/v22/player/%s.json' % video_id, - video_id, query={ - 'stream_options': 'hires', - 'disable_trackings': 1, - })['content'] + player = try_get((self._download_json( + 'https://frontend.vh.yandex.ru/graphql', video_id, data=('''{ + player(content_id: "%s") { + computed_title + content_url + description + dislikes + duration + likes + program_title + release_date + release_date_ut + release_year + restriction_age + season + start_time + streams + thumbnail + title + views_count + } +}''' % video_id).encode(), fatal=False)), lambda x: x['player']['content']) + if not player or player.get('error'): + player = self._download_json( + 'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id, + video_id, query={ + 'stream_options': 'hires', + 'disable_trackings': 1, + }) + content = player['content'] - content_url = url_or_none(content.get('content_url')) or url_or_none( - content['streams'][0]['url']) - title = content.get('title') or content.get('computed_title') + title = content.get('title') or content['computed_title'] - ext = determine_ext(content_url) - - if ext == 'm3u8': - formats = self._extract_m3u8_formats( - content_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - elif ext == 'mpd': - formats = self._extract_mpd_formats( - content_url, video_id, mpd_id='dash') - else: - formats = [{'url': content_url}] + formats = [] + streams = content.get('streams') or [] + streams.append({'url': content.get('content_url')}) + for stream in streams: + content_url = url_or_none(stream.get('url')) + if not content_url: + continue + ext = determine_ext(content_url) + if ext == 'ismc': + continue + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + content_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + content_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({'url': content_url}) self._sort_formats(formats) - description = content.get('description') - thumbnail = content.get('thumbnail') timestamp = (int_or_none(content.get('release_date')) or int_or_none(content.get('release_date_ut')) or int_or_none(content.get('start_time'))) - duration = int_or_none(content.get('duration')) - series = content.get('program_title') - age_limit = int_or_none(content.get('restriction_age')) + season = content.get('season') or {} return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'description': content.get('description'), + 'thumbnail': content.get('thumbnail'), 'timestamp': timestamp, - 'duration': duration, - 'series': series, - 'age_limit': age_limit, + 'duration': int_or_none(content.get('duration')), + 'series': content.get('program_title'), + 'age_limit': int_or_none(content.get('restriction_age')), + 'view_count': int_or_none(content.get('views_count')), + 'like_count': int_or_none(content.get('likes')), + 'dislike_count': int_or_none(content.get('dislikes')), + 'season_number': int_or_none(season.get('season_number')), + 'season_id': season.get('id'), + 'release_year': int_or_none(content.get('release_year')), 'formats': formats, } diff --git a/haruhi_dl/extractor/youporn.py b/haruhi_dl/extractor/youporn.py index c178e2f39..03880cd01 100644 --- a/haruhi_dl/extractor/youporn.py +++ b/haruhi_dl/extractor/youporn.py @@ -25,11 +25,11 @@ class YouPornIE(InfoExtractor): 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 210, 'uploader': 'Ask Dan And Jennifer', 'upload_date': '20101217', 'average_rating': int, 'view_count': int, - 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -48,7 +48,6 @@ class YouPornIE(InfoExtractor): 'upload_date': '20110418', 'average_rating': int, 'view_count': int, - 'comment_count': int, 'categories': list, 'tags': list, 'age_limit': 18, @@ -56,12 +55,16 @@ class YouPornIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': '404', }, { 'url': 'https://www.youporn.com/embed/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'only_matching': True, }, { 'url': 'http://www.youporn.com/watch/505835', 'only_matching': True, + }, { + 'url': 'https://www.youporn.com/watch/13922959/femdom-principal/', + 'only_matching': True, }] @staticmethod @@ -90,7 +93,7 @@ class YouPornIE(InfoExtractor): # Main source definitions = self._parse_json( self._search_regex( - r'mediaDefinition\s*=\s*(\[.+?\]);', webpage, + r'mediaDefinition\s*[=:]\s*(\[.+?\])\s*[;,]', webpage, 'media definitions', default='[]'), video_id, fatal=False) if definitions: @@ -102,7 +105,7 @@ class YouPornIE(InfoExtractor): links.append(video_url) # Fallback #1, this also contains extra low quality 180p format - for _, link in re.findall(r']+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): + for _, link in re.findall(r']+href=(["\'])(http(?:(?!\1).)+\.mp4(?:(?!\1).)*)\1[^>]+title=["\']Download [Vv]ideo', webpage): links.append(link) # Fallback #2 (unavailable as at 22.06.2017) @@ -130,8 +133,9 @@ class YouPornIE(InfoExtractor): # Video URL's path looks like this: # /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 # /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4 + # /videos/201703/11/109285532/1080P_4000K_109285532.mp4 # We will benefit from it by extracting some metadata - mobj = re.search(r'(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+/', video_url) + mobj = re.search(r'(?P\d{3,4})[pP]_(?P\d+)[kK]_\d+', video_url) if mobj: height = int(mobj.group('height')) bitrate = int(mobj.group('bitrate')) @@ -151,12 +155,15 @@ class YouPornIE(InfoExtractor): thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration', fatal=False)) uploader = self._html_search_regex( r'(?s)]+class=["\']submitByLink["\'][^>]*>(.+?)', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - [r'Date\s+[Aa]dded:\s*([^<]+)', + [r'UPLOADED:\s*([^<]+)', + r'Date\s+[Aa]dded:\s*([^<]+)', r'(?s)]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)'], webpage, 'upload date', fatal=False)) @@ -171,7 +178,7 @@ class YouPornIE(InfoExtractor): webpage, 'view count', fatal=False, group='count')) comment_count = str_to_int(self._search_regex( r'>All [Cc]omments? \(([\d,.]+)\)', - webpage, 'comment count', fatal=False)) + webpage, 'comment count', default=None)) def extract_tag_box(regex, title): tag_box = self._search_regex(regex, webpage, title, default=None) @@ -191,6 +198,7 @@ class YouPornIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, + 'duration': duration, 'uploader': uploader, 'upload_date': upload_date, 'average_rating': average_rating, diff --git a/haruhi_dl/extractor/youtube.py b/haruhi_dl/extractor/youtube.py index 2430fa180..f80c82f85 100644 --- a/haruhi_dl/extractor/youtube.py +++ b/haruhi_dl/extractor/youtube.py @@ -289,14 +289,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances (?:(?:www|dev)\.)?invidio\.us/| (?:(?:www|no)\.)?invidiou\.sh/| - (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| + (?:(?:www|fi)\.)?invidious\.snopyta\.org/| (?:www\.)?invidious\.kabi\.tk/| (?:www\.)?invidious\.13ad\.de/| (?:www\.)?invidious\.mastodon\.host/| + (?:www\.)?invidious\.zapashcanon\.fr/| + (?:www\.)?invidious\.kavin\.rocks/| + (?:www\.)?invidious\.tube/| + (?:www\.)?invidiou\.site/| + (?:www\.)?invidious\.site/| + (?:www\.)?invidious\.xyz/| (?:www\.)?invidious\.nixnet\.xyz/| (?:www\.)?invidious\.drycat\.fr/| (?:www\.)?tube\.poal\.co/| + (?:www\.)?tube\.connect\.cafe/| (?:www\.)?vid\.wxzm\.sx/| + (?:www\.)?vid\.mint\.lgbt/| (?:www\.)?yewtu\.be/| (?:www\.)?yt\.elukerio\.org/| (?:www\.)?yt\.lelux\.fi/| diff --git a/haruhi_dl/extractor/zaq1.py b/haruhi_dl/extractor/zaq1.py deleted file mode 100644 index 889aff5d8..000000000 --- a/haruhi_dl/extractor/zaq1.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_timestamp, -) - - -class Zaq1IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P[^/?#&]+)' - _TESTS = [{ - 'url': 'http://zaq1.pl/video/xev0e', - 'md5': '24a5eb3f052e604ae597c4d0d19b351e', - 'info_dict': { - 'id': 'xev0e', - 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa', - 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147', - 'ext': 'mp4', - 'duration': 511, - 'timestamp': 1490896361, - 'uploader': 'Anonim', - 'upload_date': '20170330', - 'view_count': int, - } - }, { - # malformed JSON-LD - 'url': 'http://zaq1.pl/video/x81vn', - 'info_dict': { - 'id': 'x81vn', - 'title': 'SEKRETNE ŻYCIE WALTERA MITTY', - 'ext': 'mp4', - 'duration': 6234, - 'timestamp': 1493494860, - 'uploader': 'Anonim', - 'upload_date': '20170429', - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Failed to parse JSON'], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_url = self._search_regex( - r'data-video-url=(["\'])(?P(?:(?!\1).)+)\1', webpage, - 'video url', group='url') - - info = self._search_json_ld(webpage, video_id, fatal=False) - - def extract_data(field, name, fatal=False): - return self._search_regex( - r'data-%s=(["\'])(?P(?:(?!\1).)+)\1' % field, - webpage, field, fatal=fatal, group='field') - - if not info.get('title'): - info['title'] = extract_data('file-name', 'title', fatal=True) - - if not info.get('duration'): - info['duration'] = int_or_none(extract_data('duration', 'duration')) - - if not info.get('thumbnail'): - info['thumbnail'] = extract_data('photo-url', 'thumbnail') - - if not info.get('timestamp'): - info['timestamp'] = unified_timestamp(self._html_search_meta( - 'uploadDate', webpage, 'timestamp')) - - if not info.get('interactionCount'): - info['view_count'] = int_or_none(self._html_search_meta( - 'interactionCount', webpage, 'view count')) - - uploader = self._html_search_regex( - r'Wideo dodał:\s*]*>([^<]+)', webpage, 'uploader', - fatal=False) - - width = int_or_none(self._html_search_meta( - 'width', webpage, fatal=False)) - height = int_or_none(self._html_search_meta( - 'height', webpage, fatal=False)) - - info.update({ - 'id': video_id, - 'formats': [{ - 'url': video_url, - 'width': width, - 'height': height, - 'http_headers': { - 'Referer': url, - }, - }], - 'uploader': uploader, - }) - - return info diff --git a/haruhi_dl/extractor/zdf.py b/haruhi_dl/extractor/zdf.py index 656864b2e..5ed2946c2 100644 --- a/haruhi_dl/extractor/zdf.py +++ b/haruhi_dl/extractor/zdf.py @@ -40,7 +40,7 @@ class ZDFBaseIE(InfoExtractor): class ZDFIE(ZDFBaseIE): _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html' - _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') + _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') _GEO_COUNTRIES = ['DE'] _TESTS = [{ @@ -119,7 +119,7 @@ class ZDFIE(ZDFBaseIE): if not ptmd_path: ptmd_path = t[ 'http://zdf.de/rels/streams/ptmd-template'].replace( - '{playerId}', 'portal') + '{playerId}', 'ngplayer_2_4') ptmd = self._call_api( urljoin(url, ptmd_path), player, url, video_id, 'metadata') diff --git a/haruhi_dl/extractor/zhihu.py b/haruhi_dl/extractor/zhihu.py new file mode 100644 index 000000000..d1ed55be3 --- /dev/null +++ b/haruhi_dl/extractor/zhihu.py @@ -0,0 +1,69 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import float_or_none, int_or_none + + +class ZhihuIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zhihu\.com/zvideo/(?P[0-9]+)' + _TEST = { + 'url': 'https://www.zhihu.com/zvideo/1342930761977176064', + 'md5': 'c8d4c9cd72dd58e6f9bc9c2c84266464', + 'info_dict': { + 'id': '1342930761977176064', + 'ext': 'mp4', + 'title': '写春联也太难了吧!', + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': '桥半舫', + 'timestamp': 1612959715, + 'upload_date': '20210210', + 'uploader_id': '244ecb13b0fd7daf92235288c8ca3365', + 'duration': 146.333, + 'view_count': int, + 'like_count': int, + 'comment_count': int, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + zvideo = self._download_json( + 'https://www.zhihu.com/api/v4/zvideos/' + video_id, video_id) + title = zvideo['title'] + video = zvideo.get('video') or {} + + formats = [] + for format_id, q in (video.get('playlist') or {}).items(): + play_url = q.get('url') or q.get('play_url') + if not play_url: + continue + formats.append({ + 'asr': int_or_none(q.get('sample_rate')), + 'filesize': int_or_none(q.get('size')), + 'format_id': format_id, + 'fps': int_or_none(q.get('fps')), + 'height': int_or_none(q.get('height')), + 'tbr': float_or_none(q.get('bitrate')), + 'url': play_url, + 'width': int_or_none(q.get('width')), + }) + self._sort_formats(formats) + + author = zvideo.get('author') or {} + url_token = author.get('url_token') + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': video.get('thumbnail') or zvideo.get('image_url'), + 'uploader': author.get('name'), + 'timestamp': int_or_none(zvideo.get('published_at')), + 'uploader_id': author.get('id'), + 'uploader_url': 'https://www.zhihu.com/people/' + url_token if url_token else None, + 'duration': float_or_none(video.get('duration')), + 'view_count': int_or_none(zvideo.get('play_count')), + 'like_count': int_or_none(zvideo.get('liked_count')), + 'comment_count': int_or_none(zvideo.get('comment_count')), + } diff --git a/haruhi_dl/extractor/zype.py b/haruhi_dl/extractor/zype.py index f336ebdb9..2ada5dd0b 100644 --- a/haruhi_dl/extractor/zype.py +++ b/haruhi_dl/extractor/zype.py @@ -85,7 +85,18 @@ class ZypeIE(InfoExtractor): else: m3u8_url = self._search_regex( r'(["\'])(?P(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', - body, 'm3u8 url', group='url') + body, 'm3u8 url', group='url', default=None) + if not m3u8_url: + source = self._search_regex( + r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', body, 'source') + + def get_attr(key): + return self._search_regex( + r'\b%s\s*:\s*([\'"])(?P(?:(?!\1).)+)\1' % key, + source, key, group='val') + + if get_attr('integration') == 'verizon-media': + m3u8_url = 'https://content.uplynk.com/%s.m3u8' % get_attr('id') formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') text_tracks = self._search_regex( diff --git a/haruhi_dl/options.py b/haruhi_dl/options.py index acbef1584..63e11b517 100644 --- a/haruhi_dl/options.py +++ b/haruhi_dl/options.py @@ -369,7 +369,7 @@ def parseOpts(overrideArguments=None): authentication.add_option( '--video-password', dest='videopassword', metavar='PASSWORD', - help='Video password (vimeo, smotri, youku)') + help='Video password (vimeo, youku)') adobe_pass = optparse.OptionGroup(parser, 'Adobe Pass Options') adobe_pass.add_option( @@ -693,6 +693,10 @@ def parseOpts(overrideArguments=None): '-o', '--output', dest='outtmpl', metavar='TEMPLATE', help=('Output filename template, see the "OUTPUT TEMPLATE" for all the info')) + filesystem.add_option( + '--output-na-placeholder', + dest='outtmpl_na_placeholder', metavar='PLACEHOLDER', default='NA', + help=('Placeholder value for unavailable meta fields in output filename template (default is "%default")')) filesystem.add_option( '--autonumber-size', dest='autonumber_size', metavar='NUMBER', type=int, @@ -786,7 +790,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '-x', '--extract-audio', action='store_true', dest='extractaudio', default=False, - help='Convert video files to audio-only files (requires ffmpeg or avconv and ffprobe or avprobe)') + help='Convert video files to audio-only files (requires ffmpeg/avconv and ffprobe/avprobe)') postproc.add_option( '--audio-format', metavar='FORMAT', dest='audioformat', default='best', help='Specify audio format: "best", "aac", "flac", "mp3", "m4a", "opus", "vorbis", or "wav"; "%default" by default; No effect without -x') diff --git a/haruhi_dl/postprocessor/embedthumbnail.py b/haruhi_dl/postprocessor/embedthumbnail.py index 5a3359588..3990908b6 100644 --- a/haruhi_dl/postprocessor/embedthumbnail.py +++ b/haruhi_dl/postprocessor/embedthumbnail.py @@ -89,10 +89,14 @@ class EmbedThumbnailPP(FFmpegPostProcessor): os.rename(encodeFilename(temp_filename), encodeFilename(filename)) elif info['ext'] in ['m4a', 'mp4']: - if not check_executable('AtomicParsley', ['-v']): + atomicparsley = next((x + for x in ['AtomicParsley', 'atomicparsley'] + if check_executable(x, ['-v'])), None) + + if atomicparsley is None: raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.') - cmd = [encodeFilename('AtomicParsley', True), + cmd = [encodeFilename(atomicparsley, True), encodeFilename(filename, True), encodeArgument('--artwork'), encodeFilename(thumbnail_filename, True), diff --git a/haruhi_dl/utils.py b/haruhi_dl/utils.py index 6a02c6f0d..d35033b7e 100644 --- a/haruhi_dl/utils.py +++ b/haruhi_dl/utils.py @@ -3642,7 +3642,7 @@ def url_or_none(url): if not url or not isinstance(url, compat_str): return None url = url.strip() - return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None + return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None def parse_duration(s): @@ -4080,7 +4080,7 @@ def js_to_json(code): v = m.group(0) if v in ('true', 'false', 'null'): return v - elif v.startswith('/*') or v.startswith('//') or v == ',': + elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': return "" if v[0] in ("'", '"'): @@ -4105,7 +4105,8 @@ def js_to_json(code): {comment}|,(?={skip}[\]}}])| (?:(?\d+)' + + def _real_extract(self, url): + video_id = self._match_id(url) + formats = [{ + 'format_id': 'default', + 'url': 'url:', + }] + if video_id == '0': + raise ExtractorError('foo') + if video_id == '2': + formats.append({ + 'format_id': 'extra', + 'url': TEST_URL, + }) + return { + 'id': video_id, + 'title': 'Video %s' % video_id, + 'formats': formats, + } + + class PlaylistIE(InfoExtractor): + _VALID_URL = r'playlist:' + + def _entries(self): + for n in range(3): + video_id = compat_str(n) + yield { + '_type': 'url_transparent', + 'ie_key': VideoIE.ie_key(), + 'id': video_id, + 'url': 'video:%s' % video_id, + 'title': 'Video Transparent %s' % video_id, + } + + def _real_extract(self, url): + return self.playlist_result(self._entries()) + + hdl.add_info_extractor(VideoIE(hdl)) + hdl.add_info_extractor(PlaylistIE(hdl)) + info = hdl.extract_info('playlist:') + entries = info['entries'] + self.assertEqual(len(entries), 3) + self.assertTrue(entries[0] is None) + self.assertTrue(entries[1] is None) + self.assertEqual(len(hdl.downloaded_info_dicts), 1) + downloaded = hdl.downloaded_info_dicts[0] + self.assertEqual(entries[2], downloaded) + self.assertEqual(downloaded['url'], TEST_URL) + self.assertEqual(downloaded['title'], 'Video Transparent 2') + self.assertEqual(downloaded['id'], '2') + self.assertEqual(downloaded['extractor'], 'Video') + self.assertEqual(downloaded['extractor_key'], 'Video') + if __name__ == '__main__': unittest.main() diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index a7a15b8ae..d7f42a02d 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -98,6 +98,55 @@ class TestInfoExtractor(unittest.TestCase): self.assertRaises(RegexNotFoundError, ie._html_search_meta, 'z', html, None, fatal=True) self.assertRaises(RegexNotFoundError, ie._html_search_meta, ('z', 'x'), html, None, fatal=True) + def test_search_json_ld_realworld(self): + # https://github.com/hdl-org/haruhi-dl/issues/23306 + expect_dict( + self, + self.ie._search_json_ld(r'''''', None), + { + 'title': '1 On 1 With Kleio', + 'description': 'Kleio Valentien', + 'url': 'https://gvideo.eporner.com/xN49A1cT3eB/xN49A1cT3eB.mp4', + 'timestamp': 1449347075, + 'duration': 743.0, + 'view_count': 1120958, + 'width': 1920, + 'height': 1080, + }) + def test_download_json(self): uri = encode_data_uri(b'{"foo": "blah"}', 'application/json') self.assertEqual(self.ie._download_json(uri, None), {'foo': 'blah'}) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index aadb30f84..0d4d71868 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -258,16 +258,24 @@ class TestNRKSubtitles(BaseTestSubtitles): class TestRaiPlaySubtitles(BaseTestSubtitles): - url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html' IE = RaiPlayIE - def test_allsubtitles(self): + def test_subtitles_key(self): + self.url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['it'])) self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') + def test_subtitles_array_key(self): + self.url = 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html' + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['it'])) + self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd') + class TestVikiSubtitles(BaseTestSubtitles): url = 'http://www.viki.com/videos/1060846v-punch-episode-18' diff --git a/test/test_utils.py b/test/test_utils.py index a57863825..d052a23de 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,6 +21,7 @@ from haruhi_dl.utils import ( encode_base_n, caesar, clean_html, + clean_podcast_url, date_from_str, DateRange, detect_exe_version, @@ -554,6 +555,11 @@ class TestUtil(unittest.TestCase): self.assertEqual(url_or_none('http$://foo.de'), None) self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de') self.assertEqual(url_or_none('//foo.de'), '//foo.de') + self.assertEqual(url_or_none('s3://foo.de'), None) + self.assertEqual(url_or_none('rtmpte://foo.de'), 'rtmpte://foo.de') + self.assertEqual(url_or_none('mms://foo.de'), 'mms://foo.de') + self.assertEqual(url_or_none('rtspu://foo.de'), 'rtspu://foo.de') + self.assertEqual(url_or_none('ftps://foo.de'), 'ftps://foo.de') def test_parse_age_limit(self): self.assertEqual(parse_age_limit(None), None) @@ -937,6 +943,28 @@ class TestUtil(unittest.TestCase): self.assertEqual(d['x'], 1) self.assertEqual(d['y'], 'a') + # Just drop ! prefix for now though this results in a wrong value + on = js_to_json('''{ + a: !0, + b: !1, + c: !!0, + d: !!42.42, + e: !!![], + f: !"abc", + g: !"", + !42: 42 + }''') + self.assertEqual(json.loads(on), { + 'a': 0, + 'b': 1, + 'c': 0, + 'd': 42.42, + 'e': [], + 'f': "abc", + 'g': "", + '42': 42 + }) + on = js_to_json('["abc", "def",]') self.assertEqual(json.loads(on), ['abc', 'def']) @@ -1443,6 +1471,10 @@ Line 1 self.assertEqual(get_elements_by_attribute('class', 'foo', html), []) self.assertEqual(get_elements_by_attribute('class', 'no-such-foo', html), []) + def test_clean_podcast_url(self): + self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') + self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + if __name__ == '__main__': unittest.main()