[niconico] dmc downloader and other stuff from yt-dlp (as of 40078a5)

unlicense
Lauren Liberda 2021-06-24 01:35:53 +02:00 committed by Dominika
parent 18b5da3114
commit 07b309368f
3 changed files with 432 additions and 182 deletions

View File

@ -1,5 +1,18 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ..utils import (
determine_protocol,
)
def _get_real_downloader(info_dict, protocol=None, *args, **kwargs):
info_copy = info_dict.copy()
if protocol:
info_copy['protocol'] = protocol
return get_suitable_downloader(info_copy, *args, **kwargs)
# Some of these require _get_real_downloader
from .common import FileDownloader from .common import FileDownloader
from .f4m import F4mFD from .f4m import F4mFD
from .hls import HlsFD from .hls import HlsFD
@ -8,16 +21,13 @@ from .rtmp import RtmpFD
from .dash import DashSegmentsFD from .dash import DashSegmentsFD
from .rtsp import RtspFD from .rtsp import RtspFD
from .ism import IsmFD from .ism import IsmFD
from .niconico import NiconicoDmcFD
from .external import ( from .external import (
get_external_downloader, get_external_downloader,
Aria2cFD, Aria2cFD,
FFmpegFD, FFmpegFD,
) )
from ..utils import (
determine_protocol,
)
PROTOCOL_MAP = { PROTOCOL_MAP = {
'rtmp': RtmpFD, 'rtmp': RtmpFD,
'm3u8_native': HlsFD, 'm3u8_native': HlsFD,
@ -28,6 +38,7 @@ PROTOCOL_MAP = {
'http_dash_segments': DashSegmentsFD, 'http_dash_segments': DashSegmentsFD,
'ism': IsmFD, 'ism': IsmFD,
'bittorrent': Aria2cFD, 'bittorrent': Aria2cFD,
'niconico_dmc': NiconicoDmcFD,
} }

View File

@ -0,0 +1,55 @@
# coding: utf-8
from __future__ import unicode_literals
import threading
from .common import FileDownloader
from ..downloader import _get_real_downloader
from ..extractor.niconico import NiconicoIE
from ..compat import compat_urllib_request
class NiconicoDmcFD(FileDownloader):
""" Downloading niconico douga from DMC with heartbeat """
FD_NAME = 'niconico_dmc'
def real_download(self, filename, info_dict):
self.to_screen('[%s] Downloading from DMC' % self.FD_NAME)
ie = NiconicoIE(self.hdl)
info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict)
fd = _get_real_downloader(info_dict, params=self.params)(self.hdl, self.params)
success = download_complete = False
timer = [None]
heartbeat_lock = threading.Lock()
heartbeat_url = heartbeat_info_dict['url']
heartbeat_data = heartbeat_info_dict['data'].encode()
heartbeat_interval = heartbeat_info_dict.get('interval', 30)
def heartbeat():
try:
compat_urllib_request.urlopen(url=heartbeat_url, data=heartbeat_data)
except Exception:
self.to_screen('[%s] Heartbeat failed' % self.FD_NAME)
with heartbeat_lock:
if not download_complete:
timer[0] = threading.Timer(heartbeat_interval, heartbeat)
timer[0].start()
heartbeat_info_dict['ping']()
self.to_screen('[%s] Heartbeat with %d second interval ...' % (self.FD_NAME, heartbeat_interval))
try:
heartbeat()
if type(fd).__name__ == 'HlsFD':
info_dict.update(ie._extract_m3u8_formats(info_dict['url'], info_dict['id'])[0])
success = fd.real_download(filename, info_dict)
finally:
if heartbeat_lock:
with heartbeat_lock:
timer[0].cancel()
download_complete = True
return success

View File

@ -1,25 +1,28 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import datetime import re
import functools
import json import json
import math import datetime
from .common import InfoExtractor from .common import InfoExtractor
from ..postprocessor.ffmpeg import FFmpegPostProcessor
from ..compat import ( from ..compat import (
compat_str,
compat_parse_qs, compat_parse_qs,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
) )
from ..utils import ( from ..utils import (
determine_ext,
dict_get, dict_get,
ExtractorError, ExtractorError,
float_or_none,
InAdvancePagedList,
int_or_none, int_or_none,
float_or_none,
OnDemandPagedList,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
PostProcessingError,
str_or_none,
remove_start, remove_start,
try_get, try_get,
unified_timestamp, unified_timestamp,
@ -34,7 +37,7 @@ class NiconicoIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'http://www.nicovideo.jp/watch/sm22312215', 'url': 'http://www.nicovideo.jp/watch/sm22312215',
'md5': 'd1a75c0823e2f629128c43e1212760f9', 'md5': 'a5bad06f1347452102953f323c69da34s',
'info_dict': { 'info_dict': {
'id': 'sm22312215', 'id': 'sm22312215',
'ext': 'mp4', 'ext': 'mp4',
@ -162,6 +165,11 @@ class NiconicoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
_NETRC_MACHINE = 'niconico' _NETRC_MACHINE = 'niconico'
_API_HEADERS = {
'X-Frontend-ID': '6',
'X-Frontend-Version': '0'
}
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
@ -188,40 +196,92 @@ class NiconicoIE(InfoExtractor):
if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login': if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login':
login_ok = False login_ok = False
if not login_ok: if not login_ok:
self._downloader.report_warning('unable to log in: bad username or password') self.report_warning('unable to log in: bad username or password')
return login_ok return login_ok
def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): def _get_heartbeat_info(self, info_dict):
def yesno(boolean):
return 'yes' if boolean else 'no'
session_api_data = api_data['video']['dmcInfo']['session_api'] video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/')
session_api_endpoint = session_api_data['urls'][0]
format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) api_data = (
info_dict.get('_api_data')
or self._parse_json(
self._html_search_regex(
'data-api-data="([^"]+)"',
self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id),
'API data', default='{}'),
video_id))
session_api_data = try_get(api_data, lambda x: x['media']['delivery']['movie']['session'])
session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0])
def ping():
status = try_get(
self._download_json(
'https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', video_id,
query={'t': try_get(api_data, lambda x: x['media']['delivery']['trackingId'])},
note='Acquiring permission for downloading video',
headers=self._API_HEADERS),
lambda x: x['meta']['status'])
if status != 200:
self.report_warning('Failed to acquire permission for playing video. The video may not download.')
yesno = lambda x: 'yes' if x else 'no'
# m3u8 (encryption)
if try_get(api_data, lambda x: x['media']['delivery']['encryption']) is not None:
protocol = 'm3u8'
encryption = self._parse_json(session_api_data['token'], video_id)['hls_encryption']
session_api_http_parameters = {
'parameters': {
'hls_parameters': {
'encryption': {
encryption: {
'encrypted_key': try_get(api_data, lambda x: x['media']['delivery']['encryption']['encryptedKey']),
'key_uri': try_get(api_data, lambda x: x['media']['delivery']['encryption']['keyUri'])
}
},
'transfer_preset': '',
'use_ssl': yesno(session_api_endpoint['isSsl']),
'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']),
'segment_duration': 6000,
}
}
}
# http
else:
protocol = 'http'
session_api_http_parameters = {
'parameters': {
'http_output_download_parameters': {
'use_ssl': yesno(session_api_endpoint['isSsl']),
'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']),
}
}
}
session_response = self._download_json( session_response = self._download_json(
session_api_endpoint['url'], video_id, session_api_endpoint['url'], video_id,
query={'_format': 'json'}, query={'_format': 'json'},
headers={'Content-Type': 'application/json'}, headers={'Content-Type': 'application/json'},
note='Downloading JSON metadata for %s' % format_id, note='Downloading JSON metadata for %s' % info_dict['format_id'],
data=json.dumps({ data=json.dumps({
'session': { 'session': {
'client_info': { 'client_info': {
'player_id': session_api_data['player_id'], 'player_id': session_api_data.get('playerId'),
}, },
'content_auth': { 'content_auth': {
'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]], 'auth_type': try_get(session_api_data, lambda x: x['authTypes'][session_api_data['protocols'][0]]),
'content_key_timeout': session_api_data['content_key_timeout'], 'content_key_timeout': session_api_data.get('contentKeyTimeout'),
'service_id': 'nicovideo', 'service_id': 'nicovideo',
'service_user_id': session_api_data['service_user_id'] 'service_user_id': session_api_data.get('serviceUserId')
}, },
'content_id': session_api_data['content_id'], 'content_id': session_api_data.get('contentId'),
'content_src_id_sets': [{ 'content_src_id_sets': [{
'content_src_ids': [{ 'content_src_ids': [{
'src_id_to_mux': { 'src_id_to_mux': {
'audio_src_ids': [audio_quality['id']], 'audio_src_ids': [audio_src_id],
'video_src_ids': [video_quality['id']], 'video_src_ids': [video_src_id],
} }
}] }]
}], }],
@ -229,52 +289,81 @@ class NiconicoIE(InfoExtractor):
'content_uri': '', 'content_uri': '',
'keep_method': { 'keep_method': {
'heartbeat': { 'heartbeat': {
'lifetime': session_api_data['heartbeat_lifetime'] 'lifetime': session_api_data.get('heartbeatLifetime')
} }
}, },
'priority': session_api_data['priority'], 'priority': session_api_data.get('priority'),
'protocol': { 'protocol': {
'name': 'http', 'name': 'http',
'parameters': { 'parameters': {
'http_parameters': { 'http_parameters': session_api_http_parameters
'parameters': {
'http_output_download_parameters': {
'use_ssl': yesno(session_api_endpoint['is_ssl']),
'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']),
}
}
}
} }
}, },
'recipe_id': session_api_data['recipe_id'], 'recipe_id': session_api_data.get('recipeId'),
'session_operation_auth': { 'session_operation_auth': {
'session_operation_auth_by_signature': { 'session_operation_auth_by_signature': {
'signature': session_api_data['signature'], 'signature': session_api_data.get('signature'),
'token': session_api_data['token'], 'token': session_api_data.get('token'),
} }
}, },
'timing_constraint': 'unlimited' 'timing_constraint': 'unlimited'
} }
}).encode()) }).encode())
resolution = video_quality.get('resolution', {}) info_dict['url'] = session_response['data']['session']['content_uri']
info_dict['protocol'] = protocol
# get heartbeat info
heartbeat_info_dict = {
'url': session_api_endpoint['url'] + '/' + session_response['data']['session']['id'] + '?_format=json&_method=PUT',
'data': json.dumps(session_response['data']),
# interval, convert milliseconds to seconds, then halve to make a buffer.
'interval': float_or_none(session_api_data.get('heartbeatLifetime'), scale=3000),
'ping': ping
}
return info_dict, heartbeat_info_dict
def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality):
def parse_format_id(id_code):
mobj = re.match(r'''(?x)
(?:archive_)?
(?:(?P<codec>[^_]+)_)?
(?:(?P<br>[\d]+)kbps_)?
(?:(?P<res>[\d+]+)p_)?
''', '%s_' % id_code)
return mobj.groupdict() if mobj else {}
protocol = 'niconico_dmc'
format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality]))
vdict = parse_format_id(video_quality['id'])
adict = parse_format_id(audio_quality['id'])
resolution = try_get(video_quality, lambda x: x['metadata']['resolution'], dict) or {'height': vdict.get('res')}
vbr = try_get(video_quality, lambda x: x['metadata']['bitrate'], float)
return { return {
'url': session_response['data']['session']['content_uri'], 'url': '%s:%s/%s/%s' % (protocol, video_id, video_quality['id'], audio_quality['id']),
'format_id': format_id, 'format_id': format_id,
'format_note': 'DMC %s' % try_get(video_quality, lambda x: x['metadata']['label'], compat_str),
'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4
'abr': float_or_none(audio_quality.get('bitrate'), 1000), 'vcodec': vdict.get('codec'),
'vbr': float_or_none(video_quality.get('bitrate'), 1000), 'acodec': adict.get('codec'),
'height': resolution.get('height'), 'vbr': float_or_none(vbr, 1000) or float_or_none(vdict.get('br')),
'width': resolution.get('width'), 'abr': float_or_none(audio_quality.get('bitrate'), 1000) or float_or_none(adict.get('br')),
'height': int_or_none(resolution.get('height', vdict.get('res'))),
'width': int_or_none(resolution.get('width')),
'quality': -2 if 'low' in format_id else -1, # Default quality value is -1
'protocol': protocol,
'http_headers': {
'Origin': 'https://www.nicovideo.jp',
'Referer': 'https://www.nicovideo.jp/watch/' + video_id,
}
} }
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
# Get video webpage. We are not actually interested in it for normal # Get video webpage for API data.
# cases, but need the cookies in order to be able to download the
# info webpage
webpage, handle = self._download_webpage_handle( webpage, handle = self._download_webpage_handle(
'http://www.nicovideo.jp/watch/' + video_id, video_id) 'http://www.nicovideo.jp/watch/' + video_id, video_id)
if video_id.startswith('so'): if video_id.startswith('so'):
@ -284,86 +373,136 @@ class NiconicoIE(InfoExtractor):
'data-api-data="([^"]+)"', webpage, 'data-api-data="([^"]+)"', webpage,
'API data', default='{}'), video_id) 'API data', default='{}'), video_id)
def _format_id_from_url(video_url): def get_video_info_web(items):
return 'economy' if video_real_url.endswith('low') else 'normal' return dict_get(api_data['video'], items)
try: # Get video info
video_real_url = api_data['video']['smileInfo']['url'] video_info_xml = self._download_xml(
except KeyError: # Flash videos 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id,
# Get flv info video_id, note='Downloading video info page')
flv_info_webpage = self._download_webpage(
'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
video_id, 'Downloading flv info')
flv_info = compat_parse_qs(flv_info_webpage) def get_video_info_xml(items):
if 'url' not in flv_info: if not isinstance(items, list):
if 'deleted' in flv_info: items = [items]
raise ExtractorError('The video has been deleted.', for item in items:
expected=True) ret = xpath_text(video_info_xml, './/' + item)
elif 'closed' in flv_info: if ret:
raise ExtractorError('Niconico videos now require logging in', return ret
expected=True)
elif 'error' in flv_info:
raise ExtractorError('%s reports error: %s' % (
self.IE_NAME, flv_info['error'][0]), expected=True)
else:
raise ExtractorError('Unable to find video URL')
video_info_xml = self._download_xml( if get_video_info_xml('error'):
'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, error_code = get_video_info_xml('code')
video_id, note='Downloading video info page')
def get_video_info(items): if error_code == 'DELETED':
if not isinstance(items, list): raise ExtractorError('The video has been deleted.',
items = [items] expected=True)
for item in items: elif error_code == 'NOT_FOUND':
ret = xpath_text(video_info_xml, './/' + item) raise ExtractorError('The video is not found.',
if ret: expected=True)
return ret elif error_code == 'COMMUNITY':
self.to_screen('%s: The video is community members only.' % video_id)
else:
raise ExtractorError('%s reports error: %s' % (self.IE_NAME, error_code))
video_real_url = flv_info['url'][0] # Start extracting video formats
formats = []
extension = get_video_info('movie_type') # Get HTML5 videos info
if not extension: quality_info = try_get(api_data, lambda x: x['media']['delivery']['movie'])
extension = determine_ext(video_real_url) if not quality_info:
raise ExtractorError('The video can\'t be downloaded', expected=True)
formats = [{ for audio_quality in quality_info.get('audios') or {}:
'url': video_real_url, for video_quality in quality_info.get('videos') or {}:
'ext': extension, if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'):
'format_id': _format_id_from_url(video_real_url), continue
}] formats.append(self._extract_format_for_quality(
else: api_data, video_id, audio_quality, video_quality))
formats = []
dmc_info = api_data['video'].get('dmcInfo') # Get flv/swf info
if dmc_info: # "New" HTML5 videos timestamp = None
quality_info = dmc_info['quality'] video_real_url = try_get(api_data, lambda x: x['video']['smileInfo']['url'])
for audio_quality in quality_info['audios']: if video_real_url:
for video_quality in quality_info['videos']: is_economy = video_real_url.endswith('low')
if not audio_quality['available'] or not video_quality['available']:
continue
formats.append(self._extract_format_for_quality(
api_data, video_id, audio_quality, video_quality))
self._sort_formats(formats) if is_economy:
else: # "Old" HTML5 videos self.report_warning('Site is currently in economy mode! You will only have access to lower quality streams')
formats = [{
# Invoking ffprobe to determine resolution
pp = FFmpegPostProcessor(self._downloader)
cookies = self._get_cookies('https://nicovideo.jp').output(header='', sep='; path=/; domain=nicovideo.jp;\n')
self.to_screen('%s: %s' % (video_id, 'Checking smile format with ffprobe'))
try:
metadata = pp.get_metadata_object(video_real_url, ['-cookies', cookies])
except PostProcessingError as err:
raise ExtractorError(err.msg, expected=True)
v_stream = a_stream = {}
# Some complex swf files doesn't have video stream (e.g. nm4809023)
for stream in metadata['streams']:
if stream['codec_type'] == 'video':
v_stream = stream
elif stream['codec_type'] == 'audio':
a_stream = stream
# Community restricted videos seem to have issues with the thumb API not returning anything at all
filesize = int(
(get_video_info_xml('size_high') if not is_economy else get_video_info_xml('size_low'))
or metadata['format']['size']
)
extension = (
get_video_info_xml('movie_type')
or 'mp4' if 'mp4' in metadata['format']['format_name'] else metadata['format']['format_name']
)
# 'creation_time' tag on video stream of re-encoded SMILEVIDEO mp4 files are '1970-01-01T00:00:00.000000Z'.
timestamp = (
parse_iso8601(get_video_info_web('first_retrieve'))
or unified_timestamp(get_video_info_web('postedDateTime'))
)
metadata_timestamp = (
parse_iso8601(try_get(v_stream, lambda x: x['tags']['creation_time']))
or timestamp if extension != 'mp4' else 0
)
# According to compconf, smile videos from pre-2017 are always better quality than their DMC counterparts
smile_threshold_timestamp = parse_iso8601('2016-12-08T00:00:00+09:00')
is_source = timestamp < smile_threshold_timestamp or metadata_timestamp > 0
# If movie file size is unstable, old server movie is not source movie.
if filesize > 1:
formats.append({
'url': video_real_url, 'url': video_real_url,
'ext': 'mp4', 'format_id': 'smile' if not is_economy else 'smile_low',
'format_id': _format_id_from_url(video_real_url), 'format_note': 'SMILEVIDEO source' if not is_economy else 'SMILEVIDEO low quality',
}] 'ext': extension,
'container': extension,
'vcodec': v_stream.get('codec_name'),
'acodec': a_stream.get('codec_name'),
# Some complex swf files doesn't have total bit rate metadata (e.g. nm6049209)
'tbr': int_or_none(metadata['format'].get('bit_rate'), scale=1000),
'vbr': int_or_none(v_stream.get('bit_rate'), scale=1000),
'abr': int_or_none(a_stream.get('bit_rate'), scale=1000),
'height': int_or_none(v_stream.get('height')),
'width': int_or_none(v_stream.get('width')),
'source_preference': 5 if not is_economy else -2,
'quality': 5 if is_source and not is_economy else None,
'filesize': filesize
})
def get_video_info(items): self._sort_formats(formats)
return dict_get(api_data['video'], items)
# Start extracting information # Start extracting information
title = get_video_info('title') title = (
if not title: get_video_info_xml('title') # prefer to get the untranslated original title
title = self._og_search_title(webpage, default=None) or get_video_info_web(['originalTitle', 'title'])
if not title: or self._og_search_title(webpage, default=None)
title = self._html_search_regex( or self._html_search_regex(
r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>', r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>',
webpage, 'video title') webpage, 'video title'))
watch_api_data_string = self._html_search_regex( watch_api_data_string = self._html_search_regex(
r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>', r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>',
@ -372,14 +511,15 @@ class NiconicoIE(InfoExtractor):
video_detail = watch_api_data.get('videoDetail', {}) video_detail = watch_api_data.get('videoDetail', {})
thumbnail = ( thumbnail = (
get_video_info(['thumbnail_url', 'thumbnailURL']) self._html_search_regex(r'<meta property="og:image" content="([^"]+)">', webpage, 'thumbnail data', default=None)
or dict_get( # choose highest from 720p to 240p
get_video_info_web('thumbnail'),
['ogp', 'player', 'largeUrl', 'middleUrl', 'url'])
or self._html_search_meta('image', webpage, 'thumbnail', default=None) or self._html_search_meta('image', webpage, 'thumbnail', default=None)
or video_detail.get('thumbnail')) or video_detail.get('thumbnail'))
description = get_video_info('description') description = get_video_info_web('description')
timestamp = (parse_iso8601(get_video_info('first_retrieve'))
or unified_timestamp(get_video_info('postedDateTime')))
if not timestamp: if not timestamp:
match = self._html_search_meta('datePublished', webpage, 'date published', default=None) match = self._html_search_meta('datePublished', webpage, 'date published', default=None)
if match: if match:
@ -388,19 +528,25 @@ class NiconicoIE(InfoExtractor):
timestamp = parse_iso8601( timestamp = parse_iso8601(
video_detail['postedAt'].replace('/', '-'), video_detail['postedAt'].replace('/', '-'),
delimiter=' ', timezone=datetime.timedelta(hours=9)) delimiter=' ', timezone=datetime.timedelta(hours=9))
timestamp = timestamp or try_get(api_data, lambda x: parse_iso8601(x['video']['registeredAt']))
view_count = int_or_none(get_video_info(['view_counter', 'viewCount'])) view_count = int_or_none(get_video_info_web(['view_counter', 'viewCount']))
if not view_count: if not view_count:
match = self._html_search_regex( match = self._html_search_regex(
r'>Views: <strong[^>]*>([^<]+)</strong>', r'>Views: <strong[^>]*>([^<]+)</strong>',
webpage, 'view count', default=None) webpage, 'view count', default=None)
if match: if match:
view_count = int_or_none(match.replace(',', '')) view_count = int_or_none(match.replace(',', ''))
view_count = view_count or video_detail.get('viewCount') view_count = (
view_count
or video_detail.get('viewCount')
or try_get(api_data, lambda x: x['video']['count']['view']))
comment_count = (
int_or_none(get_video_info_web('comment_num'))
or video_detail.get('commentCount')
or try_get(api_data, lambda x: x['video']['count']['comment']))
comment_count = (int_or_none(get_video_info('comment_num'))
or video_detail.get('commentCount')
or try_get(api_data, lambda x: x['thread']['commentCount']))
if not comment_count: if not comment_count:
match = self._html_search_regex( match = self._html_search_regex(
r'>Comments: <strong[^>]*>([^<]+)</strong>', r'>Comments: <strong[^>]*>([^<]+)</strong>',
@ -409,22 +555,41 @@ class NiconicoIE(InfoExtractor):
comment_count = int_or_none(match.replace(',', '')) comment_count = int_or_none(match.replace(',', ''))
duration = (parse_duration( duration = (parse_duration(
get_video_info('length') get_video_info_web('length')
or self._html_search_meta( or self._html_search_meta(
'video:duration', webpage, 'video duration', default=None)) 'video:duration', webpage, 'video duration', default=None))
or video_detail.get('length') or video_detail.get('length')
or get_video_info('duration')) or get_video_info_web('duration'))
webpage_url = get_video_info('watch_url') or url webpage_url = get_video_info_web('watch_url') or url
# for channel movie and community movie
channel_id = try_get(
api_data,
(lambda x: x['channel']['globalId'],
lambda x: x['community']['globalId']))
channel = try_get(
api_data,
(lambda x: x['channel']['name'],
lambda x: x['community']['name']))
# Note: cannot use api_data.get('owner', {}) because owner may be set to "null" # Note: cannot use api_data.get('owner', {}) because owner may be set to "null"
# in the JSON, which will cause None to be returned instead of {}. # in the JSON, which will cause None to be returned instead of {}.
owner = try_get(api_data, lambda x: x.get('owner'), dict) or {} owner = try_get(api_data, lambda x: x.get('owner'), dict) or {}
uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id') uploader_id = str_or_none(
uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname') get_video_info_web(['ch_id', 'user_id'])
or owner.get('id')
or channel_id
)
uploader = (
get_video_info_web(['ch_name', 'user_nickname'])
or owner.get('nickname')
or channel
)
return { return {
'id': video_id, 'id': video_id,
'_api_data': api_data,
'title': title, 'title': title,
'formats': formats, 'formats': formats,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
@ -432,6 +597,8 @@ class NiconicoIE(InfoExtractor):
'uploader': uploader, 'uploader': uploader,
'timestamp': timestamp, 'timestamp': timestamp,
'uploader_id': uploader_id, 'uploader_id': uploader_id,
'channel': channel,
'channel_id': channel_id,
'view_count': view_count, 'view_count': view_count,
'comment_count': comment_count, 'comment_count': comment_count,
'duration': duration, 'duration': duration,
@ -440,7 +607,7 @@ class NiconicoIE(InfoExtractor):
class NiconicoPlaylistIE(InfoExtractor): class NiconicoPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/)?mylist/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/|my/)?mylist/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.nicovideo.jp/mylist/27411728', 'url': 'http://www.nicovideo.jp/mylist/27411728',
@ -456,60 +623,77 @@ class NiconicoPlaylistIE(InfoExtractor):
'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728', 'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728',
'only_matching': True, 'only_matching': True,
}] }]
_PAGE_SIZE = 100
def _call_api(self, list_id, resource, query): _API_HEADERS = {
return self._download_json( 'X-Frontend-ID': '6',
'https://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, 'X-Frontend-Version': '0'
'Downloading %s JSON metatdata' % resource, query=query, }
headers={'X-Frontend-Id': 6})['data']['mylist']
def _parse_owner(self, item):
owner = item.get('owner') or {}
if owner:
return {
'uploader': owner.get('name'),
'uploader_id': owner.get('id'),
}
return {}
def _fetch_page(self, list_id, page):
page += 1
items = self._call_api(list_id, 'page %d' % page, {
'page': page,
'pageSize': self._PAGE_SIZE,
})['items']
for item in items:
video = item.get('video') or {}
video_id = video.get('id')
if not video_id:
continue
count = video.get('count') or {}
get_count = lambda x: int_or_none(count.get(x))
info = {
'_type': 'url',
'id': video_id,
'title': video.get('title'),
'url': 'https://www.nicovideo.jp/watch/' + video_id,
'description': video.get('shortDescription'),
'duration': int_or_none(video.get('duration')),
'view_count': get_count('view'),
'comment_count': get_count('comment'),
'ie_key': NiconicoIE.ie_key(),
}
info.update(self._parse_owner(video))
yield info
def _real_extract(self, url): def _real_extract(self, url):
list_id = self._match_id(url) list_id = self._match_id(url)
mylist = self._call_api(list_id, 'list', {
'pageSize': 1, def get_page_data(pagenum, pagesize):
}) return self._download_json(
entries = InAdvancePagedList( 'http://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id,
functools.partial(self._fetch_page, list_id), query={'page': 1 + pagenum, 'pageSize': pagesize},
math.ceil(mylist['totalItemCount'] / self._PAGE_SIZE), headers=self._API_HEADERS).get('data').get('mylist')
self._PAGE_SIZE)
result = self.playlist_result( data = get_page_data(0, 1)
entries, list_id, mylist.get('name'), mylist.get('description')) title = data.get('name')
result.update(self._parse_owner(mylist)) description = data.get('description')
return result uploader = data.get('owner').get('name')
uploader_id = data.get('owner').get('id')
def pagefunc(pagenum):
data = get_page_data(pagenum, 25)
return ({
'_type': 'url',
'url': 'http://www.nicovideo.jp/watch/' + item.get('watchId'),
} for item in data.get('items'))
return {
'_type': 'playlist',
'id': list_id,
'title': title,
'description': description,
'uploader': uploader,
'uploader_id': uploader_id,
'entries': OnDemandPagedList(pagefunc, 25),
}
class NiconicoUserIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)/?(?:$|[#?])'
_TEST = {
'url': 'https://www.nicovideo.jp/user/419948',
'info_dict': {
'id': '419948',
},
'playlist_mincount': 101,
}
_API_URL = "https://nvapi.nicovideo.jp/v1/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s"
_PAGE_SIZE = 100
_API_HEADERS = {
'X-Frontend-ID': '6',
'X-Frontend-Version': '0'
}
def _entries(self, list_id, ):
total_count = 1
count = page_num = 0
while count < total_count:
json_parsed = self._download_json(
self._API_URL % (list_id, self._PAGE_SIZE, page_num + 1), list_id,
headers=self._API_HEADERS,
note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else ''))
if not page_num:
total_count = int_or_none(json_parsed['data'].get('totalCount'))
for entry in json_parsed["data"]["items"]:
count += 1
yield self.url_result('https://www.nicovideo.jp/watch/%s' % entry['id'])
page_num += 1
def _real_extract(self, url):
list_id = self._match_id(url)
return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key())