2018-10-29 19:28:09 +01:00
# coding: utf-8
from __future__ import unicode_literals
import re
from . common import InfoExtractor
from . . utils import (
ExtractorError ,
float_or_none ,
int_or_none ,
2021-01-11 17:58:24 +01:00
mimetype2ext ,
unescapeHTML ,
2018-10-29 19:28:09 +01:00
urlencode_postdata ,
2019-04-11 09:44:58 +02:00
urljoin ,
2018-10-29 19:28:09 +01:00
)
2021-01-11 17:58:24 +01:00
class LinkedInPostIE ( InfoExtractor ) :
IE_NAME = ' linkedin:post '
_VALID_URL = r ''' (?x)
https ? : / / ( ? : www \. ) ? linkedin \. com /
( ? : feed / update / urn : li : activity :
| posts / [ ^ / ] + ? - )
( ? P < id > \d { 19 } )
'''
_TESTS = [ {
' url ' : ' https://www.linkedin.com/posts/mrpreventive_amazing-drone-footage-activity-6738422973809602561-icet/ ' ,
' info_dict ' : {
' id ' : ' 6738422973809602561 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Amazing drone footage ' ,
' description ' : ' Amazing drone footage ' ,
} ,
} , {
' url ' : ' https://www.linkedin.com/feed/update/urn:li:activity:6741704259739426816/ ' ,
' info_dict ' : {
' id ' : ' 6741704259739426816 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' O czym jest nasz podcast? ' ,
' description ' : ' Witamy Was serdecznie na profilu " Internet. Czas działać! " W tym krótkim zwiastunie mówimy, o czym jest nasz podcast i jakie temat w nim poruszamy.... ' ,
} ,
} ]
def _real_extract ( self , url ) :
video_id = self . _match_id ( url )
webpage = self . _download_webpage ( url , video_id )
sources = self . _parse_json (
self . _html_search_regex ( r ' <video \ b[^>]+ \ bdata-sources= " ( \ [ { .+? \ }]) " ' ,
webpage , ' video sources ' ) , video_id )
formats = [ ]
for source in sources :
formats . append ( {
' url ' : source [ ' src ' ] ,
' ext ' : mimetype2ext ( source [ ' type ' ] ) ,
' tbr ' : source . get ( ' data-bitrate ' ) ,
} )
self . _sort_formats ( formats )
title = self . _og_search_title ( webpage )
title = self . _search_regex ( r ' ^.+? on LinkedIn: (.+)$ ' , title , ' actual video title ' , default = title )
title = self . _search_regex ( r ' ^(.+?) \ | \ d+ comments?$ ' , title , ' actual video title ' , default = title )
# double-escaped like &quot;
description = unescapeHTML ( self . _og_search_description ( webpage ) )
description = self . _search_regex ( r ' ^(.+?) \ . \ . \ . \ d+ comments? on LinkedIn$ ' , description , ' actual post title ' , default = description )
return {
' id ' : video_id ,
' title ' : title ,
' description ' : description ,
' formats ' : formats ,
}
2018-10-29 19:28:09 +01:00
class LinkedInLearningBaseIE ( InfoExtractor ) :
_NETRC_MACHINE = ' linkedin '
2019-04-11 09:44:58 +02:00
_LOGIN_URL = ' https://www.linkedin.com/uas/login?trk=learning '
2018-10-29 19:28:09 +01:00
def _call_api ( self , course_slug , fields , video_slug = None , resolution = None ) :
query = {
' courseSlug ' : course_slug ,
' fields ' : fields ,
' q ' : ' slugs ' ,
}
sub = ' '
if video_slug :
query . update ( {
' videoSlug ' : video_slug ,
' resolution ' : ' _ %s ' % resolution ,
} )
sub = ' %d p ' % resolution
api_url = ' https://www.linkedin.com/learning-api/detailedCourses '
return self . _download_json (
api_url , video_slug , ' Downloading %s JSON metadata ' % sub , headers = {
' Csrf-Token ' : self . _get_cookies ( api_url ) [ ' JSESSIONID ' ] . value ,
} , query = query ) [ ' elements ' ] [ 0 ]
2019-02-08 07:21:31 +01:00
def _get_urn_id ( self , video_data ) :
urn = video_data . get ( ' urn ' )
2018-10-29 19:28:09 +01:00
if urn :
mobj = re . search ( r ' urn:li:lyndaCourse: \ d+,( \ d+) ' , urn )
if mobj :
return mobj . group ( 1 )
2019-02-08 07:21:31 +01:00
def _get_video_id ( self , video_data , course_slug , video_slug ) :
return self . _get_urn_id ( video_data ) or ' %s / %s ' % ( course_slug , video_slug )
2018-10-29 19:28:09 +01:00
def _real_initialize ( self ) :
email , password = self . _get_login_info ( )
if email is None :
return
login_page = self . _download_webpage (
2019-04-11 09:44:58 +02:00
self . _LOGIN_URL , None , ' Downloading login page ' )
action_url = urljoin ( self . _LOGIN_URL , self . _search_regex (
2018-10-29 19:28:09 +01:00
r ' <form[^>]+action=([ " \' ])(?P<url>.+?) \ 1 ' , login_page , ' post url ' ,
2019-04-11 09:44:58 +02:00
default = ' https://www.linkedin.com/uas/login-submit ' , group = ' url ' ) )
2018-10-29 19:28:09 +01:00
data = self . _hidden_inputs ( login_page )
data . update ( {
' session_key ' : email ,
' session_password ' : password ,
} )
login_submit_page = self . _download_webpage (
action_url , None , ' Logging in ' ,
data = urlencode_postdata ( data ) )
error = self . _search_regex (
r ' <span[^>]+class= " error " [^>]*> \ s*(.+?) \ s*</span> ' ,
login_submit_page , ' error ' , default = None )
if error :
raise ExtractorError ( error , expected = True )
class LinkedInLearningIE ( LinkedInLearningBaseIE ) :
IE_NAME = ' linkedin:learning '
_VALID_URL = r ' https?://(?:www \ .)?linkedin \ .com/learning/(?P<course_slug>[^/]+)/(?P<id>[^/?#]+) '
_TEST = {
' url ' : ' https://www.linkedin.com/learning/programming-foundations-fundamentals/welcome?autoplay=true ' ,
' md5 ' : ' a1d74422ff0d5e66a792deb996693167 ' ,
' info_dict ' : {
' id ' : ' 90426 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' Welcome ' ,
' timestamp ' : 1430396150.82 ,
' upload_date ' : ' 20150430 ' ,
} ,
}
def _real_extract ( self , url ) :
course_slug , video_slug = re . match ( self . _VALID_URL , url ) . groups ( )
video_data = None
formats = [ ]
for width , height in ( ( 640 , 360 ) , ( 960 , 540 ) , ( 1280 , 720 ) ) :
video_data = self . _call_api (
course_slug , ' selectedVideo ' , video_slug , height ) [ ' selectedVideo ' ]
video_url_data = video_data . get ( ' url ' ) or { }
progressive_url = video_url_data . get ( ' progressiveUrl ' )
if progressive_url :
formats . append ( {
' format_id ' : ' progressive- %d p ' % height ,
' url ' : progressive_url ,
' height ' : height ,
' width ' : width ,
' source_preference ' : 1 ,
} )
title = video_data [ ' title ' ]
audio_url = video_data . get ( ' audio ' , { } ) . get ( ' progressiveUrl ' )
if audio_url :
formats . append ( {
' abr ' : 64 ,
' ext ' : ' m4a ' ,
' format_id ' : ' audio ' ,
' url ' : audio_url ,
' vcodec ' : ' none ' ,
} )
streaming_url = video_url_data . get ( ' streamingUrl ' )
if streaming_url :
formats . extend ( self . _extract_m3u8_formats (
streaming_url , video_slug , ' mp4 ' ,
' m3u8_native ' , m3u8_id = ' hls ' , fatal = False ) )
self . _sort_formats ( formats , ( ' width ' , ' height ' , ' source_preference ' , ' tbr ' , ' abr ' ) )
return {
2019-02-08 07:21:31 +01:00
' id ' : self . _get_video_id ( video_data , course_slug , video_slug ) ,
2018-10-29 19:28:09 +01:00
' title ' : title ,
' formats ' : formats ,
' thumbnail ' : video_data . get ( ' defaultThumbnail ' ) ,
' timestamp ' : float_or_none ( video_data . get ( ' publishedOn ' ) , 1000 ) ,
' duration ' : int_or_none ( video_data . get ( ' durationInSeconds ' ) ) ,
}
class LinkedInLearningCourseIE ( LinkedInLearningBaseIE ) :
IE_NAME = ' linkedin:learning:course '
_VALID_URL = r ' https?://(?:www \ .)?linkedin \ .com/learning/(?P<id>[^/?#]+) '
_TEST = {
' url ' : ' https://www.linkedin.com/learning/programming-foundations-fundamentals ' ,
' info_dict ' : {
' id ' : ' programming-foundations-fundamentals ' ,
' title ' : ' Programming Foundations: Fundamentals ' ,
' description ' : ' md5:76e580b017694eb89dc8e8923fff5c86 ' ,
} ,
' playlist_mincount ' : 61 ,
}
@classmethod
def suitable ( cls , url ) :
return False if LinkedInLearningIE . suitable ( url ) else super ( LinkedInLearningCourseIE , cls ) . suitable ( url )
def _real_extract ( self , url ) :
course_slug = self . _match_id ( url )
course_data = self . _call_api ( course_slug , ' chapters,description,title ' )
entries = [ ]
2019-02-08 07:21:31 +01:00
for chapter_number , chapter in enumerate ( course_data . get ( ' chapters ' , [ ] ) , 1 ) :
2018-10-29 19:28:09 +01:00
chapter_title = chapter . get ( ' title ' )
2019-02-08 07:21:31 +01:00
chapter_id = self . _get_urn_id ( chapter )
2018-10-29 19:28:09 +01:00
for video in chapter . get ( ' videos ' , [ ] ) :
video_slug = video . get ( ' slug ' )
if not video_slug :
continue
entries . append ( {
2018-10-29 21:49:12 +01:00
' _type ' : ' url_transparent ' ,
2019-02-08 07:21:31 +01:00
' id ' : self . _get_video_id ( video , course_slug , video_slug ) ,
2018-10-29 19:28:09 +01:00
' title ' : video . get ( ' title ' ) ,
' url ' : ' https://www.linkedin.com/learning/ %s / %s ' % ( course_slug , video_slug ) ,
' chapter ' : chapter_title ,
2019-02-08 07:21:31 +01:00
' chapter_number ' : chapter_number ,
' chapter_id ' : chapter_id ,
2018-10-29 19:28:09 +01:00
' ie_key ' : LinkedInLearningIE . ie_key ( ) ,
} )
return self . playlist_result (
entries , course_slug ,
course_data . get ( ' title ' ) ,
course_data . get ( ' description ' ) )