From 097c7bd4bae033e3c15f3bc559f5cf91bd5efe9f Mon Sep 17 00:00:00 2001 From: Laura Liberda Date: Mon, 11 Jan 2021 17:58:24 +0100 Subject: [PATCH] linkedin:post extractor --- haruhi_dl/extractor/extractors.py | 1 + haruhi_dl/extractor/linkedin.py | 59 +++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index 08afaa7e5..8f7a700f5 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -586,6 +586,7 @@ from .limelight import ( ) from .line import LineTVIE from .linkedin import ( + LinkedInPostIE, LinkedInLearningIE, LinkedInLearningCourseIE, ) diff --git a/haruhi_dl/extractor/linkedin.py b/haruhi_dl/extractor/linkedin.py index 26fc703d1..3a790c6cd 100644 --- a/haruhi_dl/extractor/linkedin.py +++ b/haruhi_dl/extractor/linkedin.py @@ -8,11 +8,70 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + mimetype2ext, + unescapeHTML, urlencode_postdata, urljoin, ) +class LinkedInPostIE(InfoExtractor): + IE_NAME = 'linkedin:post' + _VALID_URL = r'''(?x) + https?://(?:www\.)?linkedin\.com/ + (?:feed/update/urn:li:activity: + |posts/[^/]+?-) + (?P\d{19}) + ''' + _TESTS = [{ + 'url': 'https://www.linkedin.com/posts/mrpreventive_amazing-drone-footage-activity-6738422973809602561-icet/', + 'info_dict': { + 'id': '6738422973809602561', + 'ext': 'mp4', + 'title': 'Amazing drone footage', + 'description': 'Amazing drone footage', + }, + }, { + 'url': 'https://www.linkedin.com/feed/update/urn:li:activity:6741704259739426816/', + 'info_dict': { + 'id': '6741704259739426816', + 'ext': 'mp4', + 'title': 'O czym jest nasz podcast?', + 'description': 'Witamy Was serdecznie na profilu "Internet. Czas działać!" W tym krótkim zwiastunie mówimy, o czym jest nasz podcast i jakie temat w nim poruszamy....', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + sources = self._parse_json( + self._html_search_regex(r']+\bdata-sources="(\[{.+?\}])"', + webpage, 'video sources'), video_id) + formats = [] + for source in sources: + formats.append({ + 'url': source['src'], + 'ext': mimetype2ext(source['type']), + 'tbr': source.get('data-bitrate'), + }) + self._sort_formats(formats) + + title = self._og_search_title(webpage) + title = self._search_regex(r'^.+? on LinkedIn: (.+)$', title, 'actual video title', default=title) + title = self._search_regex(r'^(.+?) \| \d+ comments?$', title, 'actual video title', default=title) + + # double-escaped like " + description = unescapeHTML(self._og_search_description(webpage)) + description = self._search_regex(r'^(.+?)\.\.\. \d+ comments? on LinkedIn$', description, 'actual post title', default=description) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + } + + class LinkedInLearningBaseIE(InfoExtractor): _NETRC_MACHINE = 'linkedin' _LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning'