From 20d6ad3c33efd3c6d5f441c30a0a7ea54d39bc49 Mon Sep 17 00:00:00 2001 From: selfisekai Date: Wed, 4 Nov 2020 23:55:41 +0100 Subject: [PATCH] [onnetwork] added extractor --- haruhi_dl/extractor/extractors.py | 4 + haruhi_dl/extractor/generic.py | 17 +++++ haruhi_dl/extractor/onnetwork.py | 121 ++++++++++++++++++++++++++++++ 3 files changed, 142 insertions(+) create mode 100644 haruhi_dl/extractor/onnetwork.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index ae7079a6a..624a430e8 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -794,6 +794,10 @@ from .onet import ( OnetPlIE, ) from .onionstudios import OnionStudiosIE +from .onnetwork import ( + OnNetworkLoaderIE, + OnNetworkFrameIE, +) from .ooyala import ( OoyalaIE, OoyalaExternalIE, diff --git a/haruhi_dl/extractor/generic.py b/haruhi_dl/extractor/generic.py index a5ab9ee33..e4a34ca7a 100644 --- a/haruhi_dl/extractor/generic.py +++ b/haruhi_dl/extractor/generic.py @@ -119,6 +119,7 @@ from .expressen import ExpressenIE from .zype import ZypeIE from .odnoklassniki import OdnoklassnikiIE from .kinja import KinjaEmbedIE +from .onnetwork import OnNetworkLoaderIE class GenericIE(InfoExtractor): @@ -2151,6 +2152,17 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # OnNetwork.tv embed + 'url': 'https://wiadomosci.gazeta.pl/wiadomosci/7,114883,26377890,panstwo-polskie-nie-uznaje-takich-rodzin-jak-nasza-i-krzywdzi.html', + 'info_dict': { + 'id': '337382', + 'title': 'Rodzina+ odc. 1. Karolina i Ania', + 'ext': 'mp4', + 'age_limit': 16, + 'upload_date': '20200929', + }, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -3213,6 +3225,11 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) + onn_urls = OnNetworkLoaderIE._extract_urls(webpage) + if onn_urls: + return self.playlist_from_matches( + onn_urls, video_id, video_title, ie=OnNetworkLoaderIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: diff --git a/haruhi_dl/extractor/onnetwork.py b/haruhi_dl/extractor/onnetwork.py new file mode 100644 index 000000000..f65386813 --- /dev/null +++ b/haruhi_dl/extractor/onnetwork.py @@ -0,0 +1,121 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, +) + +import re +import datetime + + +class OnNetworkLoaderIE(InfoExtractor): + IE_NAME = 'onnetwork:loader' + _TESTS = [{ + 'url': 'https://video.onnetwork.tv/embed.php?sid=eVgsMWM3UCww&cId=onn-cid-199058', + 'only_matching': True, + }, { + 'url': 'https://video.onnetwork.tv/embed.php?sid=MTI5LDFYaTIsMA==', + 'only_matching': True, + }, { + 'url': 'https://video.onnetwork.tv/embed.php?mid=MCwxNng5LDAsMCwxNzU1LDM3MjksMSwwLDEsMzYsNSwwLDIsMCw0LDEsMCwxLDEsMiwwLDAsMSwwLDAsMCwwLC0xOy0xOzIwOzIwLDAsNTAsMA==&cId=p2f95a6a83ab9a3e55759256bec0be777&widget=524', + 'only_matching': True, + }] + _VALID_URL = r'''https?://video\.onnetwork\.tv/embed\.php\?(?:mid=(?P[^&]+))?(?:&?sid=(?P[^&\s]+))?(?:&?cId=onn-cid-(?P\d+))?(?:.+)?''' + + @staticmethod + def _extract_urls(webpage): + matches = re.finditer( + r''']*src=["'](%s.*?)["']''' % OnNetworkLoaderIE._VALID_URL, + webpage) + if matches: + matches = [match.group(1) for match in matches] + return matches + + def _real_extract(self, url): + url_mobj = re.match(self._VALID_URL, url) + cid, sid, mid = url_mobj.group('cid', 'sid', 'mid') + js_loader = self._download_webpage(url, cid or sid or mid, 'Downloading js player loader') + return { + '_type': 'url', + 'url': self._search_regex(r'frameSrc\s*:\s*"(.+?)"', js_loader, 'frame url'), + 'ie_key': 'OnNetworkFrame', + } + + +class OnNetworkFrameIE(InfoExtractor): + IE_NAME = 'onnetwork:frame' + _VALID_URL = r'https?://video\.onnetwork\.tv/frame84\.php\?(?:[^&]+&)*?mid=(?P[^&]+)&(?:[^&]+&)*?id=(?P[^&]+)' + _TESTS = [{ + 'url': 'https://video.onnetwork.tv/frame84.php?mid=MCwxNng5LDAsMCwxNzU1LDM3MjksMSwwLDEsMzYsNSwwLDIsMCw0LDEsMCwxLDEsMiwwLDAsMSwwLDAsMCwwLC0xOy0xOzIwOzIwLDAsNTAsMA==&preview=0&iid=0&e=1&widget=524&id=ffEXS991c5f8f4dbb502b540687287098d2d8', + 'only_matching': True, + }] + + _BASE_OBJECT_RE = r'''var onplayer\s*=\s*new tUIPlayer\(\s*{\s*videos\s*:\s*\[\s*{.*?''' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + vid = mobj.group('vid') + webpage = self._download_webpage(url, vid, 'Downloading video frame') + + video_id = self._search_regex( + self._BASE_OBJECT_RE + r'id\s*:\s*(\d+)', + webpage, 'video id') + m3u_url = self._search_regex( + self._BASE_OBJECT_RE + r'(?:urls\s*:\[{[^}]+}\],)?url\s*:"([^"]+)"', + webpage, 'm3u url') + title = self._search_regex( + self._BASE_OBJECT_RE + r"(?\d+),RESOLUTION=(?P\d+)x(?P\d+),NAME=(?P[^\n]+)\n(?P[^\n]+)', + m3u_content): + + formats.append({ + 'url': base_m3u_url + match.group('filename'), + 'format_id': match.group('format_name'), + 'width': int(match.group('width')), + 'height': int(match.group('height')), + 'ext': 'mp4', + 'protocol': 'm3u8', + }) + subtitles = {} + for match in re.finditer( + r'#EXT-X-MEDIA:TYPE=SUBTITLES,GROUP-ID="(?P[^"]+)",NAME="(?P[^"]+)",URI="(?P[^"]+)",LANGUAGE="(?P[^"]+)"', + m3u_content): + + subtitles[match.group('lang')] = [] + sub_list = self._download_webpage(base_m3u_url + match.group('filename'), video_id, 'Downloading subtitle list') + for sub_file in re.findall(r'(?<=\n)[^#\s]+', sub_list): + subtitles[match.group('lang')].append({ + 'url': base_m3u_url + sub_file, + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': thumbnail, + 'duration': int_or_none(duration), + 'age_limit': int_or_none(age_limit), + 'upload_date': upload_date, + }