[joj] Rewrite and add support for generic embeds (closes #13268)

This commit is contained in:
Sergey M․ 2017-07-09 19:05:18 +07:00
parent 256a746d21
commit 73cf76a93f
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D
2 changed files with 93 additions and 32 deletions

View file

@ -91,6 +91,7 @@ from .anvato import AnvatoIE
from .washingtonpost import WashingtonPostIE from .washingtonpost import WashingtonPostIE
from .wistia import WistiaIE from .wistia import WistiaIE
from .mediaset import MediasetIE from .mediaset import MediasetIE
from .joj import JojIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -1770,6 +1771,16 @@ class GenericIE(InfoExtractor):
}, },
'add_ie': [MediasetIE.ie_key()], 'add_ie': [MediasetIE.ie_key()],
}, },
{
# JOJ.sk embeds
'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok',
'info_dict': {
'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok',
'title': 'Slovenskom sa prehnala vlna silných búrok',
},
'playlist_mincount': 5,
'add_ie': [JojIE.ie_key()],
},
{ {
# AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video)
'url': 'https://tvrain.ru/amp/418921/', 'url': 'https://tvrain.ru/amp/418921/',
@ -2722,6 +2733,12 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
# Look for JOJ.sk embeds
joj_urls = JojIE._extract_urls(webpage)
if joj_urls:
return self.playlist_from_matches(
joj_urls, video_id, video_title, ie=JojIE.ie_key())
def merge_dicts(dict1, dict2): def merge_dicts(dict1, dict2):
merged = {} merged = {}
for k, v in dict1.items(): for k, v in dict1.items():

View file

@ -1,56 +1,100 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor
import re import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
js_to_json,
try_get,
)
class JojIE(InfoExtractor): class JojIE(InfoExtractor):
_VALID_URL = r'https?://[a-z0-9]+\.joj\.sk/([^/]+/)*(?P<title_query>(?P<release_date>[0-9]{4}(-[0-9]{2}){2}).*)' # noqa _VALID_URL = r'''(?x)
(?:
joj:|
https?://media\.joj\.sk/embed/
)
(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})
'''
_TESTS = [{ _TESTS = [{
'url': 'https://www.joj.sk/nove-byvanie/archiv/2017-05-28-nove-byvanie', # noqa 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
'info_dict': { 'info_dict': {
'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932', 'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Nové Bývanie', 'title': 'NOVÉ BÝVANIE',
'release_date': '20170528' 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 3118,
} }
}, { }, {
'url': 'http://nasi.joj.sk/epizody/2016-09-06-stari-rodicia', 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',
'info_dict': { 'only_matching': True,
'id': 'f18b2c5f-9ea8-4941-a164-a814c53306ad',
'ext': 'mp4',
'title': 'Starí Rodičia',
'release_date': '20160906'
}
}] }]
media_src_url = 'http://n16.joj.sk/storage/' @staticmethod
xml_source_url = 'https://media.joj.sk/services/Video.php?clip=' def _extract_urls(webpage):
return re.findall(
r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
webpage)
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) video_id = self._match_id(url)
release_date = mobj.group('release_date').replace('-', '')
webpage = self._download_webpage(url, 'id') webpage = self._download_webpage(
video_id = self._html_search_regex( 'https://media.joj.sk/embed/%s' % video_id, video_id)
r'https?://([a-z0-9]+\.)joj\.sk/embed/(?P<video_id>[a-f0-9\-]+)',
webpage, 'id', group='video_id') title = self._search_regex(
xml_playlist_url = self.xml_source_url + video_id (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
xml_playlist_et = self._download_xml(xml_playlist_url, 'XML playlist') r'<title>(?P<title>[^<]+)'), webpage, 'title',
default=None, group='title') or self._og_search_title(webpage)
bitrates = self._parse_json(
self._search_regex(
r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates',
default='{}'),
video_id, transform_source=js_to_json, fatal=False)
formats = [] formats = []
for file_el in xml_playlist_et.findall('files/file'): for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:
try: if isinstance(format_url, compat_str):
height = int(file_el.attrib['id'].replace('p', '')) height = self._search_regex(
except ValueError: r'(\d+)[pP]\.', format_url, 'height', default=None)
height = 0 formats.append({
formats.append({'height': height, 'url': format_url,
'url': self.media_src_url + file_el.attrib['path'].replace( # noqa 'format_id': '%sp' % height if height else None,
'dat/', '', 1)}) 'height': int(height),
})
if not formats:
playlist = self._download_xml(
'https://media.joj.sk/services/Video.php?clip=%s' % video_id,
video_id)
for file_el in playlist.findall('./files/file'):
path = file_el.get('path')
if not path:
continue
format_id = file_el.get('id') or file_el.get('label')
formats.append({
'url': 'http://n16.joj.sk/storage/%s' % path.replace(
'dat/', '', 1),
'format_id': format_id,
'height': int_or_none(self._search_regex(
r'(\d+)[pP]', format_id or path, 'height',
default=None)),
})
self._sort_formats(formats) self._sort_formats(formats)
thumbnail = self._og_search_thumbnail(webpage)
duration = int_or_none(self._search_regex(
r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
return { return {
'id': video_id, 'id': video_id,
'title': self._og_search_title(webpage).title(), 'title': title,
'thumbnail': thumbnail,
'duration': duration,
'formats': formats, 'formats': formats,
'release_date': release_date
} }