[ceskateleveize] Improve extraction and remove URL replacement hacks

This commit is contained in:
Sergey M․ 2017-04-08 19:41:14 +07:00
parent 78280352ca
commit e18f1da97a
No known key found for this signature in database
GPG key ID: 2C393E0F18A9236D

View file

@ -18,7 +18,7 @@ from ..utils import (
class CeskaTelevizeIE(InfoExtractor): class CeskaTelevizeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P<id>[^/#?]+)/*(?:[#?].*)?$' _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
'info_dict': { 'info_dict': {
@ -62,40 +62,12 @@ class CeskaTelevizeIE(InfoExtractor):
}, },
'skip': 'Georestricted to Czech Republic', 'skip': 'Georestricted to Czech Republic',
}, { }, {
# video with 18+ caution trailer 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', 'only_matching': True,
'info_dict': {
'id': '215562210900007-bogotart',
'title': 'Queer: Bogotart',
'description': 'Alternativní průvodce současným queer světem',
},
'playlist': [{
'info_dict': {
'id': '61924494876844842',
'ext': 'mp4',
'title': 'Queer: Bogotart (Varování 18+)',
'duration': 10.2,
},
}, {
'info_dict': {
'id': '61924494877068022',
'ext': 'mp4',
'title': 'Queer: Bogotart (Queer)',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 1558.3,
},
}],
'params': {
# m3u8 download
'skip_download': True,
},
}] }]
def _real_extract(self, url): def _real_extract(self, url):
url = url.replace('/porady/', '/ivysilani/').replace('/video/', '') playlist_id = self._match_id(url)
mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id')
webpage = self._download_webpage(url, playlist_id) webpage = self._download_webpage(url, playlist_id)
@ -103,13 +75,28 @@ class CeskaTelevizeIE(InfoExtractor):
if '%s</p>' % NOT_AVAILABLE_STRING in webpage: if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
typ = self._html_search_regex( type_ = None
r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') episode_id = None
playlist = self._parse_json(
self._search_regex(
r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist',
default='{}'), playlist_id)
if playlist:
type_ = playlist.get('type')
episode_id = playlist.get('id')
if not type_:
type_ = self._html_search_regex(
r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],',
webpage, 'type')
if not episode_id:
episode_id = self._html_search_regex( episode_id = self._html_search_regex(
r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],',
webpage, 'episode_id')
data = { data = {
'playlist[0][type]': typ, 'playlist[0][type]': type_,
'playlist[0][id]': episode_id, 'playlist[0][id]': episode_id,
'requestUrl': compat_urllib_parse_urlparse(url).path, 'requestUrl': compat_urllib_parse_urlparse(url).path,
'requestSource': 'iVysilani', 'requestSource': 'iVysilani',