Add extractor for teletask

This commit is contained in:
MaxReimann 2014-12-21 11:01:28 +01:00
parent 1ff30d7b79
commit ee45625290
2 changed files with 60 additions and 0 deletions

View file

@ -405,6 +405,7 @@ from .ted import TEDIE
from .telebruxelles import TeleBruxellesIE from .telebruxelles import TeleBruxellesIE
from .telecinco import TelecincoIE from .telecinco import TelecincoIE
from .telemb import TeleMBIE from .telemb import TeleMBIE
from .teletask import TeleTaskIE
from .tenplay import TenPlayIE from .tenplay import TenPlayIE
from .testurl import TestURLIE from .testurl import TestURLIE
from .tf1 import TF1IE from .tf1 import TF1IE

View file

@ -0,0 +1,59 @@
# coding: utf-8
from __future__ import unicode_literals
import re
import datetime
from .common import InfoExtractor
class TeleTaskIE(InfoExtractor):
_VALID_URL = r'http?://(?:www\.)?tele-task\.de/archive/video/html5/(?P<id>[0-9]+)/'
_TEST = {
'url': 'http://www.tele-task.de/archive/video/html5/26168/',
'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
'info_dict': {
'id': '26168',
'ext': 'mp4',
'title': 'Duplicate Detection',
'thumbnail': 're:^https?://.*\.jpg$',
'date': '20141218',
# TODO more properties, either as:
# * A value
# * MD5 checksum; start the string with md5:
# * A regular expression; start the string with re:
# * Any Python type (for example int or float)
}
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
lecture_url = self._html_search_regex(
r'href="([^"]+)" itemprop="name">', webpage, 'title')
lecture_id = re.search("([0-9]+)/",lecture_url).group(1)
overview_page = self._download_webpage("http://www.tele-task.de" + lecture_url,
lecture_id)
title = self._html_search_regex(
r'itemprop="name">([^"]+)</a>', webpage, 'title')
url = self._html_search_regex(
r'class="speaker".*?src="([^"]+)"', webpage, 'video_url', flags=re.DOTALL)
description = self._html_search_regex(
r'Description of the series:</p>([^"]+)</div>', overview_page,
'description',flags=re.DOTALL)
date = self._html_search_regex(
r'<td class="label">Date:</td><td>([^"]+)</td>', webpage, 'date')
date = datetime.datetime.strptime(date, '%d.%m.%Y').strftime('%Y%m%d')
return {
'id': video_id,
'title': title,
'description': description,
'url': url,
'upload_date': date,
}