Add extractor for teletask

2014-12-21 11:01:28 +01:00 · 2014-12-21 11:01:28 +01:00 · ee45625290
commit ee45625290
parent 1ff30d7b79
2 changed files with 60 additions and 0 deletions
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -405,6 +405,7 @@ from .ted import TEDIE
 from .telebruxelles import TeleBruxellesIE
 from .telecinco import TelecincoIE
 from .telemb import TeleMBIE
 from .teletask import TeleTaskIE
 from .tenplay import TenPlayIE
 from .testurl import TestURLIE
 from .tf1 import TF1IE
--- a/youtube_dl/extractor/teletask.py
+++ b/youtube_dl/extractor/teletask.py
@ -0,0 +1,59 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import re 
 import datetime
 from .common import InfoExtractor
 class TeleTaskIE(InfoExtractor):
    _VALID_URL = r'http?://(?:www\.)?tele-task\.de/archive/video/html5/(?P<id>[0-9]+)/'
    _TEST = {
        'url': 'http://www.tele-task.de/archive/video/html5/26168/', 
        'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
        'info_dict': {
            'id': '26168',
            'ext': 'mp4',
            'title': 'Duplicate Detection',
            'thumbnail': 're:^https?://.*\.jpg$',
            'date': '20141218',
            # TODO more properties, either as:
            # * A value
            # * MD5 checksum; start the string with md5:
            # * A regular expression; start the string with re:
            # * Any Python type (for example int or float)
        }
    }
    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        lecture_url = self._html_search_regex(
                    r'href="([^"]+)" itemprop="name">', webpage, 'title')
        lecture_id = re.search("([0-9]+)/",lecture_url).group(1)
        overview_page = self._download_webpage("http://www.tele-task.de" + lecture_url, 
            lecture_id)
        title = self._html_search_regex(
            r'itemprop="name">([^"]+)</a>', webpage, 'title')
        url = self._html_search_regex(
            r'class="speaker".*?src="([^"]+)"', webpage, 'video_url', flags=re.DOTALL)
        description = self._html_search_regex(
            r'Description of the series:</p>([^"]+)</div>', overview_page, 
            'description',flags=re.DOTALL)
        date = self._html_search_regex(
            r'<td class="label">Date:</td><td>([^"]+)</td>', webpage, 'date')
        date = datetime.datetime.strptime(date, '%d.%m.%Y').strftime('%Y%m%d')
        return {
            'id': video_id,
            'title': title,
            'description': description,
            'url': url,
            'upload_date': date,
        }