From 2a772c54bb8cac97334a807096a2193f8410b415 Mon Sep 17 00:00:00 2001 From: Laura Liberda Date: Fri, 8 Jan 2021 21:01:36 +0100 Subject: [PATCH] the guardian extractors --- haruhi_dl/extractor/extractors.py | 4 ++ haruhi_dl/extractor/guardian.py | 75 +++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 haruhi_dl/extractor/guardian.py diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index aed03952c..4824a8f39 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -438,6 +438,10 @@ from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE from .groupon import GrouponIE +from .guardian import ( + GuardianAudioIE, + GuardianVideoIE, +) from .hbo import HBOIE from .hearthisat import HearThisAtIE from .heise import HeiseIE diff --git a/haruhi_dl/extractor/guardian.py b/haruhi_dl/extractor/guardian.py new file mode 100644 index 000000000..f2f17da22 --- /dev/null +++ b/haruhi_dl/extractor/guardian.py @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, + parse_duration, + url_or_none, +) + + +class GuardianVideoIE(InfoExtractor): + IE_NAME = 'guardian:video' + _VALID_URL = r'https?://(?:www\.)?theguardian\.com/[^/]+/video/\d{4}/[a-z]{3}/\d{2}/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.theguardian.com/global/video/2020/dec/29/covid-from-space-the-humans-furthest-from-the-pandemic-video', + 'info_dict': { + 'id': 'nmehcVJm3Y0', + 'ext': 'mp4', + 'title': 'Covid from space: the humans furthest from the pandemic – video', + 'description': 'md5:6b6ea15d4efc75b887c3e9e8fb3cd803', + 'upload_date': '20201229', + 'uploader_id': 'TheGuardian', + 'uploader': 'The Guardian', + }, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + release_date = self._html_search_meta('datePublished', webpage) + if release_date: + release_date = release_date[:4] + release_date[5:7] + release_date[8:10] + return { + '_type': 'url_transparent', + 'url': 'https://www.youtube.com/watch?v=%s' % (self._search_regex( + r'
[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.theguardian.com/news/audio/2021/jan/08/the-storming-of-the-capitol-and-the-end-of-the-trump-era', + 'info_dict': { + 'id': 'the-storming-of-the-capitol-and-the-end-of-the-trump-era', + 'ext': 'mp3', + 'title': 'The storming of the Capitol and the end of the Trump era', + 'description': 'When rioters stormed into the Capitol building in Washington DC this week, it marked a new low for the Trump presidency. David Smith and Lauren Gambino describe a week in US politics like no other', + }, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + figure = self._search_regex(r'(
]*id="audio-component-container"[^>]*>)', + webpage, 'figure element') + figure_attrs = extract_attributes(figure) + return { + 'id': page_id, + 'url': figure_attrs['data-source'], + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'duration': int_or_none(figure_attrs.get('data-duration')), + 'thumbnail': self._og_search_thumbnail(webpage), + }