x-link (x-news.pl embeds) extractor

2021-01-31 01:26:49 +01:00 · 2021-01-31 01:26:49 +01:00 · 87fad4b7eb
parent a3816f69be
commit 87fad4b7eb
3 changed files with 69 additions and 0 deletions
--- a/haruhi_dl/extractor/extractors.py
+++ b/haruhi_dl/extractor/extractors.py
@ -1492,6 +1492,7 @@ from .ximalaya import (
    XimalayaAlbumIE
 )
 from .xminus import XMinusIE
 from .xnews import XLinkIE
 from .xnxx import XNXXIE
 from .xstream import XstreamIE
 from .xtube import XTubeUserIE, XTubeIE
--- a/haruhi_dl/extractor/generic.py
+++ b/haruhi_dl/extractor/generic.py
@ -122,6 +122,7 @@ from .kinja import KinjaEmbedIE
 from .onnetwork import OnNetworkLoaderIE
 from .embetty import EmbettyIE
 from .rtlnl import RtlNlIE
 from .xnews import XLinkIE
 class GenericIE(InfoExtractor):
@ -2587,6 +2588,7 @@ class GenericIE(InfoExtractor):
            TeachableIE,    # must be before Wistia
            WistiaIE,
            SVTIE,
            XLinkIE,
        ):
            try:
                ie_key = embie.ie_key()
--- a/haruhi_dl/extractor/xnews.py
+++ b/haruhi_dl/extractor/xnews.py
@ -0,0 +1,66 @@
 # encoding: utf-8
 from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from ..utils import (
    js_to_json,
    parse_duration,
    smuggle_url,
    unsmuggle_url,
 )
 class XLinkIE(InfoExtractor):
    IE_NAME = 'x-link'
    IE_DESC = 'x-news.pl embeds'
    _VALID_URL = r'https?://get\.x-link\.pl/(?:[a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12}),(?P<id>[a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12}),embed\.html'
    _TESTS = [{
        'url': 'https://get.x-link.pl/6fc656ab-ee92-d813-6afd-59863a7ccbdd,7186de52-4c89-5d64-7508-fca6a4f2d3b9,embed.html#__youtubedl_smuggle=%7B%22referer%22%3A+%22https%3A%2F%2Fgazetawroclawska.pl%2Fsklepy-w-galeriach-handlowych-otwarte-od-poniedzialku-w-rezimie-sanitarnym-co-trzeba-wiedziec%2Far%2Fc3-15417477%22%7D',
        'info_dict': {
            'id': '7186de52-4c89-5d64-7508-fca6a4f2d3b9',
            'ext': 'mp4',
            'title': 'Luzowanie obostrzeń: Od 1 lutego otwarte galerie handlowe i muzea, nie będzie też godzin dla seniorów',
        },
    }]
    @staticmethod
    def _extract_urls(webpage, url=None):
        return [smuggle_url(mobj.group('url'), {'referer': url}) for mobj
                in re.finditer(r'<script\b[^>]+\bdata-url=(["\'])(?P<url>https?://get\.x-link\.pl/(?:[a-f\d]{8}-(?:[a-f\d]{4}-){3}[a-f\d]{12},){2}embed\.html)[^"\']*?\1', webpage)]
    def _real_extract(self, url):
        video_id = self._match_id(url)
        headers = {}
        _, smugged = unsmuggle_url(url, default={})
        referer = smugged.get("referer")
        if referer is None:
            self.report_warning("Referer not smuggled, will probably fail")
        else:
            headers["Referer"] = referer.encode('utf-8')
        webpage = self._download_webpage(url, video_id, headers=headers)
        data = self._search_regex(r'initConsent\(\[({.+?})],', webpage, 'video data')
        data = js_to_json(data)
        data = self._parse_json(data, video_id)
        thumbnails = []
        if data.get('thumbnail'):
            thumbnails.append({
                'url': 'https:' + data.get('thumbnail'),
            })
        if data.get('poster'):
            thumbnails.append({
                'url': 'https:' + data.get('poster'),
            })
        return {
            'id': video_id,
            'url': 'https:' + data['src'],
            'title': data['title'],
            'thumbnails': thumbnails,
            'duration': parse_duration(data.get('videoDuration')),
        }