nitter extractor

This commit is contained in:
Laura Liberda 2021-02-27 01:52:19 +01:00 committed by Dominika
parent 521dc1f82c
commit 12e6f64462
2 changed files with 26 additions and 48 deletions

View file

@ -816,6 +816,7 @@ from .ninecninemedia import NineCNineMediaIE
from .ninegag import NineGagIE
from .ninenow import NineNowIE
from .nintendo import NintendoIE
from .nitter import NitterSHIE
from .njpwworld import NJPWWorldIE
from .nobelprize import NobelPrizeIE
from .noco import NocoIE

View file

@ -1,8 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_urlparse
from .common import SelfhostedInfoExtractor
from ..utils import (
parse_count,
unified_strdate,
@ -10,44 +9,22 @@ from ..utils import (
remove_end,
determine_ext,
)
import re
class NitterIE(InfoExtractor):
# Taken from https://github.com/zedeus/nitter/wiki/Instances
INSTANCES = ('nitter.net',
'nitter.snopyta.org',
'nitter.42l.fr',
'nitter.nixnet.services',
'nitter.13ad.de',
'nitter.pussthecat.org',
'nitter.mastodont.cat',
'nitter.dark.fail',
'nitter.tedomum.net',
'nitter.cattube.org',
'nitter.fdn.fr',
'nitter.1d4.us',
'nitter.kavin.rocks',
'tweet.lambda.dance',
'nitter.cc',
'nitter.weaponizedhumiliation.com',
'nitter.vxempire.xyz',
'nitter.unixfox.eu',
'nitter.domain.glass',
'nitter.himiko.cloud',
'nitter.eu',
'nitter.ethibox.fr',
'3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion')
class NitterSHIE(SelfhostedInfoExtractor):
_VALID_URL = r'nitter:(?P<host>[^:]+):(?P<id>\d+)'
_SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(?:#.)?'
_SH_VALID_CONTENT_STRINGS = (
'<meta property="og:site_name" content="Nitter" />',
'<link rel="stylesheet" type="text/css" href="/css/themes/nitter.css" />',
)
_SELFHOSTED = True
_INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')'
_VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE}
current_instance = INSTANCES[0] # the test and official instance
current_instance = 'nitter.nixnet.services'
_TESTS = [
{
# GIF (wrapped in mp4)
'url': 'https://' + current_instance + '/firefox/status/1314279897502629888#m',
'url': 'nitter:' + current_instance + ':1314279897502629888',
'info_dict': {
'id': '1314279897502629888',
'ext': 'mp4',
@ -61,14 +38,14 @@ class NitterIE(InfoExtractor):
'timestamp': 1602183720,
},
}, { # normal video
'url': 'https://' + current_instance + '/Le___Doc/status/1299715685392756737#m',
'url': 'nitter:' + current_instance + ':1299715685392756737',
'info_dict': {
'id': '1299715685392756737',
'ext': 'mp4',
'title': 'Le Doc - "Je ne prédis jamais rien" D Raoult, Août 2020...',
'title': 're:.+ - "Je ne prédis jamais rien" D Raoult, Août 2020...',
'description': '"Je ne prédis jamais rien" D Raoult, Août 2020...',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Le Doc',
'uploader': str,
'uploader_id': 'Le___Doc',
'uploader_url': 'https://' + current_instance + '/Le___Doc',
'upload_date': '20200829',
@ -79,7 +56,7 @@ class NitterIE(InfoExtractor):
'comment_count': int,
},
}, { # video embed in a "Streaming Political Ads" box
'url': 'https://' + current_instance + '/mozilla/status/1321147074491092994#m',
'url': 'nitter:' + current_instance + ':1321147074491092994',
'info_dict': {
'id': '1321147074491092994',
'ext': 'mp4',
@ -95,13 +72,16 @@ class NitterIE(InfoExtractor):
},
]
def _real_extract(self, url):
video_id = self._match_id(url)
parsed_url = compat_urlparse.urlparse(url)
base_url = parsed_url.scheme + '://' + parsed_url.netloc
def _selfhosted_extract(self, url, webpage=None):
host, video_id = self._match_id_and_host(url)
base_url = ('http://' if url.startswith('http://') else 'https://') + host
self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
webpage = self._download_webpage(url, video_id)
if not webpage or '>Enable hls playback<' in webpage:
self._set_cookie(host, 'hlsPlayback', 'on')
if url.startswith('nitter:'):
url = base_url + '/hdl/status/' + video_id
webpage = self._download_webpage(url, video_id,
note='Re-downloading webpage for HLS data' if webpage else 'Downloading webpage')
video_url = base_url + self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url')
ext = determine_ext(video_url)
@ -119,10 +99,7 @@ class NitterIE(InfoExtractor):
or self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title'))
description = title
mobj = re.match(self._VALID_URL, url)
uploader_id = (
mobj.group('uploader_id')
or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False))
uploader_id = self._html_search_regex(r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False)
if uploader_id:
uploader_url = base_url + '/' + uploader_id