nitter extractor

This commit is contained in:
Laura Liberda 2021-02-27 01:52:19 +01:00
parent 6178129851
commit 44adc8a082
2 changed files with 26 additions and 48 deletions

View file

@ -816,6 +816,7 @@ from .ninecninemedia import NineCNineMediaIE
from .ninegag import NineGagIE from .ninegag import NineGagIE
from .ninenow import NineNowIE from .ninenow import NineNowIE
from .nintendo import NintendoIE from .nintendo import NintendoIE
from .nitter import NitterSHIE
from .njpwworld import NJPWWorldIE from .njpwworld import NJPWWorldIE
from .nobelprize import NobelPrizeIE from .nobelprize import NobelPrizeIE
from .noco import NocoIE from .noco import NocoIE

View file

@ -1,8 +1,7 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import SelfhostedInfoExtractor
from ..compat import compat_urlparse
from ..utils import ( from ..utils import (
parse_count, parse_count,
unified_strdate, unified_strdate,
@ -10,44 +9,22 @@ from ..utils import (
remove_end, remove_end,
determine_ext, determine_ext,
) )
import re
class NitterIE(InfoExtractor): class NitterSHIE(SelfhostedInfoExtractor):
# Taken from https://github.com/zedeus/nitter/wiki/Instances _VALID_URL = r'nitter:(?P<host>[^:]+):(?P<id>\d+)'
INSTANCES = ('nitter.net', _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(?:#.)?'
'nitter.snopyta.org', _SH_VALID_CONTENT_STRINGS = (
'nitter.42l.fr', '<meta property="og:site_name" content="Nitter" />',
'nitter.nixnet.services', '<link rel="stylesheet" type="text/css" href="/css/themes/nitter.css" />',
'nitter.13ad.de', )
'nitter.pussthecat.org', _SELFHOSTED = True
'nitter.mastodont.cat',
'nitter.dark.fail',
'nitter.tedomum.net',
'nitter.cattube.org',
'nitter.fdn.fr',
'nitter.1d4.us',
'nitter.kavin.rocks',
'tweet.lambda.dance',
'nitter.cc',
'nitter.weaponizedhumiliation.com',
'nitter.vxempire.xyz',
'nitter.unixfox.eu',
'nitter.domain.glass',
'nitter.himiko.cloud',
'nitter.eu',
'nitter.ethibox.fr',
'3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion')
_INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')' current_instance = 'nitter.nixnet.services'
_VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE}
current_instance = INSTANCES[0] # the test and official instance
_TESTS = [ _TESTS = [
{ {
# GIF (wrapped in mp4) # GIF (wrapped in mp4)
'url': 'https://' + current_instance + '/firefox/status/1314279897502629888#m', 'url': 'nitter:' + current_instance + ':1314279897502629888',
'info_dict': { 'info_dict': {
'id': '1314279897502629888', 'id': '1314279897502629888',
'ext': 'mp4', 'ext': 'mp4',
@ -61,14 +38,14 @@ class NitterIE(InfoExtractor):
'timestamp': 1602183720, 'timestamp': 1602183720,
}, },
}, { # normal video }, { # normal video
'url': 'https://' + current_instance + '/Le___Doc/status/1299715685392756737#m', 'url': 'nitter:' + current_instance + ':1299715685392756737',
'info_dict': { 'info_dict': {
'id': '1299715685392756737', 'id': '1299715685392756737',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Le Doc - "Je ne prédis jamais rien" D Raoult, Août 2020...', 'title': 're:.+ - "Je ne prédis jamais rien" D Raoult, Août 2020...',
'description': '"Je ne prédis jamais rien" D Raoult, Août 2020...', 'description': '"Je ne prédis jamais rien" D Raoult, Août 2020...',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Le Doc', 'uploader': str,
'uploader_id': 'Le___Doc', 'uploader_id': 'Le___Doc',
'uploader_url': 'https://' + current_instance + '/Le___Doc', 'uploader_url': 'https://' + current_instance + '/Le___Doc',
'upload_date': '20200829', 'upload_date': '20200829',
@ -79,7 +56,7 @@ class NitterIE(InfoExtractor):
'comment_count': int, 'comment_count': int,
}, },
}, { # video embed in a "Streaming Political Ads" box }, { # video embed in a "Streaming Political Ads" box
'url': 'https://' + current_instance + '/mozilla/status/1321147074491092994#m', 'url': 'nitter:' + current_instance + ':1321147074491092994',
'info_dict': { 'info_dict': {
'id': '1321147074491092994', 'id': '1321147074491092994',
'ext': 'mp4', 'ext': 'mp4',
@ -95,13 +72,16 @@ class NitterIE(InfoExtractor):
}, },
] ]
def _real_extract(self, url): def _selfhosted_extract(self, url, webpage=None):
video_id = self._match_id(url) host, video_id = self._match_id_and_host(url)
parsed_url = compat_urlparse.urlparse(url) base_url = ('http://' if url.startswith('http://') else 'https://') + host
base_url = parsed_url.scheme + '://' + parsed_url.netloc
self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on') if not webpage or '>Enable hls playback<' in webpage:
webpage = self._download_webpage(url, video_id) self._set_cookie(host, 'hlsPlayback', 'on')
if url.startswith('nitter:'):
url = base_url + '/hdl/status/' + video_id
webpage = self._download_webpage(url, video_id,
note='Re-downloading webpage for HLS data' if webpage else 'Downloading webpage')
video_url = base_url + self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url') video_url = base_url + self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url')
ext = determine_ext(video_url) ext = determine_ext(video_url)
@ -119,10 +99,7 @@ class NitterIE(InfoExtractor):
or self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title')) or self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title'))
description = title description = title
mobj = re.match(self._VALID_URL, url) uploader_id = self._html_search_regex(r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False)
uploader_id = (
mobj.group('uploader_id')
or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False))
if uploader_id: if uploader_id:
uploader_url = base_url + '/' + uploader_id uploader_url = base_url + '/' + uploader_id