diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index db6166a93..53e34e798 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -816,6 +816,7 @@ from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE +from .nitter import NitterSHIE from .njpwworld import NJPWWorldIE from .nobelprize import NobelPrizeIE from .noco import NocoIE diff --git a/haruhi_dl/extractor/nitter.py b/haruhi_dl/extractor/nitter.py index 37f235432..9f06e1ac2 100644 --- a/haruhi_dl/extractor/nitter.py +++ b/haruhi_dl/extractor/nitter.py @@ -1,8 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..compat import compat_urlparse +from .common import SelfhostedInfoExtractor from ..utils import ( parse_count, unified_strdate, @@ -10,44 +9,22 @@ from ..utils import ( remove_end, determine_ext, ) -import re -class NitterIE(InfoExtractor): - # Taken from https://github.com/zedeus/nitter/wiki/Instances - INSTANCES = ('nitter.net', - 'nitter.snopyta.org', - 'nitter.42l.fr', - 'nitter.nixnet.services', - 'nitter.13ad.de', - 'nitter.pussthecat.org', - 'nitter.mastodont.cat', - 'nitter.dark.fail', - 'nitter.tedomum.net', - 'nitter.cattube.org', - 'nitter.fdn.fr', - 'nitter.1d4.us', - 'nitter.kavin.rocks', - 'tweet.lambda.dance', - 'nitter.cc', - 'nitter.weaponizedhumiliation.com', - 'nitter.vxempire.xyz', - 'nitter.unixfox.eu', - 'nitter.domain.glass', - 'nitter.himiko.cloud', - 'nitter.eu', - 'nitter.ethibox.fr', - '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion', - 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion', - 'npf37k3mtzwxreiw52ccs5ay4e6qt2fkcs2ndieurdyn2cuzzsfyfvid.onion') +class NitterSHIE(SelfhostedInfoExtractor): + _VALID_URL = r'nitter:(?P[^:]+):(?P\d+)' + _SH_VALID_URL = r'https?://(?P[^/]+)/(?P.+)/status/(?P[0-9]+)(?:#.)?' + _SH_VALID_CONTENT_STRINGS = ( + '', + '', + ) + _SELFHOSTED = True - _INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')' - _VALID_URL = r'https?://%(instance)s/(?P.+)/status/(?P[0-9]+)(#.)?' % {'instance': _INSTANCES_RE} - current_instance = INSTANCES[0] # the test and official instance + current_instance = 'nitter.nixnet.services' _TESTS = [ { # GIF (wrapped in mp4) - 'url': 'https://' + current_instance + '/firefox/status/1314279897502629888#m', + 'url': 'nitter:' + current_instance + ':1314279897502629888', 'info_dict': { 'id': '1314279897502629888', 'ext': 'mp4', @@ -61,14 +38,14 @@ class NitterIE(InfoExtractor): 'timestamp': 1602183720, }, }, { # normal video - 'url': 'https://' + current_instance + '/Le___Doc/status/1299715685392756737#m', + 'url': 'nitter:' + current_instance + ':1299715685392756737', 'info_dict': { 'id': '1299715685392756737', 'ext': 'mp4', - 'title': 'Le Doc - "Je ne prédis jamais rien" D Raoult, Août 2020...', + 'title': 're:.+ - "Je ne prédis jamais rien" D Raoult, Août 2020...', 'description': '"Je ne prédis jamais rien" D Raoult, Août 2020...', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Le Doc', + 'uploader': str, 'uploader_id': 'Le___Doc', 'uploader_url': 'https://' + current_instance + '/Le___Doc', 'upload_date': '20200829', @@ -79,7 +56,7 @@ class NitterIE(InfoExtractor): 'comment_count': int, }, }, { # video embed in a "Streaming Political Ads" box - 'url': 'https://' + current_instance + '/mozilla/status/1321147074491092994#m', + 'url': 'nitter:' + current_instance + ':1321147074491092994', 'info_dict': { 'id': '1321147074491092994', 'ext': 'mp4', @@ -95,13 +72,16 @@ class NitterIE(InfoExtractor): }, ] - def _real_extract(self, url): - video_id = self._match_id(url) - parsed_url = compat_urlparse.urlparse(url) - base_url = parsed_url.scheme + '://' + parsed_url.netloc + def _selfhosted_extract(self, url, webpage=None): + host, video_id = self._match_id_and_host(url) + base_url = ('http://' if url.startswith('http://') else 'https://') + host - self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on') - webpage = self._download_webpage(url, video_id) + if not webpage or '>Enable hls playback<' in webpage: + self._set_cookie(host, 'hlsPlayback', 'on') + if url.startswith('nitter:'): + url = base_url + '/hdl/status/' + video_id + webpage = self._download_webpage(url, video_id, + note='Re-downloading webpage for HLS data' if webpage else 'Downloading webpage') video_url = base_url + self._html_search_regex(r'(?:]+data-url|]+src)="([^"]+)"', webpage, 'video url') ext = determine_ext(video_url) @@ -119,10 +99,7 @@ class NitterIE(InfoExtractor): or self._html_search_regex(r'
]+title="([^"]+)"', webpage, 'uploader name', fatal=False)) + uploader_id = self._html_search_regex(r']+title="@([^"]+)"', webpage, 'uploader id', fatal=False) if uploader_id: uploader_url = base_url + '/' + uploader_id