diff --git a/haruhi_dl/extractor/tiktok.py b/haruhi_dl/extractor/tiktok.py index 66088b9ab..d86716355 100644 --- a/haruhi_dl/extractor/tiktok.py +++ b/haruhi_dl/extractor/tiktok.py @@ -3,86 +3,139 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - compat_str, - ExtractorError, + compat_urllib_parse_urlencode, int_or_none, + std_headers, str_or_none, try_get, url_or_none, ) +import random class TikTokBaseIE(InfoExtractor): - def _extract_aweme(self, data): - video = data['video'] - description = str_or_none(try_get(data, lambda x: x['desc'])) - width = int_or_none(try_get(data, lambda x: video['width'])) - height = int_or_none(try_get(data, lambda x: video['height'])) + _DATA_RE = r'' + + def _extract_headers(self, data): + cookie_value = data['props']['initialProps']['$wid'] + return { + 'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5', + 'Cookie': 'tt_webid=%s; tt_webid_v2=%s' % (cookie_value, cookie_value), + 'Referer': data['query']['$initialProps']['$fullUrl'], + } + + def _extract_author_data(self, author): + uploader = str_or_none(author.get('nickname')) or author.get('uniqueId') + uploader_id = str_or_none(author.get('id')) + uploader_url = 'https://www.tiktok.com/@%s' % author.get('uniqueId') + + return { + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + } + + def _extract_video(self, data): + props = data['props']['pageProps']['itemInfo']['itemStruct'] + video = props['video'] + stats = props['stats'] + description = str_or_none(props['desc']) + width = int_or_none(video['width']) + height = int_or_none(video['height']) + duration = int_or_none(video['duration']) format_urls = set() formats = [] - for format_id in ( - 'play_addr_lowbr', 'play_addr', 'play_addr_h264', - 'download_addr'): - for format in try_get( - video, lambda x: x[format_id]['url_list'], list) or []: - format_url = url_or_none(format) - if not format_url: - continue - if format_url in format_urls: - continue - format_urls.add(format_url) - formats.append({ - 'url': format_url, - 'ext': 'mp4', - 'height': height, - 'width': width, - }) + for format_id in ('playAddr', 'downloadAddr'): + format_url = url_or_none(video[format_id]) + if not format_url: + continue + if format_url in format_urls: + continue + format_urls.add(format_url) + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'height': height, + 'width': width, + }) self._sort_formats(formats) - thumbnail = url_or_none(try_get( - video, lambda x: x['cover']['url_list'][0], compat_str)) - uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) - timestamp = int_or_none(data.get('create_time')) - comment_count = int_or_none(data.get('comment_count')) or int_or_none( - try_get(data, lambda x: x['statistics']['comment_count'])) - repost_count = int_or_none(try_get( - data, lambda x: x['statistics']['share_count'])) + thumbnails = [] + for key in ('originCover', 'dynamicCover', 'shareCover', 'reflowCover'): + urls = try_get(video, lambda x: x[key]) + if isinstance(urls, str): + urls = [urls] + if isinstance(urls, list): + for url in urls: + if isinstance(url, str) and len(url) > 0: + thumbnails.append({ + 'url': url, + }) - aweme_id = data['aweme_id'] + timestamp = int_or_none(props.get('createTime')) + view_count = int_or_none(stats.get('playCount')) + like_count = int_or_none(stats.get('diggCount')) + comment_count = int_or_none(stats.get('commentCount')) + repost_count = int_or_none(stats.get('shareCount')) + + author = self._extract_author_data(props['author']) + http_headers = self._extract_headers(data) return { - 'id': aweme_id, - 'title': uploader or aweme_id, + 'id': props['id'], + 'title': author['uploader'], 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, + 'duration': duration, + 'thumbnails': thumbnails, + 'uploader': author['uploader'], + 'uploader_id': author['uploader_id'], + 'uploader_url': author['uploader_url'], 'timestamp': timestamp, + 'view_count': view_count, + 'like_count': like_count, 'comment_count': comment_count, 'repost_count': repost_count, 'formats': formats, + 'http_headers': http_headers, } class TikTokIE(TikTokBaseIE): + IE_NAME = 'tiktok' _VALID_URL = r'''(?x) - https?:// - (?: - (?:m\.)?tiktok\.com/v| - (?:www\.)?tiktok\.com/share/video - ) - /(?P\d+) + (?: + https?:// + (?: + (?:m\.)?tiktok\.com/v| + (?:www\.)?tiktok\.com/(?:share|@[\w.]+)/video + )/ + |tiktok: + )(?P\d+) ''' _TESTS = [{ + 'url': 'https://www.tiktok.com/@puczirajot/video/6878766755280440578', + 'info_dict': { + 'id': '6878766755280440578', + 'ext': 'mp4', + 'title': 'Marta Puczyńska', + 'upload_date': '20201001', + 'uploader_id': '6797754125703693317', + 'description': '#lgbt #lgbtq #lgbtqmatter #poland #polska #warszawa #warsaw', + 'timestamp': 1601587695, + 'uploader': 'Marta Puczyńska', + }, + }, { 'url': 'https://m.tiktok.com/v/6606727368545406213.html', - 'md5': 'd584b572e92fcd48888051f238022420', + 'md5': '163ceff303bb52de60e6887fe399e6cd', 'info_dict': { 'id': '6606727368545406213', 'ext': 'mp4', 'title': 'Zureeal', 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', - 'thumbnail': r're:^https?://.*~noop.image', + 'thumbnail': r're:^https?://.*\.jpeg\?x-expires=.*&x-signature=.*', 'uploader': 'Zureeal', + 'uploader_id': '188294915489964032', 'timestamp': 1538248586, 'upload_date': '20180929', 'comment_count': int, @@ -95,44 +148,85 @@ class TikTokIE(TikTokBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://m.tiktok.com/v/%s.html' % video_id, video_id) + webpage = self._download_webpage('https://www.tiktok.com/share/video/%s' % video_id, video_id) data = self._parse_json(self._search_regex( - r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id) - return self._extract_aweme(data) + self._DATA_RE, webpage, 'data'), video_id) + return self._extract_video(data) class TikTokUserIE(TikTokBaseIE): - _VALID_URL = r'''(?x) - https?:// - (?: - (?:m\.)?tiktok\.com/h5/share/usr| - (?:www\.)?tiktok\.com/share/user - ) - /(?P\d+) - ''' + _WORKING = False + IE_NAME = 'tiktok:user' + _VALID_URL = r'https?://(?:www\.)?tiktok\.com/@(?P[\w.]+)' _TESTS = [{ - 'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html', + 'url': 'https://www.tiktok.com/@puczirajot', 'info_dict': { - 'id': '188294915489964032', + 'id': '6797754125703693317' }, - 'playlist_mincount': 24, - }, { - 'url': 'https://www.tiktok.com/share/user/188294915489964032', - 'only_matching': True, + 'playlist_mincount': 60, }] + def _get_item_list_page(self, data, page=0, max_cursor=0): + initial_props = data['query']['$initialProps'] + page_props = data['props']['pageProps'] + screen_resolution = random.choice(( + (1280, 720), + (1366, 768), + (1920, 1080), + )) + url = 'https://%s/api/item_list/?%s' % ( + try_get(initial_props, lambda x: x['$baseURL'], str) or 'm.tiktok.com', + compat_urllib_parse_urlencode({ + 'aid': '1988', + 'app_name': 'tiktok_web', + 'referer': '', + 'user_agent': std_headers['User-Agent'], + 'cookie_enabled': 'true', + 'screen_width': screen_resolution[0], + 'screen_height': screen_resolution[1], + 'browser_language': 'en-US', + 'browser_platform': 'Win32', + 'browser_name': 'Mozilla', + 'browser_version': std_headers['User-Agent'][8:], + 'browser_online': 'true', + 'ac': '', + 'timezone_name': 'Europe/Warsaw', + 'priority_region': '', + 'verifyFp': 'verify_', # needs investigation + 'appId': initial_props['$appId'], + 'region': initial_props['$region'], + 'appType': initial_props['$appType'], + 'isAndroid': initial_props['$isAndroid'], + 'isMobile': initial_props['$isMobile'], + 'isIOS': initial_props['$isIOS'], + 'OS': initial_props['$os'], + 'did': '0', # '6892477327352038917', + 'count': 30, + 'id': page_props['feedConfig']['id'], + 'secUid': page_props['feedConfig']['secUid'], + 'maxCursor': max_cursor, + 'minCursor': '0', + 'sourceType': '8', + 'language': 'en', + '_signature': '', # needs investigation + })) + return self._download_json(url, page_props['feedConfig']['id'], 'Downloading video list (page #%d)' % page) + def _real_extract(self, url): - user_id = self._match_id(url) - data = self._download_json( - 'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id, - query={'_signature': '_'}) + username = self._match_id(url) + webpage = self._download_webpage( + url, username) + data = self._parse_json(self._search_regex( + self._DATA_RE, webpage, 'data'), username) + user = self._extract_author_data(data['props']['pageProps']['userInfo']['user']) + entries = [] - for aweme in data['aweme_list']: - try: - entry = self._extract_aweme(aweme) - except ExtractorError: - continue - entry['extractor_key'] = TikTokIE.ie_key() - entries.append(entry) - return self.playlist_result(entries, user_id) + videos_page = self._get_item_list_page(data) + self.to_screen(str(videos_page)) + + return { + '_type': 'playlist', + 'id': user['uploader_id'], + 'title': user['uploader'] or user['uploader_id'], + 'entries': entries, + }