[youtube] cleanup, speed up age-gated extraction, fix videos with js-like syntax

merge-requests/5/head
Lauren Liberda 2021-06-06 16:40:57 +02:00
parent 9373a2f667
commit ed273bfbf2
2 changed files with 21 additions and 363 deletions

View File

@ -28,7 +28,6 @@ from ..utils import (
float_or_none,
get_element_by_id,
int_or_none,
list_geoblocked_countres,
mimetype2ext,
parse_codecs,
parse_duration,
@ -44,6 +43,7 @@ from ..utils import (
uppercase_escape,
url_or_none,
urlencode_postdata,
GeoRestrictedError,
)
@ -567,24 +567,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'format': '141/bestaudio[ext=m4a]',
},
},
# JS player signature function name containing $
{
'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM',
'info_dict': {
'id': 'nfWlot6h_JM',
'ext': 'm4a',
'title': 'Taylor Swift - Shake It Off',
'description': 'md5:9dc0bd58efe700594b54f7d82bed0bac',
'duration': 242,
'uploader': 'TaylorSwiftVEVO',
'uploader_id': 'TaylorSwiftVEVO',
'upload_date': '20140818',
},
'params': {
'youtube_include_dash_manifest': True,
'format': '141/bestaudio[ext=m4a]',
},
},
# Normal age-gate video (No vevo, embed allowed)
{
'url': 'https://youtube.com/watch?v=HtVdAasjOgU',
@ -636,24 +618,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'DASH manifest missing',
]
},
# Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
{
'url': 'lqQg6PlCWgI',
'info_dict': {
'id': 'lqQg6PlCWgI',
'ext': 'mp4',
'duration': 6085,
'upload_date': '20150827',
'uploader_id': 'olympic',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
'uploader': 'Olympic',
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
},
'params': {
'skip_download': 'requires avconv',
}
},
# Non-square pixels
{
'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0',
@ -879,26 +843,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'https://www.youtubekids.com/watch?v=BnC-cpUCdns',
'only_matching': True,
},
{
# invalid -> valid video id redirection
'url': 'DJztXj2GPfl',
'info_dict': {
'id': 'DJztXj2GPfk',
'ext': 'mp4',
'title': 'Panjabi MC - Mundian To Bach Ke (The Dictator Soundtrack)',
'description': 'md5:bf577a41da97918e94fa9798d9228825',
'upload_date': '20090125',
'uploader': 'Prochorowka',
'uploader_id': 'Prochorowka',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Prochorowka',
'artist': 'Panjabi MC',
'track': 'Beware of the Boys (Mundian to Bach Ke) - Motivo Hi-Lectro Remix',
'album': 'Beware of the Boys (Mundian To Bach Ke)',
},
'params': {
'skip_download': True,
},
},
{
# empty description results in an empty string
'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
@ -919,6 +863,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'url': 'https://youtube.com/shorts/7awd-y_DTQY',
'only_matching': True,
},
{
'url': 'https://www.youtube.com/video/2NDLF-k2PwA',
'only_matching': True,
}
]
_VALID_SIG_VALUE_RE = r'^AO[a-zA-Z0-9_-]+=*$'
@ -1132,7 +1080,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# to be implemented in future that will replace this workaround (see
# https://github.com/ytdl-org/youtube-dl/issues/7468,
# https://github.com/ytdl-org/youtube-dl/pull/7599)
r';ytplayer\.config\s*=\s*({.+?});ytplayer',
r';ytplayer\.config\s*=\s*({.+?});\s*ytplayer',
r';ytplayer\.config\s*=\s*({.+?});',
)
config = self._search_regex(
@ -1473,14 +1421,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Get video info
video_info = {}
embed_webpage = None
if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
or re.search(r'player-age-gate-content">', video_webpage) is not None):
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
url = proto + '://www.youtube.com/embed/%s' % video_id
embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage')
data = compat_urllib_parse_urlencode({
'video_id': video_id,
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
@ -1490,8 +1435,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
try:
video_info_webpage = self._download_webpage(
video_info_url, video_id,
note='Refetching age-gated info webpage',
errnote='unable to download video info webpage')
note='Downloading age-gated video info',
errnote='unable to download video info')
except ExtractorError:
video_info_webpage = None
if video_info_webpage:
@ -1522,9 +1467,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_response = extract_player_response(args.get('player_response'), video_id)
if not player_response:
player_response = extract_player_response(
self._search_regex(
self._search_regex((
# js-like syntax
r'(?:window(?:\["|\.)|var )ytInitialPlayerResponse(?:"])?\s*=\s*({.+?(?!\\)});(?:if \(ytcsi|var [a-zA-Z\_])',
r'(?:window(?:\["|\.)|var )ytInitialPlayerResponse(?:"])?\s*=\s*({.+?(?!\\)});',
video_webpage, 'ytInitialPlayerResponse', fatal=False), video_id)
), video_webpage, 'ytInitialPlayerResponse', fatal=False), video_id)
if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
add_dash_mpd_pr(player_response)
@ -1722,24 +1669,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
ASSETS_RE = r'"jsUrl":"(/s/player/.*?/player_ias.vflset/.*?/base.js)'
player_url = self._search_regex(
ASSETS_RE,
embed_webpage if age_gate else video_webpage, '', default=player_url)
ASSETS_RE, video_webpage, '', default=player_url)
if not player_url and not age_gate:
# We need the embed website after all
if embed_webpage is None:
embed_url = proto + '://www.youtube.com/embed/%s' % video_id
embed_webpage = self._download_webpage(
embed_url, video_id, 'Downloading embed webpage')
embed_url = proto + '://www.youtube.com/embed/%s' % video_id
embed_webpage = self._download_webpage(
embed_url, video_id, 'Downloading embed webpage')
player_url = self._search_regex(
ASSETS_RE, embed_webpage, 'JS player URL')
# if player_url is None:
# player_url_json = self._search_regex(
# r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
# video_webpage, 'age gate player URL')
# player_url = json.loads(player_url_json)
if 'sig' in url_data:
url += '&signature=' + url_data['sig'][0]
elif 's' in url_data:
@ -1871,11 +1810,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
or ', who has blocked it on copyright grounds' in error_desc
or 'It is not available in your country.' in error_desc
or ', who has blocked it in your country on copyright grounds.' in error_desc):
raise ExtractorError(
list_geoblocked_countres(
self._search_regex(
r'<meta itemprop="regionsAllowed" content="((?:(?:[A-Z]{2},)*[A-Z]{2})?)">',
video_webpage, 'allowed region list').split(',')),
raise GeoRestrictedError(
error_desc,
countries=self._search_regex(
r'<meta itemprop="regionsAllowed" content="((?:(?:[A-Z]{2},)*[A-Z]{2})?)">',
video_webpage, 'allowed region list').split(','),
expected=True)
if error_desc and 'Playback on other websites has been disabled' in error_desc:
raise ExtractorError(

View File

@ -5775,284 +5775,3 @@ def clean_podcast_url(url):
st\.fm # https://podsights.com/docs/
)/e
)/''', '', url)
# http://country.io/names.json
country_list = {
"BD": "Bangladesh",
"BE": "Belgium",
"BF": "Burkina Faso",
"BG": "Bulgaria",
"BA": "Bosnia and Herzegovina",
"BB": "Barbados",
"WF": "Wallis and Futuna",
"BL": "Saint Barthelemy",
"BM": "Bermuda",
"BN": "Brunei",
"BO": "Bolivia",
"BH": "Bahrain",
"BI": "Burundi",
"BJ": "Benin",
"BT": "Bhutan",
"JM": "Jamaica",
"BV": "Bouvet Island",
"BW": "Botswana",
"WS": "Samoa",
"BQ": "Bonaire, Saint Eustatius and Saba ",
"BR": "Brazil",
"BS": "Bahamas",
"JE": "Jersey",
"BY": "Belarus",
"BZ": "Belize",
"RU": "Russia",
"RW": "Rwanda",
"RS": "Serbia",
"TL": "East Timor",
"RE": "Reunion",
"TM": "Turkmenistan",
"TJ": "Tajikistan",
"RO": "Romania",
"TK": "Tokelau",
"GW": "Guinea-Bissau",
"GU": "Guam",
"GT": "Guatemala",
"GS": "South Georgia and the South Sandwich Islands",
"GR": "Greece",
"GQ": "Equatorial Guinea",
"GP": "Guadeloupe",
"JP": "Japan",
"GY": "Guyana",
"GG": "Guernsey",
"GF": "French Guiana",
"GE": "Georgia",
"GD": "Grenada",
"GB": "United Kingdom",
"GA": "Gabon",
"SV": "El Salvador",
"GN": "Guinea",
"GM": "Gambia",
"GL": "Greenland",
"GI": "Gibraltar",
"GH": "Ghana",
"OM": "Oman",
"TN": "Tunisia",
"JO": "Jordan",
"HR": "Croatia",
"HT": "Haiti",
"HU": "Hungary",
"HK": "Hong Kong",
"HN": "Honduras",
"HM": "Heard Island and McDonald Islands",
"VE": "Venezuela",
"PR": "Puerto Rico",
"PS": "Palestinian Territory",
"PW": "Palau",
"PT": "Portugal",
"SJ": "Svalbard and Jan Mayen",
"PY": "Paraguay",
"IQ": "Iraq",
"PA": "Panama",
"PF": "French Polynesia",
"PG": "Papua New Guinea",
"PE": "Peru",
"PK": "Pakistan",
"PH": "Philippines",
"PN": "Pitcairn",
"PL": "Poland",
"PM": "Saint Pierre and Miquelon",
"ZM": "Zambia",
"EH": "Western Sahara",
"EE": "Estonia",
"EG": "Egypt",
"ZA": "South Africa",
"EC": "Ecuador",
"IT": "Italy",
"VN": "Vietnam",
"SB": "Solomon Islands",
"ET": "Ethiopia",
"SO": "Somalia",
"ZW": "Zimbabwe",
"SA": "Saudi Arabia",
"ES": "Spain",
"ER": "Eritrea",
"ME": "Montenegro",
"MD": "Moldova",
"MG": "Madagascar",
"MF": "Saint Martin",
"MA": "Morocco",
"MC": "Monaco",
"UZ": "Uzbekistan",
"MM": "Myanmar",
"ML": "Mali",
"MO": "Macao",
"MN": "Mongolia",
"MH": "Marshall Islands",
"MK": "Macedonia",
"MU": "Mauritius",
"MT": "Malta",
"MW": "Malawi",
"MV": "Maldives",
"MQ": "Martinique",
"MP": "Northern Mariana Islands",
"MS": "Montserrat",
"MR": "Mauritania",
"IM": "Isle of Man",
"UG": "Uganda",
"TZ": "Tanzania",
"MY": "Malaysia",
"MX": "Mexico",
"IL": "Israel",
"FR": "France",
"IO": "British Indian Ocean Territory",
"SH": "Saint Helena",
"FI": "Finland",
"FJ": "Fiji",
"FK": "Falkland Islands",
"FM": "Micronesia",
"FO": "Faroe Islands",
"NI": "Nicaragua",
"NL": "Netherlands",
"NO": "Norway",
"NA": "Namibia",
"VU": "Vanuatu",
"NC": "New Caledonia",
"NE": "Niger",
"NF": "Norfolk Island",
"NG": "Nigeria",
"NZ": "New Zealand",
"NP": "Nepal",
"NR": "Nauru",
"NU": "Niue",
"CK": "Cook Islands",
"XK": "Kosovo",
"CI": "Ivory Coast",
"CH": "Switzerland",
"CO": "Colombia",
"CN": "China",
"CM": "Cameroon",
"CL": "Chile",
"CC": "Cocos Islands",
"CA": "Canada",
"CG": "Republic of the Congo",
"CF": "Central African Republic",
"CD": "Democratic Republic of the Congo",
"CZ": "Czech Republic",
"CY": "Cyprus",
"CX": "Christmas Island",
"CR": "Costa Rica",
"CW": "Curacao",
"CV": "Cape Verde",
"CU": "Cuba",
"SZ": "Swaziland",
"SY": "Syria",
"SX": "Sint Maarten",
"KG": "Kyrgyzstan",
"KE": "Kenya",
"SS": "South Sudan",
"SR": "Suriname",
"KI": "Kiribati",
"KH": "Cambodia",
"KN": "Saint Kitts and Nevis",
"KM": "Comoros",
"ST": "Sao Tome and Principe",
"SK": "Slovakia",
"KR": "South Korea",
"SI": "Slovenia",
"KP": "North Korea",
"KW": "Kuwait",
"SN": "Senegal",
"SM": "San Marino",
"SL": "Sierra Leone",
"SC": "Seychelles",
"KZ": "Kazakhstan",
"KY": "Cayman Islands",
"SG": "Singapore",
"SE": "Sweden",
"SD": "Sudan",
"DO": "Dominican Republic",
"DM": "Dominica",
"DJ": "Djibouti",
"DK": "Denmark",
"VG": "British Virgin Islands",
"DE": "Germany",
"YE": "Yemen",
"DZ": "Algeria",
"US": "United States",
"UY": "Uruguay",
"YT": "Mayotte",
"UM": "United States Minor Outlying Islands",
"LB": "Lebanon",
"LC": "Saint Lucia",
"LA": "Laos",
"TV": "Tuvalu",
"TW": "Taiwan",
"TT": "Trinidad and Tobago",
"TR": "Turkey",
"LK": "Sri Lanka",
"LI": "Liechtenstein",
"LV": "Latvia",
"TO": "Tonga",
"LT": "Lithuania",
"LU": "Luxembourg",
"LR": "Liberia",
"LS": "Lesotho",
"TH": "Thailand",
"TF": "French Southern Territories",
"TG": "Togo",
"TD": "Chad",
"TC": "Turks and Caicos Islands",
"LY": "Libya",
"VA": "Vatican",
"VC": "Saint Vincent and the Grenadines",
"AE": "United Arab Emirates",
"AD": "Andorra",
"AG": "Antigua and Barbuda",
"AF": "Afghanistan",
"AI": "Anguilla",
"VI": "U.S. Virgin Islands",
"IS": "Iceland",
"IR": "Iran",
"AM": "Armenia",
"AL": "Albania",
"AO": "Angola",
"AQ": "Antarctica",
"AS": "American Samoa",
"AR": "Argentina",
"AU": "Australia",
"AT": "Austria",
"AW": "Aruba",
"IN": "India",
"AX": "Aland Islands",
"AZ": "Azerbaijan",
"IE": "Ireland",
"ID": "Indonesia",
"UA": "Ukraine",
"QA": "Qatar",
"MZ": "Mozambique"
}
def list_countries():
return country_list.keys()
def list_geoblocked_countres(allowed_countries, reverse=False):
geoblocked = []
geounlocked = []
for country in list_countries():
if country in allowed_countries or (country not in allowed_countries and reverse is True):
geounlocked.append(country)
elif country not in allowed_countries or (country in allowed_countries and reverse is True):
geoblocked.append(country)
if len(geounlocked) == 0:
return 'This video is blocked in all countries'
if len(geoblocked) <= 10:
return 'This video is blocked in these countries: %s' % ', '.join(sorted(country_list[cnt] for cnt in geoblocked))
if len(geounlocked) <= 10:
geoblocked.sort()
return 'This video is only available in these countries: %s' % ', '.join(country_list[cnt] for cnt in geounlocked)
if len(geoblocked) >= len(geounlocked):
geounlocked.sort()
return 'This video is only available in these countries: %s' % ', '.join(geounlocked)
geoblocked.sort()
return 'This video is blocked in these countries: %s' % ', '.join(geoblocked)