Compare commits

...

35 commits

Author SHA1 Message Date
Lauren Liberda 2f375d447c fix/speedup ci 2021-09-09 12:38:11 +02:00
Lauren Liberda d464b29113 vider support 2021-09-06 22:34:06 +02:00
Lauren Liberda 19602fb3f5 [polskieradio] fix PR4 audition shit 2021-08-31 20:25:12 +02:00
Lauren Liberda a550e21b8c [ipla] state the DRM requirement clearly 2021-08-07 02:23:28 +02:00
Lauren Liberda 1ae67712e8 [ipla] error handling 2021-08-07 01:08:07 +02:00
Dominika Liberda a96bf110da * version 2021.08.01 2021-08-01 17:44:07 +02:00
Lauren Liberda 973652cf4d [youtube] fix age gate for *some* videos 2021-08-01 17:39:30 +02:00
Lauren Liberda d81137a604 [peertube] pt 3.3+ url scheme support, fix tests, minor fixes 2021-07-30 20:40:19 +02:00
Lauren Liberda a0d52ce5be [niconico] dmc downloader and other stuff from yt-dlp (as of 40078a5) 2021-06-26 14:40:02 +02:00
Dominika Liberda 81b5018d99 * version 2021.06.24.1 2021-06-24 14:01:25 +02:00
Dominika Liberda 31b7bf5bdb * fixes crash if signature decryption code isn't packed with artifacts 2021-06-24 13:58:36 +02:00
Dominika Liberda a0cb1b40a2 * fix in release script 2021-06-24 13:18:36 +02:00
Dominika Liberda c3e48f4934 * version 2021.06.24 2021-06-24 13:07:07 +02:00
Dominika Liberda ca6cbb6234 * fixes youtube list extractor 2021-06-24 12:27:39 +02:00
Lauren Liberda 7858dc7b9f fix app crash/tests 2021-06-22 03:17:30 +02:00
Lauren Liberda 2234b1100c [liveleak] remove for real 2021-06-22 03:02:52 +02:00
Lauren Liberda 75442522b2 [soundcloud] prerelease client id fetching 2021-06-22 02:43:50 +02:00
Lauren Liberda f4070e6fe4 prerelease artifact generator, for youtube sig 2021-06-21 23:01:02 +02:00
Lauren Liberda b30cd7afbb [liveleak] remove extractor 2021-06-21 20:43:52 +02:00
Lauren Liberda 29389b4935 [pornhub] Add support for pornhubthbh7ap3u.onion
Original author: dstftw <dstftw@gmail.com>
2021-06-21 20:26:48 +02:00
=?UTF-8?q?Sergey=20M=E2=80=A4?= 3fc2d04e08 [pornhub] Detect geo restriction 2021-06-21 20:22:14 +02:00
=?UTF-8?q?Sergey=20M=E2=80=A4?= 30a3fb457e [pornhub] Dismiss tbr extracted from download URLs (closes #28927)
No longer reliable
2021-06-21 20:22:07 +02:00
=?UTF-8?q?Sergey=20M=E2=80=A4?= 69813b6be8 [curiositystream:collection] Extend _VALID_URL (closes #26326, closes…
#29117)
2021-06-21 20:22:00 +02:00
Tianyi Shi f1a365faf8 [bilibili] Strip uploader name (#29202) 2021-06-21 20:21:17 +02:00
Logan B 86c90f7d47 [umg:de] Update GraphQL API URL (#29304)
Previous one no longer resolves

Co-authored-by: Sergey M. <dstftw@gmail.com>
2021-06-21 20:20:56 +02:00
=?UTF-8?q?Sergey=20M=E2=80=A4?= a33a92ba4b [nrk] Switch psapi URL to https (closes #29344)
Catalog calls no longer work via http
2021-06-21 20:20:49 +02:00
kikuyan 6057163d97 [postprocessor/ffmpeg] Show ffmpeg output on error (refs #22680) (#29…
…336)
2021-06-21 20:20:43 +02:00
kikuyan aad8936157 [egghead] Add support for app.egghead.io (closes #28404) (#29303)
Co-authored-by: Sergey M. <dstftw@gmail.com>
2021-06-21 20:20:36 +02:00
kikuyan 18dd355e39 [appleconnect] Fix extraction (#29208) 2021-06-21 20:20:29 +02:00
kikuyan e628fc3794 [orf:tvthek] Add support for MPD formats (closes #28672) (#29236) 2021-06-21 20:20:18 +02:00
=?UTF-8?q?Sergey=20M=E2=80=A4?= ac99e96a1e [facebook] Improve login required detection 2021-06-21 20:19:41 +02:00
=?UTF-8?q?Sergey=20M=E2=80=A4?= 93131809f2 [youporn] Fix formats and view count extraction (closes #29216) 2021-06-21 20:19:35 +02:00
=?UTF-8?q?Sergey=20M=E2=80=A4?= 9cced7b3d2 [orf:tvthek] Fix thumbnails extraction (closes #29217) 2021-06-21 20:19:28 +02:00
Remita Amine b526b67bc1 [formula1] fix extraction(closes #29206) 2021-06-21 20:19:20 +02:00
Lauren Liberda e676b759d1 [youtube] fix the fancy georestricted error 2021-06-20 23:00:58 +02:00
31 changed files with 819 additions and 721 deletions

1
.gitignore vendored
View file

@ -15,6 +15,7 @@ haruhi-dl.1
haruhi-dl.bash-completion haruhi-dl.bash-completion
haruhi-dl.fish haruhi-dl.fish
haruhi_dl/extractor/lazy_extractors.py haruhi_dl/extractor/lazy_extractors.py
haruhi_dl/extractor_artifacts/
haruhi-dl haruhi-dl
haruhi-dl.exe haruhi-dl.exe
haruhi-dl.tar.gz haruhi-dl.tar.gz

View file

@ -1,5 +1,6 @@
default: default:
before_script: before_script:
- sed -i "s@dl-cdn.alpinelinux.org@alpine.sakamoto.pl@g" /etc/apk/repositories
- apk add bash - apk add bash
- pip install nose - pip install nose

View file

@ -1,3 +1,9 @@
version 2021.08.01
Extractor
* [youtube] fixed agegate
* [niconico] dmc downloader from youtube-dlp
* [peertube] new URL schemas
version 2021.06.20 version 2021.06.20
Core Core
* [playwright] fixed headlessness * [playwright] fixed headlessness

View file

@ -0,0 +1,32 @@
# this is intended to speed-up some extractors,
# which sometimes need to extract some data that doesn't change very much often,
# but it does on random times, like youtube's signature "crypto" or soundcloud's client id
import os
from os.path import dirname as dirn
import sys
sys.path.insert(0, dirn(dirn((os.path.abspath(__file__)))))
from haruhi_dl import HaruhiDL
from haruhi_dl.utils import (
ExtractorError,
)
hdl = HaruhiDL(params={
'quiet': True,
})
artifact_dir = os.path.join(dirn(dirn((os.path.abspath(__file__)))), 'haruhi_dl', 'extractor_artifacts')
if not os.path.exists(artifact_dir):
os.mkdir(artifact_dir)
for ie_name in (
'Youtube',
'Soundcloud',
):
ie = hdl.get_info_extractor(ie_name)
try:
file_contents = ie._generate_prerelease_file()
with open(os.path.join(artifact_dir, ie_name.lower() + '.py'), 'w') as file:
file.write(file_contents)
except ExtractorError as err:
print(err)

View file

@ -1,141 +1,24 @@
#!/bin/bash #!/bin/bash
# IMPORTANT: the following assumptions are made if [[ "$(basename $(pwd))" == 'devscripts' ]]; then
# * the GH repo is on the origin remote cd ..
# * the gh-pages branch is named so locally
# * the git config user.signingkey is properly set
# You will need
# pip install coverage nose rsa wheel
# TODO
# release notes
# make hash on local files
set -e
skip_tests=true
gpg_sign_commits=""
buildserver='localhost:8142'
while true
do
case "$1" in
--run-tests)
skip_tests=false
shift
;;
--gpg-sign-commits|-S)
gpg_sign_commits="-S"
shift
;;
--buildserver)
buildserver="$2"
shift 2
;;
--*)
echo "ERROR: unknown option $1"
exit 1
;;
*)
break
;;
esac
done
if [ -z "$1" ]; then echo "ERROR: specify version number like this: $0 1994.09.06"; exit 1; fi
version="$1"
major_version=$(echo "$version" | sed -n 's#^\([0-9]*\.[0-9]*\.[0-9]*\).*#\1#p')
if test "$major_version" '!=' "$(date '+%Y.%m.%d')"; then
echo "$version does not start with today's date!"
exit 1
fi fi
if [ ! -z "`git tag | grep "$version"`" ]; then echo 'ERROR: version already present'; exit 1; fi v="$(date "+%Y.%m.%d")"
if [ ! -z "`git status --porcelain | grep -v CHANGELOG`" ]; then echo 'ERROR: the working directory is not clean; commit or stash changes'; exit 1; fi
useless_files=$(find haruhi_dl -type f -not -name '*.py')
if [ ! -z "$useless_files" ]; then echo "ERROR: Non-.py files in haruhi_dl: $useless_files"; exit 1; fi
if [ ! -f "updates_key.pem" ]; then echo 'ERROR: updates_key.pem missing'; exit 1; fi
if ! type pandoc >/dev/null 2>/dev/null; then echo 'ERROR: pandoc is missing'; exit 1; fi
if ! python3 -c 'import rsa' 2>/dev/null; then echo 'ERROR: python3-rsa is missing'; exit 1; fi
if ! python3 -c 'import wheel' 2>/dev/null; then echo 'ERROR: wheel is missing'; exit 1; fi
read -p "Is ChangeLog up to date? (y/n) " -n 1 if [[ "$(grep "'$v" haruhi_dl/version.py)" != '' ]]; then #' is this the first release of the day?
if [[ ! $REPLY =~ ^[Yy]$ ]]; then exit 1; fi if [[ "$(grep -Poh '[0-9]{4}\.[0-9]{2}\.[0-9]{2}\.[0-9]' haruhi_dl/version.py)" != '' ]]; then # so, 2nd or nth?
v="$v.$(($(cat haruhi_dl/version.py | grep -Poh '[0-9]{4}\.[0-9]{2}\.[0-9]{2}\.[0-9]' | grep -Poh '[0-9]+$')+1))"
/bin/echo -e "\n### First of all, testing..." else
make clean v="$v.1"
if $skip_tests ; then fi
echo 'SKIPPING TESTS'
else
nosetests --verbose --with-coverage --cover-package=haruhi_dl --cover-html test --stop || exit 1
fi fi
/bin/echo -e "\n### Changing version in version.py..." sed "s/__version__ = '.*'/__version__ = '$v'/g" -i haruhi_dl/version.py
sed -i "s/__version__ = '.*'/__version__ = '$version'/" haruhi_dl/version.py
/bin/echo -e "\n### Changing version in ChangeLog..." python3 setup.py build_lazy_extractors
sed -i "s/<unreleased>/$version/" ChangeLog python3 devscripts/prerelease_codegen.py
rm -R build dist
/bin/echo -e "\n### Committing documentation, templates and haruhi_dl/version.py..." python3 setup.py sdist bdist_wheel
make README.md CONTRIBUTING.md issuetemplates supportedsites python3 -m twine upload dist/*
git add README.md CONTRIBUTING.md .github/ISSUE_TEMPLATE/1_broken_site.md .github/ISSUE_TEMPLATE/2_site_support_request.md .github/ISSUE_TEMPLATE/3_site_feature_request.md .github/ISSUE_TEMPLATE/4_bug_report.md .github/ISSUE_TEMPLATE/5_feature_request.md .github/ISSUE_TEMPLATE/6_question.md docs/supportedsites.md haruhi_dl/version.py ChangeLog devscripts/wine-py2exe.sh setup.py
git commit $gpg_sign_commits -m "release $version"
/bin/echo -e "\n### Now tagging, signing and pushing..."
git tag -s -m "Release $version" "$version"
git show "$version"
read -p "Is it good, can I push? (y/n) " -n 1
if [[ ! $REPLY =~ ^[Yy]$ ]]; then exit 1; fi
echo
MASTER=$(git rev-parse --abbrev-ref HEAD)
git push origin $MASTER:master
git push origin "$version"
/bin/echo -e "\n### OK, now it is time to build the binaries..."
REV=$(git rev-parse HEAD)
make haruhi-dl haruhi-dl.tar.gz
read -p "VM running? (y/n) " -n 1
wget "http://$buildserver/build/ytdl-org/haruhi-dl/haruhi-dl.exe?rev=$REV" -O haruhi-dl.exe
mkdir -p "build/$version"
mv haruhi-dl haruhi-dl.exe "build/$version"
mv haruhi-dl.tar.gz "build/$version/haruhi-dl-$version.tar.gz"
RELEASE_FILES="haruhi-dl haruhi-dl.exe haruhi-dl-$version.tar.gz"
(cd build/$version/ && md5sum $RELEASE_FILES > MD5SUMS)
(cd build/$version/ && sha1sum $RELEASE_FILES > SHA1SUMS)
(cd build/$version/ && sha256sum $RELEASE_FILES > SHA2-256SUMS)
(cd build/$version/ && sha512sum $RELEASE_FILES > SHA2-512SUMS)
/bin/echo -e "\n### Signing and uploading the new binaries to GitHub..."
for f in $RELEASE_FILES; do gpg --passphrase-repeat 5 --detach-sig "build/$version/$f"; done
ROOT=$(pwd)
python devscripts/create-github-release.py ChangeLog $version "$ROOT/build/$version"
#ssh ytdl@yt-dl.org "sh html/update_latest.sh $version"
/bin/echo -e "\n### Now switching to gh-pages..."
git clone --branch gh-pages --single-branch . build/gh-pages
(
set -e
ORIGIN_URL=$(git config --get remote.origin.url)
cd build/gh-pages
"$ROOT/devscripts/gh-pages/add-version.py" $version
"$ROOT/devscripts/gh-pages/update-feed.py"
"$ROOT/devscripts/gh-pages/sign-versions.py" < "$ROOT/updates_key.pem"
"$ROOT/devscripts/gh-pages/generate-download.py"
"$ROOT/devscripts/gh-pages/update-copyright.py"
"$ROOT/devscripts/gh-pages/update-sites.py"
git add *.html *.html.in update
git commit $gpg_sign_commits -m "release $version"
git push "$ROOT" gh-pages
git push "$ORIGIN_URL" gh-pages
)
rm -rf build
make pypi-files
echo "Uploading to PyPi ..."
python setup.py sdist bdist_wheel upload
make clean
/bin/echo -e "\n### DONE!"

View file

@ -1,5 +1,18 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ..utils import (
determine_protocol,
)
def _get_real_downloader(info_dict, protocol=None, *args, **kwargs):
info_copy = info_dict.copy()
if protocol:
info_copy['protocol'] = protocol
return get_suitable_downloader(info_copy, *args, **kwargs)
# Some of these require _get_real_downloader
from .common import FileDownloader from .common import FileDownloader
from .f4m import F4mFD from .f4m import F4mFD
from .hls import HlsFD from .hls import HlsFD
@ -8,16 +21,13 @@ from .rtmp import RtmpFD
from .dash import DashSegmentsFD from .dash import DashSegmentsFD
from .rtsp import RtspFD from .rtsp import RtspFD
from .ism import IsmFD from .ism import IsmFD
from .niconico import NiconicoDmcFD
from .external import ( from .external import (
get_external_downloader, get_external_downloader,
Aria2cFD, Aria2cFD,
FFmpegFD, FFmpegFD,
) )
from ..utils import (
determine_protocol,
)
PROTOCOL_MAP = { PROTOCOL_MAP = {
'rtmp': RtmpFD, 'rtmp': RtmpFD,
'm3u8_native': HlsFD, 'm3u8_native': HlsFD,
@ -28,6 +38,7 @@ PROTOCOL_MAP = {
'http_dash_segments': DashSegmentsFD, 'http_dash_segments': DashSegmentsFD,
'ism': IsmFD, 'ism': IsmFD,
'bittorrent': Aria2cFD, 'bittorrent': Aria2cFD,
'niconico_dmc': NiconicoDmcFD,
} }

View file

@ -0,0 +1,55 @@
# coding: utf-8
from __future__ import unicode_literals
import threading
from .common import FileDownloader
from ..downloader import _get_real_downloader
from ..extractor.niconico import NiconicoIE
from ..compat import compat_urllib_request
class NiconicoDmcFD(FileDownloader):
""" Downloading niconico douga from DMC with heartbeat """
FD_NAME = 'niconico_dmc'
def real_download(self, filename, info_dict):
self.to_screen('[%s] Downloading from DMC' % self.FD_NAME)
ie = NiconicoIE(self.hdl)
info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict)
fd = _get_real_downloader(info_dict, params=self.params)(self.hdl, self.params)
success = download_complete = False
timer = [None]
heartbeat_lock = threading.Lock()
heartbeat_url = heartbeat_info_dict['url']
heartbeat_data = heartbeat_info_dict['data'].encode()
heartbeat_interval = heartbeat_info_dict.get('interval', 30)
def heartbeat():
try:
compat_urllib_request.urlopen(url=heartbeat_url, data=heartbeat_data)
except Exception:
self.to_screen('[%s] Heartbeat failed' % self.FD_NAME)
with heartbeat_lock:
if not download_complete:
timer[0] = threading.Timer(heartbeat_interval, heartbeat)
timer[0].start()
heartbeat_info_dict['ping']()
self.to_screen('[%s] Heartbeat with %d second interval ...' % (self.FD_NAME, heartbeat_interval))
try:
heartbeat()
if type(fd).__name__ == 'HlsFD':
info_dict.update(ie._extract_m3u8_formats(info_dict['url'], info_dict['id'])[0])
success = fd.real_download(filename, info_dict)
finally:
if heartbeat_lock:
with heartbeat_lock:
timer[0].cancel()
download_complete = True
return success

View file

@ -9,10 +9,10 @@ from ..utils import (
class AppleConnectIE(InfoExtractor): class AppleConnectIE(InfoExtractor):
_VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)' _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/(?:id)?sa\.(?P<id>[\w-]+)'
_TEST = { _TESTS = [{
'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
'md5': 'e7c38568a01ea45402570e6029206723', 'md5': 'c1d41f72c8bcaf222e089434619316e4',
'info_dict': { 'info_dict': {
'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
'ext': 'm4v', 'ext': 'm4v',
@ -22,7 +22,10 @@ class AppleConnectIE(InfoExtractor):
'upload_date': '20150710', 'upload_date': '20150710',
'timestamp': 1436545535, 'timestamp': 1436545535,
}, },
} }, {
'url': 'https://itunes.apple.com/us/post/sa.0fe0229f-2457-11e5-9f40-1bb645f2d5d9',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
@ -36,7 +39,7 @@ class AppleConnectIE(InfoExtractor):
video_data = self._parse_json(video_json, video_id) video_data = self._parse_json(video_json, video_id)
timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp')) timestamp = str_to_int(self._html_search_regex(r'data-timestamp="(\d+)"', webpage, 'timestamp'))
like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count')) like_count = str_to_int(self._html_search_regex(r'(\d+) Loves', webpage, 'like count', default=None))
return { return {
'id': video_id, 'id': video_id,

View file

@ -233,7 +233,7 @@ class BiliBiliIE(InfoExtractor):
webpage) webpage)
if uploader_mobj: if uploader_mobj:
info.update({ info.update({
'uploader': uploader_mobj.group('name'), 'uploader': uploader_mobj.group('name').strip(),
'uploader_id': uploader_mobj.group('id'), 'uploader_id': uploader_mobj.group('id'),
}) })
if not info.get('uploader'): if not info.get('uploader'):

View file

@ -145,7 +145,7 @@ class CuriosityStreamIE(CuriosityStreamBaseIE):
class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
IE_NAME = 'curiositystream:collection' IE_NAME = 'curiositystream:collection'
_VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collection|series)/(?P<id>\d+)' _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'https://app.curiositystream.com/collection/2', 'url': 'https://app.curiositystream.com/collection/2',
'info_dict': { 'info_dict': {
@ -157,6 +157,9 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
}, { }, {
'url': 'https://curiositystream.com/series/2', 'url': 'https://curiositystream.com/series/2',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://curiositystream.com/collections/36',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View file

@ -22,16 +22,19 @@ class EggheadBaseIE(InfoExtractor):
class EggheadCourseIE(EggheadBaseIE): class EggheadCourseIE(EggheadBaseIE):
IE_DESC = 'egghead.io course' IE_DESC = 'egghead.io course'
IE_NAME = 'egghead:course' IE_NAME = 'egghead:course'
_VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)' _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:course|playlist)s/(?P<id>[^/?#&]+)'
_TEST = { _TESTS = [{
'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript',
'playlist_count': 29, 'playlist_count': 29,
'info_dict': { 'info_dict': {
'id': '72', 'id': '432655',
'title': 'Professor Frisby Introduces Composable Functional JavaScript', 'title': 'Professor Frisby Introduces Composable Functional JavaScript',
'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$', 'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$',
}, },
} }, {
'url': 'https://app.egghead.io/playlists/professor-frisby-introduces-composable-functional-javascript',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
playlist_id = self._match_id(url) playlist_id = self._match_id(url)
@ -65,7 +68,7 @@ class EggheadCourseIE(EggheadBaseIE):
class EggheadLessonIE(EggheadBaseIE): class EggheadLessonIE(EggheadBaseIE):
IE_DESC = 'egghead.io lesson' IE_DESC = 'egghead.io lesson'
IE_NAME = 'egghead:lesson' IE_NAME = 'egghead:lesson'
_VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)' _VALID_URL = r'https://(?:app\.)?egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box',
'info_dict': { 'info_dict': {
@ -88,6 +91,9 @@ class EggheadLessonIE(EggheadBaseIE):
}, { }, {
'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://app.egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):

View file

@ -643,10 +643,6 @@ from .linkedin import (
from .linuxacademy import LinuxAcademyIE from .linuxacademy import LinuxAcademyIE
from .litv import LiTVIE from .litv import LiTVIE
from .livejournal import LiveJournalIE from .livejournal import LiveJournalIE
from .liveleak import (
LiveLeakIE,
LiveLeakEmbedIE,
)
from .livestream import ( from .livestream import (
LivestreamIE, LivestreamIE,
LivestreamOriginalIE, LivestreamOriginalIE,
@ -1517,6 +1513,7 @@ from .videomore import (
) )
from .videopress import VideoPressIE from .videopress import VideoPressIE
from .videotarget import VideoTargetIE from .videotarget import VideoTargetIE
from .vider import ViderIE
from .vidio import VidioIE from .vidio import VidioIE
from .vidlii import VidLiiIE from .vidlii import VidLiiIE
from .vidme import ( from .vidme import (

View file

@ -521,7 +521,10 @@ class FacebookIE(InfoExtractor):
raise ExtractorError( raise ExtractorError(
'The video is not available, Facebook said: "%s"' % m_msg.group(1), 'The video is not available, Facebook said: "%s"' % m_msg.group(1),
expected=True) expected=True)
elif '>You must log in to continue' in webpage: elif any(p in webpage for p in (
'>You must log in to continue',
'id="login_form"',
'id="loginbutton"')):
self.raise_login_required() self.raise_login_required()
if not video_data and '/watchparty/' in url: if not video_data and '/watchparty/' in url:

View file

@ -5,29 +5,23 @@ from .common import InfoExtractor
class Formula1IE(InfoExtractor): class Formula1IE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?formula1\.com/(?:content/fom-website/)?en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html' _VALID_URL = r'https?://(?:www\.)?formula1\.com/en/latest/video\.[^.]+\.(?P<id>\d+)\.html'
_TESTS = [{ _TEST = {
'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html', 'url': 'https://www.formula1.com/en/latest/video.race-highlights-spain-2016.6060988138001.html',
'md5': '8c79e54be72078b26b89e0e111c0502b', 'md5': 'be7d3a8c2f804eb2ab2aa5d941c359f8',
'info_dict': { 'info_dict': {
'id': 'JvYXJpMzE6pArfHWm5ARp5AiUmD-gibV', 'id': '6060988138001',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Race highlights - Spain 2016', 'title': 'Race highlights - Spain 2016',
'timestamp': 1463332814,
'upload_date': '20160515',
'uploader_id': '6057949432001',
}, },
'params': { 'add_ie': ['BrightcoveNew'],
# m3u8 download }
'skip_download': True, BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/6057949432001/S1WMrhjlh_default/index.html?videoId=%s'
},
'add_ie': ['Ooyala'],
}, {
'url': 'http://www.formula1.com/en/video/2016/5/Race_highlights_-_Spain_2016.html',
'only_matching': True,
}]
def _real_extract(self, url): def _real_extract(self, url):
display_id = self._match_id(url) bc_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
ooyala_embed_code = self._search_regex(
r'data-videoid="([^"]+)"', webpage, 'ooyala embed code')
return self.url_result( return self.url_result(
'ooyala:%s' % ooyala_embed_code, 'Ooyala', ooyala_embed_code) self.BRIGHTCOVE_URL_TEMPLATE % bc_id, 'BrightcoveNew', bc_id)

View file

@ -84,7 +84,6 @@ from .jwplatform import JWPlatformIE
from .digiteka import DigitekaIE from .digiteka import DigitekaIE
from .arkena import ArkenaIE from .arkena import ArkenaIE
from .instagram import InstagramIE from .instagram import InstagramIE
from .liveleak import LiveLeakIE
from .threeqsdn import ThreeQSDNIE from .threeqsdn import ThreeQSDNIE
from .theplatform import ThePlatformIE from .theplatform import ThePlatformIE
from .kaltura import KalturaIE from .kaltura import KalturaIE
@ -1640,34 +1639,6 @@ class GenericIE(InfoExtractor):
'upload_date': '20160409', 'upload_date': '20160409',
}, },
}, },
# LiveLeak embed
{
'url': 'http://www.wykop.pl/link/3088787/',
'md5': '7619da8c820e835bef21a1efa2a0fc71',
'info_dict': {
'id': '874_1459135191',
'ext': 'mp4',
'title': 'Man shows poor quality of new apartment building',
'description': 'The wall is like a sand pile.',
'uploader': 'Lake8737',
},
'add_ie': [LiveLeakIE.ie_key()],
'params': {
'force_generic_extractor': True,
},
},
# Another LiveLeak embed pattern (#13336)
{
'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/',
'info_dict': {
'id': '2eb_1496309988',
'ext': 'mp4',
'title': 'Thief robs place where everyone was armed',
'description': 'md5:694d73ee79e535953cf2488562288eee',
'uploader': 'brazilwtf',
},
'add_ie': [LiveLeakIE.ie_key()],
},
# Duplicated embedded video URLs # Duplicated embedded video URLs
{ {
'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443', 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443',
@ -2744,7 +2715,6 @@ class GenericIE(InfoExtractor):
SoundcloudEmbedIE, SoundcloudEmbedIE,
TuneInBaseIE, TuneInBaseIE,
JWPlatformIE, JWPlatformIE,
LiveLeakIE,
DBTVIE, DBTVIE,
VideaIE, VideaIE,
TwentyMinutenIE, TwentyMinutenIE,

View file

@ -8,6 +8,7 @@ from .common import InfoExtractor
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
url_or_none, url_or_none,
ExtractorError,
) )
@ -79,7 +80,11 @@ class IplaIE(InfoExtractor):
'Content-type': 'application/json' 'Content-type': 'application/json'
} }
res = self._download_json('http://b2c-mobile.redefine.pl/rpc/navigation/', media_id, data=req, headers=headers) res = self._download_json('https://b2c-mobile.redefine.pl/rpc/navigation/', media_id, data=req, headers=headers)
if not res.get('result'):
if res['error']['code'] == 13404:
raise ExtractorError('Video requires DRM protection', expected=True)
raise ExtractorError(f"Ipla said: {res['error']['message']} - {res['error']['data']['userMessage']}")
return res['result']['mediaItem'] return res['result']['mediaItem']
def get_url(self, media_id, source_id): def get_url(self, media_id, source_id):
@ -93,4 +98,6 @@ class IplaIE(InfoExtractor):
} }
res = self._download_json('https://b2c-mobile.redefine.pl/rpc/drm/', media_id, data=req, headers=headers) res = self._download_json('https://b2c-mobile.redefine.pl/rpc/drm/', media_id, data=req, headers=headers)
if not res.get('result'):
raise ExtractorError(f"Ipla said: {res['error']['message']} - {res['error']['data']['userMessage']}")
return res['result']['url'] return res['result']['url']

View file

@ -1,191 +0,0 @@
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import int_or_none
class LiveLeakIE(InfoExtractor):
_VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?.*?\b[it]=(?P<id>[\w_]+)'
_TESTS = [{
'url': 'http://www.liveleak.com/view?i=757_1364311680',
'md5': '0813c2430bea7a46bf13acf3406992f4',
'info_dict': {
'id': '757_1364311680',
'ext': 'mp4',
'description': 'extremely bad day for this guy..!',
'uploader': 'ljfriel2',
'title': 'Most unlucky car accident',
'thumbnail': r're:^https?://.*\.jpg$'
}
}, {
'url': 'http://www.liveleak.com/view?i=f93_1390833151',
'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf',
'info_dict': {
'id': 'f93_1390833151',
'ext': 'mp4',
'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.',
'uploader': 'ARD_Stinkt',
'title': 'German Television does first Edward Snowden Interview (ENGLISH)',
'thumbnail': r're:^https?://.*\.jpg$'
}
}, {
# Prochan embed
'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
'md5': '42c6d97d54f1db107958760788c5f48f',
'info_dict': {
'id': '4f7_1392687779',
'ext': 'mp4',
'description': "The guy with the cigarette seems amazingly nonchalant about the whole thing... I really hope my friends' reactions would be a bit stronger.\r\n\r\nAction-go to 0:55.",
'uploader': 'CapObveus',
'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
'age_limit': 18,
},
'skip': 'Video is dead',
}, {
# Covers https://github.com/ytdl-org/youtube-dl/pull/5983
# Multiple resolutions
'url': 'http://www.liveleak.com/view?i=801_1409392012',
'md5': 'c3a449dbaca5c0d1825caecd52a57d7b',
'info_dict': {
'id': '801_1409392012',
'ext': 'mp4',
'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.',
'uploader': 'bony333',
'title': 'Crazy Hungarian tourist films close call waterspout in Croatia',
'thumbnail': r're:^https?://.*\.jpg$'
}
}, {
# Covers https://github.com/ytdl-org/youtube-dl/pull/10664#issuecomment-247439521
'url': 'http://m.liveleak.com/view?i=763_1473349649',
'add_ie': ['Youtube'],
'info_dict': {
'id': '763_1473349649',
'ext': 'mp4',
'title': 'Reporters and public officials ignore epidemic of black on asian violence in Sacramento | Colin Flaherty',
'description': 'Colin being the warrior he is and showing the injustice Asians in Sacramento are being subjected to.',
'uploader': 'Ziz',
'upload_date': '20160908',
'uploader_id': 'UCEbta5E_jqlZmEJsriTEtnw'
},
'params': {
'skip_download': True,
},
}, {
'url': 'https://www.liveleak.com/view?i=677_1439397581',
'info_dict': {
'id': '677_1439397581',
'title': 'Fuel Depot in China Explosion caught on video',
},
'playlist_count': 3,
}, {
'url': 'https://www.liveleak.com/view?t=HvHi_1523016227',
'only_matching': True,
}, {
# No original video
'url': 'https://www.liveleak.com/view?t=C26ZZ_1558612804',
'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage, **kwargs):
return re.findall(
r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[ift]=[\w_]+[^"]+)"',
webpage)
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
video_description = self._og_search_description(webpage)
video_uploader = self._html_search_regex(
r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False)
age_limit = int_or_none(self._search_regex(
r'you confirm that you are ([0-9]+) years and over.',
webpage, 'age limit', default=None))
video_thumbnail = self._og_search_thumbnail(webpage)
entries = self._parse_html5_media_entries(url, webpage, video_id)
if not entries:
# Maybe an embed?
embed_url = self._search_regex(
r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"',
webpage, 'embed URL')
return {
'_type': 'url_transparent',
'url': embed_url,
'id': video_id,
'title': video_title,
'description': video_description,
'uploader': video_uploader,
'age_limit': age_limit,
}
for idx, info_dict in enumerate(entries):
formats = []
for a_format in info_dict['formats']:
if not a_format.get('height'):
a_format['height'] = int_or_none(self._search_regex(
r'([0-9]+)p\.mp4', a_format['url'], 'height label',
default=None))
formats.append(a_format)
# Removing '.*.mp4' gives the raw video, which is essentially
# the same video without the LiveLeak logo at the top (see
# https://github.com/ytdl-org/youtube-dl/pull/4768)
orig_url = re.sub(r'\.mp4\.[^.]+', '', a_format['url'])
if a_format['url'] != orig_url:
format_id = a_format.get('format_id')
format_id = 'original' + ('-' + format_id if format_id else '')
if self._is_valid_url(orig_url, video_id, format_id):
formats.append({
'format_id': format_id,
'url': orig_url,
'preference': 1,
})
self._sort_formats(formats)
info_dict['formats'] = formats
# Don't append entry ID for one-video pages to keep backward compatibility
if len(entries) > 1:
info_dict['id'] = '%s_%s' % (video_id, idx + 1)
else:
info_dict['id'] = video_id
info_dict.update({
'title': video_title,
'description': video_description,
'uploader': video_uploader,
'age_limit': age_limit,
'thumbnail': video_thumbnail,
})
return self.playlist_result(entries, video_id, video_title)
class LiveLeakEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[ift])=(?P<id>[\w_]+)'
# See generic.py for actual test cases
_TESTS = [{
'url': 'https://www.liveleak.com/ll_embed?i=874_1459135191',
'only_matching': True,
}, {
'url': 'https://www.liveleak.com/ll_embed?f=ab065df993c1',
'only_matching': True,
}]
def _real_extract(self, url):
kind, video_id = re.match(self._VALID_URL, url).groups()
if kind == 'f':
webpage = self._download_webpage(url, video_id)
liveleak_url = self._search_regex(
r'(?:logourl\s*:\s*|window\.open\()(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL,
webpage, 'LiveLeak URL', group='url')
else:
liveleak_url = 'http://www.liveleak.com/view?%s=%s' % (kind, video_id)
return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key())

View file

@ -1,25 +1,28 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import datetime import re
import functools
import json import json
import math import datetime
from .common import InfoExtractor from .common import InfoExtractor
from ..postprocessor.ffmpeg import FFmpegPostProcessor
from ..compat import ( from ..compat import (
compat_str,
compat_parse_qs, compat_parse_qs,
compat_urllib_parse_urlparse, compat_urllib_parse_urlparse,
) )
from ..utils import ( from ..utils import (
determine_ext,
dict_get, dict_get,
ExtractorError, ExtractorError,
float_or_none,
InAdvancePagedList,
int_or_none, int_or_none,
float_or_none,
OnDemandPagedList,
parse_duration, parse_duration,
parse_iso8601, parse_iso8601,
PostProcessingError,
str_or_none,
remove_start, remove_start,
try_get, try_get,
unified_timestamp, unified_timestamp,
@ -34,7 +37,7 @@ class NiconicoIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'http://www.nicovideo.jp/watch/sm22312215', 'url': 'http://www.nicovideo.jp/watch/sm22312215',
'md5': 'd1a75c0823e2f629128c43e1212760f9', 'md5': 'a5bad06f1347452102953f323c69da34s',
'info_dict': { 'info_dict': {
'id': 'sm22312215', 'id': 'sm22312215',
'ext': 'mp4', 'ext': 'mp4',
@ -162,6 +165,11 @@ class NiconicoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
_NETRC_MACHINE = 'niconico' _NETRC_MACHINE = 'niconico'
_API_HEADERS = {
'X-Frontend-ID': '6',
'X-Frontend-Version': '0'
}
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
@ -188,40 +196,92 @@ class NiconicoIE(InfoExtractor):
if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login': if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login':
login_ok = False login_ok = False
if not login_ok: if not login_ok:
self._downloader.report_warning('unable to log in: bad username or password') self.report_warning('unable to log in: bad username or password')
return login_ok return login_ok
def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): def _get_heartbeat_info(self, info_dict):
def yesno(boolean):
return 'yes' if boolean else 'no'
session_api_data = api_data['video']['dmcInfo']['session_api'] video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/')
session_api_endpoint = session_api_data['urls'][0]
format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) api_data = (
info_dict.get('_api_data')
or self._parse_json(
self._html_search_regex(
'data-api-data="([^"]+)"',
self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id),
'API data', default='{}'),
video_id))
session_api_data = try_get(api_data, lambda x: x['media']['delivery']['movie']['session'])
session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0])
def ping():
status = try_get(
self._download_json(
'https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', video_id,
query={'t': try_get(api_data, lambda x: x['media']['delivery']['trackingId'])},
note='Acquiring permission for downloading video',
headers=self._API_HEADERS),
lambda x: x['meta']['status'])
if status != 200:
self.report_warning('Failed to acquire permission for playing video. The video may not download.')
yesno = lambda x: 'yes' if x else 'no'
# m3u8 (encryption)
if try_get(api_data, lambda x: x['media']['delivery']['encryption']) is not None:
protocol = 'm3u8'
encryption = self._parse_json(session_api_data['token'], video_id)['hls_encryption']
session_api_http_parameters = {
'parameters': {
'hls_parameters': {
'encryption': {
encryption: {
'encrypted_key': try_get(api_data, lambda x: x['media']['delivery']['encryption']['encryptedKey']),
'key_uri': try_get(api_data, lambda x: x['media']['delivery']['encryption']['keyUri'])
}
},
'transfer_preset': '',
'use_ssl': yesno(session_api_endpoint['isSsl']),
'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']),
'segment_duration': 6000,
}
}
}
# http
else:
protocol = 'http'
session_api_http_parameters = {
'parameters': {
'http_output_download_parameters': {
'use_ssl': yesno(session_api_endpoint['isSsl']),
'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']),
}
}
}
session_response = self._download_json( session_response = self._download_json(
session_api_endpoint['url'], video_id, session_api_endpoint['url'], video_id,
query={'_format': 'json'}, query={'_format': 'json'},
headers={'Content-Type': 'application/json'}, headers={'Content-Type': 'application/json'},
note='Downloading JSON metadata for %s' % format_id, note='Downloading JSON metadata for %s' % info_dict['format_id'],
data=json.dumps({ data=json.dumps({
'session': { 'session': {
'client_info': { 'client_info': {
'player_id': session_api_data['player_id'], 'player_id': session_api_data.get('playerId'),
}, },
'content_auth': { 'content_auth': {
'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]], 'auth_type': try_get(session_api_data, lambda x: x['authTypes'][session_api_data['protocols'][0]]),
'content_key_timeout': session_api_data['content_key_timeout'], 'content_key_timeout': session_api_data.get('contentKeyTimeout'),
'service_id': 'nicovideo', 'service_id': 'nicovideo',
'service_user_id': session_api_data['service_user_id'] 'service_user_id': session_api_data.get('serviceUserId')
}, },
'content_id': session_api_data['content_id'], 'content_id': session_api_data.get('contentId'),
'content_src_id_sets': [{ 'content_src_id_sets': [{
'content_src_ids': [{ 'content_src_ids': [{
'src_id_to_mux': { 'src_id_to_mux': {
'audio_src_ids': [audio_quality['id']], 'audio_src_ids': [audio_src_id],
'video_src_ids': [video_quality['id']], 'video_src_ids': [video_src_id],
} }
}] }]
}], }],
@ -229,52 +289,81 @@ class NiconicoIE(InfoExtractor):
'content_uri': '', 'content_uri': '',
'keep_method': { 'keep_method': {
'heartbeat': { 'heartbeat': {
'lifetime': session_api_data['heartbeat_lifetime'] 'lifetime': session_api_data.get('heartbeatLifetime')
} }
}, },
'priority': session_api_data['priority'], 'priority': session_api_data.get('priority'),
'protocol': { 'protocol': {
'name': 'http', 'name': 'http',
'parameters': { 'parameters': {
'http_parameters': { 'http_parameters': session_api_http_parameters
'parameters': {
'http_output_download_parameters': {
'use_ssl': yesno(session_api_endpoint['is_ssl']),
'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']),
}
}
}
} }
}, },
'recipe_id': session_api_data['recipe_id'], 'recipe_id': session_api_data.get('recipeId'),
'session_operation_auth': { 'session_operation_auth': {
'session_operation_auth_by_signature': { 'session_operation_auth_by_signature': {
'signature': session_api_data['signature'], 'signature': session_api_data.get('signature'),
'token': session_api_data['token'], 'token': session_api_data.get('token'),
} }
}, },
'timing_constraint': 'unlimited' 'timing_constraint': 'unlimited'
} }
}).encode()) }).encode())
resolution = video_quality.get('resolution', {}) info_dict['url'] = session_response['data']['session']['content_uri']
info_dict['protocol'] = protocol
# get heartbeat info
heartbeat_info_dict = {
'url': session_api_endpoint['url'] + '/' + session_response['data']['session']['id'] + '?_format=json&_method=PUT',
'data': json.dumps(session_response['data']),
# interval, convert milliseconds to seconds, then halve to make a buffer.
'interval': float_or_none(session_api_data.get('heartbeatLifetime'), scale=3000),
'ping': ping
}
return info_dict, heartbeat_info_dict
def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality):
def parse_format_id(id_code):
mobj = re.match(r'''(?x)
(?:archive_)?
(?:(?P<codec>[^_]+)_)?
(?:(?P<br>[\d]+)kbps_)?
(?:(?P<res>[\d+]+)p_)?
''', '%s_' % id_code)
return mobj.groupdict() if mobj else {}
protocol = 'niconico_dmc'
format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality]))
vdict = parse_format_id(video_quality['id'])
adict = parse_format_id(audio_quality['id'])
resolution = try_get(video_quality, lambda x: x['metadata']['resolution'], dict) or {'height': vdict.get('res')}
vbr = try_get(video_quality, lambda x: x['metadata']['bitrate'], float)
return { return {
'url': session_response['data']['session']['content_uri'], 'url': '%s:%s/%s/%s' % (protocol, video_id, video_quality['id'], audio_quality['id']),
'format_id': format_id, 'format_id': format_id,
'format_note': 'DMC %s' % try_get(video_quality, lambda x: x['metadata']['label'], compat_str),
'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4
'abr': float_or_none(audio_quality.get('bitrate'), 1000), 'vcodec': vdict.get('codec'),
'vbr': float_or_none(video_quality.get('bitrate'), 1000), 'acodec': adict.get('codec'),
'height': resolution.get('height'), 'vbr': float_or_none(vbr, 1000) or float_or_none(vdict.get('br')),
'width': resolution.get('width'), 'abr': float_or_none(audio_quality.get('bitrate'), 1000) or float_or_none(adict.get('br')),
'height': int_or_none(resolution.get('height', vdict.get('res'))),
'width': int_or_none(resolution.get('width')),
'quality': -2 if 'low' in format_id else -1, # Default quality value is -1
'protocol': protocol,
'http_headers': {
'Origin': 'https://www.nicovideo.jp',
'Referer': 'https://www.nicovideo.jp/watch/' + video_id,
}
} }
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
# Get video webpage. We are not actually interested in it for normal # Get video webpage for API data.
# cases, but need the cookies in order to be able to download the
# info webpage
webpage, handle = self._download_webpage_handle( webpage, handle = self._download_webpage_handle(
'http://www.nicovideo.jp/watch/' + video_id, video_id) 'http://www.nicovideo.jp/watch/' + video_id, video_id)
if video_id.startswith('so'): if video_id.startswith('so'):
@ -284,86 +373,136 @@ class NiconicoIE(InfoExtractor):
'data-api-data="([^"]+)"', webpage, 'data-api-data="([^"]+)"', webpage,
'API data', default='{}'), video_id) 'API data', default='{}'), video_id)
def _format_id_from_url(video_url): def get_video_info_web(items):
return 'economy' if video_real_url.endswith('low') else 'normal' return dict_get(api_data['video'], items)
try: # Get video info
video_real_url = api_data['video']['smileInfo']['url'] video_info_xml = self._download_xml(
except KeyError: # Flash videos 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id,
# Get flv info video_id, note='Downloading video info page')
flv_info_webpage = self._download_webpage(
'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
video_id, 'Downloading flv info')
flv_info = compat_parse_qs(flv_info_webpage) def get_video_info_xml(items):
if 'url' not in flv_info: if not isinstance(items, list):
if 'deleted' in flv_info: items = [items]
raise ExtractorError('The video has been deleted.', for item in items:
expected=True) ret = xpath_text(video_info_xml, './/' + item)
elif 'closed' in flv_info: if ret:
raise ExtractorError('Niconico videos now require logging in', return ret
expected=True)
elif 'error' in flv_info:
raise ExtractorError('%s reports error: %s' % (
self.IE_NAME, flv_info['error'][0]), expected=True)
else:
raise ExtractorError('Unable to find video URL')
video_info_xml = self._download_xml( if get_video_info_xml('error'):
'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, error_code = get_video_info_xml('code')
video_id, note='Downloading video info page')
def get_video_info(items): if error_code == 'DELETED':
if not isinstance(items, list): raise ExtractorError('The video has been deleted.',
items = [items] expected=True)
for item in items: elif error_code == 'NOT_FOUND':
ret = xpath_text(video_info_xml, './/' + item) raise ExtractorError('The video is not found.',
if ret: expected=True)
return ret elif error_code == 'COMMUNITY':
self.to_screen('%s: The video is community members only.' % video_id)
else:
raise ExtractorError('%s reports error: %s' % (self.IE_NAME, error_code))
video_real_url = flv_info['url'][0] # Start extracting video formats
formats = []
extension = get_video_info('movie_type') # Get HTML5 videos info
if not extension: quality_info = try_get(api_data, lambda x: x['media']['delivery']['movie'])
extension = determine_ext(video_real_url) if not quality_info:
raise ExtractorError('The video can\'t be downloaded', expected=True)
formats = [{ for audio_quality in quality_info.get('audios') or {}:
'url': video_real_url, for video_quality in quality_info.get('videos') or {}:
'ext': extension, if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'):
'format_id': _format_id_from_url(video_real_url), continue
}] formats.append(self._extract_format_for_quality(
else: api_data, video_id, audio_quality, video_quality))
formats = []
dmc_info = api_data['video'].get('dmcInfo') # Get flv/swf info
if dmc_info: # "New" HTML5 videos timestamp = None
quality_info = dmc_info['quality'] video_real_url = try_get(api_data, lambda x: x['video']['smileInfo']['url'])
for audio_quality in quality_info['audios']: if video_real_url:
for video_quality in quality_info['videos']: is_economy = video_real_url.endswith('low')
if not audio_quality['available'] or not video_quality['available']:
continue
formats.append(self._extract_format_for_quality(
api_data, video_id, audio_quality, video_quality))
self._sort_formats(formats) if is_economy:
else: # "Old" HTML5 videos self.report_warning('Site is currently in economy mode! You will only have access to lower quality streams')
formats = [{
# Invoking ffprobe to determine resolution
pp = FFmpegPostProcessor(self._downloader)
cookies = self._get_cookies('https://nicovideo.jp').output(header='', sep='; path=/; domain=nicovideo.jp;\n')
self.to_screen('%s: %s' % (video_id, 'Checking smile format with ffprobe'))
try:
metadata = pp.get_metadata_object(video_real_url, ['-cookies', cookies])
except PostProcessingError as err:
raise ExtractorError(err.msg, expected=True)
v_stream = a_stream = {}
# Some complex swf files doesn't have video stream (e.g. nm4809023)
for stream in metadata['streams']:
if stream['codec_type'] == 'video':
v_stream = stream
elif stream['codec_type'] == 'audio':
a_stream = stream
# Community restricted videos seem to have issues with the thumb API not returning anything at all
filesize = int(
(get_video_info_xml('size_high') if not is_economy else get_video_info_xml('size_low'))
or metadata['format']['size']
)
extension = (
get_video_info_xml('movie_type')
or 'mp4' if 'mp4' in metadata['format']['format_name'] else metadata['format']['format_name']
)
# 'creation_time' tag on video stream of re-encoded SMILEVIDEO mp4 files are '1970-01-01T00:00:00.000000Z'.
timestamp = (
parse_iso8601(get_video_info_web('first_retrieve'))
or unified_timestamp(get_video_info_web('postedDateTime'))
)
metadata_timestamp = (
parse_iso8601(try_get(v_stream, lambda x: x['tags']['creation_time']))
or timestamp if extension != 'mp4' else 0
)
# According to compconf, smile videos from pre-2017 are always better quality than their DMC counterparts
smile_threshold_timestamp = parse_iso8601('2016-12-08T00:00:00+09:00')
is_source = timestamp < smile_threshold_timestamp or metadata_timestamp > 0
# If movie file size is unstable, old server movie is not source movie.
if filesize > 1:
formats.append({
'url': video_real_url, 'url': video_real_url,
'ext': 'mp4', 'format_id': 'smile' if not is_economy else 'smile_low',
'format_id': _format_id_from_url(video_real_url), 'format_note': 'SMILEVIDEO source' if not is_economy else 'SMILEVIDEO low quality',
}] 'ext': extension,
'container': extension,
'vcodec': v_stream.get('codec_name'),
'acodec': a_stream.get('codec_name'),
# Some complex swf files doesn't have total bit rate metadata (e.g. nm6049209)
'tbr': int_or_none(metadata['format'].get('bit_rate'), scale=1000),
'vbr': int_or_none(v_stream.get('bit_rate'), scale=1000),
'abr': int_or_none(a_stream.get('bit_rate'), scale=1000),
'height': int_or_none(v_stream.get('height')),
'width': int_or_none(v_stream.get('width')),
'source_preference': 5 if not is_economy else -2,
'quality': 5 if is_source and not is_economy else None,
'filesize': filesize
})
def get_video_info(items): self._sort_formats(formats)
return dict_get(api_data['video'], items)
# Start extracting information # Start extracting information
title = get_video_info('title') title = (
if not title: get_video_info_xml('title') # prefer to get the untranslated original title
title = self._og_search_title(webpage, default=None) or get_video_info_web(['originalTitle', 'title'])
if not title: or self._og_search_title(webpage, default=None)
title = self._html_search_regex( or self._html_search_regex(
r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>', r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>',
webpage, 'video title') webpage, 'video title'))
watch_api_data_string = self._html_search_regex( watch_api_data_string = self._html_search_regex(
r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>', r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>',
@ -372,14 +511,15 @@ class NiconicoIE(InfoExtractor):
video_detail = watch_api_data.get('videoDetail', {}) video_detail = watch_api_data.get('videoDetail', {})
thumbnail = ( thumbnail = (
get_video_info(['thumbnail_url', 'thumbnailURL']) self._html_search_regex(r'<meta property="og:image" content="([^"]+)">', webpage, 'thumbnail data', default=None)
or dict_get( # choose highest from 720p to 240p
get_video_info_web('thumbnail'),
['ogp', 'player', 'largeUrl', 'middleUrl', 'url'])
or self._html_search_meta('image', webpage, 'thumbnail', default=None) or self._html_search_meta('image', webpage, 'thumbnail', default=None)
or video_detail.get('thumbnail')) or video_detail.get('thumbnail'))
description = get_video_info('description') description = get_video_info_web('description')
timestamp = (parse_iso8601(get_video_info('first_retrieve'))
or unified_timestamp(get_video_info('postedDateTime')))
if not timestamp: if not timestamp:
match = self._html_search_meta('datePublished', webpage, 'date published', default=None) match = self._html_search_meta('datePublished', webpage, 'date published', default=None)
if match: if match:
@ -388,19 +528,25 @@ class NiconicoIE(InfoExtractor):
timestamp = parse_iso8601( timestamp = parse_iso8601(
video_detail['postedAt'].replace('/', '-'), video_detail['postedAt'].replace('/', '-'),
delimiter=' ', timezone=datetime.timedelta(hours=9)) delimiter=' ', timezone=datetime.timedelta(hours=9))
timestamp = timestamp or try_get(api_data, lambda x: parse_iso8601(x['video']['registeredAt']))
view_count = int_or_none(get_video_info(['view_counter', 'viewCount'])) view_count = int_or_none(get_video_info_web(['view_counter', 'viewCount']))
if not view_count: if not view_count:
match = self._html_search_regex( match = self._html_search_regex(
r'>Views: <strong[^>]*>([^<]+)</strong>', r'>Views: <strong[^>]*>([^<]+)</strong>',
webpage, 'view count', default=None) webpage, 'view count', default=None)
if match: if match:
view_count = int_or_none(match.replace(',', '')) view_count = int_or_none(match.replace(',', ''))
view_count = view_count or video_detail.get('viewCount') view_count = (
view_count
or video_detail.get('viewCount')
or try_get(api_data, lambda x: x['video']['count']['view']))
comment_count = (
int_or_none(get_video_info_web('comment_num'))
or video_detail.get('commentCount')
or try_get(api_data, lambda x: x['video']['count']['comment']))
comment_count = (int_or_none(get_video_info('comment_num'))
or video_detail.get('commentCount')
or try_get(api_data, lambda x: x['thread']['commentCount']))
if not comment_count: if not comment_count:
match = self._html_search_regex( match = self._html_search_regex(
r'>Comments: <strong[^>]*>([^<]+)</strong>', r'>Comments: <strong[^>]*>([^<]+)</strong>',
@ -409,22 +555,41 @@ class NiconicoIE(InfoExtractor):
comment_count = int_or_none(match.replace(',', '')) comment_count = int_or_none(match.replace(',', ''))
duration = (parse_duration( duration = (parse_duration(
get_video_info('length') get_video_info_web('length')
or self._html_search_meta( or self._html_search_meta(
'video:duration', webpage, 'video duration', default=None)) 'video:duration', webpage, 'video duration', default=None))
or video_detail.get('length') or video_detail.get('length')
or get_video_info('duration')) or get_video_info_web('duration'))
webpage_url = get_video_info('watch_url') or url webpage_url = get_video_info_web('watch_url') or url
# for channel movie and community movie
channel_id = try_get(
api_data,
(lambda x: x['channel']['globalId'],
lambda x: x['community']['globalId']))
channel = try_get(
api_data,
(lambda x: x['channel']['name'],
lambda x: x['community']['name']))
# Note: cannot use api_data.get('owner', {}) because owner may be set to "null" # Note: cannot use api_data.get('owner', {}) because owner may be set to "null"
# in the JSON, which will cause None to be returned instead of {}. # in the JSON, which will cause None to be returned instead of {}.
owner = try_get(api_data, lambda x: x.get('owner'), dict) or {} owner = try_get(api_data, lambda x: x.get('owner'), dict) or {}
uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id') uploader_id = str_or_none(
uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname') get_video_info_web(['ch_id', 'user_id'])
or owner.get('id')
or channel_id
)
uploader = (
get_video_info_web(['ch_name', 'user_nickname'])
or owner.get('nickname')
or channel
)
return { return {
'id': video_id, 'id': video_id,
'_api_data': api_data,
'title': title, 'title': title,
'formats': formats, 'formats': formats,
'thumbnail': thumbnail, 'thumbnail': thumbnail,
@ -432,6 +597,8 @@ class NiconicoIE(InfoExtractor):
'uploader': uploader, 'uploader': uploader,
'timestamp': timestamp, 'timestamp': timestamp,
'uploader_id': uploader_id, 'uploader_id': uploader_id,
'channel': channel,
'channel_id': channel_id,
'view_count': view_count, 'view_count': view_count,
'comment_count': comment_count, 'comment_count': comment_count,
'duration': duration, 'duration': duration,
@ -440,7 +607,7 @@ class NiconicoIE(InfoExtractor):
class NiconicoPlaylistIE(InfoExtractor): class NiconicoPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/)?mylist/(?P<id>\d+)' _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/(?:user/\d+/|my/)?mylist/(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'http://www.nicovideo.jp/mylist/27411728', 'url': 'http://www.nicovideo.jp/mylist/27411728',
@ -456,60 +623,77 @@ class NiconicoPlaylistIE(InfoExtractor):
'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728', 'url': 'https://www.nicovideo.jp/user/805442/mylist/27411728',
'only_matching': True, 'only_matching': True,
}] }]
_PAGE_SIZE = 100
def _call_api(self, list_id, resource, query): _API_HEADERS = {
return self._download_json( 'X-Frontend-ID': '6',
'https://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id, 'X-Frontend-Version': '0'
'Downloading %s JSON metatdata' % resource, query=query, }
headers={'X-Frontend-Id': 6})['data']['mylist']
def _parse_owner(self, item):
owner = item.get('owner') or {}
if owner:
return {
'uploader': owner.get('name'),
'uploader_id': owner.get('id'),
}
return {}
def _fetch_page(self, list_id, page):
page += 1
items = self._call_api(list_id, 'page %d' % page, {
'page': page,
'pageSize': self._PAGE_SIZE,
})['items']
for item in items:
video = item.get('video') or {}
video_id = video.get('id')
if not video_id:
continue
count = video.get('count') or {}
get_count = lambda x: int_or_none(count.get(x))
info = {
'_type': 'url',
'id': video_id,
'title': video.get('title'),
'url': 'https://www.nicovideo.jp/watch/' + video_id,
'description': video.get('shortDescription'),
'duration': int_or_none(video.get('duration')),
'view_count': get_count('view'),
'comment_count': get_count('comment'),
'ie_key': NiconicoIE.ie_key(),
}
info.update(self._parse_owner(video))
yield info
def _real_extract(self, url): def _real_extract(self, url):
list_id = self._match_id(url) list_id = self._match_id(url)
mylist = self._call_api(list_id, 'list', {
'pageSize': 1, def get_page_data(pagenum, pagesize):
}) return self._download_json(
entries = InAdvancePagedList( 'http://nvapi.nicovideo.jp/v2/mylists/' + list_id, list_id,
functools.partial(self._fetch_page, list_id), query={'page': 1 + pagenum, 'pageSize': pagesize},
math.ceil(mylist['totalItemCount'] / self._PAGE_SIZE), headers=self._API_HEADERS).get('data').get('mylist')
self._PAGE_SIZE)
result = self.playlist_result( data = get_page_data(0, 1)
entries, list_id, mylist.get('name'), mylist.get('description')) title = data.get('name')
result.update(self._parse_owner(mylist)) description = data.get('description')
return result uploader = data.get('owner').get('name')
uploader_id = data.get('owner').get('id')
def pagefunc(pagenum):
data = get_page_data(pagenum, 25)
return ({
'_type': 'url',
'url': 'http://www.nicovideo.jp/watch/' + item.get('watchId'),
} for item in data.get('items'))
return {
'_type': 'playlist',
'id': list_id,
'title': title,
'description': description,
'uploader': uploader,
'uploader_id': uploader_id,
'entries': OnDemandPagedList(pagefunc, 25),
}
class NiconicoUserIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/user/(?P<id>\d+)/?(?:$|[#?])'
_TEST = {
'url': 'https://www.nicovideo.jp/user/419948',
'info_dict': {
'id': '419948',
},
'playlist_mincount': 101,
}
_API_URL = "https://nvapi.nicovideo.jp/v1/users/%s/videos?sortKey=registeredAt&sortOrder=desc&pageSize=%s&page=%s"
_PAGE_SIZE = 100
_API_HEADERS = {
'X-Frontend-ID': '6',
'X-Frontend-Version': '0'
}
def _entries(self, list_id, ):
total_count = 1
count = page_num = 0
while count < total_count:
json_parsed = self._download_json(
self._API_URL % (list_id, self._PAGE_SIZE, page_num + 1), list_id,
headers=self._API_HEADERS,
note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else ''))
if not page_num:
total_count = int_or_none(json_parsed['data'].get('totalCount'))
for entry in json_parsed["data"]["items"]:
count += 1
yield self.url_result('https://www.nicovideo.jp/watch/%s' % entry['id'])
page_num += 1
def _real_extract(self, url):
list_id = self._match_id(url)
return self.playlist_result(self._entries(list_id), list_id, ie=NiconicoIE.ie_key())

View file

@ -58,7 +58,7 @@ class NRKBaseIE(InfoExtractor):
def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None): def _call_api(self, path, video_id, item=None, note=None, fatal=True, query=None):
return self._download_json( return self._download_json(
urljoin('http://psapi.nrk.no/', path), urljoin('https://psapi.nrk.no/', path),
video_id, note or 'Downloading %s JSON' % item, video_id, note or 'Downloading %s JSON' % item,
fatal=fatal, query=query, fatal=fatal, query=query,
headers={'Accept-Encoding': 'gzip, deflate, br'}) headers={'Accept-Encoding': 'gzip, deflate, br'})

View file

@ -98,6 +98,9 @@ class ORFTVthekIE(InfoExtractor):
elif ext == 'f4m': elif ext == 'f4m':
formats.extend(self._extract_f4m_formats( formats.extend(self._extract_f4m_formats(
src, video_id, f4m_id=format_id, fatal=False)) src, video_id, f4m_id=format_id, fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
src, video_id, mpd_id=format_id, fatal=False))
else: else:
formats.append({ formats.append({
'format_id': format_id, 'format_id': format_id,
@ -140,6 +143,25 @@ class ORFTVthekIE(InfoExtractor):
}) })
upload_date = unified_strdate(sd.get('created_date')) upload_date = unified_strdate(sd.get('created_date'))
thumbnails = []
preview = sd.get('preview_image_url')
if preview:
thumbnails.append({
'id': 'preview',
'url': preview,
'preference': 0,
})
image = sd.get('image_full_url')
if not image and len(data_jsb) == 1:
image = self._og_search_thumbnail(webpage)
if image:
thumbnails.append({
'id': 'full',
'url': image,
'preference': 1,
})
entries.append({ entries.append({
'_type': 'video', '_type': 'video',
'id': video_id, 'id': video_id,
@ -149,7 +171,7 @@ class ORFTVthekIE(InfoExtractor):
'description': sd.get('description'), 'description': sd.get('description'),
'duration': int_or_none(sd.get('duration_in_seconds')), 'duration': int_or_none(sd.get('duration_in_seconds')),
'upload_date': upload_date, 'upload_date': upload_date,
'thumbnail': sd.get('image_full_url'), 'thumbnails': thumbnails,
}) })
return { return {

View file

@ -21,7 +21,7 @@ from ..utils import (
class PeerTubeBaseExtractor(SelfhostedInfoExtractor): class PeerTubeBaseExtractor(SelfhostedInfoExtractor):
_UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}' _UUID_RE = r'[\da-zA-Z]{22}|[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
_API_BASE = 'https://%s/api/v1/%s/%s/%s' _API_BASE = 'https://%s/api/v1/%s/%s/%s'
_SH_VALID_CONTENT_STRINGS = ( _SH_VALID_CONTENT_STRINGS = (
'<title>PeerTube<', '<title>PeerTube<',
@ -180,16 +180,16 @@ class PeerTubeBaseExtractor(SelfhostedInfoExtractor):
class PeerTubeSHIE(PeerTubeBaseExtractor): class PeerTubeSHIE(PeerTubeBaseExtractor):
_VALID_URL = r'peertube:(?P<host>[^:]+):(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE) _VALID_URL = r'peertube:(?P<host>[^:]+):(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE)
_SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)|api/v\d/videos)/(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE) _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)|api/v\d/videos|w)/(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE)
_TESTS = [{ _TESTS = [{
'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d',
'md5': '9bed8c0137913e17b86334e5885aacff', 'md5': '8563064d245a4be5705bddb22bb00a28',
'info_dict': { 'info_dict': {
'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d', 'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d',
'ext': 'mp4', 'ext': 'mp4',
'title': 'What is PeerTube?', 'title': 'What is PeerTube?',
'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10', 'description': 'md5:96adbaf219b4d41747bfc5937df0b017',
'thumbnail': r're:https?://.*\.(?:jpg|png)', 'thumbnail': r're:https?://.*\.(?:jpg|png)',
'timestamp': 1538391166, 'timestamp': 1538391166,
'upload_date': '20181001', 'upload_date': '20181001',
@ -220,6 +220,27 @@ class PeerTubeSHIE(PeerTubeBaseExtractor):
'upload_date': '20200420', 'upload_date': '20200420',
'uploader': 'Drew DeVault', 'uploader': 'Drew DeVault',
} }
}, {
# new url scheme since PeerTube 3.3
'url': 'https://peertube2.cpy.re/w/3fbif9S3WmtTP8gGsC5HBd',
'info_dict': {
'id': '122d093a-1ede-43bd-bd34-59d2931ffc5e',
'ext': 'mp4',
'title': 'E2E tests',
'uploader_id': '37855',
'timestamp': 1589276219,
'upload_date': '20200512',
'uploader': 'chocobozzz',
},
}, {
'url': 'https://peertube2.cpy.re/w/122d093a-1ede-43bd-bd34-59d2931ffc5e',
'only_matching': True,
}, {
'url': 'https://peertube2.cpy.re/api/v1/videos/3fbif9S3WmtTP8gGsC5HBd',
'only_matching': True,
}, {
'url': 'peertube:peertube2.cpy.re:3fbif9S3WmtTP8gGsC5HBd',
'only_matching': True,
}, { }, {
'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44', 'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44',
'only_matching': True, 'only_matching': True,
@ -289,7 +310,7 @@ class PeerTubeSHIE(PeerTubeBaseExtractor):
description = None description = None
if webpage: if webpage:
description = self._og_search_description(webpage) description = self._og_search_description(webpage, default=None)
if not description: if not description:
full_description = self._call_api( full_description = self._call_api(
host, 'videos', video_id, 'description', note='Downloading description JSON', host, 'videos', video_id, 'description', note='Downloading description JSON',
@ -305,7 +326,7 @@ class PeerTubeSHIE(PeerTubeBaseExtractor):
class PeerTubePlaylistSHIE(PeerTubeBaseExtractor): class PeerTubePlaylistSHIE(PeerTubeBaseExtractor):
_VALID_URL = r'peertube:playlist:(?P<host>[^:]+):(?P<id>.+)' _VALID_URL = r'peertube:playlist:(?P<host>[^:]+):(?P<id>.+)'
_SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)/playlist|api/v\d/video-playlists)/(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE) _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:videos/(?:watch|embed)/playlist|api/v\d/video-playlists|w/p)/(?P<id>%s)' % (PeerTubeBaseExtractor._UUID_RE)
_TESTS = [{ _TESTS = [{
'url': 'https://video.internet-czas-dzialac.pl/videos/watch/playlist/3c81b894-acde-4539-91a2-1748b208c14c?playlistPosition=1', 'url': 'https://video.internet-czas-dzialac.pl/videos/watch/playlist/3c81b894-acde-4539-91a2-1748b208c14c?playlistPosition=1',
@ -316,6 +337,9 @@ class PeerTubePlaylistSHIE(PeerTubeBaseExtractor):
'uploader': 'Internet. Czas działać!', 'uploader': 'Internet. Czas działać!',
}, },
'playlist_mincount': 14, 'playlist_mincount': 14,
}, {
'url': 'https://peertube2.cpy.re/w/p/hrAdcvjkMMkHJ28upnoN21',
'only_matching': True,
}] }]
def _selfhosted_extract(self, url, webpage=None): def _selfhosted_extract(self, url, webpage=None):
@ -352,18 +376,21 @@ class PeerTubePlaylistSHIE(PeerTubeBaseExtractor):
class PeerTubeChannelSHIE(PeerTubeBaseExtractor): class PeerTubeChannelSHIE(PeerTubeBaseExtractor):
_VALID_URL = r'peertube:channel:(?P<host>[^:]+):(?P<id>.+)' _VALID_URL = r'peertube:channel:(?P<host>[^:]+):(?P<id>.+)'
_SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:api/v\d/)?video-channels/(?P<id>[^/?#]+)(?:/videos)?' _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:(?:api/v\d/)?video-channels|c)/(?P<id>[^/?#]+)(?:/videos)?'
_TESTS = [{ _TESTS = [{
'url': 'https://video.internet-czas-dzialac.pl/video-channels/internet_czas_dzialac/videos', 'url': 'https://video.internet-czas-dzialac.pl/video-channels/internet_czas_dzialac/videos',
'info_dict': { 'info_dict': {
'id': '2', 'id': '2',
'title': 'internet_czas_dzialac', 'title': 'Internet. Czas działać!',
'description': 'md5:4d2e215ea0d9ae4501a556ef6e9a5308', 'description': 'md5:ac35d70f6625b04b189e0b4b76e62e17',
'uploader_id': 3, 'uploader_id': 3,
'uploader': 'Internet. Czas działać!', 'uploader': 'Internet. Czas działać!',
}, },
'playlist_mincount': 14, 'playlist_mincount': 14,
}, {
'url': 'https://video.internet-czas-dzialac.pl/c/internet_czas_dzialac',
'only_matching': True,
}] }]
def _selfhosted_extract(self, url, webpage=None): def _selfhosted_extract(self, url, webpage=None):
@ -401,18 +428,21 @@ class PeerTubeChannelSHIE(PeerTubeBaseExtractor):
class PeerTubeAccountSHIE(PeerTubeBaseExtractor): class PeerTubeAccountSHIE(PeerTubeBaseExtractor):
_VALID_URL = r'peertube:account:(?P<host>[^:]+):(?P<id>.+)' _VALID_URL = r'peertube:account:(?P<host>[^:]+):(?P<id>.+)'
_SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:api/v\d/)?accounts/(?P<id>[^/?#]+)(?:/video(?:s|-channels))?' _SH_VALID_URL = r'https?://(?P<host>[^/]+)/(?:(?:api/v\d/)?accounts|a)/(?P<id>[^/?#]+)(?:/video(?:s|-channels))?'
_TESTS = [{ _TESTS = [{
'url': 'https://video.internet-czas-dzialac.pl/accounts/icd/video-channels', 'url': 'https://video.internet-czas-dzialac.pl/accounts/icd/video-channels',
'info_dict': { 'info_dict': {
'id': '3', 'id': '3',
'description': 'md5:ab3c9b934dd39030eea1c9fe76079870', 'description': 'md5:ac35d70f6625b04b189e0b4b76e62e17',
'uploader': 'Internet. Czas działać!', 'uploader': 'Internet. Czas działać!',
'title': 'Internet. Czas działać!', 'title': 'Internet. Czas działać!',
'uploader_id': 3, 'uploader_id': 3,
}, },
'playlist_mincount': 14, 'playlist_mincount': 14,
}, {
'url': 'https://video.internet-czas-dzialac.pl/a/icd',
'only_matching': True,
}] }]
def _selfhosted_extract(self, url, webpage=None): def _selfhosted_extract(self, url, webpage=None):

View file

@ -91,6 +91,14 @@ class PolskieRadioIE(PolskieRadioBaseExtractor):
'upload_date': '20201116', 'upload_date': '20201116',
}, },
}] }]
}, {
# PR4 audition - other frontend
'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301',
'info_dict': {
'id': '2610977',
'ext': 'mp3',
'title': 'Pogłos 29 października godz. 23:01',
},
}, { }, {
'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis',
'only_matching': True, 'only_matching': True,
@ -113,24 +121,34 @@ class PolskieRadioIE(PolskieRadioBaseExtractor):
content = self._search_regex( content = self._search_regex(
r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>',
webpage, 'content') webpage, 'content', default=None)
timestamp = unified_timestamp(self._html_search_regex( timestamp = unified_timestamp(self._html_search_regex(
r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>',
webpage, 'timestamp', fatal=False)) webpage, 'timestamp', default=None))
thumbnail_url = self._og_search_thumbnail(webpage) thumbnail_url = self._og_search_thumbnail(webpage, default=None)
title = self._og_search_title(webpage).strip() title = self._og_search_title(webpage).strip()
description = strip_or_none(self._og_search_description(webpage, default=None))
if not content:
return {
'id': playlist_id,
'url': 'https:' + self._search_regex(r"source:\s*'(//static\.prsa\.pl/[^']+)'", webpage, 'audition record url'),
'title': title,
'description': description,
'timestamp': timestamp,
'thumbnail': thumbnail_url,
}
entries = self._extract_webpage_player_entries(content, playlist_id, { entries = self._extract_webpage_player_entries(content, playlist_id, {
'title': title, 'title': title,
'timestamp': timestamp, 'timestamp': timestamp,
'thumbnail': thumbnail_url, 'thumbnail': thumbnail_url,
}) })
description = strip_or_none(self._og_search_description(webpage))
return self.playlist_result(entries, playlist_id, title, description) return self.playlist_result(entries, playlist_id, title, description)

View file

@ -31,6 +31,7 @@ from ..utils import (
class PornHubBaseIE(InfoExtractor): class PornHubBaseIE(InfoExtractor):
_REQUIRES_PLAYWRIGHT = True _REQUIRES_PLAYWRIGHT = True
_NETRC_MACHINE = 'pornhub' _NETRC_MACHINE = 'pornhub'
_PORNHUB_HOST_RE = r'(?:(?P<host>pornhub(?:premium)?\.(?:com|net|org))|pornhubthbh7ap3u\.onion)'
def _download_webpage_handle(self, *args, **kwargs): def _download_webpage_handle(self, *args, **kwargs):
def dl(*args, **kwargs): def dl(*args, **kwargs):
@ -125,11 +126,13 @@ class PornHubIE(PornHubBaseIE):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
(?: (?:
(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:[^/]+\.)?
%s
/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
(?:www\.)?thumbzilla\.com/video/ (?:www\.)?thumbzilla\.com/video/
) )
(?P<id>[\da-z]+) (?P<id>[\da-z]+)
''' ''' % PornHubBaseIE._PORNHUB_HOST_RE
_TESTS = [{ _TESTS = [{
'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
'info_dict': { 'info_dict': {
@ -238,6 +241,13 @@ class PornHubIE(PornHubBaseIE):
}, { }, {
'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3', 'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5f75b0f4b18e3',
'only_matching': True, 'only_matching': True,
}, {
# geo restricted
'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5a9813bfa7156',
'only_matching': True,
}, {
'url': 'http://pornhubthbh7ap3u.onion/view_video.php?viewkey=ph5a9813bfa7156',
'only_matching': True,
}] }]
@staticmethod @staticmethod
@ -277,6 +287,11 @@ class PornHubIE(PornHubBaseIE):
'PornHub said: %s' % error_msg, 'PornHub said: %s' % error_msg,
expected=True, video_id=video_id) expected=True, video_id=video_id)
if any(re.search(p, webpage) for p in (
r'class=["\']geoBlocked["\']',
r'>\s*This content is unavailable in your country')):
self.raise_geo_restricted()
# video_title from flashvars contains whitespace instead of non-ASCII (see # video_title from flashvars contains whitespace instead of non-ASCII (see
# http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
# on that anymore. # on that anymore.
@ -410,17 +425,14 @@ class PornHubIE(PornHubBaseIE):
format_url, video_id, 'mp4', entry_protocol='m3u8_native', format_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id='hls', fatal=False)) m3u8_id='hls', fatal=False))
return return
tbr = None if not height:
mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', format_url) height = int_or_none(self._search_regex(
if mobj: r'(?P<height>\d+)[pP]?_\d+[kK]', format_url, 'height',
if not height: default=None))
height = int(mobj.group('height'))
tbr = int(mobj.group('tbr'))
formats.append({ formats.append({
'url': format_url, 'url': format_url,
'format_id': '%dp' % height if height else None, 'format_id': '%dp' % height if height else None,
'height': height, 'height': height,
'tbr': tbr,
}) })
for video_url, height in video_urls: for video_url, height in video_urls:
@ -442,7 +454,8 @@ class PornHubIE(PornHubBaseIE):
add_format(video_url, height) add_format(video_url, height)
continue continue
add_format(video_url) add_format(video_url)
self._sort_formats(formats) self._sort_formats(
formats, field_preference=('height', 'width', 'fps', 'format_id'))
video_uploader = self._html_search_regex( video_uploader = self._html_search_regex(
r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
@ -511,7 +524,7 @@ class PornHubPlaylistBaseIE(PornHubBaseIE):
class PornHubUserIE(PornHubPlaylistBaseIE): class PornHubUserIE(PornHubPlaylistBaseIE):
_VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)' % PornHubBaseIE._PORNHUB_HOST_RE
_TESTS = [{ _TESTS = [{
'url': 'https://www.pornhub.com/model/zoe_ph', 'url': 'https://www.pornhub.com/model/zoe_ph',
'playlist_mincount': 118, 'playlist_mincount': 118,
@ -540,6 +553,9 @@ class PornHubUserIE(PornHubPlaylistBaseIE):
# Same as before, multi page # Same as before, multi page
'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau', 'url': 'https://www.pornhubpremium.com/pornstar/lily-labeau',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph',
'only_matching': True,
}] }]
def _real_extract(self, url): def _real_extract(self, url):
@ -615,7 +631,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE): class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
_VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)' _VALID_URL = r'https?://(?:[^/]+\.)?%s/(?P<id>(?:[^/]+/)*[^/?#&]+)' % PornHubBaseIE._PORNHUB_HOST_RE
_TESTS = [{ _TESTS = [{
'url': 'https://www.pornhub.com/model/zoe_ph/videos', 'url': 'https://www.pornhub.com/model/zoe_ph/videos',
'only_matching': True, 'only_matching': True,
@ -720,6 +736,9 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
}, { }, {
'url': 'https://de.pornhub.com/playlist/4667351', 'url': 'https://de.pornhub.com/playlist/4667351',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://pornhubthbh7ap3u.onion/model/zoe_ph/videos',
'only_matching': True,
}] }]
@classmethod @classmethod
@ -730,7 +749,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE): class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
_VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net|org))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?%s/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)' % PornHubBaseIE._PORNHUB_HOST_RE
_TESTS = [{ _TESTS = [{
'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload', 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
'info_dict': { 'info_dict': {
@ -740,4 +759,7 @@ class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
}, { }, {
'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload', 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
'only_matching': True, 'only_matching': True,
}, {
'url': 'http://pornhubthbh7ap3u.onion/pornstar/jenny-blighe/videos/upload',
'only_matching': True,
}] }]

View file

@ -30,6 +30,10 @@ from ..utils import (
url_or_none, url_or_none,
urlhandle_detect_ext, urlhandle_detect_ext,
) )
try:
from ..extractor_artifacts.soundcloud import prerelease_client_id
except ImportError:
prerelease_client_id = None
class SoundcloudEmbedIE(InfoExtractor): class SoundcloudEmbedIE(InfoExtractor):
@ -289,6 +293,10 @@ class SoundcloudIE(InfoExtractor):
return return
raise ExtractorError('Unable to extract client id') raise ExtractorError('Unable to extract client id')
def _generate_prerelease_file(self):
self._update_client_id()
return 'prerelease_client_id = {!r}\n'.format(self._CLIENT_ID)
def _download_json(self, *args, **kwargs): def _download_json(self, *args, **kwargs):
non_fatal = kwargs.get('fatal') is False non_fatal = kwargs.get('fatal') is False
if non_fatal: if non_fatal:
@ -310,7 +318,7 @@ class SoundcloudIE(InfoExtractor):
raise raise
def _real_initialize(self): def _real_initialize(self):
self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk' self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or prerelease_client_id or 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk'
@classmethod @classmethod
def _resolv_url(cls, url): def _resolv_url(cls, url):

View file

@ -28,7 +28,7 @@ class UMGDeIE(InfoExtractor):
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
video_data = self._download_json( video_data = self._download_json(
'https://api.universal-music.de/graphql', 'https://graphql.universal-music.de/',
video_id, query={ video_id, query={
'query': '''{ 'query': '''{
universalMusic(channel:16) { universalMusic(channel:16) {
@ -56,11 +56,9 @@ class UMGDeIE(InfoExtractor):
formats = [] formats = []
def add_m3u8_format(format_id): def add_m3u8_format(format_id):
m3u8_formats = self._extract_m3u8_formats( formats.extend(self._extract_m3u8_formats(
hls_url_template % format_id, video_id, 'mp4', hls_url_template % format_id, video_id, 'mp4',
'm3u8_native', m3u8_id='hls', fatal='False') 'm3u8_native', m3u8_id='hls', fatal=False))
if m3u8_formats and m3u8_formats[0].get('height'):
formats.extend(m3u8_formats)
for f in video_data.get('formats', []): for f in video_data.get('formats', []):
f_url = f.get('url') f_url = f.get('url')

View file

@ -0,0 +1,37 @@
from .common import InfoExtractor
class ViderIE(InfoExtractor):
_VALID_URL = r'https?://vider\.(?:pl|info)/(?:vid/\+f|embed/video/)(?P<id>[a-z\d]+)'
_TESTS = [{
'url': 'https://vider.info/vid/+fsx51se',
'info_dict': {
'id': 'sx51se',
'ext': 'mp4',
'title': 'Big Buck Bunny',
'upload_date': '20210906',
'timestamp': 1630927351,
},
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(f'https://vider.info/vid/+f{video_id}', video_id)
json_ld = self._parse_json(
self._search_regex(
r'(?s)<script type="application/ld\+json">(.+?)</script>',
webpage, 'JSON-LD'), video_id)
info_dict = self._json_ld(json_ld, video_id)
# generated SEO junk
info_dict['description'] = None
info_dict['id'] = video_id
info_dict['formats'] = [{
'url': self._search_regex(r'\?file=(.+)', json_ld['embedUrl'], 'video url'),
'http_headers': {
'Referer': 'https://vider.info/',
},
}]
return info_dict

View file

@ -4,13 +4,12 @@ import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
extract_attributes,
int_or_none, int_or_none,
str_to_int, str_to_int,
unescapeHTML,
unified_strdate, unified_strdate,
url_or_none, url_or_none,
) )
from ..aes import aes_decrypt_text
class YouPornIE(InfoExtractor): class YouPornIE(InfoExtractor):
@ -34,6 +33,7 @@ class YouPornIE(InfoExtractor):
'tags': list, 'tags': list,
'age_limit': 18, 'age_limit': 18,
}, },
'skip': 'This video has been disabled',
}, { }, {
# Unknown uploader # Unknown uploader
'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4', 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4',
@ -78,6 +78,40 @@ class YouPornIE(InfoExtractor):
video_id = mobj.group('id') video_id = mobj.group('id')
display_id = mobj.group('display_id') or video_id display_id = mobj.group('display_id') or video_id
definitions = self._download_json(
'https://www.youporn.com/api/video/media_definitions/%s/' % video_id,
display_id)
formats = []
for definition in definitions:
if not isinstance(definition, dict):
continue
video_url = url_or_none(definition.get('videoUrl'))
if not video_url:
continue
f = {
'url': video_url,
'filesize': int_or_none(definition.get('videoSize')),
}
height = int_or_none(definition.get('quality'))
# Video URL's path looks like this:
# /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /videos/201703/11/109285532/1080P_4000K_109285532.mp4
# We will benefit from it by extracting some metadata
mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url)
if mobj:
if not height:
height = int(mobj.group('height'))
bitrate = int(mobj.group('bitrate'))
f.update({
'format_id': '%dp-%dk' % (height, bitrate),
'tbr': bitrate,
})
f['height'] = height
formats.append(f)
self._sort_formats(formats)
webpage = self._download_webpage( webpage = self._download_webpage(
'http://www.youporn.com/watch/%s' % video_id, display_id, 'http://www.youporn.com/watch/%s' % video_id, display_id,
headers={'Cookie': 'age_verified=1'}) headers={'Cookie': 'age_verified=1'})
@ -88,65 +122,6 @@ class YouPornIE(InfoExtractor):
webpage, default=None) or self._html_search_meta( webpage, default=None) or self._html_search_meta(
'title', webpage, fatal=True) 'title', webpage, fatal=True)
links = []
# Main source
definitions = self._parse_json(
self._search_regex(
r'mediaDefinition\s*[=:]\s*(\[.+?\])\s*[;,]', webpage,
'media definitions', default='[]'),
video_id, fatal=False)
if definitions:
for definition in definitions:
if not isinstance(definition, dict):
continue
video_url = url_or_none(definition.get('videoUrl'))
if video_url:
links.append(video_url)
# Fallback #1, this also contains extra low quality 180p format
for _, link in re.findall(r'<a[^>]+href=(["\'])(http(?:(?!\1).)+\.mp4(?:(?!\1).)*)\1[^>]+title=["\']Download [Vv]ideo', webpage):
links.append(link)
# Fallback #2 (unavailable as at 22.06.2017)
sources = self._search_regex(
r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None)
if sources:
for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):
links.append(link)
# Fallback #3 (unavailable as at 22.06.2017)
for _, link in re.findall(
r'(?:videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage):
links.append(link)
# Fallback #4, encrypted links (unavailable as at 22.06.2017)
for _, encrypted_link in re.findall(
r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage):
links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8'))
formats = []
for video_url in set(unescapeHTML(link) for link in links):
f = {
'url': video_url,
}
# Video URL's path looks like this:
# /201012/17/505835/720p_1500k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /201012/17/505835/vl_240p_240k_505835/YouPorn%20-%20Sex%20Ed%20Is%20It%20Safe%20To%20Masturbate%20Daily.mp4
# /videos/201703/11/109285532/1080P_4000K_109285532.mp4
# We will benefit from it by extracting some metadata
mobj = re.search(r'(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url)
if mobj:
height = int(mobj.group('height'))
bitrate = int(mobj.group('bitrate'))
f.update({
'format_id': '%dp-%dk' % (height, bitrate),
'height': height,
'tbr': bitrate,
})
formats.append(f)
self._sort_formats(formats)
description = self._html_search_regex( description = self._html_search_regex(
r'(?s)<div[^>]+\bid=["\']description["\'][^>]*>(.+?)</div>', r'(?s)<div[^>]+\bid=["\']description["\'][^>]*>(.+?)</div>',
webpage, 'description', webpage, 'description',
@ -169,13 +144,12 @@ class YouPornIE(InfoExtractor):
age_limit = self._rta_search(webpage) age_limit = self._rta_search(webpage)
average_rating = int_or_none(self._search_regex( view_count = None
r'<div[^>]+class=["\']videoRatingPercentage["\'][^>]*>(\d+)%</div>', views = self._search_regex(
webpage, 'average rating', fatal=False)) r'(<div[^>]+\bclass=["\']js_videoInfoViews["\']>)', webpage,
'views', default=None)
view_count = str_to_int(self._search_regex( if views:
r'(?s)<div[^>]+class=(["\']).*?\bvideoInfoViews\b.*?\1[^>]*>.*?(?P<count>[\d,.]+)<', view_count = str_to_int(extract_attributes(views).get('data-value'))
webpage, 'view count', fatal=False, group='count'))
comment_count = str_to_int(self._search_regex( comment_count = str_to_int(self._search_regex(
r'>All [Cc]omments? \(([\d,.]+)\)', r'>All [Cc]omments? \(([\d,.]+)\)',
webpage, 'comment count', default=None)) webpage, 'comment count', default=None))
@ -201,7 +175,6 @@ class YouPornIE(InfoExtractor):
'duration': duration, 'duration': duration,
'uploader': uploader, 'uploader': uploader,
'upload_date': upload_date, 'upload_date': upload_date,
'average_rating': average_rating,
'view_count': view_count, 'view_count': view_count,
'comment_count': comment_count, 'comment_count': comment_count,
'categories': categories, 'categories': categories,

View file

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from datetime import datetime from datetime import datetime
import json import json
import hashlib import hashlib
from inspect import getsource
import random import random
import re import re
import time import time
@ -45,6 +46,10 @@ from ..utils import (
urlencode_postdata, urlencode_postdata,
GeoRestrictedError, GeoRestrictedError,
) )
try:
from ..extractor_artifacts.youtube import _decrypt_signature_protected
except ImportError:
_decrypt_signature_protected = None
class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeBaseInfoExtractor(InfoExtractor):
@ -901,7 +906,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
raise ExtractorError('Cannot identify player %r' % player_url) raise ExtractorError('Cannot identify player %r' % player_url)
return id_m.group('id') return id_m.group('id')
def _extract_signature_function(self, video_id, player_url, example_sig): def _extract_signature_function(self, video_id, player_url):
player_id = self._extract_player_info(player_url) player_id = self._extract_player_info(player_url)
# Read from filesystem cache # Read from filesystem cache
@ -1012,31 +1017,45 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
' return %s\n') % (signature_id_tuple, expr_code) ' return %s\n') % (signature_id_tuple, expr_code)
self.to_screen('Extracted signature function:\n' + code) self.to_screen('Extracted signature function:\n' + code)
def mess(self, a, b): @staticmethod
def mess(a, b):
c = a[0] c = a[0]
a[0] = a[b % len(a)] a[0] = a[b % len(a)]
a[b % len(a)] = c a[b % len(a)] = c
return a return a
def _decrypt_signature_protected(self, s):
a = list(s)
a = self.mess(a, 49)
a = self.mess(a, 26)
a.reverse()
a = self.mess(a, 62)
a.reverse()
a = a[2:]
return "".join(a)
def _full_signature_handling(self, sig, player_url, video_id): def _full_signature_handling(self, sig, player_url, video_id):
signature = self._decrypt_signature_protected(sig) if _decrypt_signature_protected:
if re.match(self._VALID_SIG_VALUE_RE, signature): signature = _decrypt_signature_protected(sig)
return signature if re.match(self._VALID_SIG_VALUE_RE, signature):
return signature
if self._downloader.params.get('verbose'): if self._downloader.params.get('verbose'):
self.to_screen("Built-in signature decryption failed, trying dynamic") self.to_screen("Built-in signature decryption failed, trying dynamic")
sig_decrypt_stack = self._extract_signature_function(video_id, player_url, sig) sig_decrypt_stack = self._extract_signature_function(video_id, player_url)
return self._do_decrypt_signature(sig, sig_decrypt_stack) return self._do_decrypt_signature(sig, sig_decrypt_stack)
def _generate_prerelease_file(self):
# It's Monday, so I'm in a bad mood, but at least my sailor uniform is super cute!
video_id = 'ieQ1rAIjzXc'
self._set_consent()
webpage = self._download_webpage('https://www.youtube.com/watch?v=%s' % video_id, video_id)
player_url = self._search_regex(r'"jsUrl":"(/s/player/.*?/player_ias.vflset/.*?/base.js)', webpage, 'player url')
sig_decrypt_stack = self._extract_signature_function(video_id, player_url)
func = re.sub(r'(?m)^ ', '', getsource(self.mess).replace('@staticmethod', ''))
func += '\n\ndef _decrypt_signature_protected(sig):\n'
stack = ['a = list(sig)']
for fun in sig_decrypt_stack:
if fun[0] == 'splice':
stack.append(f'a = a[{fun[1]}:]')
elif fun[0] == 'reverse':
stack.append('a.reverse()')
elif fun[0] == 'mess':
stack.append(f'a = mess(a, {fun[1]})')
else:
raise ExtractorError('Unknown stack action: %s' % (fun[0]))
stack.append("return ''.join(a)")
return func + '\n'.join(map(lambda x: ' ' * 4 + x, stack)) + '\n'
def _get_subtitles(self, video_id, webpage): def _get_subtitles(self, video_id, webpage):
try: try:
subs_doc = self._download_xml( subs_doc = self._download_xml(
@ -1422,29 +1441,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+' if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
or re.search(r'player-age-gate-content">', video_webpage) is not None): or re.search(r'player-age-gate-content">', video_webpage) is not None):
age_gate = True age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
data = compat_urllib_parse_urlencode({
'video_id': video_id,
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
'html5': 1,
'c': 'TVHTML5',
'cver': '6.20180913',
})
video_info_url = proto + '://www.youtube.com/get_video_info?' + data
try: try:
video_info_webpage = self._download_webpage( yti1_player = self._download_webpage(
video_info_url, video_id, proto + '://www.youtube.com/youtubei/v1/player', video_id,
note='Downloading age-gated video info', headers={
'User-Agent': 'Mozilla/5.0 (SMART-TV; Linux; Tizen 4.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.0 Safari/537.36',
'Content-Type': 'application/json',
'X-Goog-Api-Key': self._YOUTUBE_API_KEY,
},
data=bytes(json.dumps({
'context': {
'client': {
'clientName': 'WEB',
'clientVersion': '2.20210721.00.00',
'clientScreen': 'EMBED',
},
},
'videoId': video_id,
}).encode('utf-8')),
note='Downloading age-gated player info',
errnote='unable to download video info') errnote='unable to download video info')
except ExtractorError: except ExtractorError:
video_info_webpage = None yti1_player = None
if video_info_webpage: if yti1_player:
video_info = compat_parse_qs(video_info_webpage) player_response = extract_player_response(yti1_player, video_id)
pl_response = video_info.get('player_response', [None])[0]
player_response = extract_player_response(pl_response, video_id)
add_dash_mpd(video_info) add_dash_mpd(video_info)
view_count = extract_view_count(video_info) view_count = extract_view_count(video_id)
else: else:
age_gate = False age_gate = False
# Try looking directly into the video webpage # Try looking directly into the video webpage
@ -1814,8 +1836,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
error_desc, error_desc,
countries=self._search_regex( countries=self._search_regex(
r'<meta itemprop="regionsAllowed" content="((?:(?:[A-Z]{2},)*[A-Z]{2})?)">', r'<meta itemprop="regionsAllowed" content="((?:(?:[A-Z]{2},)*[A-Z]{2})?)">',
video_webpage, 'allowed region list').split(','), video_webpage, 'allowed region list').split(','))
expected=True)
if error_desc and 'Playback on other websites has been disabled' in error_desc: if error_desc and 'Playback on other websites has been disabled' in error_desc:
raise ExtractorError( raise ExtractorError(
'Embeds disabled for this video, account (with passed credit card or photo ID check, if in EU/EEA/CH/UK) is required', 'Embeds disabled for this video, account (with passed credit card or photo ID check, if in EU/EEA/CH/UK) is required',
@ -2224,8 +2245,9 @@ class YoutubeBaseListInfoExtractor(YoutubeBaseInfoExtractor):
webpage = self._download_webpage(url, list_id, webpage = self._download_webpage(url, list_id,
note='Downloading %s page #1 (webpage)' % (self._LIST_NAME)) note='Downloading %s page #1 (webpage)' % (self._LIST_NAME))
return self._parse_json( return self._parse_json(
self._search_regex( self._search_regex((
r'(?:window(?:\["|\.)|var )ytInitialData(?:"])?\s*=\s*({.+});', r'(?:window(?:\["|\.)|var )ytInitialData(?:"])?\s*=\s*({.+});</script>',
r'(?:window(?:\["|\.)|var )ytInitialData(?:"])?\s*=\s*({.+});'),
webpage, 'initial data JSON'), 'initial data JSON'), webpage webpage, 'initial data JSON'), 'initial data JSON'), webpage
def _real_extract(self, url, results=None, query=None): def _real_extract(self, url, results=None, query=None):

View file

@ -231,7 +231,10 @@ class FFmpegPostProcessor(PostProcessor):
stdout, stderr = p.communicate() stdout, stderr = p.communicate()
if p.returncode != 0: if p.returncode != 0:
stderr = stderr.decode('utf-8', 'replace') stderr = stderr.decode('utf-8', 'replace')
msg = stderr.strip().split('\n')[-1] msgs = stderr.strip().split('\n')
msg = msgs[-1]
if self._downloader.params.get('verbose', False):
self._downloader.to_screen('[debug] ' + '\n'.join(msgs[:-1]))
raise FFmpegPostProcessorError(msg) raise FFmpegPostProcessorError(msg)
self.try_utime(out_path, oldest_mtime, oldest_mtime) self.try_utime(out_path, oldest_mtime, oldest_mtime)

View file

@ -1,6 +1,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
__version__ = '2021.06.20' __version__ = '2021.08.01'
if __name__ == '__main__': if __name__ == '__main__':
print(__version__) print(__version__)

View file

@ -115,7 +115,7 @@ setup(
packages=[ packages=[
'haruhi_dl', 'haruhi_dl',
'haruhi_dl.extractor', 'haruhi_dl.downloader', 'haruhi_dl.extractor', 'haruhi_dl.downloader',
'haruhi_dl.postprocessor'], 'haruhi_dl.postprocessor', 'haruhi_dl.extractor_artifacts'],
# Provokes warning on most systems (why?!) # Provokes warning on most systems (why?!)
# test_suite = 'nose.collector', # test_suite = 'nose.collector',