facebook.py 29.5 KB
Newer Older
Sergey M․'s avatar
Sergey M․ committed
1
# coding: utf-8
Philipp Hagemeister's avatar
Philipp Hagemeister committed
2
3
from __future__ import unicode_literals

4
import json
5
6
7
8
import re
import socket

from .common import InfoExtractor
9
from ..compat import (
10
    compat_etree_fromstring,
11
    compat_http_client,
12
    compat_str,
13
    compat_urllib_error,
14
    compat_urllib_parse_unquote,
15
    compat_urllib_parse_unquote_plus,
16
17
)
from ..utils import (
18
    clean_html,
19
    error_to_compat_str,
20
    ExtractorError,
21
    float_or_none,
22
    get_element_by_id,
23
    int_or_none,
24
    js_to_json,
25
    limit_length,
26
    parse_count,
27
    qualities,
28
    sanitized_Request,
29
    try_get,
30
    urlencode_postdata,
31
    urljoin,
32
33
34
35
)


class FacebookIE(InfoExtractor):
36
    _VALID_URL = r'''(?x)
37
38
                (?:
                    https?://
39
                        (?:[\w-]+\.)?(?:facebook\.com|facebookcorewwwi\.onion)/
40
41
42
43
44
45
                        (?:[^#]*?\#!/)?
                        (?:
                            (?:
                                video/video\.php|
                                photo\.php|
                                video\.php|
46
                                video/embed|
47
                                story\.php|
48
                                watch(?:/live)?/?
49
                            )\?(?:.*?)(?:v|video_id|story_fbid)=|
50
                            [^/]+/videos/(?:[^/]+/)?|
51
                            [^/]+/posts/|
52
53
                            groups/[^/]+/permalink/|
                            watchparty/
54
55
56
57
58
                        )|
                    facebook:
                )
                (?P<id>[0-9]+)
                '''
59
60
    _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1'
    _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1'
61
    _NETRC_MACHINE = 'facebook'
Philipp Hagemeister's avatar
Philipp Hagemeister committed
62
    IE_NAME = 'facebook'
63

64
    _VIDEO_PAGE_TEMPLATE = 'https://www.facebook.com/video/video.php?v=%s'
65
    _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary'
66

67
    _TESTS = [{
68
69
        'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf',
        'md5': '6a40d33c0eccbb1af76cf0485a052659',
Philipp Hagemeister's avatar
Philipp Hagemeister committed
70
        'info_dict': {
71
            'id': '637842556329505',
Philipp Hagemeister's avatar
Philipp Hagemeister committed
72
            'ext': 'mp4',
Philipp Hagemeister's avatar
Philipp Hagemeister committed
73
            'title': 're:Did you know Kei Nishikori is the first Asian man to ever reach a Grand Slam',
74
            'uploader': 'Tennis on Facebook',
75
76
            'upload_date': '20140908',
            'timestamp': 1410199200,
77
78
        },
        'skip': 'Requires logging in',
79
    }, {
80
        # data.video
81
82
83
84
        'url': 'https://www.facebook.com/video.php?v=274175099429670',
        'info_dict': {
            'id': '274175099429670',
            'ext': 'mp4',
85
            'title': 're:^Asif Nawab Butt posted a video',
86
            'uploader': 'Asif Nawab Butt',
87
88
            'upload_date': '20140506',
            'timestamp': 1399398998,
89
            'thumbnail': r're:^https?://.*',
90
91
92
93
        },
        'expected_warnings': [
            'title'
        ]
94
95
96
    }, {
        'note': 'Video with DASH manifest',
        'url': 'https://www.facebook.com/video.php?v=957955867617029',
97
        'md5': 'b2c28d528273b323abe5c6ab59f0f030',
98
99
100
101
102
        'info_dict': {
            'id': '957955867617029',
            'ext': 'mp4',
            'title': 'When you post epic content on instagram.com/433 8 million followers, this is ...',
            'uploader': 'Demy de Zeeuw',
103
104
            'upload_date': '20160110',
            'timestamp': 1452431627,
105
        },
106
        'skip': 'Requires logging in',
107
108
109
110
111
112
113
114
    }, {
        'url': 'https://www.facebook.com/maxlayn/posts/10153807558977570',
        'md5': '037b1fa7f3c2d02b7a0d7bc16031ecc6',
        'info_dict': {
            'id': '544765982287235',
            'ext': 'mp4',
            'title': '"What are you doing running in the snow?"',
            'uploader': 'FailArmy',
115
116
        },
        'skip': 'Video gone',
117
118
119
120
121
122
123
124
125
    }, {
        'url': 'https://m.facebook.com/story.php?story_fbid=1035862816472149&id=116132035111903',
        'md5': '1deb90b6ac27f7efcf6d747c8a27f5e3',
        'info_dict': {
            'id': '1035862816472149',
            'ext': 'mp4',
            'title': 'What the Flock Is Going On In New Zealand  Credit: ViralHog',
            'uploader': 'S. Saint',
        },
126
        'skip': 'Video gone',
127
128
129
130
131
132
133
    }, {
        'note': 'swf params escaped',
        'url': 'https://www.facebook.com/barackobama/posts/10153664894881749',
        'md5': '97ba073838964d12c70566e0085c2b91',
        'info_dict': {
            'id': '10153664894881749',
            'ext': 'mp4',
134
135
136
137
138
            'title': 'Average time to confirm recent Supreme Court nominees: 67 days Longest it\'s t...',
            'thumbnail': r're:^https?://.*',
            'timestamp': 1456259628,
            'upload_date': '20160223',
            'uploader': 'Barack Obama',
139
        },
140
141
    }, {
        # have 1080P, but only up to 720p in swf params
142
        # data.video.story.attachments[].media
143
        'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
144
        'md5': '9571fae53d4165bbbadb17a94651dcdc',
145
146
147
        'info_dict': {
            'id': '10155529876156509',
            'ext': 'mp4',
148
            'title': 'She survived the holocaust — and years later, she’s getting her citizenship s...',
149
150
151
            'timestamp': 1477818095,
            'upload_date': '20161030',
            'uploader': 'CNN',
152
            'thumbnail': r're:^https?://.*',
153
            'view_count': int,
154
        },
155
156
    }, {
        # bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
157
        # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
158
159
160
161
        'url': 'https://www.facebook.com/yaroslav.korpan/videos/1417995061575415/',
        'info_dict': {
            'id': '1417995061575415',
            'ext': 'mp4',
162
            'title': 'md5:1db063d6a8c13faa8da727817339c857',
163
164
165
166
167
168
169
            'timestamp': 1486648217,
            'upload_date': '20170209',
            'uploader': 'Yaroslav Korpan',
        },
        'params': {
            'skip_download': True,
        },
170
171
172
173
174
175
176
177
178
    }, {
        'url': 'https://www.facebook.com/LaGuiaDelVaron/posts/1072691702860471',
        'info_dict': {
            'id': '1072691702860471',
            'ext': 'mp4',
            'title': 'md5:ae2d22a93fbb12dad20dc393a869739d',
            'timestamp': 1477305000,
            'upload_date': '20161024',
            'uploader': 'La Guía Del Varón',
179
            'thumbnail': r're:^https?://.*',
180
181
182
183
184
        },
        'params': {
            'skip_download': True,
        },
    }, {
185
        # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
186
187
188
189
        'url': 'https://www.facebook.com/groups/1024490957622648/permalink/1396382447100162/',
        'info_dict': {
            'id': '1396382447100162',
            'ext': 'mp4',
190
            'title': 'md5:19a428bbde91364e3de815383b54a235',
191
192
193
194
195
196
197
            'timestamp': 1486035494,
            'upload_date': '20170202',
            'uploader': 'Elisabeth Ahtn',
        },
        'params': {
            'skip_download': True,
        },
198
199
200
    }, {
        'url': 'https://www.facebook.com/video.php?v=10204634152394104',
        'only_matching': True,
Yen Chi Hsuan's avatar
Yen Chi Hsuan committed
201
202
203
    }, {
        'url': 'https://www.facebook.com/amogood/videos/1618742068337349/?fref=nf',
        'only_matching': True,
204
    }, {
205
        # data.mediaset.currMedia.edges
206
207
        'url': 'https://www.facebook.com/ChristyClarkForBC/videos/vb.22819070941/10153870694020942/?type=2&theater',
        'only_matching': True,
208
    }, {
209
        # data.video.story.attachments[].media
210
211
        'url': 'facebook:544765982287235',
        'only_matching': True,
212
    }, {
213
        # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.media
214
215
        'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/',
        'only_matching': True,
216
    }, {
217
        # data.video.creation_story.attachments[].media
218
219
        'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/',
        'only_matching': True,
220
    }, {
221
        # data.video
222
223
        'url': 'https://www.facebookcorewwwi.onion/video.php?v=274175099429670',
        'only_matching': True,
224
225
226
227
    }, {
        # no title
        'url': 'https://www.facebook.com/onlycleverentertainment/videos/1947995502095005/',
        'only_matching': True,
228
    }, {
229
        # data.video
230
231
232
233
234
235
236
237
238
239
        'url': 'https://www.facebook.com/WatchESLOne/videos/359649331226507/',
        'info_dict': {
            'id': '359649331226507',
            'ext': 'mp4',
            'title': '#ESLOne VoD - Birmingham Finals Day#1 Fnatic vs. @Evil Geniuses',
            'uploader': 'ESL One Dota 2',
        },
        'params': {
            'skip_download': True,
        },
240
241
242
243
244
245
246
    }, {
        # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
        'url': 'https://www.facebook.com/100033620354545/videos/106560053808006/',
        'info_dict': {
            'id': '106560053808006',
        },
        'playlist_count': 2,
247
248
249
250
    }, {
        # data.video.story.attachments[].media
        'url': 'https://www.facebook.com/watch/?v=647537299265662',
        'only_matching': True,
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
    }, {
        # data.node.comet_sections.content.story.attachments[].style_type_renderer.attachment.all_subattachments.nodes[].media
        'url': 'https://www.facebook.com/PankajShahLondon/posts/10157667649866271',
        'info_dict': {
            'id': '10157667649866271',
        },
        'playlist_count': 3,
    }, {
        # data.nodes[].comet_sections.content.story.attachments[].style_type_renderer.attachment.media
        'url': 'https://m.facebook.com/Alliance.Police.Department/posts/4048563708499330',
        'info_dict': {
            'id': '117576630041613',
            'ext': 'mp4',
            # TODO: title can be extracted from video page
            'title': 'Facebook video #117576630041613',
            'uploader_id': '189393014416438',
            'upload_date': '20201123',
            'timestamp': 1606162592,
        },
        'skip': 'Requires logging in',
271
272
273
274
275
276
277
278
279
280
281
    }, {
        # node.comet_sections.content.story.attached_story.attachments.style_type_renderer.attachment.media
        'url': 'https://www.facebook.com/groups/ateistiskselskab/permalink/10154930137678856/',
        'info_dict': {
            'id': '211567722618337',
            'ext': 'mp4',
            'title': 'Facebook video #211567722618337',
            'uploader_id': '127875227654254',
            'upload_date': '20161122',
            'timestamp': 1479793574,
        },
282
283
284
285
    }, {
        # data.video.creation_story.attachments[].media
        'url': 'https://www.facebook.com/watch/live/?v=1823658634322275',
        'only_matching': True,
286
287
288
289
290
291
292
    }, {
        'url': 'https://www.facebook.com/watchparty/211641140192478',
        'info_dict': {
            'id': '211641140192478',
        },
        'playlist_count': 1,
        'skip': 'Requires logging in',
293
    }]
294
    _SUPPORTED_PAGLETS_REGEX = r'(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_[0-9a-f]+)'
295
296
297
    _api_config = {
        'graphURI': '/api/graphql/'
    }
298

299
    @staticmethod
300
    def _extract_urls(webpage, **kwargs):
301
302
303
304
305
        urls = []
        for mobj in re.finditer(
                r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1',
                webpage):
            urls.append(mobj.group('url'))
306
307
        # Facebook API embed
        # see https://developers.facebook.com/docs/plugins/embedded-video-player
308
        for mobj in re.finditer(r'''(?x)<div[^>]+
309
                class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+
310
311
312
                data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage):
            urls.append(mobj.group('url'))
        return urls
313

314
    def _login(self):
315
        useremail, password = self._get_login_info()
316
317
318
        if useremail is None:
            return

319
        login_page_req = sanitized_Request(self._LOGIN_URL)
Sergey M․'s avatar
Sergey M․ committed
320
        self._set_cookie('facebook.com', 'locale', 'en_US')
321
        login_page = self._download_webpage(login_page_req, None,
Jouke Waleson's avatar
Jouke Waleson committed
322
323
                                            note='Downloading login page',
                                            errnote='Unable to download login page')
324
        lsd = self._search_regex(
325
            r'<input type="hidden" name="lsd" value="([^"]*)"',
326
            login_page, 'lsd')
Philipp Hagemeister's avatar
Philipp Hagemeister committed
327
        lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd')
328

329
330
331
        login_form = {
            'email': useremail,
            'pass': password,
332
333
334
335
336
337
338
            'lsd': lsd,
            'lgnrnd': lgnrnd,
            'next': 'http://facebook.com/home.php',
            'default_persistent': '0',
            'legacy_return': '1',
            'timezone': '-60',
            'trynum': '1',
339
        }
340
        request = sanitized_Request(self._LOGIN_URL, urlencode_postdata(login_form))
341
        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
342
        try:
343
            login_results = self._download_webpage(request, None,
Jouke Waleson's avatar
Jouke Waleson committed
344
                                                   note='Logging in', errnote='unable to fetch login page')
345
            if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
Sergey M․'s avatar
Sergey M․ committed
346
347
348
349
350
                error = self._html_search_regex(
                    r'(?s)<div[^>]+class=(["\']).*?login_error_box.*?\1[^>]*><div[^>]*>.*?</div><div[^>]*>(?P<error>.+?)</div>',
                    login_results, 'login error', default=None, group='error')
                if error:
                    raise ExtractorError('Unable to login: %s' % error, expected=True)
Jakub Wilk's avatar
Jakub Wilk committed
351
                self._downloader.report_warning('unable to log in: bad username/password, or exceeded login rate limit (~3/min). Check credentials or wait.')
352
                return
353

Sergey M․'s avatar
Sergey M․ committed
354
355
356
357
358
359
360
361
            fb_dtsg = self._search_regex(
                r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg', default=None)
            h = self._search_regex(
                r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h', default=None)

            if not fb_dtsg or not h:
                return

362
            check_form = {
Sergey M․'s avatar
Sergey M․ committed
363
364
                'fb_dtsg': fb_dtsg,
                'h': h,
365
366
                'name_action_selected': 'dont_save',
            }
367
            check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form))
368
            check_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
369
            check_response = self._download_webpage(check_req, None,
Jouke Waleson's avatar
Jouke Waleson committed
370
                                                    note='Confirming login')
371
            if re.search(r'id="checkpointSubmitButton"', check_response) is not None:
Jakub Wilk's avatar
Jakub Wilk committed
372
                self._downloader.report_warning('Unable to confirm login, you have to login in your browser and authorize the login.')
373
        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374
            self._downloader.report_warning('unable to log in: %s' % error_to_compat_str(err))
375
376
            return

377
378
379
    def _real_initialize(self):
        self._login()

380
    def _extract_from_url(self, url, video_id):
381
382
        webpage = self._download_webpage(
            url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id)
383

384
385
        video_data = None

386
        def extract_video_data(instances):
387
            video_data = []
388
            for item in instances:
389
                if try_get(item, lambda x: x[1][0]) == 'VideoConfig':
390
                    video_item = item[2][0]
391
                    if video_item.get('video_id'):
392
393
                        video_data.append(video_item['videoData'])
            return video_data
394

395
        server_js_data = self._parse_json(self._search_regex(
396
397
            [r'handleServerJS\(({.+})(?:\);|,")', r'\bs\.handle\(({.+?})\);'],
            webpage, 'server js data', default='{}'), video_id, fatal=False)
398
399
400
401

        if server_js_data:
            video_data = extract_video_data(server_js_data.get('instances', []))

402
403
404
405
406
        def extract_from_jsmods_instances(js_data):
            if js_data:
                return extract_video_data(try_get(
                    js_data, lambda x: x['jsmods']['instances'], list) or [])

407
        def extract_dash_manifest(video, formats):
408
409
410
411
412
            dash_manifest = video.get('dash_manifest')
            if dash_manifest:
                formats.extend(self._parse_mpd_formats(
                    compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest))))

413
414
415
416
417
418
419
420
        def process_formats(formats):
            # Downloads with browser's User-Agent are rate limited. Working around
            # with non-browser User-Agent.
            for f in formats:
                f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1'

            self._sort_formats(formats)

421
422
423
424
425
426
427
428
429
430
431
        def extract_relay_data(_filter):
            return self._parse_json(self._search_regex(
                r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter,
                webpage, 'replay data', default='{}'), video_id, fatal=False) or {}

        def extract_relay_prefetched_data(_filter):
            replay_data = extract_relay_data(_filter)
            for require in (replay_data.get('require') or []):
                if require[0] == 'RelayPrefetchedStreamCache':
                    return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {}

432
        if not video_data:
433
            server_js_data = self._parse_json(self._search_regex([
434
435
                r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+' + self._SUPPORTED_PAGLETS_REGEX,
                r'bigPipe\.onPageletArrive\(({.*?id\s*:\s*"%s".*?})\);' % self._SUPPORTED_PAGLETS_REGEX
436
            ], webpage, 'js data', default='{}'), video_id, js_to_json, False)
437
            video_data = extract_from_jsmods_instances(server_js_data)
438

439
        if not video_data:
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
            data = extract_relay_prefetched_data(
                r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"')
            if data:
                entries = []

                def parse_graphql_video(video):
                    formats = []
                    q = qualities(['sd', 'hd'])
                    for (suffix, format_id) in [('', 'sd'), ('_quality_hd', 'hd')]:
                        playable_url = video.get('playable_url' + suffix)
                        if not playable_url:
                            continue
                        formats.append({
                            'format_id': format_id,
                            'quality': q(format_id),
                            'url': playable_url,
                        })
                    extract_dash_manifest(video, formats)
                    process_formats(formats)
                    v_id = video.get('videoId') or video.get('id') or video_id
                    info = {
                        'id': v_id,
                        'formats': formats,
                        'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']),
                        'uploader_id': try_get(video, lambda x: x['owner']['id']),
                        'timestamp': int_or_none(video.get('publish_time')),
                        'duration': float_or_none(video.get('playable_duration_in_ms'), 1000),
                    }
                    description = try_get(video, lambda x: x['savable_description']['text'])
                    title = video.get('name')
                    if title:
                        info.update({
                            'title': title,
                            'description': description,
                        })
                    else:
                        info['title'] = description or 'Facebook video #%s' % v_id
                    entries.append(info)

                def parse_attachment(attachment, key='media'):
                    media = attachment.get(key) or {}
                    if media.get('__typename') == 'Video':
                        return parse_graphql_video(media)

                nodes = data.get('nodes') or []
                node = data.get('node') or {}
                if not nodes and node:
                    nodes.append(node)
                for node in nodes:
                    story = try_get(node, lambda x: x['comet_sections']['content']['story'], dict) or {}
                    attachments = try_get(story, [
                        lambda x: x['attached_story']['attachments'],
                        lambda x: x['attachments']
                    ], list) or []
                    for attachment in attachments:
                        attachment = try_get(attachment, lambda x: x['style_type_renderer']['attachment'], dict)
                        ns = try_get(attachment, lambda x: x['all_subattachments']['nodes'], list) or []
                        for n in ns:
                            parse_attachment(n)
                        parse_attachment(attachment)

                edges = try_get(data, lambda x: x['mediaset']['currMedia']['edges'], list) or []
                for edge in edges:
                    parse_attachment(edge, key='node')

                video = data.get('video') or {}
                if video:
                    attachments = try_get(video, [
                        lambda x: x['story']['attachments'],
                        lambda x: x['creation_story']['attachments']
                    ], list) or []
                    for attachment in attachments:
                        parse_attachment(attachment)
                    if not entries:
                        parse_graphql_video(video)

                return self.playlist_result(entries, video_id)
517

518
        if not video_data:
519
520
521
522
523
            m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
            if m_msg is not None:
                raise ExtractorError(
                    'The video is not available, Facebook said: "%s"' % m_msg.group(1),
                    expected=True)
524
525
526
527
            elif any(p in webpage for p in (
                    '>You must log in to continue',
                    'id="login_form"',
                    'id="loginbutton"')):
528
529
                self.raise_login_required()

530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
        if not video_data and '/watchparty/' in url:
            post_data = {
                'doc_id': 3731964053542869,
                'variables': json.dumps({
                    'livingRoomID': video_id,
                }),
            }

            prefetched_data = extract_relay_prefetched_data(r'"login_data"\s*:\s*{')
            if prefetched_data:
                lsd = try_get(prefetched_data, lambda x: x['login_data']['lsd'], dict)
                if lsd:
                    post_data[lsd['name']] = lsd['value']

            relay_data = extract_relay_data(r'\[\s*"RelayAPIConfigDefaults"\s*,')
            for define in (relay_data.get('define') or []):
                if define[0] == 'RelayAPIConfigDefaults':
                    self._api_config = define[2]

            living_room = self._download_json(
                urljoin(url, self._api_config['graphURI']), video_id,
                data=urlencode_postdata(post_data))['data']['living_room']

            entries = []
            for edge in (try_get(living_room, lambda x: x['recap']['watched_content']['edges']) or []):
                video = try_get(edge, lambda x: x['node']['video']) or {}
                v_id = video.get('id')
                if not v_id:
                    continue
                v_id = compat_str(v_id)
                entries.append(self.url_result(
                    self._VIDEO_PAGE_TEMPLATE % v_id,
                    self.ie_key(), v_id, video.get('name')))

            return self.playlist_result(entries, video_id)

        if not video_data:
567
568
            # Video info not in first request, do a secondary request using
            # tahoe player specific URL
569
570
571
572
            tahoe_data = self._download_webpage(
                self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id,
                data=urlencode_postdata({
                    '__a': 1,
573
574
575
576
577
578
                    '__pc': self._search_regex(
                        r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage,
                        'pkg cohort', default='PHASED:DEFAULT'),
                    '__rev': self._search_regex(
                        r'client_revision["\']\s*:\s*(\d+),', webpage,
                        'client revision', default='3944515'),
579
580
581
                    'fb_dtsg': self._search_regex(
                        r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"',
                        webpage, 'dtsg token', default=''),
582
583
584
585
                }),
                headers={
                    'Content-Type': 'application/x-www-form-urlencoded',
                })
586
587
588
589
590
591
            tahoe_js_data = self._parse_json(
                self._search_regex(
                    r'for\s+\(\s*;\s*;\s*\)\s*;(.+)', tahoe_data,
                    'tahoe js data', default='{}'),
                video_id, fatal=False)
            video_data = extract_from_jsmods_instances(tahoe_js_data)
592

593
        if not video_data:
594
            raise ExtractorError('Cannot parse data')
595

596
597
598
599
600
601
602
603
604
605
606
        if len(video_data) > 1:
            entries = []
            for v in video_data:
                video_url = v[0].get('video_url')
                if not video_url:
                    continue
                entries.append(self.url_result(urljoin(
                    url, video_url), self.ie_key(), v[0].get('video_id')))
            return self.playlist_result(entries, video_id)
        video_data = video_data[0]

607
        formats = []
608
        subtitles = {}
609
610
        for f in video_data:
            format_id = f['stream_type']
611
612
            if f and isinstance(f, dict):
                f = [f]
613
614
615
616
617
618
            if not f or not isinstance(f, list):
                continue
            for quality in ('sd', 'hd'):
                for src_type in ('src', 'src_no_ratelimit'):
                    src = f[0].get('%s_%s' % (quality, src_type))
                    if src:
Yen Chi Hsuan's avatar
Yen Chi Hsuan committed
619
620
621
                        preference = -10 if format_id == 'progressive' else 0
                        if quality == 'hd':
                            preference += 5
622
623
624
                        formats.append({
                            'format_id': '%s_%s_%s' % (format_id, quality, src_type),
                            'url': src,
Yen Chi Hsuan's avatar
Yen Chi Hsuan committed
625
                            'preference': preference,
626
                        })
627
            extract_dash_manifest(f[0], formats)
628
629
630
            subtitles_src = f[0].get('subtitles_src')
            if subtitles_src:
                subtitles.setdefault('en', []).append({'url': subtitles_src})
631
632
        if not formats:
            raise ExtractorError('Cannot find video formats')
633

634
        process_formats(formats)
635

636
        video_title = self._html_search_regex(
637
638
            r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage,
            'title', default=None)
639
640
641
        if not video_title:
            video_title = self._html_search_regex(
                r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
642
                webpage, 'alternative title', default=None)
643
        if not video_title:
644
            video_title = self._html_search_meta(
645
                'description', webpage, 'title', default=None)
646
647
648
        if video_title:
            video_title = limit_length(video_title, 80)
        else:
649
            video_title = 'Facebook video #%s' % video_id
650
651
652
        uploader = clean_html(get_element_by_id(
            'fbPhotoPageAuthorName', webpage)) or self._search_regex(
            r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader',
653
            default=None) or self._og_search_title(webpage, fatal=False)
654
655
656
        timestamp = int_or_none(self._search_regex(
            r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
            'timestamp', default=None))
657
        thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)
658

659
660
661
662
        view_count = parse_count(self._search_regex(
            r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
            default=None))

663
        info_dict = {
664
665
            'id': video_id,
            'title': video_title,
666
            'formats': formats,
remitamine's avatar
remitamine committed
667
            'uploader': uploader,
668
            'timestamp': timestamp,
669
            'thumbnail': thumbnail,
670
            'view_count': view_count,
671
            'subtitles': subtitles,
672
        }
673

674
        return info_dict
675
676

    def _real_extract(self, url):
677
678
679
        video_id = self._match_id(url)

        real_url = self._VIDEO_PAGE_TEMPLATE % video_id if url.startswith('facebook:') else url
680
        return self._extract_from_url(real_url, video_id)
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709


class FacebookPluginsVideoIE(InfoExtractor):
    _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/plugins/video\.php\?.*?\bhref=(?P<id>https.+)'

    _TESTS = [{
        'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fgov.sg%2Fvideos%2F10154383743583686%2F&show_text=0&width=560',
        'md5': '5954e92cdfe51fe5782ae9bda7058a07',
        'info_dict': {
            'id': '10154383743583686',
            'ext': 'mp4',
            'title': 'What to do during the haze?',
            'uploader': 'Gov.sg',
            'upload_date': '20160826',
            'timestamp': 1472184808,
        },
        'add_ie': [FacebookIE.ie_key()],
    }, {
        'url': 'https://www.facebook.com/plugins/video.php?href=https%3A%2F%2Fwww.facebook.com%2Fvideo.php%3Fv%3D10204634152394104',
        'only_matching': True,
    }, {
        'url': 'https://www.facebook.com/plugins/video.php?href=https://www.facebook.com/gov.sg/videos/10154383743583686/&show_text=0&width=560',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        return self.url_result(
            compat_urllib_parse_unquote(self._match_id(url)),
            FacebookIE.ie_key())