albicla extractor

This commit is contained in:
Laura Liberda 2021-01-30 19:47:13 +01:00
parent f3b5985cc3
commit b1c1d64de0
2 changed files with 96 additions and 0 deletions

View file

@ -0,0 +1,95 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
clean_html,
int_or_none,
)
from ..compat import compat_urllib_parse_urlencode
class AlbiclaIE(InfoExtractor):
_VALID_URL = r'https?://albicla\.com/[a-zA-Z\d]+/post/(?P<id>\d+)'
_LOGIN_REQUIRED = True
_NETRC_MACHINE = 'albicla'
_TESTS = [{
'url': 'https://albicla.com/PolandDailycom/post/1000270222',
'info_dict': {
'id': '1000270222',
'uploader': 'PolandDailycom',
},
'playlist_count': 1,
'params': {
'username': 'albicla@haruhi.download',
'password': 'fedupwithallthis',
'extract_flat': True,
},
}]
def _login(self):
email, password = self._get_login_info()
if not email:
self.report_warning('No Albicla login data found; use --username and --password or --netrc to provide them')
# if not self._downloader.cookiejar
self._download_webpage('https://albicla.com/login', 'login', 'Logging in',
data=bytes(compat_urllib_parse_urlencode({
'email': email,
'pass': password,
'remember': 'remember-me',
'signin': 'zaloguj',
}).encode('utf-8')), headers={
'Content-Type': 'application/x-www-form-urlencoded',
'Origin': 'https://albicla.com',
'Referer': 'https://albicla.com/login',
})
def _real_initialize(self):
self._login()
def _real_extract(self, url):
post_id = self._match_id(url)
webpage = self._download_webpage(url, post_id)
post = re.search(r'''(?xs)
<div\b[^>]+\bclass="post-item">.+?
<p\b[^>]+>@(?P<username>[a-zA-Z\d]+).+?
<span\b[^>]+\bdata-timestamp="(?P<timestamp>\d+)".+?
<div\b[^>]+\bclass="user-post">\s+
<p\b[^>]*>(?P<content>[^<]*)</p>\s+
(?:<div\b[^>]+\bclass="card-full[ ]yt"[^>]*>
<iframe\b[^>]+\bsrc="(?P<yt_url>https?://(?:www\.)?youtube(?:-nocookie)?\.com/embed/[a-zA-Z\d_-]{11})"[^>]*>\s*</iframe>)?
(?:.+?<i\b[^>]+\bclass="fa[ ]fa-comment[^"]*"></i>\s*(?P<comments>\d+)</button>)?
(?:.+?<i\b[^>]+\bclass="fa[ ]fa-retweet"></i>\s*<span[^>]+>\s*(?P<forwards>\d+)</span>)?
(?:.+?<i\b[^>]+\bclass="fa[ ]fa-heart"></i>\s*<span[^>]+>\s*(?P<likes>\d+)</span>)?
''', webpage)
if not post:
raise ExtractorError('Could not extract post content')
content, yt_url, comment_count, repost_count, like_count, uploader, timestamp = post.group('content', 'yt_url', 'comments', 'forwards', 'likes', 'username', 'timestamp')
if not yt_url:
raise ExtractorError('Could not find youtube embed in the post')
return {
'_type': 'playlist',
'id': post_id,
'title': clean_html(content),
'entries': [{
'_type': 'url',
'url': yt_url,
'ie_key': 'Youtube',
}],
'uploader': uploader,
'uploader_url': 'https://albicla.com/%s' % uploader,
'comment_count': int_or_none(comment_count),
'repost_count': int_or_none(repost_count),
'like_count': int_or_none(like_count),
}

View file

@ -40,6 +40,7 @@ from .agora import (
WyborczaVideoIE, WyborczaVideoIE,
) )
from .airmozilla import AirMozillaIE from .airmozilla import AirMozillaIE
from .albicla import AlbiclaIE
from .aljazeera import AlJazeeraIE from .aljazeera import AlJazeeraIE
from .alphaporno import AlphaPornoIE from .alphaporno import AlphaPornoIE
from .amcnetworks import AMCNetworksIE from .amcnetworks import AMCNetworksIE