From b1c1d64de0b19be27a6f2878e07625729b36d805 Mon Sep 17 00:00:00 2001 From: Laura Liberda Date: Sat, 30 Jan 2021 19:47:13 +0100 Subject: [PATCH] albicla extractor --- haruhi_dl/extractor/albicla.py | 95 +++++++++++++++++++++++++++++++ haruhi_dl/extractor/extractors.py | 1 + 2 files changed, 96 insertions(+) create mode 100644 haruhi_dl/extractor/albicla.py diff --git a/haruhi_dl/extractor/albicla.py b/haruhi_dl/extractor/albicla.py new file mode 100644 index 000000000..159c6b303 --- /dev/null +++ b/haruhi_dl/extractor/albicla.py @@ -0,0 +1,95 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + clean_html, + int_or_none, +) +from ..compat import compat_urllib_parse_urlencode + + +class AlbiclaIE(InfoExtractor): + _VALID_URL = r'https?://albicla\.com/[a-zA-Z\d]+/post/(?P\d+)' + _LOGIN_REQUIRED = True + _NETRC_MACHINE = 'albicla' + _TESTS = [{ + 'url': 'https://albicla.com/PolandDailycom/post/1000270222', + 'info_dict': { + 'id': '1000270222', + 'uploader': 'PolandDailycom', + }, + 'playlist_count': 1, + 'params': { + 'username': 'albicla@haruhi.download', + 'password': 'fedupwithallthis', + 'extract_flat': True, + }, + }] + + def _login(self): + email, password = self._get_login_info() + + if not email: + self.report_warning('No Albicla login data found; use --username and --password or --netrc to provide them') + + # if not self._downloader.cookiejar + + self._download_webpage('https://albicla.com/login', 'login', 'Logging in', + data=bytes(compat_urllib_parse_urlencode({ + 'email': email, + 'pass': password, + 'remember': 'remember-me', + 'signin': 'zaloguj', + }).encode('utf-8')), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Origin': 'https://albicla.com', + 'Referer': 'https://albicla.com/login', + }) + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + post_id = self._match_id(url) + + webpage = self._download_webpage(url, post_id) + + post = re.search(r'''(?xs) + ]+\bclass="post-item">.+? + ]+>@(?P[a-zA-Z\d]+).+? + ]+\bdata-timestamp="(?P\d+)".+? + ]+\bclass="user-post">\s+ + ]*>(?P[^<]*)

\s+ + (?:]+\bclass="card-full[ ]yt"[^>]*> + ]+\bsrc="(?Phttps?://(?:www\.)?youtube(?:-nocookie)?\.com/embed/[a-zA-Z\d_-]{11})"[^>]*>\s*)? + (?:.+?]+\bclass="fa[ ]fa-comment[^"]*">\s*(?P\d+))? + (?:.+?]+\bclass="fa[ ]fa-retweet">\s*]+>\s*(?P\d+))? + (?:.+?]+\bclass="fa[ ]fa-heart">\s*]+>\s*(?P\d+))? + ''', webpage) + + if not post: + raise ExtractorError('Could not extract post content') + + content, yt_url, comment_count, repost_count, like_count, uploader, timestamp = post.group('content', 'yt_url', 'comments', 'forwards', 'likes', 'username', 'timestamp') + if not yt_url: + raise ExtractorError('Could not find youtube embed in the post') + + return { + '_type': 'playlist', + 'id': post_id, + 'title': clean_html(content), + 'entries': [{ + '_type': 'url', + 'url': yt_url, + 'ie_key': 'Youtube', + }], + 'uploader': uploader, + 'uploader_url': 'https://albicla.com/%s' % uploader, + 'comment_count': int_or_none(comment_count), + 'repost_count': int_or_none(repost_count), + 'like_count': int_or_none(like_count), + } diff --git a/haruhi_dl/extractor/extractors.py b/haruhi_dl/extractor/extractors.py index a68090c79..296bf7bef 100644 --- a/haruhi_dl/extractor/extractors.py +++ b/haruhi_dl/extractor/extractors.py @@ -40,6 +40,7 @@ from .agora import ( WyborczaVideoIE, ) from .airmozilla import AirMozillaIE +from .albicla import AlbiclaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE from .amcnetworks import AMCNetworksIE