# -*- coding: utf-8 -*- # Copyright 2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for Nitter instances""" from .common import BaseExtractor, Message from .. import text class NitterExtractor(BaseExtractor): """Base class for nitter extractors""" basecategory = "nitter" directory_fmt = ("{category}", "{user[name]}") filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{num}" def __init__(self, match): BaseExtractor.__init__(self, match) self.user = match.group(match.lastindex) def items(self): for tweet_html in self.tweets(): tweet = self._tweet_from_html(tweet_html) attachments_html = tweet.pop("_attach", "") if attachments_html: attachments = list(text.extract_iter( attachments_html, 'href="', '"')) attachments.extend(text.extract_iter( attachments_html, 'data-url="', '"')) else: attachments = () tweet["count"] = len(attachments) yield Message.Directory, tweet for tweet["num"], url in enumerate(attachments, 1): if url[0] == "/": url = self.root + url if "/video/" in url: url = "ytdl:" + url tweet["filename"] = url.rpartition( "%2F")[2].partition(".")[0] tweet["extension"] = "mp4" else: text.nameext_from_url(url, tweet) yield Message.Url, url, tweet def _tweet_from_html(self, html): extr = text.extract_from(html) user = { "name": extr('class="fullname" href="/', '"'), "nick": extr('title="', '"'), } extr('")[2], "_attach": extr('class="attachments', 'class="tweet-stats'), "comments": text.parse_int(extr( 'class="icon-comment', '').rpartition(">")[2]), "retweets": text.parse_int(extr( 'class="icon-retweet', '').rpartition(">")[2]), "quotes" : text.parse_int(extr( 'class="icon-quote', '').rpartition(">")[2]), "likes" : text.parse_int(extr( 'class="icon-heart', '').rpartition(">")[2]), } def _pagination(self, path): base_url = url = self.root + path while True: page = self.request(url).text yield from page.split('