From 6c153750fa9b787f0fd6ee59bd00f591e1520eca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 15 Nov 2022 11:44:16 +0100 Subject: [PATCH] [nitter] add extractors for Nitter instances (#2696) --- docs/supportedsites.md | 40 +++++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/nitter.py | 256 +++++++++++++++++++++++++++++++ gallery_dl/extractor/twitter.py | 10 +- scripts/supportedsites.py | 1 + 5 files changed, 299 insertions(+), 9 deletions(-) create mode 100644 gallery_dl/extractor/nitter.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 2d0d983b..422d8680 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1056,6 +1056,46 @@ Consider all sites to be NSFW unless otherwise known. Required + + Nitter Instances + + + Nitter.net + https://nitter.net/ + Media Files, Replies, Search Results, Tweets + + + + Nitter.lacontrevoie.fr + https://nitter.lacontrevoie.fr/ + Media Files, Replies, Search Results, Tweets + + + + Nitter.pussthecat.org + https://nitter.pussthecat.org/ + Media Files, Replies, Search Results, Tweets + + + + Nitter.1d4.us + https://nitter.1d4.us/ + Media Files, Replies, Search Results, Tweets + + + + Nitter.kavin.rocks + https://nitter.kavin.rocks/ + Media Files, Replies, Search Results, Tweets + + + + Nitter.unixfox.eu + https://nitter.unixfox.eu/ + Media Files, Replies, Search Results, Tweets + + + Philomena Instances diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 6998a21e..a563bfd6 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -97,6 +97,7 @@ modules = [ "newgrounds", "nhentai", "nijie", + "nitter", "nozomi", "nsfwalbum", "paheal", diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py new file mode 100644 index 00000000..1ba8253b --- /dev/null +++ b/gallery_dl/extractor/nitter.py @@ -0,0 +1,256 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Nitter instances""" + +from .common import BaseExtractor, Message +from .. import text + + +class NitterExtractor(BaseExtractor): + """Base class for nitter extractors""" + basecategory = "nitter" + directory_fmt = ("{category}", "{user[name]}") + filename_fmt = "{tweet_id}_{num}.{extension}" + archive_fmt = "{tweet_id}_{num}" + + def __init__(self, match): + BaseExtractor.__init__(self, match) + self.user = match.group(match.lastindex) + + def items(self): + for tweet_html in self.tweets(): + tweet = self._tweet_from_html(tweet_html) + + attachments_html = tweet.pop("_attach", "") + if attachments_html: + attachments = list(text.extract_iter( + attachments_html, 'href="', '"')) + attachments.extend(text.extract_iter( + attachments_html, 'data-url="', '"')) + else: + attachments = () + tweet["count"] = len(attachments) + + yield Message.Directory, tweet + for tweet["num"], url in enumerate(attachments, 1): + if url[0] == "/": + url = self.root + url + if "/video/" in url: + url = "ytdl:" + url + tweet["filename"] = url.rpartition( + "%2F")[2].partition(".")[0] + tweet["extension"] = "mp4" + else: + text.nameext_from_url(url, tweet) + yield Message.Url, url, tweet + + def _tweet_from_html(self, html): + extr = text.extract_from(html) + user = { + "name": extr('class="fullname" href="/', '"'), + "nick": extr('title="', '"'), + } + extr('")[2], + "_attach": extr('class="attachments', 'class="tweet-stats'), + "comments": text.parse_int(extr( + 'class="icon-comment', '').rpartition(">")[2]), + "retweets": text.parse_int(extr( + 'class="icon-retweet', '').rpartition(">")[2]), + "quotes" : text.parse_int(extr( + 'class="icon-quote', '').rpartition(">")[2]), + "likes" : text.parse_int(extr( + 'class="icon-heart', '').rpartition(">")[2]), + } + + def _pagination(self, path): + base_url = url = self.root + path + + while True: + page = self.request(url).text + + yield from page.split('