From 6cb12f513bfa4b64138f1e19d77aefa0efa38ffc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 26 Nov 2022 11:23:03 +0100 Subject: [PATCH] [nitter] support quoted Tweets - distinguish between regular and quoted Tweets and media - add 'quoted' option and metadata field --- docs/configuration.rst | 10 ++++ docs/gallery-dl.conf | 1 + gallery_dl/extractor/nitter.py | 96 +++++++++++++++++++++++++++++----- 3 files changed, 93 insertions(+), 14 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index c36b1700..896ccc4c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1860,6 +1860,16 @@ Description You can use ``"all"`` instead of listing all values separately. +extractor.nitter.quoted +----------------------- +Type + ``bool`` +Default + ``false`` +Description + Fetch media from quoted Tweets. + + extractor.nitter.retweets ------------------------- Type diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index f76cbdc8..6b12721a 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -205,6 +205,7 @@ "include": "illustration,doujin" }, "nitter": { + "quoted": false, "retweets": false, "videos": true }, diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index cb51cf14..8e33eb62 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -107,6 +107,28 @@ class NitterExtractor(BaseExtractor): "likes" : text.parse_int(extr( 'class="icon-heart', '').rpartition(">")[2]), "retweet" : 'class="retweet-header' in html, + "quoted": False, + } + + def _tweet_from_quote(self, html): + extr = text.extract_from(html) + author = { + "name": extr('class="fullname" href="/', '"'), + "nick": extr('title="', '"'), + } + extr('")[2], + "_attach" : extr('class="attachments', ''' + '''), + "retweet" : False, + "quoted": True, } def _user_from_html(self, html): @@ -123,18 +145,26 @@ class NitterExtractor(BaseExtractor): "date" : text.parse_datetime( extr('class="profile-joindate">', '<').replace(",", ""), - "friends_count" : extr( - 'class="profile-stat-num">', '<').replace(",", ""), - "followers_count" : extr( - 'class="profile-stat-num">', '<').replace(",", ""), - "favourites_count": extr( - 'class="profile-stat-num">', '<').replace(",", ""), + "statuses_count" : text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "friends_count" : text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "followers_count" : text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "favourites_count": text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), "verified" : 'title="Verified account"' in html, } + def _extract_quote(self, html): + html, _, quote = html.partition('class="quote') + if quote: + quote, _, tail = quote.partition('class="tweet-published') + return (html + tail, quote) + return (html, None) + def _pagination(self, path): + quoted = self.config("quoted", False) base_url = url = self.root + path while True: @@ -144,8 +174,10 @@ class NitterExtractor(BaseExtractor): if self.user_obj is None: self.user_obj = self._user_from_html(tweets_html[0]) - for html in tweets_html[1:]: + for html, quote in map(self._extract_quote, tweets_html[1:]): yield self._tweet_from_html(html) + if quoted and quote: + yield self._tweet_from_quote(quote) more = text.extr( tweets_html[-1], '