diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 9213357a..e07dab47 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -11,13 +11,14 @@ from .common import Extractor, Message from .. import text, exception from ..cache import cache, memcache +import json import re class TwitterExtractor(Extractor): """Base class for twitter extractors""" category = "twitter" - directory_fmt = ("{category}", "{user}") + directory_fmt = ("{category}", "{user[name]}") filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{retweet_id}_{num}" root = "https://twitter.com" @@ -26,6 +27,7 @@ class TwitterExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) + self._user_dict = None self.logged_in = False self.retweets = self.config("retweets", True) self.content = self.config("content", False) @@ -37,23 +39,18 @@ class TwitterExtractor(Extractor): def items(self): self.login() + metadata = self.metadata() yield Message.Version, 1 - yield Message.Directory, self.metadata() for tweet in self.tweets(): data = self._data_from_tweet(tweet) - - if not self.retweets and data["retweet_id"]: + if not data or not self.retweets and data["retweet_id"]: continue - - images = text.extract_iter( - tweet, 'data-image-url="', '"') - for data["num"], url in enumerate(images, 1): - text.nameext_from_url(url, data) - urls = [url + size for size in self.sizes] - yield Message.Urllist, urls, data + data.update(metadata) if self.videos and "-videoContainer" in tweet: + yield Message.Directory, data + if self.videos == "ytdl": data["extension"] = None url = "ytdl:{}/{}/status/{}".format( @@ -70,9 +67,19 @@ class TwitterExtractor(Extractor): data["num"] = 1 yield Message.Url, url, data + elif "data-image-url=" in tweet: + yield Message.Directory, data + + images = text.extract_iter( + tweet, 'data-image-url="', '"') + for data["num"], url in enumerate(images, 1): + text.nameext_from_url(url, data) + urls = [url + size for size in self.sizes] + yield Message.Urllist, urls, data + def metadata(self): """Return general metadata""" - return {"user": self.user} + return {} def tweets(self): """Yield HTML content of all relevant tweets""" @@ -113,11 +120,33 @@ class TwitterExtractor(Extractor): "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')), "retweet_id": text.parse_int(extr('data-retweet-id="', '"')), "retweeter" : extr('data-retweeter="' , '"'), - "user" : extr('data-screen-name="', '"'), - "username" : extr('data-name="' , '"'), - "user_id" : text.parse_int(extr('data-user-id="' , '"')), - "date" : text.parse_timestamp(extr('data-time="', '"')), + "author" : { + "name" : extr('data-screen-name="', '"'), + "nick" : text.unescape(extr('data-name="' , '"')), + "id" : text.parse_int(extr('data-user-id="' , '"')), + }, } + + if not self._user_dict: + if data["retweet_id"]: + for user in json.loads(text.unescape(extr( + 'data-reply-to-users-json="', '"'))): + if user["screen_name"] == data["retweeter"]: + break + else: + self.log.warning("Unable to extract user info") + return None + self._user_dict = { + "name": user["screen_name"], + "nick": text.unescape(user["name"]), + "id" : text.parse_int(user["id_str"]), + } + else: + self._user_dict = data["author"] + + data["user"] = self._user_dict + data["date"] = text.parse_timestamp(extr('data-time="', '"')) + if self.content: content = extr('