diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 4c7b757c..f0e02254 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -11,114 +11,96 @@ from .common import Extractor, Message from .. import text, exception from ..cache import cache, memcache -import json -import re +import hashlib +import time class TwitterExtractor(Extractor): """Base class for twitter extractors""" category = "twitter" - directory_fmt = ("{category}", "{user[name]}") - filename_fmt = "{tweet_id}_{num}.{extension}" - archive_fmt = "{tweet_id}_{retweet_id}_{num}" + directory_fmt = ("{category}", "{user[screen_name]}") + filename_fmt = "{id_str}_{num}.{extension}" + archive_fmt = "{id_str}_{num}" cookiedomain = ".twitter.com" root = "https://twitter.com" sizes = (":orig", ":large", ":medium", ":small") - user_agent = ("Mozilla/5.0 (Windows NT 6.1; WOW64; " - "Trident/7.0; rv:11.0) like Gecko") def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) - self._user_dict = None - self.logged_in = False self.retweets = self.config("retweets", True) self.replies = self.config("replies", True) self.twitpic = self.config("twitpic", False) - self.content = self.config("content", False) self.videos = self.config("videos", True) - if self.content: - self._emoji_sub = re.compile( - r']*>').sub - def items(self): self.login() metadata = self.metadata() yield Message.Version, 1 for tweet in self.tweets(): - data = self._data_from_tweet(tweet) - if not data or \ - not self.retweets and data["retweet_id"] or \ - not self.replies and data["reply"]: + if not self.retweets and "retweeted_status_id_str" in tweet or \ + not self.replies and "in_reply_to_user_id_str" in tweet: continue - data.update(metadata) - if self.videos and "-videoContainer" in tweet: - yield Message.Directory, data + if "extended_entities" not in tweet: + continue + + tweet.update(metadata) + yield Message.Directory, tweet + for tweet["num"], media in enumerate( + tweet["extended_entities"]["media"], 1): + tweet.update(media["original_info"]) + + if "video_info" in media and self.videos: + + if self.videos == "ytdl": + url = "ytdl:{}/i/web/status/{}".format( + self.root, tweet["id_str"]) + tweet["extension"] = None + yield Message.Url, url, tweet + + else: + video_info = media["video_info"] + variant = max( + video_info["variants"], + key=lambda v: v.get("bitrate", 0), + ) + tweet["duration"] = video_info.get( + "duration_millis", 0) / 1000 + tweet["bitrate"] = variant.get("bitrate", 0) + + url = variant["url"] + text.nameext_from_url(url, tweet) + yield Message.Url, url, tweet - if self.videos == "ytdl": - data["extension"] = None - url = "ytdl:{}/i/web/status/{}".format( - self.root, data["tweet_id"]) else: - url = self._video_from_tweet(data["tweet_id"]) - if not url: - continue - text.nameext_from_url(url, data) - if data["extension"] == "m3u8": - url = "ytdl:" + url - data["extension"] = "mp4" - data["_ytdl_extra"] = {"protocol": "m3u8_native"} - data["num"] = 1 - yield Message.Url, url, data - - elif "data-image-url=" in tweet: - yield Message.Directory, data - - images = text.extract_iter( - tweet, 'data-image-url="', '"') - for data["num"], url in enumerate(images, 1): - text.nameext_from_url(url, data) + url = media["media_url_https"] urls = [url + size for size in self.sizes] - yield Message.Urllist, urls, data - - if self.twitpic and "//twitpic.com/" in tweet: - urls = [ - url for url in text.extract_iter( - tweet, 'data-expanded-url="', '"') - if "//twitpic.com/" in url - ] - - if "num" not in data: - if urls: - yield Message.Directory, data - data["num"] = 0 - - for data["num"], url in enumerate(urls, data["num"]+1): - response = self.request(url, fatal=False) - if response.status_code >= 400: - continue - url = text.extract( - response.text, 'name="twitter:image" value="', '"')[0] - yield Message.Url, url, text.nameext_from_url(url, data) + text.nameext_from_url(url, tweet) + yield Message.Urllist, urls, tweet def metadata(self): """Return general metadata""" return {} def tweets(self): - """Yield HTML content of all relevant tweets""" + """Yield all relevant tweet objects""" def login(self): username, password = self._get_auth_info() if username: self._update_cookies(self._login_impl(username, password)) - self.logged_in = True + # self.logged_in = True @cache(maxage=360*24*3600, keyarg=1) def _login_impl(self, username, password): + self.log.warning( + 'Logging in with username and password is currently not possible. ' + 'Use cookies from your browser session instead.') + return {} + + """ self.log.info("Logging in as %s", username) headers = {"User-Agent": self.user_agent} @@ -130,7 +112,7 @@ class TwitterExtractor(Extractor): data = { "session[username_or_email]": username, "session[password]" : password, - "authenticity_token" : token, + # "authenticity_token" : token, "ui_metrics" : '{"rf":{},"s":""}', "scribe_log" : "", "redirect_after_login" : "", @@ -145,138 +127,7 @@ class TwitterExtractor(Extractor): for cookie in self.session.cookies if cookie.domain and "twitter.com" in cookie.domain } - - def _data_from_tweet(self, tweet): - extr = text.extract_from(tweet) - data = { - "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')), - "reply" : bool(extr('data-is-reply-to="' , '"')), - "retweet_id": text.parse_int(extr('data-retweet-id="', '"')), - "retweeter" : extr('data-retweeter="' , '"'), - "author" : { - "name" : extr('data-screen-name="', '"'), - "nick" : text.unescape(extr('data-name="' , '"')), - "id" : text.parse_int(extr('data-user-id="' , '"')), - }, - } - - if not self._user_dict: - if data["retweet_id"]: - for user in json.loads(text.unescape(extr( - 'data-reply-to-users-json="', '"'))): - if user["screen_name"] == data["retweeter"]: - break - else: - self.log.warning("Unable to extract user info") - return None - self._user_dict = { - "name": user["screen_name"], - "nick": text.unescape(user["name"]), - "id" : text.parse_int(user["id_str"]), - } - else: - self._user_dict = data["author"] - - data["user"] = self._user_dict - data["date"] = text.parse_timestamp(extr('data-time="', '"')) - - if self.content: - content = extr('
', '\n
') - if '')[2]), - } - - return data - - def _video_from_tweet(self, tweet_id): - url = "https://api.twitter.com/1.1/videos/tweet/config/{}.json".format( - tweet_id) - cookies = None - headers = { - "Origin" : self.root, - "Referer" : "{}/i/web/status/{}".format(self.root, tweet_id), - "x-csrf-token" : self.session.cookies.get("ct0"), - "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekM" - "xqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28N" - "HfOPqkca3qaAxGfsyKCs0wRbw", - } - - if self.logged_in: - headers["x-twitter-auth-type"] = "OAuth2Session" - else: - token = _guest_token(self, headers) - cookies = {"gt": token} - headers["x-guest-token"] = token - - response = self.request( - url, cookies=cookies, headers=headers, fatal=None) - - if response.status_code == 429 or \ - response.headers.get("x-rate-limit-remaining") == "0": - if self.logged_in: - self.wait(until=response.headers.get("x-rate-limit-reset")) - else: - _guest_token.invalidate() - return self._video_from_tweet(tweet_id) - - elif response.status_code >= 400: - self.log.warning("Unable to fetch video data for %s ('%s %s')", - tweet_id, response.status_code, response.reason) - return None - - return response.json()["track"]["playbackUrl"] - - def _tweets_from_api(self, url, max_position=None): - params = { - "include_available_features": "1", - "include_entities": "1", - "max_position": max_position, - "reset_error_state": "false", - "lang": "en", - } - headers = { - "X-Requested-With": "XMLHttpRequest", - "X-Twitter-Active-User": "yes", - "Referer": self.root + "/", - } - - while True: - data = self.request(url, params=params, headers=headers).json() - if "inner" in data: - data = data["inner"] - - for tweet in text.extract_iter( - data["items_html"], '
= max_position: - return - params["max_position"] = max_position = position + """ class TwitterTimelineExtractor(TwitterExtractor): @@ -288,15 +139,12 @@ class TwitterTimelineExtractor(TwitterExtractor): ("https://twitter.com/supernaturepics", { "range": "1-40", "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4", - "keyword": "4a3d28cc9f7a39e27333d56f3fe19e6e07ee979e", }), ("https://mobile.twitter.com/supernaturepics?p=i"), ) def tweets(self): - url = "{}/i/profiles/show/{}/timeline/tweets".format( - self.root, self.user) - return self._tweets_from_api(url) + return TwitterAPI(self).timeline_profile(self.user) class TwitterMediaExtractor(TwitterExtractor): @@ -313,9 +161,7 @@ class TwitterMediaExtractor(TwitterExtractor): ) def tweets(self): - url = "{}/i/profiles/show/{}/media_timeline".format( - self.root, self.user) - return self._tweets_from_api(url) + return TwitterAPI(self).timeline_media(self.user) class TwitterSearchExtractor(TwitterExtractor): @@ -333,9 +179,7 @@ class TwitterSearchExtractor(TwitterExtractor): return {"search": self.user} def tweets(self): - url = "{}/i/search/timeline?f=tweets&q={}".format( - self.root, self.user) - return self._tweets_from_api(url, "-1") + return TwitterAPI(self).search(self.user) class TwitterTweetExtractor(TwitterExtractor): @@ -346,23 +190,20 @@ class TwitterTweetExtractor(TwitterExtractor): test = ( ("https://twitter.com/supernaturepics/status/604341487988576256", { "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580", - "keyword": "76e018cf3f4c8b82d3bdd425e01e28078c98373b", "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", }), # 4 images ("https://twitter.com/perrypumas/status/894001459754180609", { "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6", - "keyword": "c9251b1fd79d547b0c6b4577f06c937d0e9b63d2", }), # video ("https://twitter.com/perrypumas/status/1065692031626829824", { "options": (("videos", True),), - "pattern": r"ytdl:https://video.twimg.com/ext_tw_video/.*.m3u8", + "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5", }), # content with emoji, newlines, hashtags (#338) ("https://twitter.com/playpokemon/status/1263832915173048321", { - "options": (("content", True),), - "keyword": {"content": ( + "keyword": {"full_text": ( r"re:Gear up for #PokemonSwordShieldEX with special Mystery " "Gifts! \n\nYou’ll be able to receive four Galarian form " "Pokémon with Hidden Abilities, plus some very useful items. " @@ -386,10 +227,6 @@ class TwitterTweetExtractor(TwitterExtractor): # quoted tweet (#526) ("https://twitter.com/Pistachio/status/1222690391817932803", { "pattern": r"https://pbs\.twimg\.com/media/EPfMfDUU8AAnByO\.jpg", - "keyword": { - "author": {"name": "Afro_Herper", "id": 786047748508221440}, - "user" : {"name": "Pistachio" , "id": 3533231}, - }, }), # TwitPic embeds (#579) ("https://twitter.com/i/web/status/112900228289540096", { @@ -404,18 +241,7 @@ class TwitterTweetExtractor(TwitterExtractor): self.tweet_id = match.group(2) def tweets(self): - url = "{}/i/web/status/{}".format(self.root, self.tweet_id) - cookies = {"app_shell_visited": "1"} - headers = {"User-Agent": self.user_agent, "Referer": url} - - response = self.request(url, cookies=cookies, headers=headers) - if response.history and response.url == self.root + "/": - raise exception.AuthorizationError() - page = response.text - - end = page.index('class="js-tweet-stats-container') - beg = page.rindex('
= 400: - raise exception.StopExtraction(response.text) - data = response.json() + cursor = None + data = self._call(endpoint, params) tweets = data["globalObjects"]["tweets"] + users = data["globalObjects"]["users"] + instr = data["timeline"]["instructions"][0] - if not tweets: + for entry in instr["addEntries"]["entries"]: + + if entry["entryId"].startswith(entry_tweet): + tid = entry["content"]["item"]["content"]["tweet"]["id"] + tweet = tweets[tid] + tweet["user"] = users[tweet["user_id_str"]] + + if "quoted_status_id_str" in tweet: + quoted = tweets[tweet["quoted_status_id_str"]] + tweet["author"] = tweet["user"] + if "extended_entities" in quoted: + tweet["extended_entities"] = \ + quoted["extended_entities"] + elif "retweeted_status_id_str" in tweet: + retweet = tweets[tweet["retweeted_status_id_str"]] + tweet["author"] = users[retweet["user_id_str"]] + else: + tweet["author"] = tweet["user"] + + yield tweet + + elif entry["entryId"].startswith(entry_cursor): + cursor = entry["content"]["operation"]["cursor"]["value"] + + if not cursor or params["cursor"] == cursor: return - for tweet_id, tweet_data in tweets.items(): - tweet_url = "{}/i/web/status/{}".format(self.root, tweet_id) - tweet_data["_extractor"] = TwitterTweetExtractor - yield Message.Queue, tweet_url, tweet_data - - inst = data["timeline"]["instructions"][0] - for entry in inst["addEntries"]["entries"]: - if entry["entryId"].startswith("cursor-bottom-"): - params["cursor"] = \ - entry["content"]["operation"]["cursor"]["value"] - break + params["cursor"] = cursor -@memcache() +@cache(maxage=3600) def _guest_token(extr, headers): return extr.request( "https://api.twitter.com/1.1/guest/activate.json", diff --git a/gallery_dl/version.py b/gallery_dl/version.py index dd6f373e..5580df1e 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.14.0" +__version__ = "1.14.1-dev"