[twitter] rewrite; use new interface (#740, #806)

Everything except logging in with username & password and TwitPic
embeds should be working again.

Metadata per Tweet is massively different than before (mostly raw API
responses - might need some cleaning up) and the default 'archive_fmt'
changed.
This commit is contained in:
Mike Fährmann
2020-06-03 20:51:29 +02:00
parent f1ef9082bb
commit a10f31dde5
2 changed files with 179 additions and 269 deletions

View File

@@ -11,114 +11,96 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, exception from .. import text, exception
from ..cache import cache, memcache from ..cache import cache, memcache
import json import hashlib
import re import time
class TwitterExtractor(Extractor): class TwitterExtractor(Extractor):
"""Base class for twitter extractors""" """Base class for twitter extractors"""
category = "twitter" category = "twitter"
directory_fmt = ("{category}", "{user[name]}") directory_fmt = ("{category}", "{user[screen_name]}")
filename_fmt = "{tweet_id}_{num}.{extension}" filename_fmt = "{id_str}_{num}.{extension}"
archive_fmt = "{tweet_id}_{retweet_id}_{num}" archive_fmt = "{id_str}_{num}"
cookiedomain = ".twitter.com" cookiedomain = ".twitter.com"
root = "https://twitter.com" root = "https://twitter.com"
sizes = (":orig", ":large", ":medium", ":small") sizes = (":orig", ":large", ":medium", ":small")
user_agent = ("Mozilla/5.0 (Windows NT 6.1; WOW64; "
"Trident/7.0; rv:11.0) like Gecko")
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.user = match.group(1) self.user = match.group(1)
self._user_dict = None
self.logged_in = False
self.retweets = self.config("retweets", True) self.retweets = self.config("retweets", True)
self.replies = self.config("replies", True) self.replies = self.config("replies", True)
self.twitpic = self.config("twitpic", False) self.twitpic = self.config("twitpic", False)
self.content = self.config("content", False)
self.videos = self.config("videos", True) self.videos = self.config("videos", True)
if self.content:
self._emoji_sub = re.compile(
r'<img class="Emoji [^>]+ alt="([^"]+)"[^>]*>').sub
def items(self): def items(self):
self.login() self.login()
metadata = self.metadata() metadata = self.metadata()
yield Message.Version, 1 yield Message.Version, 1
for tweet in self.tweets(): for tweet in self.tweets():
data = self._data_from_tweet(tweet) if not self.retweets and "retweeted_status_id_str" in tweet or \
if not data or \ not self.replies and "in_reply_to_user_id_str" in tweet:
not self.retweets and data["retweet_id"] or \
not self.replies and data["reply"]:
continue continue
data.update(metadata)
if self.videos and "-videoContainer" in tweet: if "extended_entities" not in tweet:
yield Message.Directory, data continue
tweet.update(metadata)
yield Message.Directory, tweet
for tweet["num"], media in enumerate(
tweet["extended_entities"]["media"], 1):
tweet.update(media["original_info"])
if "video_info" in media and self.videos:
if self.videos == "ytdl":
url = "ytdl:{}/i/web/status/{}".format(
self.root, tweet["id_str"])
tweet["extension"] = None
yield Message.Url, url, tweet
else:
video_info = media["video_info"]
variant = max(
video_info["variants"],
key=lambda v: v.get("bitrate", 0),
)
tweet["duration"] = video_info.get(
"duration_millis", 0) / 1000
tweet["bitrate"] = variant.get("bitrate", 0)
url = variant["url"]
text.nameext_from_url(url, tweet)
yield Message.Url, url, tweet
if self.videos == "ytdl":
data["extension"] = None
url = "ytdl:{}/i/web/status/{}".format(
self.root, data["tweet_id"])
else: else:
url = self._video_from_tweet(data["tweet_id"]) url = media["media_url_https"]
if not url:
continue
text.nameext_from_url(url, data)
if data["extension"] == "m3u8":
url = "ytdl:" + url
data["extension"] = "mp4"
data["_ytdl_extra"] = {"protocol": "m3u8_native"}
data["num"] = 1
yield Message.Url, url, data
elif "data-image-url=" in tweet:
yield Message.Directory, data
images = text.extract_iter(
tweet, 'data-image-url="', '"')
for data["num"], url in enumerate(images, 1):
text.nameext_from_url(url, data)
urls = [url + size for size in self.sizes] urls = [url + size for size in self.sizes]
yield Message.Urllist, urls, data text.nameext_from_url(url, tweet)
yield Message.Urllist, urls, tweet
if self.twitpic and "//twitpic.com/" in tweet:
urls = [
url for url in text.extract_iter(
tweet, 'data-expanded-url="', '"')
if "//twitpic.com/" in url
]
if "num" not in data:
if urls:
yield Message.Directory, data
data["num"] = 0
for data["num"], url in enumerate(urls, data["num"]+1):
response = self.request(url, fatal=False)
if response.status_code >= 400:
continue
url = text.extract(
response.text, 'name="twitter:image" value="', '"')[0]
yield Message.Url, url, text.nameext_from_url(url, data)
def metadata(self): def metadata(self):
"""Return general metadata""" """Return general metadata"""
return {} return {}
def tweets(self): def tweets(self):
"""Yield HTML content of all relevant tweets""" """Yield all relevant tweet objects"""
def login(self): def login(self):
username, password = self._get_auth_info() username, password = self._get_auth_info()
if username: if username:
self._update_cookies(self._login_impl(username, password)) self._update_cookies(self._login_impl(username, password))
self.logged_in = True # self.logged_in = True
@cache(maxage=360*24*3600, keyarg=1) @cache(maxage=360*24*3600, keyarg=1)
def _login_impl(self, username, password): def _login_impl(self, username, password):
self.log.warning(
'Logging in with username and password is currently not possible. '
'Use cookies from your browser session instead.')
return {}
"""
self.log.info("Logging in as %s", username) self.log.info("Logging in as %s", username)
headers = {"User-Agent": self.user_agent} headers = {"User-Agent": self.user_agent}
@@ -130,7 +112,7 @@ class TwitterExtractor(Extractor):
data = { data = {
"session[username_or_email]": username, "session[username_or_email]": username,
"session[password]" : password, "session[password]" : password,
"authenticity_token" : token, # "authenticity_token" : token,
"ui_metrics" : '{"rf":{},"s":""}', "ui_metrics" : '{"rf":{},"s":""}',
"scribe_log" : "", "scribe_log" : "",
"redirect_after_login" : "", "redirect_after_login" : "",
@@ -145,138 +127,7 @@ class TwitterExtractor(Extractor):
for cookie in self.session.cookies for cookie in self.session.cookies
if cookie.domain and "twitter.com" in cookie.domain if cookie.domain and "twitter.com" in cookie.domain
} }
"""
def _data_from_tweet(self, tweet):
extr = text.extract_from(tweet)
data = {
"tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')),
"reply" : bool(extr('data-is-reply-to="' , '"')),
"retweet_id": text.parse_int(extr('data-retweet-id="', '"')),
"retweeter" : extr('data-retweeter="' , '"'),
"author" : {
"name" : extr('data-screen-name="', '"'),
"nick" : text.unescape(extr('data-name="' , '"')),
"id" : text.parse_int(extr('data-user-id="' , '"')),
},
}
if not self._user_dict:
if data["retweet_id"]:
for user in json.loads(text.unescape(extr(
'data-reply-to-users-json="', '"'))):
if user["screen_name"] == data["retweeter"]:
break
else:
self.log.warning("Unable to extract user info")
return None
self._user_dict = {
"name": user["screen_name"],
"nick": text.unescape(user["name"]),
"id" : text.parse_int(user["id_str"]),
}
else:
self._user_dict = data["author"]
data["user"] = self._user_dict
data["date"] = text.parse_timestamp(extr('data-time="', '"'))
if self.content:
content = extr('<div class="js-tweet-text-container">', '\n</div>')
if '<img class="Emoji ' in content:
content = self._emoji_sub(r"\1", content)
content = text.unescape(text.remove_html(content, "", ""))
cl, _, cr = content.rpartition("pic.twitter.com/")
data["content"] = cl if cl and len(cr) < 16 else content
if extr('<div class="QuoteTweet', '>'):
data["retweet_id"] = text.parse_int(extr('data-item-id="', '"'))
data["retweeter"] = data["user"]["name"]
data["author"] = {
"name" : extr('data-screen-name="', '"'),
"id" : text.parse_int(extr('data-user-id="' , '"')),
"nick" : text.unescape(extr(
'QuoteTweet-fullname', '<').partition('>')[2]),
}
return data
def _video_from_tweet(self, tweet_id):
url = "https://api.twitter.com/1.1/videos/tweet/config/{}.json".format(
tweet_id)
cookies = None
headers = {
"Origin" : self.root,
"Referer" : "{}/i/web/status/{}".format(self.root, tweet_id),
"x-csrf-token" : self.session.cookies.get("ct0"),
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekM"
"xqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28N"
"HfOPqkca3qaAxGfsyKCs0wRbw",
}
if self.logged_in:
headers["x-twitter-auth-type"] = "OAuth2Session"
else:
token = _guest_token(self, headers)
cookies = {"gt": token}
headers["x-guest-token"] = token
response = self.request(
url, cookies=cookies, headers=headers, fatal=None)
if response.status_code == 429 or \
response.headers.get("x-rate-limit-remaining") == "0":
if self.logged_in:
self.wait(until=response.headers.get("x-rate-limit-reset"))
else:
_guest_token.invalidate()
return self._video_from_tweet(tweet_id)
elif response.status_code >= 400:
self.log.warning("Unable to fetch video data for %s ('%s %s')",
tweet_id, response.status_code, response.reason)
return None
return response.json()["track"]["playbackUrl"]
def _tweets_from_api(self, url, max_position=None):
params = {
"include_available_features": "1",
"include_entities": "1",
"max_position": max_position,
"reset_error_state": "false",
"lang": "en",
}
headers = {
"X-Requested-With": "XMLHttpRequest",
"X-Twitter-Active-User": "yes",
"Referer": self.root + "/",
}
while True:
data = self.request(url, params=params, headers=headers).json()
if "inner" in data:
data = data["inner"]
for tweet in text.extract_iter(
data["items_html"], '<div class="tweet ', '\n</li>'):
yield tweet
if data.get("min_position") is None:
if data["has_more_items"] and "min_position" not in data:
pass
else:
return
if "min_position" in data:
position = data["min_position"]
if position == max_position or position is None:
return
else:
position = text.parse_int(text.extract(
tweet, 'data-tweet-id="', '"')[0])
if max_position and position >= max_position:
return
params["max_position"] = max_position = position
class TwitterTimelineExtractor(TwitterExtractor): class TwitterTimelineExtractor(TwitterExtractor):
@@ -288,15 +139,12 @@ class TwitterTimelineExtractor(TwitterExtractor):
("https://twitter.com/supernaturepics", { ("https://twitter.com/supernaturepics", {
"range": "1-40", "range": "1-40",
"url": "0106229d408f4111d9a52c8fd2ad687f64842aa4", "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
"keyword": "4a3d28cc9f7a39e27333d56f3fe19e6e07ee979e",
}), }),
("https://mobile.twitter.com/supernaturepics?p=i"), ("https://mobile.twitter.com/supernaturepics?p=i"),
) )
def tweets(self): def tweets(self):
url = "{}/i/profiles/show/{}/timeline/tweets".format( return TwitterAPI(self).timeline_profile(self.user)
self.root, self.user)
return self._tweets_from_api(url)
class TwitterMediaExtractor(TwitterExtractor): class TwitterMediaExtractor(TwitterExtractor):
@@ -313,9 +161,7 @@ class TwitterMediaExtractor(TwitterExtractor):
) )
def tweets(self): def tweets(self):
url = "{}/i/profiles/show/{}/media_timeline".format( return TwitterAPI(self).timeline_media(self.user)
self.root, self.user)
return self._tweets_from_api(url)
class TwitterSearchExtractor(TwitterExtractor): class TwitterSearchExtractor(TwitterExtractor):
@@ -333,9 +179,7 @@ class TwitterSearchExtractor(TwitterExtractor):
return {"search": self.user} return {"search": self.user}
def tweets(self): def tweets(self):
url = "{}/i/search/timeline?f=tweets&q={}".format( return TwitterAPI(self).search(self.user)
self.root, self.user)
return self._tweets_from_api(url, "-1")
class TwitterTweetExtractor(TwitterExtractor): class TwitterTweetExtractor(TwitterExtractor):
@@ -346,23 +190,20 @@ class TwitterTweetExtractor(TwitterExtractor):
test = ( test = (
("https://twitter.com/supernaturepics/status/604341487988576256", { ("https://twitter.com/supernaturepics/status/604341487988576256", {
"url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580", "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580",
"keyword": "76e018cf3f4c8b82d3bdd425e01e28078c98373b",
"content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
}), }),
# 4 images # 4 images
("https://twitter.com/perrypumas/status/894001459754180609", { ("https://twitter.com/perrypumas/status/894001459754180609", {
"url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6", "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6",
"keyword": "c9251b1fd79d547b0c6b4577f06c937d0e9b63d2",
}), }),
# video # video
("https://twitter.com/perrypumas/status/1065692031626829824", { ("https://twitter.com/perrypumas/status/1065692031626829824", {
"options": (("videos", True),), "options": (("videos", True),),
"pattern": r"ytdl:https://video.twimg.com/ext_tw_video/.*.m3u8", "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5",
}), }),
# content with emoji, newlines, hashtags (#338) # content with emoji, newlines, hashtags (#338)
("https://twitter.com/playpokemon/status/1263832915173048321", { ("https://twitter.com/playpokemon/status/1263832915173048321", {
"options": (("content", True),), "keyword": {"full_text": (
"keyword": {"content": (
r"re:Gear up for #PokemonSwordShieldEX with special Mystery " r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
"Gifts! \n\nYoull be able to receive four Galarian form " "Gifts! \n\nYoull be able to receive four Galarian form "
"Pokémon with Hidden Abilities, plus some very useful items. " "Pokémon with Hidden Abilities, plus some very useful items. "
@@ -386,10 +227,6 @@ class TwitterTweetExtractor(TwitterExtractor):
# quoted tweet (#526) # quoted tweet (#526)
("https://twitter.com/Pistachio/status/1222690391817932803", { ("https://twitter.com/Pistachio/status/1222690391817932803", {
"pattern": r"https://pbs\.twimg\.com/media/EPfMfDUU8AAnByO\.jpg", "pattern": r"https://pbs\.twimg\.com/media/EPfMfDUU8AAnByO\.jpg",
"keyword": {
"author": {"name": "Afro_Herper", "id": 786047748508221440},
"user" : {"name": "Pistachio" , "id": 3533231},
},
}), }),
# TwitPic embeds (#579) # TwitPic embeds (#579)
("https://twitter.com/i/web/status/112900228289540096", { ("https://twitter.com/i/web/status/112900228289540096", {
@@ -404,18 +241,7 @@ class TwitterTweetExtractor(TwitterExtractor):
self.tweet_id = match.group(2) self.tweet_id = match.group(2)
def tweets(self): def tweets(self):
url = "{}/i/web/status/{}".format(self.root, self.tweet_id) return TwitterAPI(self).tweet(self.tweet_id)
cookies = {"app_shell_visited": "1"}
headers = {"User-Agent": self.user_agent, "Referer": url}
response = self.request(url, cookies=cookies, headers=headers)
if response.history and response.url == self.root + "/":
raise exception.AuthorizationError()
page = response.text
end = page.index('class="js-tweet-stats-container')
beg = page.rindex('<div class="tweet ', 0, end)
return (page[beg:end],)
class TwitterBookmarkExtractor(TwitterExtractor): class TwitterBookmarkExtractor(TwitterExtractor):
@@ -424,15 +250,26 @@ class TwitterBookmarkExtractor(TwitterExtractor):
pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()" pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()"
test = ("https://twitter.com/i/bookmarks",) test = ("https://twitter.com/i/bookmarks",)
def items(self): def tweets(self):
self.login() return TwitterAPI(self).bookmarks()
if not self.logged_in:
raise exception.AuthorizationError("Login required")
for cookie in self.session.cookies:
cookie.expires = None
url = "https://api.twitter.com/2/timeline/bookmark.json"
params = { class TwitterAPI():
def __init__(self, extractor):
self.extractor = extractor
self.headers = {
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
"COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
"4FA33AGWWjCpTnA",
"x-guest-token": None,
"x-twitter-client-language": "en",
"x-twitter-active-user": "yes",
"x-csrf-token": None,
"Origin": "https://twitter.com",
"Referer": "https://twitter.com/",
}
self.params = {
"include_profile_interstitial_type": "1", "include_profile_interstitial_type": "1",
"include_blocking": "1", "include_blocking": "1",
"include_blocked_by": "1", "include_blocked_by": "1",
@@ -453,47 +290,120 @@ class TwitterBookmarkExtractor(TwitterExtractor):
"include_ext_media_color": "true", "include_ext_media_color": "true",
"include_ext_media_availability": "true", "include_ext_media_availability": "true",
"send_error_codes": "true", "send_error_codes": "true",
"simple_quoted_tweets": "true", "simple_quoted_tweet": "true",
# "count": "20",
"count": "100", "count": "100",
"cursor": None, "cursor": None,
"ext": "mediaStats%2CcameraMoment", "ext": "mediaStats%2ChighlightedLabel%2CcameraMoment",
"include_quote_count": "true",
} }
headers = {
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" cookies = self.extractor.session.cookies
"COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
"4FA33AGWWjCpTnA", # CSRF
"Origin": self.root, csrf = hashlib.md5(str(time.time()).encode()).hexdigest()
"Referer": self.root + "/i/bookmarks", self.headers["x-csrf-token"] = csrf
"x-csrf-token": self.session.cookies.get("ct0"), cookies.set("ct0", csrf, domain=".twitter.com")
"x-twitter-active-user": "yes",
"x-twitter-auth-type": "OAuth2Session", if cookies.get("auth_token", domain=".twitter.com"):
"x-twitter-client-language": "en", self.headers["x-twitter-auth-type"] = "OAuth2Session"
else:
# guest token
guest_token = _guest_token(self.extractor, self.headers)
self.headers["x-guest-token"] = guest_token
cookies.set("gt", guest_token, domain=".twitter.com")
def tweet(self, tweet_id):
endpoint = "2/timeline/conversation/{}.json".format(tweet_id)
for tweet in self._pagination(endpoint):
if tweet["id_str"] == tweet_id:
return (tweet,)
return ()
def timeline_profile(self, screen_name):
user = self.user_by_screen_name(screen_name)
endpoint = "2/timeline/profile/{}.json".format(user["rest_id"])
return self._pagination(endpoint)
def timeline_media(self, screen_name):
user = self.user_by_screen_name(screen_name)
endpoint = "2/timeline/media/{}.json".format(user["rest_id"])
return self._pagination(endpoint)
def search(self, query):
endpoint = "2/search/adaptive.json"
params = self.params.copy()
params["q"] = query
return self._pagination(
endpoint, params, "sq-I-t-", "sq-cursor-bottom")
def bookmarks(self):
endpoint = "2/timeline/bookmark.json"
return self._pagination(endpoint)
@memcache()
def user_by_screen_name(self, screen_name):
endpoint = "graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName"
params = {
"variables": '{"screen_name":"' + screen_name + '"'
',"withHighlightedLabel":true}'
} }
return self._call(endpoint, params)["data"]["user"]
def _call(self, endpoint, params):
url = "https://api.twitter.com/" + endpoint
response = self.extractor.request(
url, params=params, headers=self.headers, fatal=None)
if response.status_code < 400:
return response.json()
if response.status_code == 429:
self.extractor.wait(until=response.headers["x-rate-limit-reset"])
return self._call(endpoint, params)
raise exception.StopExtraction(
"%s %s (%s)", response.status_code, response.reason, response.text)
def _pagination(self, endpoint, params=None,
entry_tweet="tweet-", entry_cursor="cursor-bottom-"):
if params is None:
params = self.params.copy()
while True: while True:
response = self.request( cursor = None
url, params=params, headers=headers, fatal=False) data = self._call(endpoint, params)
if response.status_code >= 400:
raise exception.StopExtraction(response.text)
data = response.json()
tweets = data["globalObjects"]["tweets"] tweets = data["globalObjects"]["tweets"]
users = data["globalObjects"]["users"]
instr = data["timeline"]["instructions"][0]
if not tweets: for entry in instr["addEntries"]["entries"]:
if entry["entryId"].startswith(entry_tweet):
tid = entry["content"]["item"]["content"]["tweet"]["id"]
tweet = tweets[tid]
tweet["user"] = users[tweet["user_id_str"]]
if "quoted_status_id_str" in tweet:
quoted = tweets[tweet["quoted_status_id_str"]]
tweet["author"] = tweet["user"]
if "extended_entities" in quoted:
tweet["extended_entities"] = \
quoted["extended_entities"]
elif "retweeted_status_id_str" in tweet:
retweet = tweets[tweet["retweeted_status_id_str"]]
tweet["author"] = users[retweet["user_id_str"]]
else:
tweet["author"] = tweet["user"]
yield tweet
elif entry["entryId"].startswith(entry_cursor):
cursor = entry["content"]["operation"]["cursor"]["value"]
if not cursor or params["cursor"] == cursor:
return return
for tweet_id, tweet_data in tweets.items(): params["cursor"] = cursor
tweet_url = "{}/i/web/status/{}".format(self.root, tweet_id)
tweet_data["_extractor"] = TwitterTweetExtractor
yield Message.Queue, tweet_url, tweet_data
inst = data["timeline"]["instructions"][0]
for entry in inst["addEntries"]["entries"]:
if entry["entryId"].startswith("cursor-bottom-"):
params["cursor"] = \
entry["content"]["operation"]["cursor"]["value"]
break
@memcache() @cache(maxage=3600)
def _guest_token(extr, headers): def _guest_token(extr, headers):
return extr.request( return extr.request(
"https://api.twitter.com/1.1/guest/activate.json", "https://api.twitter.com/1.1/guest/activate.json",

View File

@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
__version__ = "1.14.0" __version__ = "1.14.1-dev"