[twitter] metadata cleanup #2
- remove useless clutter by creating new tweet-data dicts instead of
reusing the original Tweet objects
- rename fields to how they were named before
('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'
This commit is contained in:
@@ -18,9 +18,9 @@ import time
|
|||||||
class TwitterExtractor(Extractor):
|
class TwitterExtractor(Extractor):
|
||||||
"""Base class for twitter extractors"""
|
"""Base class for twitter extractors"""
|
||||||
category = "twitter"
|
category = "twitter"
|
||||||
directory_fmt = ("{category}", "{user[screen_name]}")
|
directory_fmt = ("{category}", "{user[name]}")
|
||||||
filename_fmt = "{id_str}_{num}.{extension}"
|
filename_fmt = "{tweet_id}_{num}.{extension}"
|
||||||
archive_fmt = "{id_str}_{num}"
|
archive_fmt = "{tweet_id}_{retweet_id}_{num}"
|
||||||
cookiedomain = ".twitter.com"
|
cookiedomain = ".twitter.com"
|
||||||
root = "https://twitter.com"
|
root = "https://twitter.com"
|
||||||
sizes = (":orig", ":large", ":medium", ":small")
|
sizes = (":orig", ":large", ":medium", ":small")
|
||||||
@@ -32,6 +32,7 @@ class TwitterExtractor(Extractor):
|
|||||||
self.replies = self.config("replies", True)
|
self.replies = self.config("replies", True)
|
||||||
self.twitpic = self.config("twitpic", False)
|
self.twitpic = self.config("twitpic", False)
|
||||||
self.videos = self.config("videos", True)
|
self.videos = self.config("videos", True)
|
||||||
|
self._user_cache = {}
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
self.login()
|
self.login()
|
||||||
@@ -49,26 +50,23 @@ class TwitterExtractor(Extractor):
|
|||||||
if "extended_entities" not in tweet:
|
if "extended_entities" not in tweet:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
tweet.update(metadata)
|
tdata = self._transform_tweet(tweet)
|
||||||
tweet["date"] = text.parse_datetime(
|
tdata.update(metadata)
|
||||||
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
|
||||||
entities = tweet["extended_entities"]
|
|
||||||
del tweet["extended_entities"]
|
|
||||||
del tweet["entities"]
|
|
||||||
|
|
||||||
yield Message.Directory, tweet
|
yield Message.Directory, tdata
|
||||||
for tweet["num"], media in enumerate(entities["media"], 1):
|
for tdata["num"], media in enumerate(
|
||||||
|
tweet["extended_entities"]["media"], 1):
|
||||||
|
|
||||||
tweet["width"] = media["original_info"].get("width", 0)
|
tdata["width"] = media["original_info"].get("width", 0)
|
||||||
tweet["height"] = media["original_info"].get("height", 0)
|
tdata["height"] = media["original_info"].get("height", 0)
|
||||||
|
|
||||||
if "video_info" in media and self.videos:
|
if "video_info" in media and self.videos:
|
||||||
|
|
||||||
if self.videos == "ytdl":
|
if self.videos == "ytdl":
|
||||||
url = "ytdl:{}/i/web/status/{}".format(
|
url = "ytdl:{}/i/web/status/{}".format(
|
||||||
self.root, tweet["id_str"])
|
self.root, tweet["id_str"])
|
||||||
tweet["extension"] = None
|
tdata["extension"] = None
|
||||||
yield Message.Url, url, tweet
|
yield Message.Url, url, tdata
|
||||||
|
|
||||||
else:
|
else:
|
||||||
video_info = media["video_info"]
|
video_info = media["video_info"]
|
||||||
@@ -76,24 +74,24 @@ class TwitterExtractor(Extractor):
|
|||||||
video_info["variants"],
|
video_info["variants"],
|
||||||
key=lambda v: v.get("bitrate", 0),
|
key=lambda v: v.get("bitrate", 0),
|
||||||
)
|
)
|
||||||
tweet["duration"] = video_info.get(
|
tdata["duration"] = video_info.get(
|
||||||
"duration_millis", 0) / 1000
|
"duration_millis", 0) / 1000
|
||||||
tweet["bitrate"] = variant.get("bitrate", 0)
|
tdata["bitrate"] = variant.get("bitrate", 0)
|
||||||
|
|
||||||
url = variant["url"]
|
url = variant["url"]
|
||||||
text.nameext_from_url(url, tweet)
|
text.nameext_from_url(url, tdata)
|
||||||
yield Message.Url, url, tweet
|
yield Message.Url, url, tdata
|
||||||
|
|
||||||
elif "media_url_https" in media:
|
elif "media_url_https" in media:
|
||||||
url = media["media_url_https"]
|
url = media["media_url_https"]
|
||||||
urls = [url + size for size in self.sizes]
|
urls = [url + size for size in self.sizes]
|
||||||
text.nameext_from_url(url, tweet)
|
text.nameext_from_url(url, tdata)
|
||||||
yield Message.Urllist, urls, tweet
|
yield Message.Urllist, urls, tdata
|
||||||
|
|
||||||
else:
|
else:
|
||||||
url = media["media_url"]
|
url = media["media_url"]
|
||||||
text.nameext_from_url(url, tweet)
|
text.nameext_from_url(url, tdata)
|
||||||
yield Message.Url, url, tweet
|
yield Message.Url, url, tdata
|
||||||
|
|
||||||
def _extract_twitpic(self, tweet):
|
def _extract_twitpic(self, tweet):
|
||||||
twitpics = []
|
twitpics = []
|
||||||
@@ -115,6 +113,73 @@ class TwitterExtractor(Extractor):
|
|||||||
else:
|
else:
|
||||||
tweet["extended_entities"] = {"media": twitpics}
|
tweet["extended_entities"] = {"media": twitpics}
|
||||||
|
|
||||||
|
def _transform_tweet(self, tweet):
|
||||||
|
entities = tweet["entities"]
|
||||||
|
tdata = {
|
||||||
|
"tweet_id" : text.parse_int(tweet["id_str"]),
|
||||||
|
"retweet_id" : text.parse_int(
|
||||||
|
tweet.get("retweeted_status_id_str")),
|
||||||
|
"quote_id" : text.parse_int(
|
||||||
|
tweet.get("quoted_status_id_str")),
|
||||||
|
"reply_id" : text.parse_int(
|
||||||
|
tweet.get("in_reply_to_status_id_str")),
|
||||||
|
"date" : text.parse_datetime(
|
||||||
|
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
|
||||||
|
"user" : self._transform_user(tweet["user"]),
|
||||||
|
"lang" : tweet["lang"],
|
||||||
|
"content" : tweet["full_text"],
|
||||||
|
"favorite_count": tweet["favorite_count"],
|
||||||
|
"quote_count" : tweet["quote_count"],
|
||||||
|
"reply_count" : tweet["reply_count"],
|
||||||
|
"retweet_count" : tweet["retweet_count"],
|
||||||
|
}
|
||||||
|
|
||||||
|
hashtags = entities.get("hashtags")
|
||||||
|
if hashtags:
|
||||||
|
tdata["hashtags"] = [t["text"] for t in hashtags]
|
||||||
|
|
||||||
|
mentions = entities.get("user_mentions")
|
||||||
|
if mentions:
|
||||||
|
tdata["mentions"] = [{
|
||||||
|
"id": text.parse_int(u["id_str"]),
|
||||||
|
"name": u["screen_name"],
|
||||||
|
"nick": u["name"],
|
||||||
|
} for u in mentions]
|
||||||
|
|
||||||
|
if "full_text_quoted" in tweet:
|
||||||
|
tdata["content_quoted"] = tweet["full_text_quoted"]
|
||||||
|
|
||||||
|
if "author" in tweet:
|
||||||
|
tdata["author"] = self._transform_user(tweet["author"])
|
||||||
|
|
||||||
|
return tdata
|
||||||
|
|
||||||
|
def _transform_user(self, user):
|
||||||
|
uid = user["id_str"]
|
||||||
|
cache = self._user_cache
|
||||||
|
|
||||||
|
if uid not in cache:
|
||||||
|
cache[uid] = {
|
||||||
|
"id" : text.parse_int(uid),
|
||||||
|
"name" : user["screen_name"],
|
||||||
|
"nick" : user["name"],
|
||||||
|
"description" : user["description"],
|
||||||
|
"location" : user["location"],
|
||||||
|
"date" : text.parse_datetime(
|
||||||
|
user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
|
||||||
|
"verified" : user.get("verified", False),
|
||||||
|
"profile_banner" : user.get("profile_banner_url", ""),
|
||||||
|
"profile_image" : user.get(
|
||||||
|
"profile_image_url_https", "").replace("_normal.", "."),
|
||||||
|
"favourites_count": user["favourites_count"],
|
||||||
|
"followers_count" : user["followers_count"],
|
||||||
|
"friends_count" : user["friends_count"],
|
||||||
|
"listed_count" : user["listed_count"],
|
||||||
|
"media_count" : user["media_count"],
|
||||||
|
"statuses_count" : user["statuses_count"],
|
||||||
|
}
|
||||||
|
return cache[uid]
|
||||||
|
|
||||||
def metadata(self):
|
def metadata(self):
|
||||||
"""Return general metadata"""
|
"""Return general metadata"""
|
||||||
return {}
|
return {}
|
||||||
@@ -235,7 +300,7 @@ class TwitterTweetExtractor(TwitterExtractor):
|
|||||||
}),
|
}),
|
||||||
# content with emoji, newlines, hashtags (#338)
|
# content with emoji, newlines, hashtags (#338)
|
||||||
("https://twitter.com/playpokemon/status/1263832915173048321", {
|
("https://twitter.com/playpokemon/status/1263832915173048321", {
|
||||||
"keyword": {"full_text": (
|
"keyword": {"content": (
|
||||||
r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
|
r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
|
||||||
"Gifts! \n\nYou’ll be able to receive four Galarian form "
|
"Gifts! \n\nYou’ll be able to receive four Galarian form "
|
||||||
"Pokémon with Hidden Abilities, plus some very useful items. "
|
"Pokémon with Hidden Abilities, plus some very useful items. "
|
||||||
@@ -418,16 +483,16 @@ class TwitterAPI():
|
|||||||
tweet["user"] = users[tweet["user_id_str"]]
|
tweet["user"] = users[tweet["user_id_str"]]
|
||||||
|
|
||||||
if "quoted_status_id_str" in tweet:
|
if "quoted_status_id_str" in tweet:
|
||||||
quoted = tweets[tweet["quoted_status_id_str"]]
|
quoted = tweets.get(tweet["quoted_status_id_str"])
|
||||||
tweet["author"] = tweet["user"]
|
if quoted:
|
||||||
if "extended_entities" in quoted:
|
tweet["full_text_quoted"] = quoted["full_text"]
|
||||||
tweet["extended_entities"] = \
|
if "extended_entities" in quoted:
|
||||||
quoted["extended_entities"]
|
tweet["extended_entities"] = \
|
||||||
|
quoted["extended_entities"]
|
||||||
elif "retweeted_status_id_str" in tweet:
|
elif "retweeted_status_id_str" in tweet:
|
||||||
retweet = tweets[tweet["retweeted_status_id_str"]]
|
retweet = tweets.get(tweet["retweeted_status_id_str"])
|
||||||
tweet["author"] = users[retweet["user_id_str"]]
|
if retweet:
|
||||||
else:
|
tweet["author"] = users[retweet["user_id_str"]]
|
||||||
tweet["author"] = tweet["user"]
|
|
||||||
|
|
||||||
yield tweet
|
yield tweet
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user