[twitter] metadata cleanup #2
- remove useless clutter by creating new tweet-data dicts instead of
reusing the original Tweet objects
- rename fields to how they were named before
('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'
This commit is contained in:
@@ -18,9 +18,9 @@ import time
|
||||
class TwitterExtractor(Extractor):
|
||||
"""Base class for twitter extractors"""
|
||||
category = "twitter"
|
||||
directory_fmt = ("{category}", "{user[screen_name]}")
|
||||
filename_fmt = "{id_str}_{num}.{extension}"
|
||||
archive_fmt = "{id_str}_{num}"
|
||||
directory_fmt = ("{category}", "{user[name]}")
|
||||
filename_fmt = "{tweet_id}_{num}.{extension}"
|
||||
archive_fmt = "{tweet_id}_{retweet_id}_{num}"
|
||||
cookiedomain = ".twitter.com"
|
||||
root = "https://twitter.com"
|
||||
sizes = (":orig", ":large", ":medium", ":small")
|
||||
@@ -32,6 +32,7 @@ class TwitterExtractor(Extractor):
|
||||
self.replies = self.config("replies", True)
|
||||
self.twitpic = self.config("twitpic", False)
|
||||
self.videos = self.config("videos", True)
|
||||
self._user_cache = {}
|
||||
|
||||
def items(self):
|
||||
self.login()
|
||||
@@ -49,26 +50,23 @@ class TwitterExtractor(Extractor):
|
||||
if "extended_entities" not in tweet:
|
||||
continue
|
||||
|
||||
tweet.update(metadata)
|
||||
tweet["date"] = text.parse_datetime(
|
||||
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
|
||||
entities = tweet["extended_entities"]
|
||||
del tweet["extended_entities"]
|
||||
del tweet["entities"]
|
||||
tdata = self._transform_tweet(tweet)
|
||||
tdata.update(metadata)
|
||||
|
||||
yield Message.Directory, tweet
|
||||
for tweet["num"], media in enumerate(entities["media"], 1):
|
||||
yield Message.Directory, tdata
|
||||
for tdata["num"], media in enumerate(
|
||||
tweet["extended_entities"]["media"], 1):
|
||||
|
||||
tweet["width"] = media["original_info"].get("width", 0)
|
||||
tweet["height"] = media["original_info"].get("height", 0)
|
||||
tdata["width"] = media["original_info"].get("width", 0)
|
||||
tdata["height"] = media["original_info"].get("height", 0)
|
||||
|
||||
if "video_info" in media and self.videos:
|
||||
|
||||
if self.videos == "ytdl":
|
||||
url = "ytdl:{}/i/web/status/{}".format(
|
||||
self.root, tweet["id_str"])
|
||||
tweet["extension"] = None
|
||||
yield Message.Url, url, tweet
|
||||
tdata["extension"] = None
|
||||
yield Message.Url, url, tdata
|
||||
|
||||
else:
|
||||
video_info = media["video_info"]
|
||||
@@ -76,24 +74,24 @@ class TwitterExtractor(Extractor):
|
||||
video_info["variants"],
|
||||
key=lambda v: v.get("bitrate", 0),
|
||||
)
|
||||
tweet["duration"] = video_info.get(
|
||||
tdata["duration"] = video_info.get(
|
||||
"duration_millis", 0) / 1000
|
||||
tweet["bitrate"] = variant.get("bitrate", 0)
|
||||
tdata["bitrate"] = variant.get("bitrate", 0)
|
||||
|
||||
url = variant["url"]
|
||||
text.nameext_from_url(url, tweet)
|
||||
yield Message.Url, url, tweet
|
||||
text.nameext_from_url(url, tdata)
|
||||
yield Message.Url, url, tdata
|
||||
|
||||
elif "media_url_https" in media:
|
||||
url = media["media_url_https"]
|
||||
urls = [url + size for size in self.sizes]
|
||||
text.nameext_from_url(url, tweet)
|
||||
yield Message.Urllist, urls, tweet
|
||||
text.nameext_from_url(url, tdata)
|
||||
yield Message.Urllist, urls, tdata
|
||||
|
||||
else:
|
||||
url = media["media_url"]
|
||||
text.nameext_from_url(url, tweet)
|
||||
yield Message.Url, url, tweet
|
||||
text.nameext_from_url(url, tdata)
|
||||
yield Message.Url, url, tdata
|
||||
|
||||
def _extract_twitpic(self, tweet):
|
||||
twitpics = []
|
||||
@@ -115,6 +113,73 @@ class TwitterExtractor(Extractor):
|
||||
else:
|
||||
tweet["extended_entities"] = {"media": twitpics}
|
||||
|
||||
def _transform_tweet(self, tweet):
|
||||
entities = tweet["entities"]
|
||||
tdata = {
|
||||
"tweet_id" : text.parse_int(tweet["id_str"]),
|
||||
"retweet_id" : text.parse_int(
|
||||
tweet.get("retweeted_status_id_str")),
|
||||
"quote_id" : text.parse_int(
|
||||
tweet.get("quoted_status_id_str")),
|
||||
"reply_id" : text.parse_int(
|
||||
tweet.get("in_reply_to_status_id_str")),
|
||||
"date" : text.parse_datetime(
|
||||
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
|
||||
"user" : self._transform_user(tweet["user"]),
|
||||
"lang" : tweet["lang"],
|
||||
"content" : tweet["full_text"],
|
||||
"favorite_count": tweet["favorite_count"],
|
||||
"quote_count" : tweet["quote_count"],
|
||||
"reply_count" : tweet["reply_count"],
|
||||
"retweet_count" : tweet["retweet_count"],
|
||||
}
|
||||
|
||||
hashtags = entities.get("hashtags")
|
||||
if hashtags:
|
||||
tdata["hashtags"] = [t["text"] for t in hashtags]
|
||||
|
||||
mentions = entities.get("user_mentions")
|
||||
if mentions:
|
||||
tdata["mentions"] = [{
|
||||
"id": text.parse_int(u["id_str"]),
|
||||
"name": u["screen_name"],
|
||||
"nick": u["name"],
|
||||
} for u in mentions]
|
||||
|
||||
if "full_text_quoted" in tweet:
|
||||
tdata["content_quoted"] = tweet["full_text_quoted"]
|
||||
|
||||
if "author" in tweet:
|
||||
tdata["author"] = self._transform_user(tweet["author"])
|
||||
|
||||
return tdata
|
||||
|
||||
def _transform_user(self, user):
|
||||
uid = user["id_str"]
|
||||
cache = self._user_cache
|
||||
|
||||
if uid not in cache:
|
||||
cache[uid] = {
|
||||
"id" : text.parse_int(uid),
|
||||
"name" : user["screen_name"],
|
||||
"nick" : user["name"],
|
||||
"description" : user["description"],
|
||||
"location" : user["location"],
|
||||
"date" : text.parse_datetime(
|
||||
user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
|
||||
"verified" : user.get("verified", False),
|
||||
"profile_banner" : user.get("profile_banner_url", ""),
|
||||
"profile_image" : user.get(
|
||||
"profile_image_url_https", "").replace("_normal.", "."),
|
||||
"favourites_count": user["favourites_count"],
|
||||
"followers_count" : user["followers_count"],
|
||||
"friends_count" : user["friends_count"],
|
||||
"listed_count" : user["listed_count"],
|
||||
"media_count" : user["media_count"],
|
||||
"statuses_count" : user["statuses_count"],
|
||||
}
|
||||
return cache[uid]
|
||||
|
||||
def metadata(self):
|
||||
"""Return general metadata"""
|
||||
return {}
|
||||
@@ -235,7 +300,7 @@ class TwitterTweetExtractor(TwitterExtractor):
|
||||
}),
|
||||
# content with emoji, newlines, hashtags (#338)
|
||||
("https://twitter.com/playpokemon/status/1263832915173048321", {
|
||||
"keyword": {"full_text": (
|
||||
"keyword": {"content": (
|
||||
r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
|
||||
"Gifts! \n\nYou’ll be able to receive four Galarian form "
|
||||
"Pokémon with Hidden Abilities, plus some very useful items. "
|
||||
@@ -418,16 +483,16 @@ class TwitterAPI():
|
||||
tweet["user"] = users[tweet["user_id_str"]]
|
||||
|
||||
if "quoted_status_id_str" in tweet:
|
||||
quoted = tweets[tweet["quoted_status_id_str"]]
|
||||
tweet["author"] = tweet["user"]
|
||||
if "extended_entities" in quoted:
|
||||
tweet["extended_entities"] = \
|
||||
quoted["extended_entities"]
|
||||
quoted = tweets.get(tweet["quoted_status_id_str"])
|
||||
if quoted:
|
||||
tweet["full_text_quoted"] = quoted["full_text"]
|
||||
if "extended_entities" in quoted:
|
||||
tweet["extended_entities"] = \
|
||||
quoted["extended_entities"]
|
||||
elif "retweeted_status_id_str" in tweet:
|
||||
retweet = tweets[tweet["retweeted_status_id_str"]]
|
||||
tweet["author"] = users[retweet["user_id_str"]]
|
||||
else:
|
||||
tweet["author"] = tweet["user"]
|
||||
retweet = tweets.get(tweet["retweeted_status_id_str"])
|
||||
if retweet:
|
||||
tweet["author"] = users[retweet["user_id_str"]]
|
||||
|
||||
yield tweet
|
||||
|
||||
|
||||
Reference in New Issue
Block a user