[twitter] metadata cleanup #2

- remove useless clutter by creating new tweet-data dicts instead of
  reusing the original Tweet objects
- rename fields to how they were named before
  ('id_str' -> 'tweet_id', etc.)
- only include 'author' if it would differ from 'user'
- restore 'archive_fmt'
This commit is contained in:
Mike Fährmann
2020-06-06 23:51:54 +02:00
parent 1fcf938f9c
commit 5bc1097f9d

View File

@@ -18,9 +18,9 @@ import time
class TwitterExtractor(Extractor):
"""Base class for twitter extractors"""
category = "twitter"
directory_fmt = ("{category}", "{user[screen_name]}")
filename_fmt = "{id_str}_{num}.{extension}"
archive_fmt = "{id_str}_{num}"
directory_fmt = ("{category}", "{user[name]}")
filename_fmt = "{tweet_id}_{num}.{extension}"
archive_fmt = "{tweet_id}_{retweet_id}_{num}"
cookiedomain = ".twitter.com"
root = "https://twitter.com"
sizes = (":orig", ":large", ":medium", ":small")
@@ -32,6 +32,7 @@ class TwitterExtractor(Extractor):
self.replies = self.config("replies", True)
self.twitpic = self.config("twitpic", False)
self.videos = self.config("videos", True)
self._user_cache = {}
def items(self):
self.login()
@@ -49,26 +50,23 @@ class TwitterExtractor(Extractor):
if "extended_entities" not in tweet:
continue
tweet.update(metadata)
tweet["date"] = text.parse_datetime(
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y")
entities = tweet["extended_entities"]
del tweet["extended_entities"]
del tweet["entities"]
tdata = self._transform_tweet(tweet)
tdata.update(metadata)
yield Message.Directory, tweet
for tweet["num"], media in enumerate(entities["media"], 1):
yield Message.Directory, tdata
for tdata["num"], media in enumerate(
tweet["extended_entities"]["media"], 1):
tweet["width"] = media["original_info"].get("width", 0)
tweet["height"] = media["original_info"].get("height", 0)
tdata["width"] = media["original_info"].get("width", 0)
tdata["height"] = media["original_info"].get("height", 0)
if "video_info" in media and self.videos:
if self.videos == "ytdl":
url = "ytdl:{}/i/web/status/{}".format(
self.root, tweet["id_str"])
tweet["extension"] = None
yield Message.Url, url, tweet
tdata["extension"] = None
yield Message.Url, url, tdata
else:
video_info = media["video_info"]
@@ -76,24 +74,24 @@ class TwitterExtractor(Extractor):
video_info["variants"],
key=lambda v: v.get("bitrate", 0),
)
tweet["duration"] = video_info.get(
tdata["duration"] = video_info.get(
"duration_millis", 0) / 1000
tweet["bitrate"] = variant.get("bitrate", 0)
tdata["bitrate"] = variant.get("bitrate", 0)
url = variant["url"]
text.nameext_from_url(url, tweet)
yield Message.Url, url, tweet
text.nameext_from_url(url, tdata)
yield Message.Url, url, tdata
elif "media_url_https" in media:
url = media["media_url_https"]
urls = [url + size for size in self.sizes]
text.nameext_from_url(url, tweet)
yield Message.Urllist, urls, tweet
text.nameext_from_url(url, tdata)
yield Message.Urllist, urls, tdata
else:
url = media["media_url"]
text.nameext_from_url(url, tweet)
yield Message.Url, url, tweet
text.nameext_from_url(url, tdata)
yield Message.Url, url, tdata
def _extract_twitpic(self, tweet):
twitpics = []
@@ -115,6 +113,73 @@ class TwitterExtractor(Extractor):
else:
tweet["extended_entities"] = {"media": twitpics}
def _transform_tweet(self, tweet):
entities = tweet["entities"]
tdata = {
"tweet_id" : text.parse_int(tweet["id_str"]),
"retweet_id" : text.parse_int(
tweet.get("retweeted_status_id_str")),
"quote_id" : text.parse_int(
tweet.get("quoted_status_id_str")),
"reply_id" : text.parse_int(
tweet.get("in_reply_to_status_id_str")),
"date" : text.parse_datetime(
tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"),
"user" : self._transform_user(tweet["user"]),
"lang" : tweet["lang"],
"content" : tweet["full_text"],
"favorite_count": tweet["favorite_count"],
"quote_count" : tweet["quote_count"],
"reply_count" : tweet["reply_count"],
"retweet_count" : tweet["retweet_count"],
}
hashtags = entities.get("hashtags")
if hashtags:
tdata["hashtags"] = [t["text"] for t in hashtags]
mentions = entities.get("user_mentions")
if mentions:
tdata["mentions"] = [{
"id": text.parse_int(u["id_str"]),
"name": u["screen_name"],
"nick": u["name"],
} for u in mentions]
if "full_text_quoted" in tweet:
tdata["content_quoted"] = tweet["full_text_quoted"]
if "author" in tweet:
tdata["author"] = self._transform_user(tweet["author"])
return tdata
def _transform_user(self, user):
uid = user["id_str"]
cache = self._user_cache
if uid not in cache:
cache[uid] = {
"id" : text.parse_int(uid),
"name" : user["screen_name"],
"nick" : user["name"],
"description" : user["description"],
"location" : user["location"],
"date" : text.parse_datetime(
user["created_at"], "%a %b %d %H:%M:%S %z %Y"),
"verified" : user.get("verified", False),
"profile_banner" : user.get("profile_banner_url", ""),
"profile_image" : user.get(
"profile_image_url_https", "").replace("_normal.", "."),
"favourites_count": user["favourites_count"],
"followers_count" : user["followers_count"],
"friends_count" : user["friends_count"],
"listed_count" : user["listed_count"],
"media_count" : user["media_count"],
"statuses_count" : user["statuses_count"],
}
return cache[uid]
def metadata(self):
"""Return general metadata"""
return {}
@@ -235,7 +300,7 @@ class TwitterTweetExtractor(TwitterExtractor):
}),
# content with emoji, newlines, hashtags (#338)
("https://twitter.com/playpokemon/status/1263832915173048321", {
"keyword": {"full_text": (
"keyword": {"content": (
r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
"Gifts! \n\nYoull be able to receive four Galarian form "
"Pokémon with Hidden Abilities, plus some very useful items. "
@@ -418,16 +483,16 @@ class TwitterAPI():
tweet["user"] = users[tweet["user_id_str"]]
if "quoted_status_id_str" in tweet:
quoted = tweets[tweet["quoted_status_id_str"]]
tweet["author"] = tweet["user"]
if "extended_entities" in quoted:
tweet["extended_entities"] = \
quoted["extended_entities"]
quoted = tweets.get(tweet["quoted_status_id_str"])
if quoted:
tweet["full_text_quoted"] = quoted["full_text"]
if "extended_entities" in quoted:
tweet["extended_entities"] = \
quoted["extended_entities"]
elif "retweeted_status_id_str" in tweet:
retweet = tweets[tweet["retweeted_status_id_str"]]
tweet["author"] = users[retweet["user_id_str"]]
else:
tweet["author"] = tweet["user"]
retweet = tweets.get(tweet["retweeted_status_id_str"])
if retweet:
tweet["author"] = users[retweet["user_id_str"]]
yield tweet