[twitter] add extractor for media-tweet timelines (#96)

For example "https://twitter.com/PicturesEarth/media".
They are different from normal timelines in that they do not contain
any (re)tweets from other users and feature all media the user ever
posted, including responses to other tweets.
This commit is contained in:
Mike Fährmann
2018-08-19 20:36:33 +02:00
parent f45c9f2141
commit e9dd2eff1d
4 changed files with 46 additions and 30 deletions

View File

@@ -76,7 +76,7 @@ SmugMug https://www.smugmug.com/ |Albums, individ-5|
Subapics https://subapics.com/ Chapters, Manga
The /b/ Archive https://thebarchive.com/ Threads
Tumblr https://www.tumblr.com/ Images from Users, Likes, Posts, Tag-Searches Optional (OAuth)
Twitter https://twitter.com/ Timelines, Tweets
Twitter https://twitter.com/ Timelines, Tweets, Media Tweets
Warosu https://warosu.org/ Threads
World Three http://www.slide.world-three.org/ Chapters, Manga
XVideos https://www.xvideos.com/ Images from Users, Galleries

View File

@@ -81,7 +81,7 @@ class SmugmugImageExtractor(SmugmugExtractor):
pattern = [BASE_PATTERN + r"(?:/[^/?&#]+)+/i-([^/?&#]+)"]
test = [("https://acapella.smugmug.com/Micro-Macro/Drops/i-g2Dmf9z", {
"url": "ab0d7aa001a53ff3fd228622070b39005b6fc179",
"keyword": "4fcc02599d180321b22a7f7238102c48d5410c05",
"keyword": "a116167929c22338e6067b81c5d3bee641df3af3",
"content": "64a8f69a1d824921eebbdf2420087937adfa45cd",
})]

View File

@@ -20,10 +20,10 @@ class TwitterExtractor(Extractor):
archive_fmt = "{tweet_id}_{retweet_id}_{num}"
root = "https://twitter.com"
def __init__(self):
def __init__(self, match):
Extractor.__init__(self)
self.user = None
self.retweets = True
self.user = match.group(1)
self.retweets = self.config("retweets", True)
def items(self):
yield Message.Version, 1
@@ -45,9 +45,11 @@ class TwitterExtractor(Extractor):
def metadata(self):
"""Return general metadata"""
return {"user": self.user}
def tweets(self):
"""Yield HTML content of all relevant tweets"""
return ()
@staticmethod
def _data_from_tweet(tweet):
@@ -64,29 +66,7 @@ class TwitterExtractor(Extractor):
data["retweeter"] = data["retweeter"] or ""
return data
class TwitterTimelineExtractor(TwitterExtractor):
"""Extractor for all tweeted images from a user's timeline"""
subcategory = "timeline"
pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
r"/([^/?&#]+)/?$"]
test = [("https://twitter.com/PicturesEarth", {
"range": (1, 40),
"url": "2f4d51cbba81e56c1c755677b3ad58fc167c9771",
"keyword": "cbae53b6f4ba133078bb13c95dbd3cbb4fa40b9f",
})]
def __init__(self, match):
TwitterExtractor.__init__(self)
self.user = match.group(1)
self.retweets = self.config("retweets", True)
def metadata(self):
return {"user": self.user}
def tweets(self):
url = "{}/i/profiles/show/{}/timeline/tweets".format(
self.root, self.user)
def _tweets_from_api(self, url):
params = {
"include_available_features": "1",
"include_entities": "1",
@@ -112,6 +92,39 @@ class TwitterTimelineExtractor(TwitterExtractor):
tweet, 'data-tweet-id="', '"')[0]
class TwitterTimelineExtractor(TwitterExtractor):
"""Extractor for all images from a user's timeline"""
subcategory = "timeline"
pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
r"/([^/?&#]+)/?$"]
test = [("https://twitter.com/PicturesEarth", {
"range": (1, 40),
"url": "2f4d51cbba81e56c1c755677b3ad58fc167c9771",
"keyword": "cbae53b6f4ba133078bb13c95dbd3cbb4fa40b9f",
})]
def tweets(self):
url = "{}/i/profiles/show/{}/timeline/tweets".format(
self.root, self.user)
return self._tweets_from_api(url)
class TwitterMediaExtractor(TwitterExtractor):
"""Extractor for all images from a user's Media Tweets"""
subcategory = "media"
pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
r"/([^/?&#]+)/media(?!\w)"]
test = [("https://twitter.com/PicturesEarth/media", {
"range": (1, 40),
"url": "2f4d51cbba81e56c1c755677b3ad58fc167c9771",
})]
def tweets(self):
url = "{}/i/profiles/show/{}/media_timeline".format(
self.root, self.user)
return self._tweets_from_api(url)
class TwitterTweetExtractor(TwitterExtractor):
"""Extractor for images from individual tweets"""
subcategory = "tweet"
@@ -130,8 +143,8 @@ class TwitterTweetExtractor(TwitterExtractor):
]
def __init__(self, match):
TwitterExtractor.__init__(self)
self.user, self.tweet_id = match.groups()
TwitterExtractor.__init__(self, match)
self.tweet_id = match.group(2)
def metadata(self):
return {"user": self.user, "tweet_id": self.tweet_id}

View File

@@ -68,6 +68,7 @@ SUBCATEGORY_MAP = {
"issue" : "Comic-Issues",
"manga" : "Manga",
"me" : "pixiv.me Links",
"media" : "Media Tweets",
"path" : "Images from Users and Folders",
"pinit" : "pin.it Links",
"popular": "Popular Images",
@@ -226,6 +227,8 @@ def category_key(extrlist):
def subcategory_key(cls):
if cls.subcategory in ("user", "issue"):
return "A"
if cls.subcategory in ("media",):
return "z"
return cls.subcategory