[twitter] add extractor for media-tweet timelines (#96)
For example "https://twitter.com/PicturesEarth/media". They are different from normal timelines in that they do not contain any (re)tweets from other users and feature all media the user ever posted, including responses to other tweets.
This commit is contained in:
@@ -76,7 +76,7 @@ SmugMug https://www.smugmug.com/ |Albums, individ-5|
|
||||
Subapics https://subapics.com/ Chapters, Manga
|
||||
The /b/ Archive https://thebarchive.com/ Threads
|
||||
Tumblr https://www.tumblr.com/ Images from Users, Likes, Posts, Tag-Searches Optional (OAuth)
|
||||
Twitter https://twitter.com/ Timelines, Tweets
|
||||
Twitter https://twitter.com/ Timelines, Tweets, Media Tweets
|
||||
Warosu https://warosu.org/ Threads
|
||||
World Three http://www.slide.world-three.org/ Chapters, Manga
|
||||
XVideos https://www.xvideos.com/ Images from Users, Galleries
|
||||
|
||||
@@ -81,7 +81,7 @@ class SmugmugImageExtractor(SmugmugExtractor):
|
||||
pattern = [BASE_PATTERN + r"(?:/[^/?&#]+)+/i-([^/?&#]+)"]
|
||||
test = [("https://acapella.smugmug.com/Micro-Macro/Drops/i-g2Dmf9z", {
|
||||
"url": "ab0d7aa001a53ff3fd228622070b39005b6fc179",
|
||||
"keyword": "4fcc02599d180321b22a7f7238102c48d5410c05",
|
||||
"keyword": "a116167929c22338e6067b81c5d3bee641df3af3",
|
||||
"content": "64a8f69a1d824921eebbdf2420087937adfa45cd",
|
||||
})]
|
||||
|
||||
|
||||
@@ -20,10 +20,10 @@ class TwitterExtractor(Extractor):
|
||||
archive_fmt = "{tweet_id}_{retweet_id}_{num}"
|
||||
root = "https://twitter.com"
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self)
|
||||
self.user = None
|
||||
self.retweets = True
|
||||
self.user = match.group(1)
|
||||
self.retweets = self.config("retweets", True)
|
||||
|
||||
def items(self):
|
||||
yield Message.Version, 1
|
||||
@@ -45,9 +45,11 @@ class TwitterExtractor(Extractor):
|
||||
|
||||
def metadata(self):
|
||||
"""Return general metadata"""
|
||||
return {"user": self.user}
|
||||
|
||||
def tweets(self):
|
||||
"""Yield HTML content of all relevant tweets"""
|
||||
return ()
|
||||
|
||||
@staticmethod
|
||||
def _data_from_tweet(tweet):
|
||||
@@ -64,29 +66,7 @@ class TwitterExtractor(Extractor):
|
||||
data["retweeter"] = data["retweeter"] or ""
|
||||
return data
|
||||
|
||||
|
||||
class TwitterTimelineExtractor(TwitterExtractor):
|
||||
"""Extractor for all tweeted images from a user's timeline"""
|
||||
subcategory = "timeline"
|
||||
pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
|
||||
r"/([^/?&#]+)/?$"]
|
||||
test = [("https://twitter.com/PicturesEarth", {
|
||||
"range": (1, 40),
|
||||
"url": "2f4d51cbba81e56c1c755677b3ad58fc167c9771",
|
||||
"keyword": "cbae53b6f4ba133078bb13c95dbd3cbb4fa40b9f",
|
||||
})]
|
||||
|
||||
def __init__(self, match):
|
||||
TwitterExtractor.__init__(self)
|
||||
self.user = match.group(1)
|
||||
self.retweets = self.config("retweets", True)
|
||||
|
||||
def metadata(self):
|
||||
return {"user": self.user}
|
||||
|
||||
def tweets(self):
|
||||
url = "{}/i/profiles/show/{}/timeline/tweets".format(
|
||||
self.root, self.user)
|
||||
def _tweets_from_api(self, url):
|
||||
params = {
|
||||
"include_available_features": "1",
|
||||
"include_entities": "1",
|
||||
@@ -112,6 +92,39 @@ class TwitterTimelineExtractor(TwitterExtractor):
|
||||
tweet, 'data-tweet-id="', '"')[0]
|
||||
|
||||
|
||||
class TwitterTimelineExtractor(TwitterExtractor):
|
||||
"""Extractor for all images from a user's timeline"""
|
||||
subcategory = "timeline"
|
||||
pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
|
||||
r"/([^/?&#]+)/?$"]
|
||||
test = [("https://twitter.com/PicturesEarth", {
|
||||
"range": (1, 40),
|
||||
"url": "2f4d51cbba81e56c1c755677b3ad58fc167c9771",
|
||||
"keyword": "cbae53b6f4ba133078bb13c95dbd3cbb4fa40b9f",
|
||||
})]
|
||||
|
||||
def tweets(self):
|
||||
url = "{}/i/profiles/show/{}/timeline/tweets".format(
|
||||
self.root, self.user)
|
||||
return self._tweets_from_api(url)
|
||||
|
||||
|
||||
class TwitterMediaExtractor(TwitterExtractor):
|
||||
"""Extractor for all images from a user's Media Tweets"""
|
||||
subcategory = "media"
|
||||
pattern = [r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
|
||||
r"/([^/?&#]+)/media(?!\w)"]
|
||||
test = [("https://twitter.com/PicturesEarth/media", {
|
||||
"range": (1, 40),
|
||||
"url": "2f4d51cbba81e56c1c755677b3ad58fc167c9771",
|
||||
})]
|
||||
|
||||
def tweets(self):
|
||||
url = "{}/i/profiles/show/{}/media_timeline".format(
|
||||
self.root, self.user)
|
||||
return self._tweets_from_api(url)
|
||||
|
||||
|
||||
class TwitterTweetExtractor(TwitterExtractor):
|
||||
"""Extractor for images from individual tweets"""
|
||||
subcategory = "tweet"
|
||||
@@ -130,8 +143,8 @@ class TwitterTweetExtractor(TwitterExtractor):
|
||||
]
|
||||
|
||||
def __init__(self, match):
|
||||
TwitterExtractor.__init__(self)
|
||||
self.user, self.tweet_id = match.groups()
|
||||
TwitterExtractor.__init__(self, match)
|
||||
self.tweet_id = match.group(2)
|
||||
|
||||
def metadata(self):
|
||||
return {"user": self.user, "tweet_id": self.tweet_id}
|
||||
|
||||
@@ -68,6 +68,7 @@ SUBCATEGORY_MAP = {
|
||||
"issue" : "Comic-Issues",
|
||||
"manga" : "Manga",
|
||||
"me" : "pixiv.me Links",
|
||||
"media" : "Media Tweets",
|
||||
"path" : "Images from Users and Folders",
|
||||
"pinit" : "pin.it Links",
|
||||
"popular": "Popular Images",
|
||||
@@ -226,6 +227,8 @@ def category_key(extrlist):
|
||||
def subcategory_key(cls):
|
||||
if cls.subcategory in ("user", "issue"):
|
||||
return "A"
|
||||
if cls.subcategory in ("media",):
|
||||
return "z"
|
||||
return cls.subcategory
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user