[twitter] small improvements to search extractor
- put search results in separate directories - set 'max_position' to '-1' for first request -> prevent duplicate results - add a test - flake8
This commit is contained in:
@@ -115,17 +115,18 @@ class TwitterExtractor(Extractor):
|
|||||||
data["content"] = cl if cl and len(cr) < 16 else content
|
data["content"] = cl if cl and len(cr) < 16 else content
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def _tweets_from_api(self, url):
|
def _tweets_from_api(self, url, max_position=None):
|
||||||
params = {
|
params = {
|
||||||
"include_available_features": "1",
|
"include_available_features": "1",
|
||||||
"include_entities": "1",
|
"include_entities": "1",
|
||||||
|
"max_position": max_position,
|
||||||
"reset_error_state": "false",
|
"reset_error_state": "false",
|
||||||
"lang": "en",
|
"lang": "en",
|
||||||
}
|
}
|
||||||
headers = {
|
headers = {
|
||||||
"X-Requested-With": "XMLHttpRequest",
|
"X-Requested-With": "XMLHttpRequest",
|
||||||
"X-Twitter-Active-User": "yes",
|
"X-Twitter-Active-User": "yes",
|
||||||
"Referer": "{}/{}".format(self.root, self.user)
|
"Referer": self.root + "/",
|
||||||
}
|
}
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
@@ -142,21 +143,21 @@ class TwitterExtractor(Extractor):
|
|||||||
|
|
||||||
if "min_position" in data:
|
if "min_position" in data:
|
||||||
position = data["min_position"]
|
position = data["min_position"]
|
||||||
if "max_position" in params and position == params["max_position"]:
|
if position == max_position:
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
position = text.parse_int(text.extract(
|
position = text.parse_int(text.extract(
|
||||||
tweet, 'data-tweet-id="', '"')[0])
|
tweet, 'data-tweet-id="', '"')[0])
|
||||||
if "max_position" in params and position >= params["max_position"]:
|
if max_position and position >= max_position:
|
||||||
return
|
return
|
||||||
params["max_position"] = position
|
params["max_position"] = max_position = position
|
||||||
|
|
||||||
|
|
||||||
class TwitterTimelineExtractor(TwitterExtractor):
|
class TwitterTimelineExtractor(TwitterExtractor):
|
||||||
"""Extractor for all images from a user's timeline"""
|
"""Extractor for all images from a user's timeline"""
|
||||||
subcategory = "timeline"
|
subcategory = "timeline"
|
||||||
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
|
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
|
||||||
r"/((?!search)[^/?&#]+)/?(?:$|[?#])")
|
r"/(?!search)([^/?&#]+)/?(?:$|[?#])")
|
||||||
test = (
|
test = (
|
||||||
("https://twitter.com/supernaturepics", {
|
("https://twitter.com/supernaturepics", {
|
||||||
"range": "1-40",
|
"range": "1-40",
|
||||||
@@ -167,6 +168,7 @@ class TwitterTimelineExtractor(TwitterExtractor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def tweets(self):
|
def tweets(self):
|
||||||
|
|
||||||
url = "{}/i/profiles/show/{}/timeline/tweets".format(
|
url = "{}/i/profiles/show/{}/timeline/tweets".format(
|
||||||
self.root, self.user)
|
self.root, self.user)
|
||||||
return self._tweets_from_api(url)
|
return self._tweets_from_api(url)
|
||||||
@@ -176,7 +178,7 @@ class TwitterMediaExtractor(TwitterExtractor):
|
|||||||
"""Extractor for all images from a user's Media Tweets"""
|
"""Extractor for all images from a user's Media Tweets"""
|
||||||
subcategory = "media"
|
subcategory = "media"
|
||||||
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
|
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
|
||||||
r"/((?!search)[^/?&#]+)/media(?!\w)")
|
r"/(?!search)([^/?&#]+)/media(?!\w)")
|
||||||
test = (
|
test = (
|
||||||
("https://twitter.com/supernaturepics/media", {
|
("https://twitter.com/supernaturepics/media", {
|
||||||
"range": "1-40",
|
"range": "1-40",
|
||||||
@@ -190,17 +192,26 @@ class TwitterMediaExtractor(TwitterExtractor):
|
|||||||
self.root, self.user)
|
self.root, self.user)
|
||||||
return self._tweets_from_api(url)
|
return self._tweets_from_api(url)
|
||||||
|
|
||||||
|
|
||||||
class TwitterSearchExtractor(TwitterExtractor):
|
class TwitterSearchExtractor(TwitterExtractor):
|
||||||
"""Extractor for all images from a search timeline"""
|
"""Extractor for all images from a search timeline"""
|
||||||
subcategory = "search"
|
subcategory = "search"
|
||||||
|
directory_fmt = ("{category}", "Search", "{search}")
|
||||||
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
|
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
|
||||||
r"/search[^q]+q=([^/?&#]+)(?:$|&)")
|
r"/search/?\?(?:[^&#]+&)*q=([^&#]+)")
|
||||||
test = ()
|
test = ("https://twitter.com/search?q=nature", {
|
||||||
|
"range": "1-40",
|
||||||
|
"count": 40,
|
||||||
|
})
|
||||||
|
|
||||||
|
def metadata(self):
|
||||||
|
return {"search": self.user}
|
||||||
|
|
||||||
def tweets(self):
|
def tweets(self):
|
||||||
url = "{}/i/search/timeline?f=tweets&q={}".format(
|
url = "{}/i/search/timeline?f=tweets&q={}".format(
|
||||||
self.root, self.user)
|
self.root, self.user)
|
||||||
return self._tweets_from_api(url)
|
return self._tweets_from_api(url, "-1")
|
||||||
|
|
||||||
|
|
||||||
class TwitterTweetExtractor(TwitterExtractor):
|
class TwitterTweetExtractor(TwitterExtractor):
|
||||||
"""Extractor for images from individual tweets"""
|
"""Extractor for images from individual tweets"""
|
||||||
|
|||||||
Reference in New Issue
Block a user