[twitter] small improvements to search extractor

- put search results in separate directories
- set 'max_position' to '-1' for first request
  -> prevent duplicate results
- add a test
- flake8
This commit is contained in:
Mike Fährmann
2019-10-17 18:34:07 +02:00
parent c3042978b8
commit 1c03a389df

View File

@@ -115,17 +115,18 @@ class TwitterExtractor(Extractor):
data["content"] = cl if cl and len(cr) < 16 else content data["content"] = cl if cl and len(cr) < 16 else content
return data return data
def _tweets_from_api(self, url): def _tweets_from_api(self, url, max_position=None):
params = { params = {
"include_available_features": "1", "include_available_features": "1",
"include_entities": "1", "include_entities": "1",
"max_position": max_position,
"reset_error_state": "false", "reset_error_state": "false",
"lang": "en", "lang": "en",
} }
headers = { headers = {
"X-Requested-With": "XMLHttpRequest", "X-Requested-With": "XMLHttpRequest",
"X-Twitter-Active-User": "yes", "X-Twitter-Active-User": "yes",
"Referer": "{}/{}".format(self.root, self.user) "Referer": self.root + "/",
} }
while True: while True:
@@ -142,21 +143,21 @@ class TwitterExtractor(Extractor):
if "min_position" in data: if "min_position" in data:
position = data["min_position"] position = data["min_position"]
if "max_position" in params and position == params["max_position"]: if position == max_position:
return return
else: else:
position = text.parse_int(text.extract( position = text.parse_int(text.extract(
tweet, 'data-tweet-id="', '"')[0]) tweet, 'data-tweet-id="', '"')[0])
if "max_position" in params and position >= params["max_position"]: if max_position and position >= max_position:
return return
params["max_position"] = position params["max_position"] = max_position = position
class TwitterTimelineExtractor(TwitterExtractor): class TwitterTimelineExtractor(TwitterExtractor):
"""Extractor for all images from a user's timeline""" """Extractor for all images from a user's timeline"""
subcategory = "timeline" subcategory = "timeline"
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
r"/((?!search)[^/?&#]+)/?(?:$|[?#])") r"/(?!search)([^/?&#]+)/?(?:$|[?#])")
test = ( test = (
("https://twitter.com/supernaturepics", { ("https://twitter.com/supernaturepics", {
"range": "1-40", "range": "1-40",
@@ -167,6 +168,7 @@ class TwitterTimelineExtractor(TwitterExtractor):
) )
def tweets(self): def tweets(self):
url = "{}/i/profiles/show/{}/timeline/tweets".format( url = "{}/i/profiles/show/{}/timeline/tweets".format(
self.root, self.user) self.root, self.user)
return self._tweets_from_api(url) return self._tweets_from_api(url)
@@ -176,7 +178,7 @@ class TwitterMediaExtractor(TwitterExtractor):
"""Extractor for all images from a user's Media Tweets""" """Extractor for all images from a user's Media Tweets"""
subcategory = "media" subcategory = "media"
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
r"/((?!search)[^/?&#]+)/media(?!\w)") r"/(?!search)([^/?&#]+)/media(?!\w)")
test = ( test = (
("https://twitter.com/supernaturepics/media", { ("https://twitter.com/supernaturepics/media", {
"range": "1-40", "range": "1-40",
@@ -190,17 +192,26 @@ class TwitterMediaExtractor(TwitterExtractor):
self.root, self.user) self.root, self.user)
return self._tweets_from_api(url) return self._tweets_from_api(url)
class TwitterSearchExtractor(TwitterExtractor): class TwitterSearchExtractor(TwitterExtractor):
"""Extractor for all images from a search timeline""" """Extractor for all images from a search timeline"""
subcategory = "search" subcategory = "search"
directory_fmt = ("{category}", "Search", "{search}")
pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
r"/search[^q]+q=([^/?&#]+)(?:$|&)") r"/search/?\?(?:[^&#]+&)*q=([^&#]+)")
test = () test = ("https://twitter.com/search?q=nature", {
"range": "1-40",
"count": 40,
})
def metadata(self):
return {"search": self.user}
def tweets(self): def tweets(self):
url = "{}/i/search/timeline?f=tweets&q={}".format( url = "{}/i/search/timeline?f=tweets&q={}".format(
self.root, self.user) self.root, self.user)
return self._tweets_from_api(url) return self._tweets_from_api(url, "-1")
class TwitterTweetExtractor(TwitterExtractor): class TwitterTweetExtractor(TwitterExtractor):
"""Extractor for images from individual tweets""" """Extractor for images from individual tweets"""