[twitter] implement 'pagination-search' option (#8173)
This commit is contained in:
@@ -5631,6 +5631,22 @@ Description
|
|||||||
Logout and retry as guest when access to another user's Tweets is blocked.
|
Logout and retry as guest when access to another user's Tweets is blocked.
|
||||||
|
|
||||||
|
|
||||||
|
extractor.twitter.pagination-search
|
||||||
|
-----------------------------------
|
||||||
|
Type
|
||||||
|
``string``
|
||||||
|
Default
|
||||||
|
``"cursor"``
|
||||||
|
Description
|
||||||
|
Selects how to paginate over search results.
|
||||||
|
|
||||||
|
``"cursor"``
|
||||||
|
Use ``cursor`` values provided by the API
|
||||||
|
``"id"`` | ``"max_id"`` | ``"tweet_id"``
|
||||||
|
Update the ``max_id`` search query parameter
|
||||||
|
to the Tweet ID value of the last retrieved Tweet.
|
||||||
|
|
||||||
|
|
||||||
extractor.twitter.pinned
|
extractor.twitter.pinned
|
||||||
------------------------
|
------------------------
|
||||||
Type
|
Type
|
||||||
|
|||||||
@@ -812,6 +812,7 @@
|
|||||||
"include" : ["timeline"],
|
"include" : ["timeline"],
|
||||||
"locked" : "abort",
|
"locked" : "abort",
|
||||||
"logout" : true,
|
"logout" : true,
|
||||||
|
"pagination-search": "cursor",
|
||||||
"pinned" : false,
|
"pinned" : false,
|
||||||
"quoted" : false,
|
"quoted" : false,
|
||||||
"ratelimit" : "wait",
|
"ratelimit" : "wait",
|
||||||
|
|||||||
@@ -1459,10 +1459,17 @@ class TwitterAPI():
|
|||||||
"product": product,
|
"product": product,
|
||||||
"withGrokTranslatedBio": False,
|
"withGrokTranslatedBio": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if self.extractor.config("pagination-search") in (
|
||||||
|
"id", "tweet_id", "max_id"):
|
||||||
|
update_variables = self._update_variables_search
|
||||||
|
else:
|
||||||
|
update_variables = None
|
||||||
|
|
||||||
return self._pagination_tweets(
|
return self._pagination_tweets(
|
||||||
endpoint, variables,
|
endpoint, variables,
|
||||||
("search_by_raw_query", "search_timeline", "timeline"),
|
("search_by_raw_query", "search_timeline", "timeline"),
|
||||||
stop_tweets=3)
|
stop_tweets=3, update_variables=update_variables)
|
||||||
|
|
||||||
def community_query(self, community_id):
|
def community_query(self, community_id):
|
||||||
endpoint = "/graphql/2W09l7nD7ZbxGQHXvfB22w/CommunityQuery"
|
endpoint = "/graphql/2W09l7nD7ZbxGQHXvfB22w/CommunityQuery"
|
||||||
@@ -1872,7 +1879,7 @@ class TwitterAPI():
|
|||||||
params["cursor"] = extr._update_cursor(cursor)
|
params["cursor"] = extr._update_cursor(cursor)
|
||||||
|
|
||||||
def _pagination_tweets(self, endpoint, variables,
|
def _pagination_tweets(self, endpoint, variables,
|
||||||
path=None, stop_tweets=0,
|
path=None, stop_tweets=0, update_variables=None,
|
||||||
features=None, field_toggles=None):
|
features=None, field_toggles=None):
|
||||||
extr = self.extractor
|
extr = self.extractor
|
||||||
original_retweets = (extr.retweets == "original")
|
original_retweets = (extr.retweets == "original")
|
||||||
@@ -2072,6 +2079,7 @@ class TwitterAPI():
|
|||||||
|
|
||||||
if tweet:
|
if tweet:
|
||||||
stop_tweets = stop_tweets_max
|
stop_tweets = stop_tweets_max
|
||||||
|
last_tweet = tweet
|
||||||
else:
|
else:
|
||||||
if stop_tweets <= 0:
|
if stop_tweets <= 0:
|
||||||
return extr._update_cursor(None)
|
return extr._update_cursor(None)
|
||||||
@@ -2079,9 +2087,14 @@ class TwitterAPI():
|
|||||||
"No Tweet results (%s/%s)",
|
"No Tweet results (%s/%s)",
|
||||||
stop_tweets_max - stop_tweets + 1, stop_tweets_max)
|
stop_tweets_max - stop_tweets + 1, stop_tweets_max)
|
||||||
stop_tweets -= 1
|
stop_tweets -= 1
|
||||||
|
|
||||||
if not cursor or cursor == variables.get("cursor"):
|
if not cursor or cursor == variables.get("cursor"):
|
||||||
return extr._update_cursor(None)
|
return extr._update_cursor(None)
|
||||||
variables["cursor"] = extr._update_cursor(cursor)
|
|
||||||
|
if update_variables is None:
|
||||||
|
variables["cursor"] = extr._update_cursor(cursor)
|
||||||
|
else:
|
||||||
|
variables = update_variables(variables, cursor, last_tweet)
|
||||||
|
|
||||||
def _pagination_users(self, endpoint, variables, path=None):
|
def _pagination_users(self, endpoint, variables, path=None):
|
||||||
extr = self.extractor
|
extr = self.extractor
|
||||||
@@ -2150,6 +2163,30 @@ class TwitterAPI():
|
|||||||
|
|
||||||
self.log.debug("Skipping %s ('%s')", tweet_id, text)
|
self.log.debug("Skipping %s ('%s')", tweet_id, text)
|
||||||
|
|
||||||
|
def _update_variables_search(self, variables, cursor, tweet):
|
||||||
|
try:
|
||||||
|
tweet_id = tweet.get("id_str") or tweet["legacy"]["id_str"]
|
||||||
|
max_id = f"max_id:{int(tweet_id)-1}"
|
||||||
|
|
||||||
|
query, n = text.re(r"\bmax_id:\d+").subn(
|
||||||
|
max_id, variables["rawQuery"])
|
||||||
|
if n:
|
||||||
|
variables["rawQuery"] = query
|
||||||
|
else:
|
||||||
|
variables["rawQuery"] = f"{query} {max_id}"
|
||||||
|
|
||||||
|
if prefix := self.extractor._cursor_prefix:
|
||||||
|
self.extractor._cursor_prefix = \
|
||||||
|
f"{prefix.partition('_')[0]}_{tweet_id}/"
|
||||||
|
variables["cursor"] = None
|
||||||
|
except Exception as exc:
|
||||||
|
self.extractor.log.debug(
|
||||||
|
"Failed to update 'max_id' search query (%s: %s). Falling "
|
||||||
|
"back to 'cursor' pagination", exc.__class__.__name__, exc)
|
||||||
|
variables["cursor"] = self.extractor._update_cursor(cursor)
|
||||||
|
|
||||||
|
return variables
|
||||||
|
|
||||||
|
|
||||||
@cache(maxage=365*86400, keyarg=1)
|
@cache(maxage=365*86400, keyarg=1)
|
||||||
def _login_impl(extr, username, password):
|
def _login_impl(extr, username, password):
|
||||||
|
|||||||
Reference in New Issue
Block a user