[twitter] implement 'pagination-search' option (#8173)

This commit is contained in:
Mike Fährmann
2025-09-07 21:03:29 +02:00
parent f94eedbe1d
commit d182749f45
3 changed files with 57 additions and 3 deletions

View File

@@ -5631,6 +5631,22 @@ Description
Logout and retry as guest when access to another user's Tweets is blocked. Logout and retry as guest when access to another user's Tweets is blocked.
extractor.twitter.pagination-search
-----------------------------------
Type
``string``
Default
``"cursor"``
Description
Selects how to paginate over search results.
``"cursor"``
Use ``cursor`` values provided by the API
``"id"`` | ``"max_id"`` | ``"tweet_id"``
Update the ``max_id`` search query parameter
to the Tweet ID value of the last retrieved Tweet.
extractor.twitter.pinned extractor.twitter.pinned
------------------------ ------------------------
Type Type

View File

@@ -812,6 +812,7 @@
"include" : ["timeline"], "include" : ["timeline"],
"locked" : "abort", "locked" : "abort",
"logout" : true, "logout" : true,
"pagination-search": "cursor",
"pinned" : false, "pinned" : false,
"quoted" : false, "quoted" : false,
"ratelimit" : "wait", "ratelimit" : "wait",

View File

@@ -1459,10 +1459,17 @@ class TwitterAPI():
"product": product, "product": product,
"withGrokTranslatedBio": False, "withGrokTranslatedBio": False,
} }
if self.extractor.config("pagination-search") in (
"id", "tweet_id", "max_id"):
update_variables = self._update_variables_search
else:
update_variables = None
return self._pagination_tweets( return self._pagination_tweets(
endpoint, variables, endpoint, variables,
("search_by_raw_query", "search_timeline", "timeline"), ("search_by_raw_query", "search_timeline", "timeline"),
stop_tweets=3) stop_tweets=3, update_variables=update_variables)
def community_query(self, community_id): def community_query(self, community_id):
endpoint = "/graphql/2W09l7nD7ZbxGQHXvfB22w/CommunityQuery" endpoint = "/graphql/2W09l7nD7ZbxGQHXvfB22w/CommunityQuery"
@@ -1872,7 +1879,7 @@ class TwitterAPI():
params["cursor"] = extr._update_cursor(cursor) params["cursor"] = extr._update_cursor(cursor)
def _pagination_tweets(self, endpoint, variables, def _pagination_tweets(self, endpoint, variables,
path=None, stop_tweets=0, path=None, stop_tweets=0, update_variables=None,
features=None, field_toggles=None): features=None, field_toggles=None):
extr = self.extractor extr = self.extractor
original_retweets = (extr.retweets == "original") original_retweets = (extr.retweets == "original")
@@ -2072,6 +2079,7 @@ class TwitterAPI():
if tweet: if tweet:
stop_tweets = stop_tweets_max stop_tweets = stop_tweets_max
last_tweet = tweet
else: else:
if stop_tweets <= 0: if stop_tweets <= 0:
return extr._update_cursor(None) return extr._update_cursor(None)
@@ -2079,9 +2087,14 @@ class TwitterAPI():
"No Tweet results (%s/%s)", "No Tweet results (%s/%s)",
stop_tweets_max - stop_tweets + 1, stop_tweets_max) stop_tweets_max - stop_tweets + 1, stop_tweets_max)
stop_tweets -= 1 stop_tweets -= 1
if not cursor or cursor == variables.get("cursor"): if not cursor or cursor == variables.get("cursor"):
return extr._update_cursor(None) return extr._update_cursor(None)
variables["cursor"] = extr._update_cursor(cursor)
if update_variables is None:
variables["cursor"] = extr._update_cursor(cursor)
else:
variables = update_variables(variables, cursor, last_tweet)
def _pagination_users(self, endpoint, variables, path=None): def _pagination_users(self, endpoint, variables, path=None):
extr = self.extractor extr = self.extractor
@@ -2150,6 +2163,30 @@ class TwitterAPI():
self.log.debug("Skipping %s ('%s')", tweet_id, text) self.log.debug("Skipping %s ('%s')", tweet_id, text)
def _update_variables_search(self, variables, cursor, tweet):
try:
tweet_id = tweet.get("id_str") or tweet["legacy"]["id_str"]
max_id = f"max_id:{int(tweet_id)-1}"
query, n = text.re(r"\bmax_id:\d+").subn(
max_id, variables["rawQuery"])
if n:
variables["rawQuery"] = query
else:
variables["rawQuery"] = f"{query} {max_id}"
if prefix := self.extractor._cursor_prefix:
self.extractor._cursor_prefix = \
f"{prefix.partition('_')[0]}_{tweet_id}/"
variables["cursor"] = None
except Exception as exc:
self.extractor.log.debug(
"Failed to update 'max_id' search query (%s: %s). Falling "
"back to 'cursor' pagination", exc.__class__.__name__, exc)
variables["cursor"] = self.extractor._update_cursor(cursor)
return variables
@cache(maxage=365*86400, keyarg=1) @cache(maxage=365*86400, keyarg=1)
def _login_impl(extr, username, password): def _login_impl(extr, username, password):