[twitter] implement 'pagination-search' option (#8173)

This commit is contained in:
Mike Fährmann
2025-09-07 21:03:29 +02:00
parent f94eedbe1d
commit d182749f45
3 changed files with 57 additions and 3 deletions

View File

@@ -1459,10 +1459,17 @@ class TwitterAPI():
"product": product,
"withGrokTranslatedBio": False,
}
if self.extractor.config("pagination-search") in (
"id", "tweet_id", "max_id"):
update_variables = self._update_variables_search
else:
update_variables = None
return self._pagination_tweets(
endpoint, variables,
("search_by_raw_query", "search_timeline", "timeline"),
stop_tweets=3)
stop_tweets=3, update_variables=update_variables)
def community_query(self, community_id):
endpoint = "/graphql/2W09l7nD7ZbxGQHXvfB22w/CommunityQuery"
@@ -1872,7 +1879,7 @@ class TwitterAPI():
params["cursor"] = extr._update_cursor(cursor)
def _pagination_tweets(self, endpoint, variables,
path=None, stop_tweets=0,
path=None, stop_tweets=0, update_variables=None,
features=None, field_toggles=None):
extr = self.extractor
original_retweets = (extr.retweets == "original")
@@ -2072,6 +2079,7 @@ class TwitterAPI():
if tweet:
stop_tweets = stop_tweets_max
last_tweet = tweet
else:
if stop_tweets <= 0:
return extr._update_cursor(None)
@@ -2079,9 +2087,14 @@ class TwitterAPI():
"No Tweet results (%s/%s)",
stop_tweets_max - stop_tweets + 1, stop_tweets_max)
stop_tweets -= 1
if not cursor or cursor == variables.get("cursor"):
return extr._update_cursor(None)
variables["cursor"] = extr._update_cursor(cursor)
if update_variables is None:
variables["cursor"] = extr._update_cursor(cursor)
else:
variables = update_variables(variables, cursor, last_tweet)
def _pagination_users(self, endpoint, variables, path=None):
extr = self.extractor
@@ -2150,6 +2163,30 @@ class TwitterAPI():
self.log.debug("Skipping %s ('%s')", tweet_id, text)
def _update_variables_search(self, variables, cursor, tweet):
try:
tweet_id = tweet.get("id_str") or tweet["legacy"]["id_str"]
max_id = f"max_id:{int(tweet_id)-1}"
query, n = text.re(r"\bmax_id:\d+").subn(
max_id, variables["rawQuery"])
if n:
variables["rawQuery"] = query
else:
variables["rawQuery"] = f"{query} {max_id}"
if prefix := self.extractor._cursor_prefix:
self.extractor._cursor_prefix = \
f"{prefix.partition('_')[0]}_{tweet_id}/"
variables["cursor"] = None
except Exception as exc:
self.extractor.log.debug(
"Failed to update 'max_id' search query (%s: %s). Falling "
"back to 'cursor' pagination", exc.__class__.__name__, exc)
variables["cursor"] = self.extractor._update_cursor(cursor)
return variables
@cache(maxage=365*86400, keyarg=1)
def _login_impl(extr, username, password):