[idolcomplex] improve and fix pagination (#1601)

always rely on the 'next-page-url' value and its query parameters
This commit is contained in:
Mike Fährmann
2021-06-04 20:31:08 +02:00
parent 3cbbefd4ed
commit a3bf878329

View File

@@ -132,11 +132,16 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor):
archive_fmt = "t_{search_tags}_{id}" archive_fmt = "t_{search_tags}_{id}"
pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)" pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)"
test = ( test = (
("https://idol.sankakucomplex.com/?tags=lyumos+wreath", { ("https://idol.sankakucomplex.com/?tags=lyumos", {
"count": ">= 6", "count": 5,
"range": "18-22",
"pattern": r"https://is\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}" "pattern": r"https://is\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+", r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+",
}), }),
("https://idol.sankakucomplex.com/?tags=order:favcount", {
"count": 5,
"range": "18-22",
}),
("https://idol.sankakucomplex.com" ("https://idol.sankakucomplex.com"
"/?tags=lyumos+wreath&page=3&next=694215"), "/?tags=lyumos+wreath&page=3&next=694215"),
) )
@@ -184,21 +189,21 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor):
while True: while True:
page = self.request(self.root, params=params, retries=10).text page = self.request(self.root, params=params, retries=10).text
pos = page.find("<div id=more-popular-posts-link>") + 1 pos = page.find("<div id=more-popular-posts-link>") + 1
yield from text.extract_iter(page, '" id=p', '>', pos)
ids = list(text.extract_iter(page, '" id=p', '>', pos)) next_url = text.extract(page, 'next-page-url="', '"', pos)[0]
if not ids: if not next_url:
return
yield from ids
next_qs = text.extract(page, 'next-page-url="?', '"', pos)[0]
next_id = text.parse_query(next_qs).get("next")
# stop if the same "next" parameter occurs twice in a row (#265)
if "next" in params and params["next"] == next_id:
return return
params["next"] = next_id or (text.parse_int(ids[-1]) - 1) next_params = text.parse_query(text.unescape(
params["page"] = "2" next_url).lstrip("?/"))
if "next" in next_params:
# stop if the same "next" value occurs twice in a row (#265)
if "next" in params and params["next"] == next_params["next"]:
return
next_params["page"] = "2"
params = next_params
class IdolcomplexPoolExtractor(IdolcomplexExtractor): class IdolcomplexPoolExtractor(IdolcomplexExtractor):