[aryion] update/improve pagination (#1849)
Manually increment the 'p' query parameter, instead of relying on a "Next" link which only works up to page 200.
This commit is contained in:
@@ -29,7 +29,6 @@ class AryionExtractor(Extractor):
|
|||||||
Extractor.__init__(self, match)
|
Extractor.__init__(self, match)
|
||||||
self.user = match.group(1)
|
self.user = match.group(1)
|
||||||
self.recursive = True
|
self.recursive = True
|
||||||
self._needle = "class='gallery-item' id='"
|
|
||||||
|
|
||||||
def login(self):
|
def login(self):
|
||||||
if self._check_cookies(self.cookienames):
|
if self._check_cookies(self.cookienames):
|
||||||
@@ -68,7 +67,7 @@ class AryionExtractor(Extractor):
|
|||||||
elif post is False and self.recursive:
|
elif post is False and self.recursive:
|
||||||
base = self.root + "/g4/view/"
|
base = self.root + "/g4/view/"
|
||||||
data = {"_extractor": AryionPostExtractor}
|
data = {"_extractor": AryionPostExtractor}
|
||||||
for post_id in self._pagination(base + post_id):
|
for post_id in self._pagination_params(base + post_id):
|
||||||
yield Message.Queue, base + post_id, data
|
yield Message.Queue, base + post_id, data
|
||||||
|
|
||||||
def posts(self):
|
def posts(self):
|
||||||
@@ -77,10 +76,29 @@ class AryionExtractor(Extractor):
|
|||||||
def metadata(self):
|
def metadata(self):
|
||||||
"""Return general metadata"""
|
"""Return general metadata"""
|
||||||
|
|
||||||
def _pagination(self, url):
|
def _pagination_params(self, url, params=None):
|
||||||
|
if params is None:
|
||||||
|
params = {"p": 1}
|
||||||
|
else:
|
||||||
|
params["p"] = text.parse_int(params.get("p"), 1)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
page = self.request(url, params=params).text
|
||||||
|
|
||||||
|
cnt = 0
|
||||||
|
for post_id in text.extract_iter(
|
||||||
|
page, "class='gallery-item' id='", "'"):
|
||||||
|
cnt += 1
|
||||||
|
yield post_id
|
||||||
|
|
||||||
|
if cnt < 40:
|
||||||
|
return
|
||||||
|
params["p"] += 1
|
||||||
|
|
||||||
|
def _pagination_next(self, url):
|
||||||
while True:
|
while True:
|
||||||
page = self.request(url).text
|
page = self.request(url).text
|
||||||
yield from text.extract_iter(page, self._needle, "'")
|
yield from text.extract_iter(page, "thumb' href='/g4/view/", "'")
|
||||||
|
|
||||||
pos = page.find("Next >>")
|
pos = page.find("Next >>")
|
||||||
if pos < 0:
|
if pos < 0:
|
||||||
@@ -186,11 +204,10 @@ class AryionGalleryExtractor(AryionExtractor):
|
|||||||
def posts(self):
|
def posts(self):
|
||||||
if self.recursive:
|
if self.recursive:
|
||||||
url = "{}/g4/gallery/{}".format(self.root, self.user)
|
url = "{}/g4/gallery/{}".format(self.root, self.user)
|
||||||
return self._pagination(url)
|
return self._pagination_params(url)
|
||||||
else:
|
else:
|
||||||
self._needle = "thumb' href='/g4/view/"
|
|
||||||
url = "{}/g4/latest.php?name={}".format(self.root, self.user)
|
url = "{}/g4/latest.php?name={}".format(self.root, self.user)
|
||||||
return util.advance(self._pagination(url), self.offset)
|
return util.advance(self._pagination_next(url), self.offset)
|
||||||
|
|
||||||
|
|
||||||
class AryionTagExtractor(AryionExtractor):
|
class AryionTagExtractor(AryionExtractor):
|
||||||
@@ -199,17 +216,18 @@ class AryionTagExtractor(AryionExtractor):
|
|||||||
directory_fmt = ("{category}", "tags", "{search_tags}")
|
directory_fmt = ("{category}", "tags", "{search_tags}")
|
||||||
archive_fmt = "t_{search_tags}_{id}"
|
archive_fmt = "t_{search_tags}_{id}"
|
||||||
pattern = BASE_PATTERN + r"/tags\.php\?([^#]+)"
|
pattern = BASE_PATTERN + r"/tags\.php\?([^#]+)"
|
||||||
test = ("https://aryion.com/g4/tags.php?tag=star+wars&p=18", {
|
test = ("https://aryion.com/g4/tags.php?tag=star+wars&p=19", {
|
||||||
"count": ">= 5",
|
"count": ">= 5",
|
||||||
})
|
})
|
||||||
|
|
||||||
def metadata(self):
|
def metadata(self):
|
||||||
return {"search_tags": text.parse_query(self.user).get("tag")}
|
self.params = text.parse_query(self.user)
|
||||||
|
self.user = None
|
||||||
|
return {"search_tags": self.params.get("tag")}
|
||||||
|
|
||||||
def posts(self):
|
def posts(self):
|
||||||
url = "{}/g4/tags.php?{}".format(self.root, self.user)
|
url = self.root + "/g4/tags.php"
|
||||||
self.user = None
|
return self._pagination_params(url, self.params)
|
||||||
return self._pagination(url)
|
|
||||||
|
|
||||||
|
|
||||||
class AryionPostExtractor(AryionExtractor):
|
class AryionPostExtractor(AryionExtractor):
|
||||||
|
|||||||
Reference in New Issue
Block a user