[exhentai] fix search and favorite extraction
removes basically all metadata, but that can be compensated for with the right search query. writing "parsers" for all 4 possible views that have been introduced in the latest changes is too much of a hassle ...
This commit is contained in:
@@ -339,53 +339,30 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
|
|||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
ExhentaiExtractor.__init__(self, match)
|
ExhentaiExtractor.__init__(self, match)
|
||||||
self.params = text.parse_query(match.group(2) or "")
|
self.params = text.parse_query(match.group(2))
|
||||||
self.params["page"] = text.parse_int(self.params.get("page"))
|
self.params["page"] = text.parse_int(self.params.get("page"))
|
||||||
self.search_url = self.root
|
self.search_url = self.root
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
self.login()
|
self.login()
|
||||||
self.init()
|
|
||||||
yield Message.Version, 1
|
yield Message.Version, 1
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
last = None
|
||||||
page = self.request(self.search_url, params=self.params).text
|
page = self.request(self.search_url, params=self.params).text
|
||||||
|
|
||||||
for row in text.extract_iter(page, '<tr class="gtr', '</tr>'):
|
for gallery in ExhentaiGalleryExtractor.pattern.finditer(page):
|
||||||
yield self._parse_row(row)
|
url = gallery.group(0)
|
||||||
|
if url == last:
|
||||||
|
continue
|
||||||
|
last = url
|
||||||
|
yield Message.Queue, url, {}
|
||||||
|
|
||||||
if 'class="ptdd">><' in page or ">No hits found</p>" in page:
|
if 'class="ptdd">><' in page or ">No hits found</p>" in page:
|
||||||
return
|
return
|
||||||
self.params["page"] += 1
|
self.params["page"] += 1
|
||||||
self.wait()
|
self.wait()
|
||||||
|
|
||||||
def init(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _parse_row(self, row, extr=text.extract):
|
|
||||||
"""Parse information of a single result row"""
|
|
||||||
gtype, pos = extr(row, ' alt="', '"')
|
|
||||||
date , pos = extr(row, 'nowrap">', '<', pos)
|
|
||||||
url , pos = extr(row, ' class="it5"><a href="', '"', pos)
|
|
||||||
title, pos = extr(row, '>', '<', pos)
|
|
||||||
key , last = self._parse_last(row, pos)
|
|
||||||
parts = url.rsplit("/", 3)
|
|
||||||
|
|
||||||
return Message.Queue, url, {
|
|
||||||
"type": gtype,
|
|
||||||
"date": date,
|
|
||||||
"gallery_id": text.parse_int(parts[1]),
|
|
||||||
"gallery_token": parts[2],
|
|
||||||
"title": text.unescape(title),
|
|
||||||
"_extractor": ExhentaiGalleryExtractor,
|
|
||||||
key: last,
|
|
||||||
}
|
|
||||||
|
|
||||||
def _parse_last(self, row, pos):
|
|
||||||
"""Parse the last column of a result row"""
|
|
||||||
return "uploader", text.remove_html(
|
|
||||||
text.extract(row, '<td class="itu">', '</td>', pos)[0])
|
|
||||||
|
|
||||||
|
|
||||||
class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
|
class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
|
||||||
"""Extractor for favorited exhentai galleries"""
|
"""Extractor for favorited exhentai galleries"""
|
||||||
@@ -400,15 +377,3 @@ class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
|
|||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
ExhentaiSearchExtractor.__init__(self, match)
|
ExhentaiSearchExtractor.__init__(self, match)
|
||||||
self.search_url = self.root + "/favorites.php"
|
self.search_url = self.root + "/favorites.php"
|
||||||
|
|
||||||
def init(self):
|
|
||||||
# The first request to '/favorites.php' will return an empty list
|
|
||||||
# if the 's' cookie isn't set (maybe on some other conditions as well),
|
|
||||||
# so we make a "noop" request to get all the correct cookie values
|
|
||||||
# and to get a filled favorite list on the next one.
|
|
||||||
# TODO: proper cookie storage
|
|
||||||
self.request(self.url)
|
|
||||||
self.wait(1.5)
|
|
||||||
|
|
||||||
def _parse_last(self, row, pos):
|
|
||||||
return "date_favorited", text.extract(row, 'nowrap">', '<', pos)[0]
|
|
||||||
|
|||||||
Reference in New Issue
Block a user