[seiga] support more than 200 images
Due to API restrictions and/or missing knowledge about and documentation of API usage, it was only possible to retrieve the latest 200 images of a niconico seiga user with said API. The new approach manually visits each HTML page and gets its information from there.
This commit is contained in:
@@ -11,7 +11,6 @@
|
||||
from .common import Extractor, Message
|
||||
from .. import text, exception
|
||||
from ..cache import cache
|
||||
from xml.etree import ElementTree
|
||||
|
||||
|
||||
class SeigaExtractor(Extractor):
|
||||
@@ -74,11 +73,12 @@ class SeigaUserExtractor(SeigaExtractor):
|
||||
r"user/illust/(\d+)")]
|
||||
test = [
|
||||
("http://seiga.nicovideo.jp/user/illust/39537793", {
|
||||
"keyword": "a716bf534b4191dc58ddbff51494b72a9cf58285",
|
||||
"pattern": r"https://lohas\.nicoseiga\.jp/priv/[0-9a-f]+/\d+/\d+",
|
||||
"count": 2,
|
||||
}),
|
||||
("http://seiga.nicovideo.jp/user/illust/79433", {
|
||||
"url": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
|
||||
"keyword": "187b77728381d072466af7f7ebcc479a0830ce25",
|
||||
"count": 0,
|
||||
}),
|
||||
]
|
||||
|
||||
@@ -90,25 +90,27 @@ class SeigaUserExtractor(SeigaExtractor):
|
||||
return {"user_id": self.user_id}
|
||||
|
||||
def get_images(self):
|
||||
keymap = {0: "image_id", 2: "title", 3: "description",
|
||||
7: "summary", 8: "genre", 18: "date"}
|
||||
url = "http://seiga.nicovideo.jp/api/user/data?id=" + self.user_id
|
||||
response = self.request(url)
|
||||
try:
|
||||
root = ElementTree.fromstring(response.text)
|
||||
except ElementTree.ParseError:
|
||||
self.log.debug("xml parsing error; removing control characters")
|
||||
xmldata = text.clean_xml(response.text)
|
||||
root = ElementTree.fromstring(xmldata)
|
||||
if root[0].text == "0":
|
||||
return []
|
||||
return [
|
||||
{
|
||||
key: image[index].text
|
||||
for index, key in keymap.items()
|
||||
}
|
||||
for image in root[1]
|
||||
]
|
||||
url = "http://seiga.nicovideo.jp/user/illust/" + self.user_id
|
||||
params = {"target": "illust_all", "page": 1}
|
||||
|
||||
while True:
|
||||
cnt = 0
|
||||
page = self.request(url, params=params).text
|
||||
|
||||
for info in text.extract_iter(
|
||||
page, '<li class="list_item', '</a></li> '):
|
||||
yield text.extract_all(info, (
|
||||
("image_id", '/seiga/im', '"'),
|
||||
("title" , '<li class="title">', '</li>'),
|
||||
("views" , '</span>', '</li>'),
|
||||
("comments", '</span>', '</li>'),
|
||||
("clips" , '</span>', '</li>'),
|
||||
))[0]
|
||||
cnt += 1
|
||||
|
||||
if cnt < 40:
|
||||
return
|
||||
params["page"] += 1
|
||||
|
||||
|
||||
class SeigaImageExtractor(SeigaExtractor):
|
||||
|
||||
@@ -83,6 +83,7 @@ skip = [
|
||||
"archivedmoe", "archiveofsins", "thebarchive",
|
||||
# temporary issues
|
||||
"mangazuki",
|
||||
"hentaifoundry", # invalid SSL cert
|
||||
]
|
||||
# enable selective testing for direct calls
|
||||
if __name__ == '__main__' and len(sys.argv) > 1:
|
||||
|
||||
Reference in New Issue
Block a user