[seiga] support more than 200 images
Due to API restrictions and/or missing knowledge about and documentation of API usage, it was only possible to retrieve the latest 200 images of a niconico seiga user with said API. The new approach manually visits each HTML page and gets its information from there.
This commit is contained in:
@@ -11,7 +11,6 @@
|
|||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import text, exception
|
from .. import text, exception
|
||||||
from ..cache import cache
|
from ..cache import cache
|
||||||
from xml.etree import ElementTree
|
|
||||||
|
|
||||||
|
|
||||||
class SeigaExtractor(Extractor):
|
class SeigaExtractor(Extractor):
|
||||||
@@ -74,11 +73,12 @@ class SeigaUserExtractor(SeigaExtractor):
|
|||||||
r"user/illust/(\d+)")]
|
r"user/illust/(\d+)")]
|
||||||
test = [
|
test = [
|
||||||
("http://seiga.nicovideo.jp/user/illust/39537793", {
|
("http://seiga.nicovideo.jp/user/illust/39537793", {
|
||||||
"keyword": "a716bf534b4191dc58ddbff51494b72a9cf58285",
|
"pattern": r"https://lohas\.nicoseiga\.jp/priv/[0-9a-f]+/\d+/\d+",
|
||||||
|
"count": 2,
|
||||||
}),
|
}),
|
||||||
("http://seiga.nicovideo.jp/user/illust/79433", {
|
("http://seiga.nicovideo.jp/user/illust/79433", {
|
||||||
"url": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
|
"url": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
|
||||||
"keyword": "187b77728381d072466af7f7ebcc479a0830ce25",
|
"count": 0,
|
||||||
}),
|
}),
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -90,25 +90,27 @@ class SeigaUserExtractor(SeigaExtractor):
|
|||||||
return {"user_id": self.user_id}
|
return {"user_id": self.user_id}
|
||||||
|
|
||||||
def get_images(self):
|
def get_images(self):
|
||||||
keymap = {0: "image_id", 2: "title", 3: "description",
|
url = "http://seiga.nicovideo.jp/user/illust/" + self.user_id
|
||||||
7: "summary", 8: "genre", 18: "date"}
|
params = {"target": "illust_all", "page": 1}
|
||||||
url = "http://seiga.nicovideo.jp/api/user/data?id=" + self.user_id
|
|
||||||
response = self.request(url)
|
while True:
|
||||||
try:
|
cnt = 0
|
||||||
root = ElementTree.fromstring(response.text)
|
page = self.request(url, params=params).text
|
||||||
except ElementTree.ParseError:
|
|
||||||
self.log.debug("xml parsing error; removing control characters")
|
for info in text.extract_iter(
|
||||||
xmldata = text.clean_xml(response.text)
|
page, '<li class="list_item', '</a></li> '):
|
||||||
root = ElementTree.fromstring(xmldata)
|
yield text.extract_all(info, (
|
||||||
if root[0].text == "0":
|
("image_id", '/seiga/im', '"'),
|
||||||
return []
|
("title" , '<li class="title">', '</li>'),
|
||||||
return [
|
("views" , '</span>', '</li>'),
|
||||||
{
|
("comments", '</span>', '</li>'),
|
||||||
key: image[index].text
|
("clips" , '</span>', '</li>'),
|
||||||
for index, key in keymap.items()
|
))[0]
|
||||||
}
|
cnt += 1
|
||||||
for image in root[1]
|
|
||||||
]
|
if cnt < 40:
|
||||||
|
return
|
||||||
|
params["page"] += 1
|
||||||
|
|
||||||
|
|
||||||
class SeigaImageExtractor(SeigaExtractor):
|
class SeigaImageExtractor(SeigaExtractor):
|
||||||
|
|||||||
@@ -83,6 +83,7 @@ skip = [
|
|||||||
"archivedmoe", "archiveofsins", "thebarchive",
|
"archivedmoe", "archiveofsins", "thebarchive",
|
||||||
# temporary issues
|
# temporary issues
|
||||||
"mangazuki",
|
"mangazuki",
|
||||||
|
"hentaifoundry", # invalid SSL cert
|
||||||
]
|
]
|
||||||
# enable selective testing for direct calls
|
# enable selective testing for direct calls
|
||||||
if __name__ == '__main__' and len(sys.argv) > 1:
|
if __name__ == '__main__' and len(sys.argv) > 1:
|
||||||
|
|||||||
Reference in New Issue
Block a user