[seiga] support more than 200 images

Due to API restrictions and/or missing knowledge about and
documentation of API usage, it was only possible to retrieve the
latest 200 images of a niconico seiga user with said API.

The new approach manually visits each HTML page and gets its
information from there.
This commit is contained in:
Mike Fährmann
2017-11-13 20:46:24 +01:00
parent baf8094868
commit f72318e593
2 changed files with 25 additions and 22 deletions

View File

@@ -11,7 +11,6 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, exception from .. import text, exception
from ..cache import cache from ..cache import cache
from xml.etree import ElementTree
class SeigaExtractor(Extractor): class SeigaExtractor(Extractor):
@@ -74,11 +73,12 @@ class SeigaUserExtractor(SeigaExtractor):
r"user/illust/(\d+)")] r"user/illust/(\d+)")]
test = [ test = [
("http://seiga.nicovideo.jp/user/illust/39537793", { ("http://seiga.nicovideo.jp/user/illust/39537793", {
"keyword": "a716bf534b4191dc58ddbff51494b72a9cf58285", "pattern": r"https://lohas\.nicoseiga\.jp/priv/[0-9a-f]+/\d+/\d+",
"count": 2,
}), }),
("http://seiga.nicovideo.jp/user/illust/79433", { ("http://seiga.nicovideo.jp/user/illust/79433", {
"url": "da39a3ee5e6b4b0d3255bfef95601890afd80709", "url": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
"keyword": "187b77728381d072466af7f7ebcc479a0830ce25", "count": 0,
}), }),
] ]
@@ -90,25 +90,27 @@ class SeigaUserExtractor(SeigaExtractor):
return {"user_id": self.user_id} return {"user_id": self.user_id}
def get_images(self): def get_images(self):
keymap = {0: "image_id", 2: "title", 3: "description", url = "http://seiga.nicovideo.jp/user/illust/" + self.user_id
7: "summary", 8: "genre", 18: "date"} params = {"target": "illust_all", "page": 1}
url = "http://seiga.nicovideo.jp/api/user/data?id=" + self.user_id
response = self.request(url) while True:
try: cnt = 0
root = ElementTree.fromstring(response.text) page = self.request(url, params=params).text
except ElementTree.ParseError:
self.log.debug("xml parsing error; removing control characters") for info in text.extract_iter(
xmldata = text.clean_xml(response.text) page, '<li class="list_item', '</a></li> '):
root = ElementTree.fromstring(xmldata) yield text.extract_all(info, (
if root[0].text == "0": ("image_id", '/seiga/im', '"'),
return [] ("title" , '<li class="title">', '</li>'),
return [ ("views" , '</span>', '</li>'),
{ ("comments", '</span>', '</li>'),
key: image[index].text ("clips" , '</span>', '</li>'),
for index, key in keymap.items() ))[0]
} cnt += 1
for image in root[1]
] if cnt < 40:
return
params["page"] += 1
class SeigaImageExtractor(SeigaExtractor): class SeigaImageExtractor(SeigaExtractor):

View File

@@ -83,6 +83,7 @@ skip = [
"archivedmoe", "archiveofsins", "thebarchive", "archivedmoe", "archiveofsins", "thebarchive",
# temporary issues # temporary issues
"mangazuki", "mangazuki",
"hentaifoundry", # invalid SSL cert
] ]
# enable selective testing for direct calls # enable selective testing for direct calls
if __name__ == '__main__' and len(sys.argv) > 1: if __name__ == '__main__' and len(sys.argv) > 1: