From f72318e593c399add259997400d31f4f8d7af258 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Mon, 13 Nov 2017 20:46:24 +0100
Subject: [PATCH] [seiga] support more than 200 images

Due to API restrictions and/or missing knowledge about and
documentation of API usage, it was only possible to retrieve the
latest 200 images of a niconico seiga user with said API.

The new approach manually visits each HTML page and gets its
information from there.
---
 gallery_dl/extractor/seiga.py | 46 ++++++++++++++++++-----------------
 test/test_extractors.py       |  1 +
 2 files changed, 25 insertions(+), 22 deletions(-)
diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py
index 4d917b2e..d0de92d9 100644
--- a/gallery_dl/extractor/seiga.py
+++ b/gallery_dl/extractor/seiga.py
@@ -11,7 +11,6 @@
 from .common import Extractor, Message
 from .. import text, exception
 from ..cache import cache
-from xml.etree import ElementTree
 
 
 class SeigaExtractor(Extractor):
@@ -74,11 +73,12 @@ class SeigaUserExtractor(SeigaExtractor):
                 r"user/illust/(\d+)")]
     test = [
         ("http://seiga.nicovideo.jp/user/illust/39537793", {
-            "keyword": "a716bf534b4191dc58ddbff51494b72a9cf58285",
+            "pattern": r"https://lohas\.nicoseiga\.jp/priv/[0-9a-f]+/\d+/\d+",
+            "count": 2,
         }),
         ("http://seiga.nicovideo.jp/user/illust/79433", {
             "url": "da39a3ee5e6b4b0d3255bfef95601890afd80709",
-            "keyword": "187b77728381d072466af7f7ebcc479a0830ce25",
+            "count": 0,
         }),
     ]
 
@@ -90,25 +90,27 @@ class SeigaUserExtractor(SeigaExtractor):
         return {"user_id": self.user_id}
 
     def get_images(self):
-        keymap = {0: "image_id", 2: "title", 3: "description",
-                  7: "summary", 8: "genre", 18: "date"}
-        url = "http://seiga.nicovideo.jp/api/user/data?id=" + self.user_id
-        response = self.request(url)
-        try:
-            root = ElementTree.fromstring(response.text)
-        except ElementTree.ParseError:
-            self.log.debug("xml parsing error; removing control characters")
-            xmldata = text.clean_xml(response.text)
-            root = ElementTree.fromstring(xmldata)
-        if root[0].text == "0":
-            return []
-        return [
-            {
-                key: image[index].text
-                for index, key in keymap.items()
-            }
-            for image in root[1]
-        ]
+        url = "http://seiga.nicovideo.jp/user/illust/" + self.user_id
+        params = {"target": "illust_all", "page": 1}
+
+        while True:
+            cnt = 0
+            page = self.request(url, params=params).text
+
+            for info in text.extract_iter(
+                    page, '<li class="list_item', '</a></li> '):
+                yield text.extract_all(info, (
+                    ("image_id", '/seiga/im', '"'),
+                    ("title"   , '<li class="title">', '</li>'),
+                    ("views"   , '</span>', '</li>'),
+                    ("comments", '</span>', '</li>'),
+                    ("clips"   , '</span>', '</li>'),
+                ))[0]
+                cnt += 1
+
+            if cnt < 40:
+                return
+            params["page"] += 1
 
 
 class SeigaImageExtractor(SeigaExtractor):
diff --git a/test/test_extractors.py b/test/test_extractors.py
index 791d7c27..ad268eeb 100644
--- a/test/test_extractors.py
+++ b/test/test_extractors.py
@@ -83,6 +83,7 @@ skip = [
     "archivedmoe", "archiveofsins", "thebarchive",
     # temporary issues
     "mangazuki",
+    "hentaifoundry",  # invalid SSL cert
 ]
 # enable selective testing for direct calls
 if __name__ == '__main__' and len(sys.argv) > 1: