[tsumino] add extractor for search results (#161)

2019-02-02 14:56:46 +01:00
parent 1c1367ec5b
commit d36ec51e5a
2 changed files with 265 additions and 22 deletions
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@@ -83,7 +83,7 @@ Simply Hentai        https://www.simply-hentai.com/      Galleries, individual I
 SlideShare           https://www.slideshare.net/         Presentations
 SmugMug              https://www.smugmug.com/            |Capabilities-8|                                   Optional (OAuth)
 The /b/ Archive      https://thebarchive.com/            Threads
-Tsumino              https://www.tsumino.com/            Galleries                                          Optional
+Tsumino              https://www.tsumino.com/            Galleries, Search Results                          Optional
 Tumblr               https://www.tumblr.com/             Images from Users, Likes, Posts, Tag-Searches      Optional (OAuth)
 Twitter              https://twitter.com/                Media Timelines, Timelines, Tweets
 Wallhaven            https://alpha.wallhaven.cc/         individual Images, Search Results                  Optional
--- a/gallery_dl/extractor/tsumino.py
+++ b/gallery_dl/extractor/tsumino.py
@@ -8,35 +8,17 @@
 """Extractors for https://www.tsumino.com/"""
-from .common import ChapterExtractor
+from .common import ChapterExtractor, Extractor, Message
 from .. import text, exception
 from ..cache import cache
-class TsuminoGalleryExtractor(ChapterExtractor):
+class TsuminoBase():
-    """Extractor for image galleries on tsumino.com"""
+    """Base class for tsumino extractors"""
    category = "tsumino"
    subcategory = "gallery"
    filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}"
    directory_fmt = ["{category}", "{gallery_id} {title}"]
    archive_fmt = "{gallery_id}_{page}"
    cookiedomain = "www.tsumino.com"
    pattern = [r"(?i)(?:https?://)?(?:www\.)?tsumino\.com"
               r"/(?:Book/Info|Read/View)/(\d+)"]
    test = [
        ("https://www.tsumino.com/Book/Info/45834", {
            "url": "ed3e39bc21221fbd21b9a2ba711e8decb6fdc6bc",
            "keyword": "5acc43f67c61f5312e0b5d6c9d6b1276cda438fc",
        }),
        ("https://www.tsumino.com/Read/View/45834", None),
    ]
    root = "https://www.tsumino.com"
    def __init__(self, match):
        self.gallery_id = match.group(1)
        url = "{}/Book/Info/{}".format(self.root, self.gallery_id)
        ChapterExtractor.__init__(self, url)
    def login(self):
        username, password = self._get_auth_info()
        if username:
@@ -57,6 +39,48 @@ class TsuminoGalleryExtractor(ChapterExtractor):
            raise exception.AuthenticationError()
        return {".aotsumino": response.history[0].cookies[".aotsumino"]}
 class TsuminoGalleryExtractor(TsuminoBase, ChapterExtractor):
    """Extractor for image galleries on tsumino.com"""
    subcategory = "gallery"
    filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}"
    directory_fmt = ["{category}", "{gallery_id} {title}"]
    archive_fmt = "{gallery_id}_{page}"
    pattern = [r"(?i)(?:https?://)?(?:www\.)?tsumino\.com"
               r"/(?:Book/Info|Read/View)/(\d+)"]
    test = [
        ("https://www.tsumino.com/Book/Info/45834", {
            "url": "ed3e39bc21221fbd21b9a2ba711e8decb6fdc6bc",
            "keyword": {
                "artist": "Itou Life",
                "characters": "Carmilla, Gudako, Gudao, Lancelot, Nightingale",
                "collection": "",
                "count": 42,
                "date": "2019 January 27",
                "gallery_id": 45834,
                "group": "Itou Life",
                "lang": "en",
                "language": "English",
                "page": int,
                "parodies": "Fate/Grand Order",
                "rating": float,
                "tags": str,
                "thumbnail": "http://www.tsumino.com/Image/Thumb/45834",
                "title": r"re:\[Remove\] Shikoshiko Daisuki Nightingale",
                "title_jp": "シコシコ大好きナイチンゲール + 会場限定おまけ本",
                "type": "Doujinshi",
                "uploader": "NHNL1"
            },
        }),
        ("https://www.tsumino.com/Read/View/45834", None),
    ]
    def __init__(self, match):
        self.gallery_id = match.group(1)
        url = "{}/Book/Info/{}".format(self.root, self.gallery_id)
        ChapterExtractor.__init__(self, url)
    def get_metadata(self, page):
        extr = text.extract
        title, pos = extr(page, '"og:title" content="', '"')
@@ -102,3 +126,222 @@ class TsuminoGalleryExtractor(ChapterExtractor):
            (base + text.quote(name), None)
            for name in data["reader_page_urls"]
        ]
 class TsuminoSearchExtractor(TsuminoBase, Extractor):
    """Extractor for search results on tsumino.com"""
    subcategory = "search"
    pattern = [r"(?i)(?:https?://)?(?:www\.)?tsumino\.com"
               r"/(?:Books/?)?#(.+)"]
    test = [
        ("https://www.tsumino.com/Books#?Character=Reimu+Hakurei", {
            "pattern": TsuminoGalleryExtractor.pattern[0],
            "range": "1-40",
            "count": 40,
        }),
        (("http://www.tsumino.com/Books#~(Tags~(~"
          "(Type~7~Text~'Reimu*20Hakurei~Exclude~false)~"
          "(Type~'1~Text~'Pantyhose~Exclude~false)))#"), {
            "pattern": TsuminoGalleryExtractor.pattern[0],
            "count": ">= 3",
        }),
    ]
    def __init__(self, match):
        Extractor.__init__(self)
        self.query = match.group(1)
    def items(self):
        yield Message.Version, 1
        for gallery in self.galleries():
            url = "{}/Book/Info/{}".format(self.root, gallery["Id"])
            yield Message.Queue, url, gallery
    def galleries(self):
        """Return all gallery results matching 'self.query'"""
        url = "{}/Books/Operate".format(self.root)
        headers = {
            "Referer": "{}/".format(self.root),
            "X-Requested-With": "XMLHttpRequest",
        }
        data = {
            "PageNumber": 1,
            "Text": "",
            "Sort": "Newest",
            "List": "0",
            "Length": "0",
            "MinimumRating": "0",
            "ExcludeList": "0",
            "CompletelyExcludeHated": "false",
        }
        data.update(self._parse(self.query))
        while True:
            info = self.request(
                url, method="POST", headers=headers, data=data).json()
            for gallery in info["Data"]:
                yield gallery["Entry"]
            if info["PageNumber"] >= info["PageCount"]:
                return
            data["PageNumber"] += 1
    def _parse(self, query):
        try:
            if query.startswith("?"):
                return self._parse_simple(query)
            return self._parse_jsurl(query)
        except Exception as exc:
            self.log.error("Invalid search query: '%s' (%s)", query, exc)
            raise exception.StopExtraction()
    @staticmethod
    def _parse_simple(query):
        """Parse search query with format '?<key>=value>'"""
        key, _, value = query.partition("=")
        tag_types = {
            "Tag": "1",
            "Category": "2",
            "Collection": "3",
            "Group": "4",
            "Artist": "5",
            "Parody": "6",
            "Character": "7",
            "Uploader": "100",
        }
        return {
            "Tags[0][Type]": tag_types[key[1:].capitalize()],
            "Tags[0][Text]": text.unquote(value).replace("+", " "),
            "Tags[0][Exclude]": "false",
        }
    @staticmethod
    def _parse_jsurl(data):
        """Parse search query in JSURL format
        Nested lists and dicts are handled in a special way to deal
        with the way Tsumino expects its parameters -> expand(...)
        Example: ~(name~'John*20Doe~age~42~children~(~'Mary~'Bill))
        Ref: https://github.com/Sage/jsurl
        """
        if not data:
            return {}
        i = 0
        imax = len(data)
        def eat(expected):
            nonlocal i
            if data[i] != expected:
                error = "bad JSURL syntax: expected '{}', got {}".format(
                    expected, data[i])
                raise ValueError(error)
            i += 1
        def decode():
            nonlocal i
            beg = i
            result = ""
            while i < imax:
                ch = data[i]
                if ch not in "~)*!":
                    i += 1
                elif ch == "*":
                    if beg < i:
                        result += data[beg:i]
                    if data[i + 1] == "*":
                        result += chr(int(data[i+2:i+6], 16))
                        i += 6
                    else:
                        result += chr(int(data[i+1:i+3], 16))
                        i += 3
                    beg = i
                elif ch == "!":
                    if beg < i:
                        result += data[beg:i]
                    result += "$"
                    i += 1
                    beg = i
                else:
                    break
            return result + data[beg:i]
        def parse_one():
            nonlocal i
            eat('~')
            result = ""
            ch = data[i]
            if ch == "(":
                i += 1
                if data[i] == "~":
                    result = []
                    if data[i+1] == ")":
                        i += 1
                    else:
                        result.append(parse_one())
                        while data[i] == "~":
                            result.append(parse_one())
                else:
                    result = {}
                    if data[i] != ")":
                        while True:
                            key = decode()
                            value = parse_one()
                            for ekey, evalue in expand(key, value):
                                result[ekey] = evalue
                            if data[i] != "~":
                                break
                            i += 1
                eat(")")
            elif ch == "'":
                i += 1
                result = decode()
            else:
                beg = i
                i += 1
                while i < imax and data[i] not in "~)":
                    i += 1
                sub = data[beg:i]
                if ch in "0123456789-":
                    fval = float(sub)
                    ival = int(fval)
                    result = ival if ival == fval else fval
                else:
                    if sub not in ("true", "false", "null"):
                        raise ValueError("bad value keyword: " + sub)
                    result = sub
            return result
        def expand(key, value):
            if isinstance(value, list):
                for index, cvalue in enumerate(value):
                    ckey = "{}[{}]".format(key, index)
                    yield from expand(ckey, cvalue)
            elif isinstance(value, dict):
                for ckey, cvalue in value.items():
                    ckey = "{}[{}]".format(key, ckey)
                    yield from expand(ckey, cvalue)
            else:
                yield key, value
        return parse_one()