From 6779512fc7e4e3fdb3484c3f7914e7ed930a2867 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 13 Oct 2019 22:10:32 +0200 Subject: [PATCH] [nozomi] add post and tag extractors (#388) --- docs/supportedsites.rst | 1 + gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/nozomi.py | 137 +++++++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + 4 files changed, 140 insertions(+) create mode 100644 gallery_dl/extractor/nozomi.py diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index b0d6ebad..2d0b2985 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -78,6 +78,7 @@ Ngomik http://ngomik.in/ Chapters nhentai https://nhentai.net/ Galleries, Search Results Niconico Seiga https://seiga.nicovideo.jp/ individual Images, User Profiles Required nijie https://nijie.info/ |nijie-C| Required +Nozomi.la https://nozomi.la/ Posts, Tag-Searches NSFWalbum.com https://nsfwalbum.com/ Albums Nyafuu Archive https://archive.nyafuu.org/ Threads Patreon https://www.patreon.com/ Creators, Posts, User Profiles diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 351c5dfd..df3a573d 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -70,6 +70,7 @@ modules = [ "ngomik", "nhentai", "nijie", + "nozomi", "nsfwalbum", "paheal", "patreon", diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py new file mode 100644 index 00000000..a98c1dad --- /dev/null +++ b/gallery_dl/extractor/nozomi.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://nozomi.la/""" + +from .common import Extractor, Message +from .. import text + + +class NozomiExtractor(Extractor): + """Base class for nozomi extractors""" + category = "nozomi" + root = "https://nozomi.la" + filename_fmt = "{postid}.{extension}" + archive_fmt = "{postid}" + + def items(self): + yield Message.Version, 1 + + data = self.metadata() + self.session.headers["Origin"] = self.root + self.session.headers["Referer"] = self.root + "/" + + for post_id in self.posts(): + url = "https://j.nozomi.la/post/{}/{}/{}.json".format( + post_id[-1], post_id[-3:-1], post_id) + image = self.request(url).json() + + image["tags"] = self._list(image.get("general")) + image["artist"] = self._list(image.get("artist")) + image["copyright"] = self._list(image.get("copyright")) + image["character"] = self._list(image.get("character")) + image["is_video"] = bool(image.get("is_video")) + image["date"] = text.parse_datetime( + image["date"] + ":00", "%Y-%m-%d %H:%M:%S%z") + image["url"] = text.urljoin(self.root, image["imageurl"]) + + for key in ("general", "imageurl", "imageurls"): + if key in image: + del image[key] + + image.update(data) + text.nameext_from_url(image["url"], image) + yield Message.Directory, image + yield Message.Url, image["url"], image + + def metadata(self): + return {} + + def posts(self): + return () + + @staticmethod + def _list(src): + if not src: + return [] + return [x["tagname_display"] for x in src] + + @staticmethod + def _unpack(b): + for i in range(0, len(b), 4): + yield (b[i] << 24) + (b[i+1] << 16) + (b[i+2] << 8) + b[i+3] + + +class NozomiPostExtractor(NozomiExtractor): + """Extractor for individual posts on nozomi.la""" + subcategory = "post" + pattern = r"(?:https?://)?nozomi\.la/post/(\d+)" + test = ("https://nozomi.la/post/3649262.html", { + "url": "f4522adfc8159355fd0476de28761b5be0f02068", + "content": "cd20d2c5149871a0b80a1b0ce356526278964999", + "keyword": { + "artist" : ["hammer (sunset beach)"], + "character": ["patchouli knowledge"], + "copyright": ["touhou"], + "dataid" : "re:aaa9f7c632cde1e1a5baaff3fb6a6d857ec73df7fdc5cf5a", + "date" : "type:datetime", + "extension": "jpg", + "favorites": int, + "filename" : str, + "height" : 768, + "is_video" : False, + "postid" : 3649262, + "source" : "danbooru", + "sourceid" : 2434215, + "tags" : list, + "type" : "jpg", + "url" : str, + "width" : 1024, + }, + }) + + def __init__(self, match): + NozomiExtractor.__init__(self, match) + self.post_id = match.group(1) + + def posts(self): + return (self.post_id,) + + +class NozomiTagExtractor(NozomiExtractor): + """Extractor for posts from tag searches on nozomi.la""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{postid}" + pattern = r"(?:https?://)?nozomi\.la/tag/([^/?&#]+)-\d+\." + test = ("https://nozomi.la/tag/3:1_aspect_ratio-1.html", { + "pattern": r"https://i.nozomi.la/\w/\w\w/\w+\.\w+", + "count": ">= 75", + "range": "1-75", + }) + + def __init__(self, match): + NozomiExtractor.__init__(self, match) + self.tags = text.unescape(match.group(1)) + + def metadata(self): + return {"search_tags": self.tags} + + def posts(self): + i = 0 + url = "https://n.nozomi.la/nozomi/{}.nozomi".format(self.tags) + + while True: + headers = {"Range": "bytes={}-{}".format(i, i+255)} + response = self.request(url, headers=headers) + yield from map(str, self._unpack(response.content)) + + i += 256 + cr = response.headers.get("Content-Range", "").rpartition("/")[2] + if text.parse_int(cr, i) <= i: + return diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 96c11d66..bf053e8d 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -55,6 +55,7 @@ CATEGORY_MAP = { "myportfolio" : "Adobe Portfolio", "nhentai" : "nhentai", "nijie" : "nijie", + "nozomi" : "Nozomi.la", "nsfwalbum" : "NSFWalbum.com", "nyafuu" : "Nyafuu Archive", "paheal" : "rule #34",