From 751e535948ff75579c3e92025676035d7dbc3013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 14 Jan 2019 07:51:05 +0100 Subject: [PATCH] [nhentai] fix extraction (closes #156) Use JSON embedded in webpage since API endpoints have been disabled --- gallery_dl/extractor/nhentai.py | 41 +++++++++++++++------------------ gallery_dl/extractor/smugmug.py | 2 +- test/test_results.py | 1 + 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py index f4aa5200..0ee3974b 100644 --- a/gallery_dl/extractor/nhentai.py +++ b/gallery_dl/extractor/nhentai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2018 Mike Fährmann +# Copyright 2015-2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text +import json class NHentaiExtractor(Extractor): @@ -68,45 +69,41 @@ class NhentaiGalleryExtractor(NHentaiExtractor): def get_gallery_info(self, gallery_id): """Extract and return info about a gallery by ID""" - url = "{}/api/gallery/{}".format(self.root, gallery_id) - return self.request(url).json() + url = "{}/g/{}".format(self.root, gallery_id) + page = self.request(url).text + return json.loads(text.extract(page, "N.gallery(", ");")[0]) class NhentaiSearchExtractor(NHentaiExtractor): """Extractor for nhentai search results""" category = "nhentai" subcategory = "search" - pattern = [r"(?:https?://)?nhentai\.net/search/?\?(.*)"] + pattern = [r"(?:https?://)?nhentai\.net/search/?\?([^#]+)"] + test = [("https://nhentai.net/search/?q=touhou", { + "pattern": NhentaiGalleryExtractor.pattern[0], + "count": 30, + "range": "1-30", + })] def __init__(self, match): NHentaiExtractor.__init__(self) self.params = text.parse_query(match.group(1)) - if "q" in self.params: - self.params["query"] = self.params["q"] - del self.params["q"] - def items(self): yield Message.Version, 1 - for ginfo in self._pagination("galleries/search", self.params): - url = "{}/g/{}/".format(self.root, ginfo["id"]) - yield Message.Queue, url, self.transform_to_metadata(ginfo) + for gid in self._pagination(self.params): + url = "{}/g/{}/".format(self.root, gid) + yield Message.Queue, url, {} - def _pagination(self, endpoint, params): - """Pagination over API responses""" - url = "{}/api/{}".format(self.root, endpoint) + def _pagination(self, params): + url = "{}/search/".format(self.root) params["page"] = text.parse_int(params.get("page"), 1) while True: - data = self.request( - url, params=params, expect=range(400, 500)).json() + page = self.request(url, params=params).text - if "error" in data: - self.log.error("API request failed: \"%s\"", data["error"]) - return + yield from text.extract_iter(page, 'href="/g/', '/') - yield from data["result"] - - if params["page"] >= data["num_pages"]: + if 'class="next"' not in page: return params["page"] += 1 diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 53d3760f..fcdab58b 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -104,7 +104,7 @@ class SmugmugImageExtractor(SmugmugExtractor): # no "ImageOwner" ("https://www.smugmug.com/gallery/n-GLCjnD/i-JD62fQk", { "url": "d4047637947b35e4ef49e3c7cb70303cc224a3a0", - "keyword": "96fc43bc3081f6356c929be43ab5971009975063", + "keyword": "0a1b12efd789c42d9b061f01b2a1fcfd6af32003", }), ] diff --git a/test/test_results.py b/test/test_results.py index a5802391..457d3075 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -23,6 +23,7 @@ TRAVIS_SKIP = { # temporary issues, etc. BROKEN = { + "hbrowse", "pinterest", }