From eb37fbf0e84d1dbbe86784802dba1597a3a23823 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 20 Apr 2018 18:22:57 +0200 Subject: [PATCH] [hentaifoundry] improve extractor - use common base class - better pagination - respect '.../page/' - implement skip() / --range support - get YII_CSRF_TOKEN from cookies --- gallery_dl/extractor/hentaifoundry.py | 164 +++++++++++++------------- 1 file changed, 85 insertions(+), 79 deletions(-) diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 2fe4daa7..63f37274 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -10,70 +10,44 @@ from .common import Extractor, Message from .. import text, util, exception +from urllib.parse import urljoin -class HentaifoundryUserExtractor(Extractor): - """Extractor for all images of a hentai-foundry-user""" +class HentaifoundryExtractor(Extractor): category = "hentaifoundry" - subcategory = "user" directory_fmt = ["{category}", "{artist}"] filename_fmt = "{category}_{index}_{title}.{extension}" archive_fmt = "{index}" - pattern = [r"(?:https?://)?(?:www\.)?hentai-foundry\.com/" - r"(?:pictures/user/([^/]+)/?$|user/([^/]+)/profile)"] - test = [ - ("https://www.hentai-foundry.com/pictures/user/Tenpura", { - "url": "ebbc981a85073745e3ca64a0f2ab31fab967fc28", - "keyword": "f8fecc8aa89978ecf402ec221243978fe791bd54", - }), - ("http://www.hentai-foundry.com/user/asdq/profile", { - "exception": exception.NotFoundError, - }), - ] - base_url = "https://www.hentai-foundry.com/pictures/user/" + root = "https://www.hentai-foundry.com" + per_page = 25 - def __init__(self, match): + def __init__(self, artist): Extractor.__init__(self) - self.artist = match.group(1) or match.group(2) + self.artist = artist + self.artist_url = "{}/pictures/user/{}".format(self.root, self.artist) + self.start_post = 0 def items(self): - data, token = self.get_job_metadata() - self.set_filters(token) + data = self.get_job_metadata() yield Message.Version, 1 yield Message.Directory, data - for url, image in self.get_images(data["count"]): + + for page_url in util.advance(self.get_image_pages(), self.start_post): + url, image = self.get_image_metadata(page_url) image.update(data) + if not image["extension"]: + image["extension"] = "jpg" yield Message.Url, url, image - def get_images(self, count): - """Yield url and keywords for all images of one artist""" - num = 1 - needle = 'thumbTitle">Pictures (', ')', pos) - return {"artist": self.artist, "count": util.safe_int(count)}, token def get_image_metadata(self, url): """Collect metadata for an image""" - page = self.request(url).text + page = self.request(urljoin(self.root, url)).text index = url.rsplit("/", 2)[1] title, pos = text.extract( page, 'Pictures » ', '<') @@ -83,8 +57,65 @@ class HentaifoundryUserExtractor(Extractor): text.nameext_from_url(part, data) return "https://pictures.hentai-foundry.com" + part, data - def set_filters(self, token): + +class HentaifoundryUserExtractor(HentaifoundryExtractor): + """Extractor for all images of a hentai-foundry-user""" + subcategory = "user" + pattern = [r"(?:https?://)?(?:www\.)?hentai-foundry\.com/" + r"(?:pictures/user/([^/]+)(?:/(?:page/(\d+))?)?$" + r"|user/([^/]+)/profile)"] + test = [ + ("https://www.hentai-foundry.com/pictures/user/Tenpura", { + "url": "ebbc981a85073745e3ca64a0f2ab31fab967fc28", + "keyword": "f8fecc8aa89978ecf402ec221243978fe791bd54", + }), + ("http://www.hentai-foundry.com/user/asdq/profile", { + "exception": exception.NotFoundError, + }), + ("https://www.hentai-foundry.com/pictures/user/Tenpura/page/3", None), + ] + + def __init__(self, match): + HentaifoundryExtractor.__init__(self, match.group(1) or match.group(3)) + self.start_page = util.safe_int(match.group(2), 1) + self._skipped = (self.start_page - 1) * self.per_page + + def skip(self, num): + pages, posts = divmod(num, self.per_page) + self.start_page += pages + self.start_post += posts + self._skipped += num + return num + + def get_image_pages(self): + num = self.start_page + + while True: + url = "{}/page/{}".format(self.artist_url, num) + page = self.request(url).text + yield from text.extract_iter(page, 'thumbTitle">Pictures (', ')')[0]) + if self._skipped >= count: + raise exception.StopExtraction() + + self.set_filters() + return {"artist": self.artist, "count": count} + + def set_filters(self): """Set site-internal filters to show all images""" + token = text.extract( + self.session.cookies["YII_CSRF_TOKEN"], "%22", "%22")[0] formdata = { "YII_CSRF_TOKEN": token, "rating_nudity": 3, @@ -114,55 +145,30 @@ class HentaifoundryUserExtractor(Extractor): method="post", data=formdata, allow_empty=True) -class HentaifoundryImageExtractor(Extractor): +class HentaifoundryImageExtractor(HentaifoundryExtractor): """Extractor for a single image from hentaifoundry.com""" - category = "hentaifoundry" subcategory = "image" - directory_fmt = ["{category}", "{artist}"] - filename_fmt = "{category}_{index}_{title}.{extension}" - archive_fmt = "{index}" pattern = [(r"(?:https?://)?(?:www\.|pictures\.)?hentai-foundry\.com/" r"(?:pictures/user/([^/]+)/(\d+)" r"|[^/]/([^/]+)/(\d+))")] test = [ - (("http://www.hentai-foundry.com/" - "pictures/user/Tenpura/407501/shimakaze"), { + (("http://www.hentai-foundry.com" + "/pictures/user/Tenpura/407501/shimakaze"), { "url": "fbf2fd74906738094e2575d2728e8dc3de18a8a3", - "keyword": "85b8e26fa93d00ae1333cb7b418078f1792dc4a8", + "keyword": "2956321893e9187edde4aeac6bed889449692e6a", "content": "91bf01497c39254b6dfb234a18e8f01629c77fd1", }), ("http://www.hentai-foundry.com/pictures/user/Tenpura/340853/", { - "exception": exception.NotFoundError, + "exception": exception.HttpError, }), ] def __init__(self, match): - Extractor.__init__(self) - self.artist = match.group(1) or match.group(3) + HentaifoundryExtractor.__init__(self, match.group(1) or match.group(3)) self.index = match.group(2) or match.group(4) - def items(self): - url, data = self.get_image_metadata() - yield Message.Version, 1 - yield Message.Directory, data - yield Message.Url, url, data + def get_image_pages(self): + return ("{}/{}?enterAgree=1".format(self.artist_url, self.index),) - def get_image_metadata(self): - """Collect metadata for an image""" - url = "https://www.hentai-foundry.com/pictures/user/{}/{}".format( - self.artist, self.index) - response = self.request(url + "?enterAgree=1", fatal=False) - if response.status_code == 404: - raise exception.NotFoundError("image") - extr = text.extract - page = response.text - artist, pos = extr(page, ' » ', '<', pos) - url , pos = extr(page, '//pictures.hentai-foundry.com', '"', pos) - data = { - "artist": artist, - "index": util.safe_int(self.index), - "title": text.unescape(title), - } - text.nameext_from_url(url, data) - return "https://pictures.hentai-foundry.com" + url, data + def get_job_metadata(self): + return {"artist": self.artist, "index": util.safe_int(self.index)}