diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 7ddeae75..53bc7265 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -74,6 +74,7 @@ modules = [ "mangareader", "mangastream", "mangoxo", + "myhentaigallery", "myportfolio", "naver", "newgrounds", diff --git a/gallery_dl/extractor/myhentaigallery.py b/gallery_dl/extractor/myhentaigallery.py new file mode 100644 index 00000000..ccbfb3c1 --- /dev/null +++ b/gallery_dl/extractor/myhentaigallery.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2020 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract hentai-gallery from https://myhentaigallery.com/""" + +from .common import GalleryExtractor +from .. import text, exception + + +class MyhentaigalleryGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from myhentaigallery.com""" + category = "myhentaigallery" + directory_fmt = ("{category}", "{gallery_id} [{artist}] {title}") + pattern = (r"(?:https?://)?(myhentaigallery\.com" + r"/gallery/thumbnails/[0-9]+)") + test = ( + ("https://myhentaigallery.com/gallery/thumbnails/16247"), + ("https://myhentaigallery.com/gallery/thumbnails/15224"), + ) + + def __init__(self, match): + url = "https://" + match.group(1) + GalleryExtractor.__init__(self, match, url) + self.session.headers["Referer"] = url + + def metadata(self, page): + extr = text.extract_from(page) + split = text.split_html + + image = extr('
\n\n

', '

') + if not title: + raise exception.NotFoundError("gallery") + data = { + "title" : text.unescape(title), + "gallery_id": text.parse_int(image.split("/")[-2]), + "tags" : split(extr('
\nCategories:', '
')), + } + artists = split(extr('
\nArtists:', '
')) + data["artist"] = artists[0] if artists else "Unknown" + return data + + def images(self, page): + extr = text.extract_iter + return [ + (text.unescape(url).replace("/thumbnail/", "/original/"), None) + for url in extr(page, 'class="comic-thumb">\n