From 67791e1b36d7dce40477601e2382cfb59c998405 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 26 May 2017 22:30:09 +0200 Subject: [PATCH] [imgur] improve and add image extractor --- docs/supportedsites.rst | 4 +- gallery_dl/extractor/imgur.py | 117 +++++++++++++++++++++++----------- 2 files changed, 83 insertions(+), 38 deletions(-) diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index c988e2ea..c26f10ce 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -49,7 +49,7 @@ Supported Sites +-------------------+---------------------------------------+----------------------------------------------------------+ |imgth |https://imgth.com/ |Galleries | +-------------------+---------------------------------------+----------------------------------------------------------+ -|imgur |https://imgur.com/ |Albums | +|imgur |https://imgur.com/ |Albums, individual Images | +-------------------+---------------------------------------+----------------------------------------------------------+ |Jaimini's Box |https://jaiminisbox.com/ |Chapters, Manga | +-------------------+---------------------------------------+----------------------------------------------------------+ @@ -93,6 +93,8 @@ Supported Sites +-------------------+---------------------------------------+----------------------------------------------------------+ |Read Comic Online |http://readcomiconline.to/ |Comic-Issues, Comics | +-------------------+---------------------------------------+----------------------------------------------------------+ +|Reddit |https://reddit.com/ |Submissions, Subreddits | ++-------------------+---------------------------------------+----------------------------------------------------------+ |Rule 34 |https://rule34.xxx/ |Posts, Tag-Searches | +-------------------+---------------------------------------+----------------------------------------------------------+ |Safebooru |https://safebooru.org/ |Posts, Tag-Searches | diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 7985b53c..5f094748 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -6,61 +6,104 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from albums at https://imgur.com/""" +"""Extract images from https://imgur.com/""" from .common import Extractor, Message from .. import text, exception +import json -class ImgurAlbumExtractor(Extractor): - """Extractor for image albums from imgur.com""" +class ImgurExtractor(Extractor): + """Base class for imgur extractors""" category = "imgur" + + def __init__(self, match): + Extractor.__init__(self) + self.item_id = match.group(1) + + def _get_data(self, urlpart): + response = self.session.get("https://imgur.com/" + urlpart) + if response.status_code == 404: + raise exception.NotFoundError(self.subcategory) + data = text.extract(response.text, "image : ", ",\n")[0] + return self._clean(json.loads(data)) + + @staticmethod + def _prepare(image): + url = "https://i.imgur.com/" + image["hash"] + image["ext"] + image["extension"] = image["ext"][1:] + return url + + @staticmethod + def _clean(data): + try: + del data["views"] + del data["adConfig"] + del data["isAd"] + except KeyError: + pass + return data + + +class ImgurImageExtractor(ImgurExtractor): + """Extractor for individual images from imgur.com""" + category = "imgur" + subcategory = "image" + directory_fmt = ["{category}"] + filename_fmt = "{category}_{hash}.{extension}" + pattern = [(r"(?:https?://)?(?:m\.|www\.)?imgur\.com/" + r"(?:gallery/)?((?!gallery)[^/?&#]{7})/?"), + (r"(?:https?://)?i\.imgur\.com/([^/?&#.]{7})\.")] + test = [ + ("https://imgur.com/21yMxCS", { + "url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2", + "keyword": "2270c7a1365c43012231359d2d74d506be6b1a19", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + }), + ("https://i.imgur.com/21yMxCS.png", { + "url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2", + "keyword": "2270c7a1365c43012231359d2d74d506be6b1a19", + }), + ("https://imgur.com/zzzzzzz", { + "exception": exception.NotFoundError, + }), + ] + + def items(self): + image = self._get_data(self.item_id) + url = self._prepare(image) + + yield Message.Version, 1 + yield Message.Directory, image + yield Message.Url, url, image + + +class ImgurAlbumExtractor(ImgurExtractor): + """Extractor for image albums from imgur.com""" subcategory = "album" - directory_fmt = ["{category}", "{album-key} - {title}"] - filename_fmt = "{category}_{album-key}_{num:>03}_{hash}{ext}" + directory_fmt = ["{category}", "{album[hash]} - {album[title]}"] + filename_fmt = "{category}_{album[hash]}_{num:>03}_{hash}.{extension}" pattern = [r"(?:https?://)?(?:m\.|www\.)?imgur\.com/" - r"(?:a|gallery)/([^/?&#]+)"] + r"(?:a|gallery)/([^/?&#]{5})/?$"] test = [ ("https://imgur.com/a/TcBmP", { "url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563", - "keyword": "21723f47bf4a42599d39fbf29c5f79323d420898", + "keyword": "e2eaae0e62d3c5d76df9c870140d1ef466bbec59", }), ("https://imgur.com/a/TcBmQ", { "exception": exception.NotFoundError, }), ] - def __init__(self, match): - Extractor.__init__(self) - self.album = match.group(1) - def items(self): - imgs = self.get_images() - data = self.get_job_metadata() + album = self._get_data("a/" + self.item_id) + images = album["album_images"]["images"] + del album["album_images"] + yield Message.Version, 1 - yield Message.Directory, data - for num, image in enumerate(imgs, 1): + yield Message.Directory, {"album": album, "count": len(images)} + for num, image in enumerate(images, 1): + url = self._prepare(image) image["num"] = num - image["extension"] = image["ext"][1:] - image.update(data) - url = "https://i.imgur.com/" + image["hash"] + image["ext"] + image["album"] = album yield Message.Url, url, image - - def get_job_metadata(self): - """Collect metadata for extractor-job""" - page = self.request("https://imgur.com/a/" + self.album).text - data = text.extract_all(page, ( - ('title', '