[imgur] improve and add image extractor

2017-05-26 22:30:09 +02:00
parent 99b72130ee
commit 67791e1b36
2 changed files with 83 additions and 38 deletions
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@@ -49,7 +49,7 @@ Supported Sites
 +-------------------+---------------------------------------+----------------------------------------------------------+
 |imgth              |https://imgth.com/                     |Galleries                                                 |
 +-------------------+---------------------------------------+----------------------------------------------------------+
-|imgur              |https://imgur.com/                     |Albums                                                    |
+|imgur              |https://imgur.com/                     |Albums, individual Images                                 |
 +-------------------+---------------------------------------+----------------------------------------------------------+
 |Jaimini's Box      |https://jaiminisbox.com/               |Chapters, Manga                                           |
 +-------------------+---------------------------------------+----------------------------------------------------------+
@@ -93,6 +93,8 @@ Supported Sites
 +-------------------+---------------------------------------+----------------------------------------------------------+
 |Read Comic Online  |http://readcomiconline.to/             |Comic-Issues, Comics                                      |
 +-------------------+---------------------------------------+----------------------------------------------------------+
+|Reddit             |https://reddit.com/                    |Submissions, Subreddits                                   |
+-------------------+---------------------------------------+----------------------------------------------------------+
 |Rule 34            |https://rule34.xxx/                    |Posts, Tag-Searches                                       |
 +-------------------+---------------------------------------+----------------------------------------------------------+
 |Safebooru          |https://safebooru.org/                 |Posts, Tag-Searches                                       |
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -6,61 +6,104 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.

-"""Extract images from albums at https://imgur.com/"""
+"""Extract images from https://imgur.com/"""

 from .common import Extractor, Message
 from .. import text, exception
+import json


-class ImgurAlbumExtractor(Extractor):
-    """Extractor for image albums from imgur.com"""
+class ImgurExtractor(Extractor):
+    """Base class for imgur extractors"""
    category = "imgur"
+
+    def __init__(self, match):
+        Extractor.__init__(self)
+        self.item_id = match.group(1)
+
+    def _get_data(self, urlpart):
+        response = self.session.get("https://imgur.com/" + urlpart)
+        if response.status_code == 404:
+            raise exception.NotFoundError(self.subcategory)
+        data = text.extract(response.text, "image               : ", ",\n")[0]
+        return self._clean(json.loads(data))
+
+    @staticmethod
+    def _prepare(image):
+        url = "https://i.imgur.com/" + image["hash"] + image["ext"]
+        image["extension"] = image["ext"][1:]
+        return url
+
+    @staticmethod
+    def _clean(data):
+        try:
+            del data["views"]
+            del data["adConfig"]
+            del data["isAd"]
+        except KeyError:
+            pass
+        return data
+
+
+class ImgurImageExtractor(ImgurExtractor):
+    """Extractor for individual images from imgur.com"""
+    category = "imgur"
+    subcategory = "image"
+    directory_fmt = ["{category}"]
+    filename_fmt = "{category}_{hash}.{extension}"
+    pattern = [(r"(?:https?://)?(?:m\.|www\.)?imgur\.com/"
+                r"(?:gallery/)?((?!gallery)[^/?&#]{7})/?"),
+               (r"(?:https?://)?i\.imgur\.com/([^/?&#.]{7})\.")]
+    test = [
+        ("https://imgur.com/21yMxCS", {
+            "url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2",
+            "keyword": "2270c7a1365c43012231359d2d74d506be6b1a19",
+            "content": "0c8768055e4e20e7c7259608b67799171b691140",
+        }),
+        ("https://i.imgur.com/21yMxCS.png", {
+            "url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2",
+            "keyword": "2270c7a1365c43012231359d2d74d506be6b1a19",
+        }),
+        ("https://imgur.com/zzzzzzz", {
+            "exception": exception.NotFoundError,
+        }),
+    ]
+
+    def items(self):
+        image = self._get_data(self.item_id)
+        url = self._prepare(image)
+
+        yield Message.Version, 1
+        yield Message.Directory, image
+        yield Message.Url, url, image
+
+
+class ImgurAlbumExtractor(ImgurExtractor):
+    """Extractor for image albums from imgur.com"""
    subcategory = "album"
-    directory_fmt = ["{category}", "{album-key} - {title}"]
-    filename_fmt = "{category}_{album-key}_{num:>03}_{hash}{ext}"
+    directory_fmt = ["{category}", "{album[hash]} - {album[title]}"]
+    filename_fmt = "{category}_{album[hash]}_{num:>03}_{hash}.{extension}"
    pattern = [r"(?:https?://)?(?:m\.|www\.)?imgur\.com/"
-               r"(?:a|gallery)/([^/?&#]+)"]
+               r"(?:a|gallery)/([^/?&#]{5})/?$"]
    test = [
        ("https://imgur.com/a/TcBmP", {
            "url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563",
-            "keyword": "21723f47bf4a42599d39fbf29c5f79323d420898",
+            "keyword": "e2eaae0e62d3c5d76df9c870140d1ef466bbec59",
        }),
        ("https://imgur.com/a/TcBmQ", {
            "exception": exception.NotFoundError,
        }),
    ]

-    def __init__(self, match):
-        Extractor.__init__(self)
-        self.album = match.group(1)
-
    def items(self):
-        imgs = self.get_images()
-        data = self.get_job_metadata()
+        album = self._get_data("a/" + self.item_id)
+        images = album["album_images"]["images"]
+        del album["album_images"]
+
        yield Message.Version, 1
-        yield Message.Directory, data
-        for num, image in enumerate(imgs, 1):
+        yield Message.Directory, {"album": album, "count": len(images)}
+        for num, image in enumerate(images, 1):
+            url = self._prepare(image)
            image["num"] = num
-            image["extension"] = image["ext"][1:]
-            image.update(data)
-            url = "https://i.imgur.com/" + image["hash"] + image["ext"]
+            image["album"] = album
            yield Message.Url, url, image
-
-    def get_job_metadata(self):
-        """Collect metadata for extractor-job"""
-        page = self.request("https://imgur.com/a/" + self.album).text
-        data = text.extract_all(page, (
-            ('title', '<meta property="og:title" content="', '"'),
-            ('count', '"num_images":"', '"'),
-        ), values={"album-key": self.album})[0]
-        data["title"] = text.unescape(data["title"])
-        return data
-
-    def get_images(self):
-        """Return a list of all images in this album"""
-        url = ("https://imgur.com/ajaxalbums/getimages/" +
-               self.album + "/hit.json")
-        data = self.request(url).json()["data"]
-        if not data:
-            raise exception.NotFoundError("album")
-        return data["images"]