[myhentaigallery] update and fix extraction (#1001)

- extract more metadata - match "/show/" URLs - complete test results - fix missing images for lines starting with " <img" - fix missing comma in supportedsites.py
2020-09-17 18:06:12 +02:00
parent 05b9ac8d37
commit c43b3894be
3 changed files with 31 additions and 18 deletions
--- a/gallery_dl/extractor/myhentaigallery.py
+++ b/gallery_dl/extractor/myhentaigallery.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-

-# Copyright 2018-2020 Mike Fährmann
-#
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
@@ -15,16 +13,30 @@ from .. import text, exception
 class MyhentaigalleryGalleryExtractor(GalleryExtractor):
    """Extractor for image galleries from myhentaigallery.com"""
    category = "myhentaigallery"
-    directory_fmt = ("{category}", "{gallery_id} [{artist}] {title}")
-    pattern = (r"(?:https?://)?(myhentaigallery\.com"
-               r"/gallery/thumbnails/[0-9]+)")
+    directory_fmt = ("{category}", "{gallery_id} {artist:?[/] /J, }{title}")
+    pattern = (r"(?:https?://)?myhentaigallery\.com"
+               r"/gallery/(?:thumbnails|show)/(\d+)")
    test = (
-        ("https://myhentaigallery.com/gallery/thumbnails/16247"),
-        ("https://myhentaigallery.com/gallery/thumbnails/15224"),
+        ("https://myhentaigallery.com/gallery/thumbnails/16247", {
+            "pattern": r"https://images.myhentaigrid.com/imagesgallery/images"
+                       r"/[^/]+/original/\d+\.jpg",
+            "keyword": {
+                "artist"    : list,
+                "count"     : 11,
+                "gallery_id": 16247,
+                "group"     : list,
+                "parodies"  : list,
+                "tags"      : ["Giantess"],
+                "title"     : "Attack Of The 50ft Woman 1",
+            },
+        }),
+        ("https://myhentaigallery.com/gallery/show/16247/1"),
    )
+    root = "https://myhentaigallery.com"

    def __init__(self, match):
-        url = "https://" + match.group(1)
+        self.gallery_id = match.group(1)
+        url = "{}/gallery/thumbnails/{}".format(self.root, self.gallery_id)
        GalleryExtractor.__init__(self, match, url)
        self.session.headers["Referer"] = url

@@ -32,22 +44,22 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor):
        extr = text.extract_from(page)
        split = text.split_html

-        image = extr('<div class="comic-cover">\n<a href="', '"')
        title = extr('<div class="comic-description">\n<h1>', '</h1>')
        if not title:
            raise exception.NotFoundError("gallery")
-        data = {
+
+        return {
            "title"     : text.unescape(title),
-            "gallery_id": text.parse_int(image.split("/")[-2]),
+            "gallery_id": text.parse_int(self.gallery_id),
            "tags"      : split(extr('<div>\nCategories:', '</div>')),
+            "artist"    : split(extr('<div>\nArtists:'   , '</div>')),
+            "group"     : split(extr('<div>\nGroups:'    , '</div>')),
+            "parodies"  : split(extr('<div>\nParodies:'  , '</div>')),
        }
-        artists = split(extr('<div>\nArtists:', '</div>'))
-        data["artist"] = artists[0] if artists else "Unknown"
-        return data

    def images(self, page):
-        extr = text.extract_iter
        return [
-            (text.unescape(url).replace("/thumbnail/", "/original/"), None)
-            for url in extr(page, 'class="comic-thumb">\n<img src="', '"')
+            (text.unescape(text.extract(url, 'src="', '"')[0]).replace(
+                "/thumbnail/", "/original/"), None)
+            for url in text.extract_iter(page, 'class="comic-thumb"', '</div>')
        ]