[myhentaigallery] add 'tag' extractor (#8537)

2025-11-10 18:20:17 +01:00
parent 8876272e3c
commit d75a135e03
3 changed files with 61 additions and 6 deletions
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -658,7 +658,7 @@ Consider all listed sites to potentially be NSFW.
 <tr id="myhentaigallery" title="myhentaigallery">
    <td>My Hentai Gallery</td>
    <td>https://myhentaigallery.com/</td>
-    <td>Galleries</td>
+    <td>Galleries, Tag Searches</td>
    <td></td>
 </tr>
 <tr id="naver-blog" title="naver-blog">
--- a/gallery_dl/extractor/myhentaigallery.py
+++ b/gallery_dl/extractor/myhentaigallery.py
@@ -6,17 +6,21 @@

 """Extractors for https://myhentaigallery.com/"""

-from .common import GalleryExtractor
+from .common import Extractor, GalleryExtractor, Message
 from .. import text, exception

+BASE_PATTERN = r"(?:https?://)?myhentaigallery\.com"

-class MyhentaigalleryGalleryExtractor(GalleryExtractor):
-    """Extractor for image galleries from myhentaigallery.com"""
+
+class MyhentaigalleryBase():
    category = "myhentaigallery"
    root = "https://myhentaigallery.com"
+
+
+class MyhentaigalleryGalleryExtractor(MyhentaigalleryBase, GalleryExtractor):
+    """Extractor for image galleries from myhentaigallery.com"""
    directory_fmt = ("{category}", "{gallery_id} {artist:?[/] /J, }{title}")
-    pattern = (r"(?:https?://)?myhentaigallery\.com"
-               r"/g(?:allery/(?:thumbnails|show))?/(\d+)")
+    pattern = rf"{BASE_PATTERN}/g(?:allery/(?:thumbnails|show))?/(\d+)"
    example = "https://myhentaigallery.com/g/12345"

    def __init__(self, match):
@@ -53,3 +57,32 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor):
                "/thumbnail/", "/original/"), None)
            for url in text.extract_iter(page, 'class="comic-thumb"', '</div>')
        ]
+
+
+class MyhentaigalleryTagExtractor(MyhentaigalleryBase, Extractor):
+    """Extractor for myhentaigallery tag searches"""
+    subcategory = "tag"
+    pattern = rf"{BASE_PATTERN}(/g/(artist|category|group|parody)/(\d+).*)"
+    example = "https://myhentaigallery.com/g/category/123"
+
+    def items(self):
+        data = {"_extractor": MyhentaigalleryGalleryExtractor}
+        for url in self.galleries():
+            yield Message.Queue, url, data
+
+    def galleries(self):
+        root = self.root
+        url = root + self.groups[0]
+
+        while True:
+            page = self.request(url).text
+
+            for inner in text.extract_iter(
+                    page, '<div class="comic-inner">', "<div"):
+                yield root + text.extr(inner, 'href="', '"')
+
+            try:
+                pos = page.index(">Next<")
+            except ValueError:
+                return
+            url = root + text.rextr(page, 'href="', '"', pos)
--- a/test/results/myhentaigallery.py
+++ b/test/results/myhentaigallery.py
@@ -35,4 +35,26 @@ __tests__ = (
    "#class"   : myhentaigallery.MyhentaigalleryGalleryExtractor,
 },

+{
+    "#url"     : "https://myhentaigallery.com/g/artist/8084?sorting=favorite",
+    "#class"   : myhentaigallery.MyhentaigalleryTagExtractor,
+    "#pattern" : myhentaigallery.MyhentaigalleryGalleryExtractor.pattern,
+    "#count"   : 18,
+},
+
+{
+    "#url"     : "https://myhentaigallery.com/g/group/2",
+    "#class"   : myhentaigallery.MyhentaigalleryTagExtractor,
+},
+
+{
+    "#url"     : "https://myhentaigallery.com/g/parody/8239",
+    "#class"   : myhentaigallery.MyhentaigalleryTagExtractor,
+},
+
+{
+    "#url"     : "https://myhentaigallery.com/g/category/59",
+    "#class"   : myhentaigallery.MyhentaigalleryTagExtractor,
+},
+
 )