From c4a201ed42e6679d7edc3ce98d75054f574c00fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Tue, 21 Nov 2023 20:24:07 +0100
Subject: [PATCH] [tmohentai] simplify + tests

---
 docs/supportedsites.md            |  2 +-
 gallery_dl/extractor/tmohentai.py | 66 ++++++++++++-------------------
 scripts/supportedsites.py         |  1 +
 test/results/tmohentai.py         | 54 +++++++++++++++++++++++++
 4 files changed, 81 insertions(+), 42 deletions(-)
 create mode 100644 test/results/tmohentai.py
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 94cef0f7..8aadcde5 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -830,7 +830,7 @@ Consider all sites to be NSFW unless otherwise known.
     <td></td>
 </tr>
 <tr>
-    <td>Tmohentai</td>
+    <td>TMOHentai</td>
     <td>https://tmohentai.com/</td>
     <td>Galleries</td>
     <td></td>
diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py
index d4e16086..be45702a 100644
--- a/gallery_dl/extractor/tmohentai.py
+++ b/gallery_dl/extractor/tmohentai.py
@@ -6,59 +6,43 @@
 
 """Extractors for https://tmohentai.com/"""
 
-from .common import GalleryExtractor, Message
+from .common import GalleryExtractor
 from .. import text
 
-BASE_PATTERN = r'(?:https?://)?tmohentai\.com'
+BASE_PATTERN = r"(?:https?://)?tmohentai\.com"
 
 
 class TmohentaiGalleryExtractor(GalleryExtractor):
-    category = 'tmohentai'
-    subcategory = 'gallery'
-    root = 'http://tmohentai.com'
-    directory_fmt = ('{category}', '{title}')
-    filename_fmt = '{title}_{filename}.{extension}'
-    archive_fmt = '{id_string}_{filename}'
-    pattern = BASE_PATTERN + r'/(contents)|(reader)/(\w+)'
-    example = 'https://tmohentai.com/contents/12345a67b89c0'
+    category = "tmohentai"
+    root = "http://tmohentai.com"
+    directory_fmt = ("{category}", "{title} ({gallery_id})")
+    pattern = BASE_PATTERN + r"/(?:contents|reader)/(\w+)"
+    example = "https://tmohentai.com/contents/12345a67b89c0"
 
     def __init__(self, match):
-        self.id_string = match.group(2)
-        url = '{}/contents/{}'.format(self.root, self.id_string)
+        self.gallery_id = match.group(1)
+        url = "{}/contents/{}".format(self.root, self.gallery_id)
         GalleryExtractor.__init__(self, match, url)
 
-    def items(self):
-        page = self.request(
-            text.ensure_http_scheme(self.url)).text
-        data = self.metadata(page)
-
-        yield Message.Directory, data
-        imgs = self.images(page)
-
-        cdn = 'https://imgrojo.tmohentai.com/contents'
-        for num, _ in enumerate(imgs, start=0):
-            url = ('{}/{}/{:>03}.webp'.format(cdn, self.id_string, num))
-            img = text.nameext_from_url(url, {
-                'num'      : num + 1,
-                'title'    : data['title'],
-                'id_string': self.id_string,
-            })
-            yield Message.Url, url, img
-
     def images(self, page):
-        pages = text.extract_iter(
-            page, 'class="lanzador', '>')
-        return pages
+        fmt = "https://imgrojo.tmohentai.com/contents/{}/{{:>03}}.webp".format(
+            self.gallery_id).format
+        cnt = page.count('class="lanzador')
+        return [(fmt(i), None) for i in range(0, cnt)]
 
     def metadata(self, page):
-        extr = text.extract_from(page, page.index('tag tag-accepted">'))
+        extr = text.extract_from(page)
 
         return {
-            'title'    : text.extr(page, '<h3>', '</h3>').strip(),
-            'id_string': self.id_string,
-            'artists'  : text.remove_html(extr('">', '</a>')),
-            'genders'  : text.split_html(extr('Genders</label>', '<div')),
-            'tags'     : text.split_html(extr('Tags</label>', '</ul>')),
-            'uploader' : text.remove_html(extr('Uploaded By</label>', '</a>')),
-            'language' : extr('&nbsp;', '\n</a>'),
+            "gallery_id": self.gallery_id,
+            "title"     : text.unescape(extr("<h3>", "<").strip()),
+            "artists"   : text.split_html(extr(
+                "<label>Artists and Artists Groups</label>", "</ul>")),
+            "categories": text.split_html(extr(
+                "<label>Genders</label>", "</ul>")),
+            "tags"      : text.split_html(extr(
+                "<label>Tags</label>", "</ul>")),
+            "uploader"  : text.remove_html(extr(
+                "<label>Uploaded By</label>", "</ul>")),
+            "language"  : extr("&nbsp;", "\n"),
         }
diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py
index 470b629d..695108e0 100755
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@@ -121,6 +121,7 @@ CATEGORY_MAP = {
     "tbib"           : "The Big ImageBoard",
     "tcbscans"       : "TCB Scans",
     "tco"            : "Twitter t.co",
+    "tmohentai"      : "TMOHentai",
     "thatpervert"    : "ThatPervert",
     "thebarchive"    : "The /b/ Archive",
     "thecollection"  : "The /co/llection",
diff --git a/test/results/tmohentai.py b/test/results/tmohentai.py
new file mode 100644
index 00000000..2bae050a
--- /dev/null
+++ b/test/results/tmohentai.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from gallery_dl.extractor import tmohentai
+
+
+__tests__ = (
+{
+    "#url"     : "https://tmohentai.com/contents/653c2aeaa693c",
+    "#category": ("", "tmohentai", "gallery"),
+    "#class"   : tmohentai.TmohentaiGalleryExtractor,
+    "#pattern" : r"https://imgrojo\.tmohentai\.com/contents/653c2aeaa693c/\d\d\d\.webp",
+    "#count"   : 46,
+
+    "artists"   : ["Andoryu"],
+    "categories": [
+        "Big Breasts",
+        "BlowJob",
+        "Cheating",
+        "Mature",
+        "Milf",
+        "Student",
+    ],
+    "count"     : 46,
+    "extension" : "webp",
+    "gallery_id": "653c2aeaa693c",
+    "language"  : "Español",
+    "num"       : int,
+    "tags"      : [
+        "milf",
+        "Madre",
+        "enormes pechos",
+        "Peluda",
+        "nakadashi",
+        "cheating",
+        "madura",
+        "sexo a escondidas",
+        "Ama de casa",
+        "mamada",
+    ],
+    "title"     : "La Mama de mi Novia es tan Pervertida que no Pude Soportarlo mas",
+    "uploader"  : "NekoCreme Fansub",
+},
+
+{
+    "#url"     : "https://tmohentai.com/reader/653c2aeaa693c/paginated/1",
+    "#category": ("", "tmohentai", "gallery"),
+    "#class"   : tmohentai.TmohentaiGalleryExtractor,
+},
+
+)