From c4a201ed42e6679d7edc3ce98d75054f574c00fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 21 Nov 2023 20:24:07 +0100 Subject: [PATCH] [tmohentai] simplify + tests --- docs/supportedsites.md | 2 +- gallery_dl/extractor/tmohentai.py | 66 ++++++++++++------------------- scripts/supportedsites.py | 1 + test/results/tmohentai.py | 54 +++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 42 deletions(-) create mode 100644 test/results/tmohentai.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 94cef0f7..8aadcde5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -830,7 +830,7 @@ Consider all sites to be NSFW unless otherwise known. - Tmohentai + TMOHentai https://tmohentai.com/ Galleries diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index d4e16086..be45702a 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -6,59 +6,43 @@ """Extractors for https://tmohentai.com/""" -from .common import GalleryExtractor, Message +from .common import GalleryExtractor from .. import text -BASE_PATTERN = r'(?:https?://)?tmohentai\.com' +BASE_PATTERN = r"(?:https?://)?tmohentai\.com" class TmohentaiGalleryExtractor(GalleryExtractor): - category = 'tmohentai' - subcategory = 'gallery' - root = 'http://tmohentai.com' - directory_fmt = ('{category}', '{title}') - filename_fmt = '{title}_{filename}.{extension}' - archive_fmt = '{id_string}_{filename}' - pattern = BASE_PATTERN + r'/(contents)|(reader)/(\w+)' - example = 'https://tmohentai.com/contents/12345a67b89c0' + category = "tmohentai" + root = "http://tmohentai.com" + directory_fmt = ("{category}", "{title} ({gallery_id})") + pattern = BASE_PATTERN + r"/(?:contents|reader)/(\w+)" + example = "https://tmohentai.com/contents/12345a67b89c0" def __init__(self, match): - self.id_string = match.group(2) - url = '{}/contents/{}'.format(self.root, self.id_string) + self.gallery_id = match.group(1) + url = "{}/contents/{}".format(self.root, self.gallery_id) GalleryExtractor.__init__(self, match, url) - def items(self): - page = self.request( - text.ensure_http_scheme(self.url)).text - data = self.metadata(page) - - yield Message.Directory, data - imgs = self.images(page) - - cdn = 'https://imgrojo.tmohentai.com/contents' - for num, _ in enumerate(imgs, start=0): - url = ('{}/{}/{:>03}.webp'.format(cdn, self.id_string, num)) - img = text.nameext_from_url(url, { - 'num' : num + 1, - 'title' : data['title'], - 'id_string': self.id_string, - }) - yield Message.Url, url, img - def images(self, page): - pages = text.extract_iter( - page, 'class="lanzador', '>') - return pages + fmt = "https://imgrojo.tmohentai.com/contents/{}/{{:>03}}.webp".format( + self.gallery_id).format + cnt = page.count('class="lanzador') + return [(fmt(i), None) for i in range(0, cnt)] def metadata(self, page): - extr = text.extract_from(page, page.index('tag tag-accepted">')) + extr = text.extract_from(page) return { - 'title' : text.extr(page, '

', '

').strip(), - 'id_string': self.id_string, - 'artists' : text.remove_html(extr('">', '')), - 'genders' : text.split_html(extr('Genders', '', '')), - 'uploader' : text.remove_html(extr('Uploaded By', '')), - 'language' : extr(' ', '\n'), + "gallery_id": self.gallery_id, + "title" : text.unescape(extr("

", "<").strip()), + "artists" : text.split_html(extr( + "", "")), + "categories": text.split_html(extr( + "", "")), + "tags" : text.split_html(extr( + "", "")), + "uploader" : text.remove_html(extr( + "", "")), + "language" : extr(" ", "\n"), } diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 470b629d..695108e0 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -121,6 +121,7 @@ CATEGORY_MAP = { "tbib" : "The Big ImageBoard", "tcbscans" : "TCB Scans", "tco" : "Twitter t.co", + "tmohentai" : "TMOHentai", "thatpervert" : "ThatPervert", "thebarchive" : "The /b/ Archive", "thecollection" : "The /co/llection", diff --git a/test/results/tmohentai.py b/test/results/tmohentai.py new file mode 100644 index 00000000..2bae050a --- /dev/null +++ b/test/results/tmohentai.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import tmohentai + + +__tests__ = ( +{ + "#url" : "https://tmohentai.com/contents/653c2aeaa693c", + "#category": ("", "tmohentai", "gallery"), + "#class" : tmohentai.TmohentaiGalleryExtractor, + "#pattern" : r"https://imgrojo\.tmohentai\.com/contents/653c2aeaa693c/\d\d\d\.webp", + "#count" : 46, + + "artists" : ["Andoryu"], + "categories": [ + "Big Breasts", + "BlowJob", + "Cheating", + "Mature", + "Milf", + "Student", + ], + "count" : 46, + "extension" : "webp", + "gallery_id": "653c2aeaa693c", + "language" : "EspaƱol", + "num" : int, + "tags" : [ + "milf", + "Madre", + "enormes pechos", + "Peluda", + "nakadashi", + "cheating", + "madura", + "sexo a escondidas", + "Ama de casa", + "mamada", + ], + "title" : "La Mama de mi Novia es tan Pervertida que no Pude Soportarlo mas", + "uploader" : "NekoCreme Fansub", +}, + +{ + "#url" : "https://tmohentai.com/reader/653c2aeaa693c/paginated/1", + "#category": ("", "tmohentai", "gallery"), + "#class" : tmohentai.TmohentaiGalleryExtractor, +}, + +)