From 286d0cb098a24916ecccf5a24961bf4847073dca Mon Sep 17 00:00:00 2001 From: jsouthgb Date: Fri, 17 Nov 2023 19:34:34 -0500 Subject: [PATCH] [tmohentai] add support --- docs/supportedsites.md | 6 +++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/tmohentai.py | 78 +++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 gallery_dl/extractor/tmohentai.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a15566df..94cef0f7 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -829,6 +829,12 @@ Consider all sites to be NSFW unless otherwise known. Galleries + + Tmohentai + https://tmohentai.com/ + Galleries + + Toyhouse https://toyhou.se/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 22e4fe34..efdcde78 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -147,6 +147,7 @@ modules = [ "tapas", "tcbscans", "telegraph", + "tmohentai", "toyhouse", "tsumino", "tumblr", diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py new file mode 100644 index 00000000..462e51dd --- /dev/null +++ b/gallery_dl/extractor/tmohentai.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://tmohentai.com/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r'(?:https?://)?tmohentai\.com' + + +class TmohentaiExtractor(Extractor): + category = 'tmohentai' + root = 'http://tmohentai.com' + directory_fmt = ('{category}', '{title}') + filename_fmt = '{filename}.{extension}' + archive_fmt = '{title}_{filename}' + pattern = BASE_PATTERN + r'/((contents)|(reader))/(\w+)' + example = 'https://tmohentai.com/contents/12345a67b89c0' + + def __init__(self, match): + Extractor.__init__(self, match) + self.contents = match.group(2) + self.reader = match.group(3) + self.id_string = match.group(4) + + def parse_location(self): + if self.contents: + url = f'{self.root}/reader/{self.id_string}/paginated' + else: + url = self.url + return url + + def items(self): + url = self.parse_location() + page_src = self.request( + text.ensure_http_scheme(url)).text + + data = self.metadata() + yield Message.Directory, data + + page_nums = text.extract_iter(page_src, 'option value="', '"') + pages = [text.extr(page_src, 'data-original="', '"')] + base_page = pages[0].rpartition('/')[0] + for num, page in enumerate(page_nums, start=1): + file = f'{base_page}/{num:>03}.webp' + img = text.nameext_from_url(file, { + 'num': num, + }) + yield Message.Url, file, img + + def metadata(self): + contents = f'{self.root}/contents/{self.id_string}' + contents_src = self.request(text.ensure_http_scheme(contents)).text + + genders_src = text.extr(contents_src, 'Genders', '') + genders_list = text.extract_iter(genders_src, '">', '') + + tags_src = text.extr(contents_src, 'Tags', '') + tags_list = text.extract_iter(tags_src, '">', '') + + upload_src = text.extr(contents_src, 'Uploaded By', '/a>') + data = { + 'title' : text.extr(contents_src, '

', '

'), + 'id_string': self.id_string, + 'artists' : text.remove_html( + text.extr(contents_src, 'tag tag-accepted">', '')), + 'genders' : list(genders_list), + 'tags' : list(tags_list), + 'uploader' : text.extr(upload_src, '">', '<'), + 'language' : text.extr( + contents_src, ' ', ''), + } + return data