diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py index a02b8e8a..ef05f989 100644 --- a/gallery_dl/extractor/tmohentai.py +++ b/gallery_dl/extractor/tmohentai.py @@ -6,14 +6,15 @@ """Extractors for https://tmohentai.com/""" -from .common import Extractor, Message +from .common import GalleryExtractor, Message from .. import text BASE_PATTERN = r'(?:https?://)?tmohentai\.com' -class TmohentaiExtractor(Extractor): +class TmohentaiGalleryExtractor(GalleryExtractor): category = 'tmohentai' + subcategory = 'gallery' root = 'http://tmohentai.com' directory_fmt = ('{category}', '{title}') filename_fmt = '{title}_{filename}.{extension}' @@ -22,71 +23,51 @@ class TmohentaiExtractor(Extractor): example = 'https://tmohentai.com/contents/12345a67b89c0' def __init__(self, match): - Extractor.__init__(self, match) + GalleryExtractor.__init__(self, match) self.contents = match.group(2) self.reader = match.group(3) self.id_string = match.group(4) def parse_location(self): - if self.contents: - url = '{}/reader/{}/paginated'.format(self.root, self.id_string) - else: - url_str = self.url.rpartition('/') - if url_str[-1].isdigit(): - url = url_str[0] - else: - url = self.url + url = self.url + if self.reader: + url = '{}/contents/{}'.format(self.root, self.id_string) return url - @staticmethod - def get_file_info(page_src): - file = text.extr(page_src, 'data-original="', '"') - file_loc, _, file_name = file.rpartition('/') - start_num, ext = file_name.split('.') - return file_loc, start_num, ext - def items(self): url = self.parse_location() - page_src = self.request( + page = self.request( text.ensure_http_scheme(url)).text + data = self.metadata(page) - data = self.metadata() yield Message.Directory, data + imgs = self.images(page) - file_loc, start_num, ext = self.get_file_info(page_src) - page_nums = text.extract_iter( - page_src, 'option value="', '"') - - for num, page in enumerate(page_nums, start=int(start_num)): - file = '{}/{:>03}.{}'.format(file_loc, num, ext) - img = text.nameext_from_url(file, { - 'num' : num, + cdn = 'https://imgrojo.tmohentai.com/contents' + for num, _ in enumerate(imgs, start=0): + url = ('{}/{}/{:>03}.webp'.format(cdn, self.id_string, num)) + img = text.nameext_from_url(url, { + 'num' : num + 1, 'title' : data['title'], 'id_string': self.id_string, }) - yield Message.Url, file, img + yield Message.Url, url, img - def metadata(self): - contents = '{}/contents/{}'.format(self.root, self.id_string) - contents_src = self.request(text.ensure_http_scheme(contents)).text + def images(self, page): + pages = text.extract_iter( + page, 'class="lanzador', '>') + return pages - genders_src = text.extr(contents_src, 'Genders', '') - genders_list = text.extract_iter(genders_src, '">', '') + def metadata(self, page): + extr = text.extract_from(page, page.index('tag tag-accepted">')) - tags_src = text.extr(contents_src, 'Tags', '') - tags_list = text.extract_iter(tags_src, '">', '') - - upload_src = text.extr(contents_src, 'Uploaded By', '/a>') data = { - 'title' : text.extr(contents_src, '