diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 97b3aa90..dad3b9bd 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -85,6 +85,12 @@ Consider all listed sites to potentially be NSFW. Posts, Tag Searches + + AHottie + https://ahottie.top/ + Galleries, Search Results, Tag Searches + + Arcalive https://arca.live/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 64134d01..6f2cb69a 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -23,6 +23,7 @@ modules = [ "8muses", "adultempire", "agnph", + "ahottie", "ao3", "arcalive", "architizer", diff --git a/gallery_dl/extractor/ahottie.py b/gallery_dl/extractor/ahottie.py new file mode 100644 index 00000000..f8db0d4c --- /dev/null +++ b/gallery_dl/extractor/ahottie.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://ahottie.top/""" + +from .common import Extractor, GalleryExtractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?ahottie\.top" + + +class AhottieExtractor(Extractor): + """Base class for ahottie extractors""" + category = "ahottie" + root = "https://ahottie.top" + + def items(self): + for album in self.albums(): + yield Message.Queue, album["url"], album + + def _pagination(self, url, params): + params["page"] = text.parse_int(params.get("page"), 1) + + while True: + page = self.request(url, params=params).text + + for album in text.extract_iter( + page, '
', '
'): + yield { + "url" : text.extr(album, ' href="', '"'), + "title": text.unquote(text.extr( + album, ' alt="', '"')), + "date" : self.parse_datetime_iso(text.extr( + album, ' datetime="', '"')), + "_extractor": AhottieGalleryExtractor, + } + + if 'rel="next"' not in page: + break + params["page"] += 1 + + +class AhottieGalleryExtractor(GalleryExtractor, AhottieExtractor): + directory_fmt = ("{category}", "{date:%Y-%m-%d} {title} ({gallery_id})") + filename_fmt = "{num:>03}.{extension}" + archive_fmt = "{gallery_id}_{num}_{filename}" + pattern = BASE_PATTERN + r"(/albums/(\w+))" + example = "https://ahottie.top/albums/1234567890" + + def metadata(self, page): + extr = text.extract_from(page) + return { + "gallery_id": self.groups[1], + "title": text.unescape(extr("", "<").rpartition(" | ")[0]), + "date" : self.parse_datetime_iso(extr('datetime="', '"')), + "tags" : text.split_html(extr('<i ', '</div>'))[1:], + } + + def images(self, page): + pos = page.find("<time ") + 1 + data = { + "_http_headers" : {"Referer": None}, + "_http_validate": self._validate, + } + return [ + (url, data) + for url in text.extract_iter(page, '" src="', '"', pos) + ] + + def _validate(self, response): + hget = response.headers.get + return not ( + hget("content-length") == "2421" and + hget("content-type") == "image/jpeg" + ) + + +class AhottieTagExtractor(AhottieExtractor): + subcategory = "tag" + pattern = BASE_PATTERN + r"/tags/([^/?#]+)" + example = "https://ahottie.top/tags/TAG" + + def albums(self): + tag = self.groups[0] + self.kwdict["search_tags"] = text.unquote(tag) + return self._pagination(f"{self.root}/tags/{tag}", {}) + + +class AhottieSearchExtractor(AhottieExtractor): + subcategory = "search" + pattern = BASE_PATTERN + r"/search/?\?([^#]+)" + example = "https://ahottie.top/search?kw=QUERY" + + def albums(self): + params = text.parse_query(self.groups[0]) + self.kwdict["search_tags"] = params.get("kw") + return self._pagination(f"{self.root}/search", params) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 0ee42ad9..3685c071 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -25,6 +25,7 @@ CATEGORY_MAP = { "35photo" : "35PHOTO", "adultempire" : "Adult Empire", "agnph" : "AGNPH", + "ahottie" : "AHottie", "aibooru" : "AIBooru", "allgirlbooru" : "All girl", "ao3" : "Archive of Our Own", diff --git a/test/results/ahottie.py b/test/results/ahottie.py new file mode 100644 index 00000000..5aabc9e0 --- /dev/null +++ b/test/results/ahottie.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import ahottie + + +__tests__ = ( +{ + "#url" : "https://ahottie.top/albums/5d54b221c19ff9c9126ffd62859c6603", + "#class" : ahottie.AhottieGalleryExtractor, + "#pattern" : r"https://images2\.imgbox\.com/../../\w+_o\.jpg", + "#count" : 10, + + "count" : 10, + "num" : range(1, 10), + "date" : "dt:2024-12-30 00:00:00", + "extension" : "jpg", + "filename" : str, + "gallery_id": "5d54b221c19ff9c9126ffd62859c6603", + "title" : "大熊杏優・かれしちゃん, Young Champion 2025 No.02 (ヤングチャンピオン 2025年2号)", + "tags" : [ + "Ayu Okuma 大熊杏優", + "Kareshichan かれしちゃん", + "Young Champion ヤングチャンピオン", + ], +}, + +{ + "#url" : "https://ahottie.top/tags/Ayu%20Okuma%20%E5%A4%A7%E7%86%8A%E6%9D%8F%E5%84%AA", + "#class" : ahottie.AhottieTagExtractor, + "#pattern" : ahottie.AhottieGalleryExtractor.pattern, + "#count" : 17, + + "date" : "type:datetime", + "search_tags": "Ayu Okuma 大熊杏優", + "title" : str, + "url" : str, +}, + +{ + "#url" : "https://ahottie.top/search?kw=ayu&page=10", + "#class" : ahottie.AhottieSearchExtractor, + "#pattern" : ahottie.AhottieGalleryExtractor.pattern, + "#count" : range(80, 200), + + "date" : "type:datetime", + "search_tags": "ayu", + "title" : str, + "url" : str, +}, + +)