diff --git a/docs/supportedsites.md b/docs/supportedsites.md index ba5aed8f..b1a4fee4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -793,6 +793,12 @@ Consider all listed sites to potentially be NSFW. Posts, Tag Searches + + Rule 34 XYZ + https://rule34.xyz/ + Playlists, Posts, Tag Searches + + Saint https://saint2.su/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 067ec013..5c980ad1 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -140,6 +140,7 @@ modules = [ "redgifs", "rule34us", "rule34vault", + "rule34xyz", "saint", "sankaku", "sankakucomplex", diff --git a/gallery_dl/extractor/rule34xyz.py b/gallery_dl/extractor/rule34xyz.py new file mode 100644 index 00000000..b686cf0a --- /dev/null +++ b/gallery_dl/extractor/rule34xyz.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://rule34.xyz/""" + +from .booru import BooruExtractor +from .. import text +import collections + +BASE_PATTERN = r"(?:https?://)?rule34\.xyz" + + +class Rule34xyzExtractor(BooruExtractor): + category = "rule34xyz" + root = "https://rule34.xyz" + root_cdn = "https://rule34xyz.b-cdn.net" + filename_fmt = "{category}_{id}.{extension}" + per_page = 60 + + TAG_TYPES = { + 0: "general", + 1: "copyright", + 2: "character", + 3: "artist", + } + + def _file_url(self, post): + post["files"] = files = { + link["type"]: link["url"] + for link in post.pop("imageLinks") + } + post["file_url"] = url = ( + files.get(10) or files.get(40) or files.get(41) or files[2]) + return url + + def _prepare(self, post): + post.pop("filesPreview", None) + post.pop("tagsWithType", None) + post["date"] = text.parse_datetime( + post["created"], "%Y-%m-%dT%H:%M:%S.%f") + + def _tags(self, post, _): + if post.get("tagsWithType") is None: + post.update(self._fetch_post(post["id"])) + + tags = collections.defaultdict(list) + for tag in post["tagsWithType"]: + tags[tag["type"]].append(tag["value"]) + types = self.TAG_TYPES + for type, values in tags.items(): + post["tags_" + types[type]] = values + + def _fetch_post(self, post_id): + url = "{}/api/post/{}".format(self.root, post_id) + return self.request(url).json() + + def _pagination(self, endpoint, params=None): + url = "{}/api{}".format(self.root, endpoint) + + if params is None: + params = {} + params["IncludeLinks"] = "true" + params["IncludeTags"] = "true" + params["OrderBy"] = "0" + params["Skip"] = self.page_start * self.per_page + params["Take"] = self.per_page + params["DisableTotal"] = "true" + threshold = self.per_page + + while True: + data = self.request(url, params=params).json() + + yield from data["items"] + + if len(data["items"]) < threshold: + return + params["Skip"] += params["Take"] + + +class Rule34xyzPostExtractor(Rule34xyzExtractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/post/(\d+)" + example = "https://rule34.xyz/post/12345" + + def posts(self): + return (self._fetch_post(self.groups[0]),) + + +class Rule34xyzPlaylistExtractor(Rule34xyzExtractor): + subcategory = "playlist" + directory_fmt = ("{category}", "{playlist_id}") + archive_fmt = "p_{playlist_id}_{id}" + pattern = BASE_PATTERN + r"/playlists/view/(\d+)" + example = "https://rule34.xyz/playlists/view/12345" + + def metadata(self): + return {"playlist_id": self.groups[0]} + + def posts(self): + endpoint = "/playlist-item" + params = {"PlaylistId": self.groups[0]} + return self._pagination(endpoint, params) + + +class Rule34xyzTagExtractor(Rule34xyzExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/([^/?#]+)$" + example = "https://rule34.xyz/TAG" + + def metadata(self): + self.tags = text.unquote(self.groups[0]).replace("_", " ") + return {"search_tags": self.tags} + + def posts(self): + endpoint = "/post/search" + params = {"Tag": self.tags} + return self._pagination(endpoint, params) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index c7eed634..6cf1e66f 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -120,6 +120,7 @@ CATEGORY_MAP = { "rule34hentai" : "Rule34Hentai", "rule34us" : "Rule 34", "rule34vault" : "R34 Vault", + "rule34xyz" : "Rule 34 XYZ", "sankaku" : "Sankaku Channel", "sankakucomplex" : "Sankaku Complex", "seiga" : "Niconico Seiga", diff --git a/test/results/rule34xyz.py b/test/results/rule34xyz.py new file mode 100644 index 00000000..8400e651 --- /dev/null +++ b/test/results/rule34xyz.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import rule34xyz + + +__tests__ = ( +{ + "#url" : "https://rule34.xyz/sfw", + "#class": rule34xyz.Rule34xyzTagExtractor, + "#pattern": r"https://rule34(\.xyz|xyz\.b-cdn\.net)/posts/\d+/\d+/\d+\.(pic|mov\d*)\.(jpg|mp4)", + "#range" : "1-150", + "#count" : 150, + + "search_tags": "sfw", +}, + +{ + "#url" : "https://rule34.xyz/playlists/view/119", + "#class": rule34xyz.Rule34xyzPlaylistExtractor, + "#pattern": r"https://rule34(\.xyz|xyz\.b-cdn\.net)/posts/\d+/\d+/\d+\.(pic|mov\d*)\.(jpg|mp4)", + "#count" : 64, + + "playlist_id": "119", +}, + +{ + "#url" : "https://rule34.xyz/post/3613851", + "#comment": "image", + "#class" : rule34xyz.Rule34xyzPostExtractor, + "#options" : {"tags": True}, + "#urls" : "https://rule34xyz.b-cdn.net/posts/3613/3613851/3613851.pic.jpg", + "#sha1_content": "4d7146db258fd5b1645a1a5fc01550d102f495e1", + + "attributes": 1, + "comments" : 0, + "created" : "2023-03-29T06:00:59.136819", + "date" : "dt:2023-03-29 06:00:59", + "duration" : None, + "error" : None, + "extension" : "jpg", + "file_url" : "https://rule34xyz.b-cdn.net/posts/3613/3613851/3613851.pic.jpg", + "filename" : "3613851.pic", + "id" : 3613851, + "likes" : range(3, 100), + "posted" : "2023-03-29T06:01:07.900161", + "type" : 0, + "uploaderId": 9741, + "views" : range(200, 2000), + "status" : 2, + "files" : dict, + "sources": [ + "https://twitter.com/DesireDelta13/status/1636502494292373505?t=OrmlnC85cELyY5BPmBy9Hw&s=19", + ], + "tags": [ + "doki doki literature club", + "doki doki takeover", + "friday night funkin", + "friday night funkin mod", + "yuri (doki doki literature club)", + "desiredelta", + "1girls", + "big breasts", + "clothed", + "clothed female", + "female", + "female focus", + "female only", + "holding microphone", + "holding object", + "long hair", + "long purple hair", + "looking at viewer", + "microphone", + "open hand", + "open mouth", + "purple background", + "purple hair", + "solo", + "solo female", + "solo focus", + "sweater", + "white outline", + "jpeg", + "safe for work", + "sfw", + ], + "tags_artist": [ + "desiredelta", + ], + "tags_character": [ + "yuri (doki doki literature club)", + ], + "tags_copyright": [ + "doki doki literature club", + "friday night funkin", + "friday night funkin mod", + ], + "tags_general": list, + "uploader": { + "avatarUrl" : None, + "bookmarks" : 0, + "certified" : True, + "created" : "2021-04-03T08:29:51.373823", + "email" : "agent.rulexxx-uploader@z.com", + "id" : 9741, + "isSystemAccount": True, + "name" : "agent.rulexxx-uploader", + "role" : 2, + "uploadedPosts" : range(100000, 999999), + "webId" : None, + }, +}, + +{ + "#url" : "https://rule34.xyz/post/3571567", + "#comment": "video", + "#class" : rule34xyz.Rule34xyzPostExtractor, + "#urls" : "https://rule34xyz.b-cdn.net/posts/3571/3571567/3571567.mov720.mp4", + "#sha1_content": "c0a5e7e887774f91527f00e6142c435a3c482c1f", +}, + +)