From 299bd2f1f51ca7758ec6060f46e4a856a41e64f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sun, 12 Dec 2021 23:36:16 +0100 Subject: [PATCH] [rule34us] add 'tag' and 'post' extractors (#1527) --- docs/supportedsites.md | 6 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/rule34us.py | 114 +++++++++++++++++++++++++++++++ scripts/supportedsites.py | 1 + 4 files changed, 122 insertions(+) create mode 100644 gallery_dl/extractor/rule34us.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 3b3dbb69..8abb4478 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -649,6 +649,12 @@ Consider all sites to be NSFW unless otherwise known. Posts, Tag Searches + + Rule 34 + https://rule34.us/ + Posts, Tag Searches + + Sankaku Channel https://sankaku.app/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index dd9da010..a8ab39b5 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -108,6 +108,7 @@ modules = [ "readcomiconline", "reddit", "redgifs", + "rule34us", "sankaku", "sankakucomplex", "seiga", diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py new file mode 100644 index 00000000..a65e9ff4 --- /dev/null +++ b/gallery_dl/extractor/rule34us.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://rule34.us/""" + +from . import booru +from .. import text + + +class Rule34usExtractor(booru.BooruExtractor): + category = "rule34us" + root = "https://rule34.us" + per_page = 42 + + def _parse_post(self, post_id): + url = "{}/index.php?r=posts/view&id={}".format(self.root, post_id) + extr = text.extract_from(self.request(url).text) + + post = { + "id" : post_id, + "tags" : text.unescape(extr( + 'name="keywords" content="', '"').rstrip(", ")), + "uploader": text.extract(extr('Added by: ', ''), ">", "<")[0], + "score" : text.extract(extr('Score: ', '> - <'), ">", "<")[0], + "width" : extr('Size: ', 'w'), + "height" : extr(' x ', 'h'), + "file_url": extr(' src="', '"'), + } + post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] + + return post + + +class Rule34usTagExtractor(Rule34usExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = r"(?:https?://)?rule34\.us/index\.php\?r=posts/index&q=([^&#]+)" + test = ("https://rule34.us/index.php?r=posts/index&q=[terios]_elysion", { + "pattern": r"https://img\d*\.rule34\.us" + r"/images/../../[0-9a-f]{32}\.\w+", + "count": 10, + }) + + def __init__(self, match): + Rule34usExtractor.__init__(self, match) + self.tags = text.unquote(match.group(1).replace("+", " ")) + + def metadata(self): + return {"search_tags": self.tags} + + def posts(self): + url = self.root + "/index.php" + params = { + "r" : "posts/index", + "q" : self.tags, + "page": self.page_start, + } + + while True: + page = self.request(url, params=params).text + + cnt = 0 + for post_id in text.extract_iter(page, '>