From 299bd2f1f51ca7758ec6060f46e4a856a41e64f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de>
Date: Sun, 12 Dec 2021 23:36:16 +0100
Subject: [PATCH] [rule34us] add 'tag' and 'post' extractors (#1527)

---
 docs/supportedsites.md           |   6 ++
 gallery_dl/extractor/__init__.py |   1 +
 gallery_dl/extractor/rule34us.py | 114 +++++++++++++++++++++++++++++++
 scripts/supportedsites.py        |   1 +
 4 files changed, 122 insertions(+)
 create mode 100644 gallery_dl/extractor/rule34us.py
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 3b3dbb69..8abb4478 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -649,6 +649,12 @@ Consider all sites to be NSFW unless otherwise known.
     <td>Posts, Tag Searches</td>
     <td></td>
 </tr>
+<tr>
+    <td>Rule 34</td>
+    <td>https://rule34.us/</td>
+    <td>Posts, Tag Searches</td>
+    <td></td>
+</tr>
 <tr>
     <td>Sankaku Channel</td>
     <td>https://sankaku.app/</td>
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index dd9da010..a8ab39b5 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -108,6 +108,7 @@ modules = [
     "readcomiconline",
     "reddit",
     "redgifs",
+    "rule34us",
     "sankaku",
     "sankakucomplex",
     "seiga",
diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py
new file mode 100644
index 00000000..a65e9ff4
--- /dev/null
+++ b/gallery_dl/extractor/rule34us.py
@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://rule34.us/"""
+
+from . import booru
+from .. import text
+
+
+class Rule34usExtractor(booru.BooruExtractor):
+    category = "rule34us"
+    root = "https://rule34.us"
+    per_page = 42
+
+    def _parse_post(self, post_id):
+        url = "{}/index.php?r=posts/view&id={}".format(self.root, post_id)
+        extr = text.extract_from(self.request(url).text)
+
+        post = {
+            "id"      : post_id,
+            "tags"    : text.unescape(extr(
+                'name="keywords" content="', '"').rstrip(", ")),
+            "uploader": text.extract(extr('Added by: ', '</li>'), ">", "<")[0],
+            "score"   : text.extract(extr('Score: ', '> - <'), ">", "<")[0],
+            "width"   : extr('Size: ', 'w'),
+            "height"  : extr(' x ', 'h'),
+            "file_url": extr(' src="', '"'),
+        }
+        post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0]
+
+        return post
+
+
+class Rule34usTagExtractor(Rule34usExtractor):
+    subcategory = "tag"
+    directory_fmt = ("{category}", "{search_tags}")
+    archive_fmt = "t_{search_tags}_{id}"
+    pattern = r"(?:https?://)?rule34\.us/index\.php\?r=posts/index&q=([^&#]+)"
+    test = ("https://rule34.us/index.php?r=posts/index&q=[terios]_elysion", {
+        "pattern": r"https://img\d*\.rule34\.us"
+                   r"/images/../../[0-9a-f]{32}\.\w+",
+        "count": 10,
+    })
+
+    def __init__(self, match):
+        Rule34usExtractor.__init__(self, match)
+        self.tags = text.unquote(match.group(1).replace("+", " "))
+
+    def metadata(self):
+        return {"search_tags": self.tags}
+
+    def posts(self):
+        url = self.root + "/index.php"
+        params = {
+            "r"   : "posts/index",
+            "q"   : self.tags,
+            "page": self.page_start,
+        }
+
+        while True:
+            page = self.request(url, params=params).text
+
+            cnt = 0
+            for post_id in text.extract_iter(page, '><a id="', '"'):
+                yield self._parse_post(post_id)
+                cnt += 1
+
+            if cnt < self.per_page:
+                return
+
+            if "page" in params:
+                del params["page"]
+            params["q"] = self.tags + " id:<" + post_id
+
+
+class Rule34usPostExtractor(Rule34usExtractor):
+    subcategory = "post"
+    archive_fmt = "{id}"
+    pattern = r"(?:https?://)?rule34\.us/index\.php\?r=posts/view&id=(\d+)"
+    test = (
+        ("https://rule34.us/index.php?r=posts/view&id=3709005", {
+            "pattern": r"https://img\d*\.rule34\.us/images/14/7b"
+                       r"/147bee6fc2e13f73f5f9bac9d4930b13\.png",
+            "content": "d714342ea84050f82dda5f0c194d677337abafc5",
+        }),
+        ("https://rule34.us/index.php?r=posts/view&id=4576310", {
+            "pattern": r"https://video\.rule34\.us/images/a2/94"
+                       r"/a294ff8e1f8e0efa041e5dc9d1480011\.mp4",
+            "keyword": {
+                "extension": "mp4",
+                "file_url": str,
+                "filename": "a294ff8e1f8e0efa041e5dc9d1480011",
+                "height": "3982",
+                "id": "4576310",
+                "md5": "a294ff8e1f8e0efa041e5dc9d1480011",
+                "score": r"re:\d+",
+                "tags": "tagme, video",
+                "uploader": "Anonymous",
+                "width": "3184",
+            },
+        }),
+    )
+
+    def __init__(self, match):
+        Rule34usExtractor.__init__(self, match)
+        self.post_id = match.group(1)
+
+    def posts(self):
+        return (self._parse_post(self.post_id),)
diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py
index cb9f6451..cd6d3b3f 100755
--- a/scripts/supportedsites.py
+++ b/scripts/supportedsites.py
@@ -85,6 +85,7 @@ CATEGORY_MAP = {
     "rbt"            : "RebeccaBlackTech",
     "redgifs"        : "RedGIFs",
     "rule34"         : "Rule 34",
+    "rule34us"       : "Rule 34",
     "sankaku"        : "Sankaku Channel",
     "sankakucomplex" : "Sankaku Complex",
     "seiga"          : "Niconico Seiga",