# -*- coding: utf-8 -*- # Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://realbooru.com/""" from . import booru from .. import text, util import collections BASE_PATTERN = r"(?:https?://)?realbooru\.com" class RealbooruExtractor(booru.BooruExtractor): basecategory = "booru" category = "realbooru" root = "https://realbooru.com" def _parse_post(self, post_id): url = f"{self.root}/index.php?page=post&s=view&id={post_id}" page = self.request(url).text extr = text.extract_from(page) rating = extr('name="rating" content="', '"') extr('class="container"', '>') post = { "_html" : page, "id" : post_id, "rating" : "e" if rating == "adult" else (rating or "?")[0], "tags" : text.unescape(extr(' alt="', '"')), "file_url" : extr('src="', '"'), "created_at": extr(">Posted at ", " by "), "uploader" : extr(">", "<"), "score" : extr('">', "<"), "title" : extr('id="title" style="width: 100%;" value="', '"'), "source" : extr('d="source" style="width: 100%;" value="', '"'), } post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] return post def skip(self, num): self.page_start += num return num def _prepare(self, post): post["date"] = text.parse_datetime(post["created_at"], "%b, %d %Y") def _pagination(self, params, begin, end): url = self.root + "/index.php" params["pid"] = self.page_start while True: page = self.request(url, params=params).text cnt = 0 for post_id in text.extract_iter(page, begin, end): cnt += 1 yield self._parse_post(post_id) if cnt < self.per_page: return params["pid"] += self.per_page def _tags(self, post, _): page = post["_html"] tag_container = text.extr(page, 'id="tagLink"', '') tags = collections.defaultdict(list) pattern = util.re(r'Pool: ", "") self.post_ids = text.extract_iter( page, 'class="thumb" id="p', '"', pos) return { "pool": text.parse_int(pool_id), "pool_name": text.unescape(name), } def posts(self): return map( self._parse_post, util.advance(self.post_ids, self.page_start) ) class RealbooruPostExtractor(RealbooruExtractor): subcategory = "post" archive_fmt = "{id}" pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)" example = "https://realbooru.com/index.php?page=post&s=view&id=12345" def posts(self): return (self._parse_post(self.groups[0]),)