diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index 89b8cd13..45116a94 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -62,6 +62,7 @@ Pure Mashiro http://reader.puremashiro.moe/ Chapters, Manga Read Comic Online http://readcomiconline.to/ Comic-Issues, Comics RebeccaBlackTech https://rbt.asia/ Threads Reddit https://reddit.com/ individual Images, Submissions, Subreddits Optional (OAuth) +rule #34 http://rule34.paheal.net/ Posts, Tag-Searches Rule 34 https://rule34.xxx/ Posts, Tag-Searches Safebooru https://safebooru.org/ Posts, Tag-Searches Sankaku Channel https://chan.sankakucomplex.com/ Pools, Posts, Tag-Searches Optional diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 13ed78c9..de086e50 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -60,6 +60,7 @@ modules = [ "nhentai", "nijie", "nyafuu", + "paheal", "pawoo", "pinterest", "pixiv", diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py new file mode 100644 index 00000000..1d0d03a5 --- /dev/null +++ b/gallery_dl/extractor/paheal.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from http://rule34.paheal.net/""" + +from .common import SharedConfigExtractor, Message +from .. import text, util + + +class PahealExtractor(SharedConfigExtractor): + """Base class for paheal extractors""" + basecategory = "booru" + category = "paheal" + filename_fmt = "{category}_{id}_{md5}.{extension}" + root = "http://rule34.paheal.net" + + def items(self): + yield Message.Version, 1 + yield Message.Directory, self.get_metadata() + + for data in self.get_posts(): + url = data["file_url"] + for key in ("id", "width", "height"): + data[key] = util.safe_int(data[key]) + data["tags"] = text.unquote(data["tags"]) + yield Message.Url, url, text.nameext_from_url(url, data) + + def get_metadata(self): + """Return general metadata""" + return {} + + def get_posts(self): + """Return an iterable containing data of all relevant posts""" + + +class PahealTagExtractor(PahealExtractor): + """Extractor for images from rule34.paheal.net by search-tags""" + subcategory = "tag" + directory_fmt = ["{category}", "{tags}"] + pattern = [r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" + r"/post/list/([^/?&#]+)"] + test = [("https://rule34.paheal.net/post/list/k-on/1", { + "url": "c33de1a1470ab29e24ba7a39f53dbf77984be383", + "keyword": "dcba36cfeedf53387aa9656675307fb9141901f1", + })] + per_page = 70 + + def __init__(self, match): + PahealExtractor.__init__(self) + self.tags = text.unquote(match.group(1)) + + def get_metadata(self): + return {"tags": self.tags} + + def get_posts(self): + pnum = 1 + while True: + url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) + page = self.request(url).text + + for post in text.extract_iter( + page, 'Next<" not in page: + return + pnum += 1 + + @staticmethod + def _extract_data(post): + pid , pos = text.extract(post, '', '"') + data, pos = text.extract(post, 'title="', '"', pos) + md5 , pos = text.extract(post, '/_thumbs/', '/', pos) + url , pos = text.extract(post, '