diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index f0478f76..2e497042 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -70,6 +70,7 @@ Spectrum Nexus |http://www.thes-0| Chapters, Manga The /b/ Archive https://thebarchive.com/ Threads Tumblr https://www.tumblr.com/ Images from Users, Posts, Tag-Searches Twitter https://twitter.com/ Tweets +Warosu https://warosu.org/ Threads World Three http://www.slide.world-three.org/ Chapters, Manga Yandere https://yande.re/ Pools, Posts, Tag-Searches Chronos http://chronos.to/ individual Images diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 99667ab1..75573a05 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -77,6 +77,7 @@ modules = [ "thebarchive", "tumblr", "twitter", + "warosu", "worldthree", "yandere", "imagehosts", diff --git a/gallery_dl/extractor/mangazuki.py b/gallery_dl/extractor/mangazuki.py index 359bb91b..a536bab2 100644 --- a/gallery_dl/extractor/mangazuki.py +++ b/gallery_dl/extractor/mangazuki.py @@ -80,7 +80,7 @@ class MangazukiMangaExtractor(MangaExtractor): "url": "aab747414191b14e768f4a1eb148448d83ef2e14", }), ("https://raws.mangazuki.co/series/Rakujitsu-no-Pathos", { - "url": "4c5fcee4ad306faa3cfe952f7474293a99d11787", + "url": "57ac10ce4f4a93a313c80542bbc5bd6fd922b055", }), ] diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py new file mode 100644 index 00000000..dbb3d0b0 --- /dev/null +++ b/gallery_dl/extractor/warosu.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://warosu.org/""" + +from .common import Extractor, Message +from .. import text + + +class WarosuThreadExtractor(Extractor): + """Extractor for images from threads on warosu.org""" + category = "warosu" + subcategory = "thread" + directory_fmt = ["{category}", "{board}", "{thread} - {title}"] + filename_fmt = "{tim}-{filename}{ext}" + pattern = [r"(?:https?://)?(?:www\.)?warosu\.org/([^/]+)/thread/(\d+)"] + test = [ + ("https://warosu.org/jp/thread/16656025", { + "url": "889d57246ed67e491e5b8f7f124e50ea7991e770", + "keyword": "dab56209e31634b44eb99a2cdd85fa922c726b4f", + }), + ("https://warosu.org/jp/thread/16658073", { + "url": "4500cf3184b067424fd9883249bd543c905fbecd", + "keyword": "084369b27b8cfc08a2276e00a4be6ffd7b1e5088", + "content": "d48df0a701e6599312bfff8674f4aa5d4fb8db1c", + }), + ] + + def __init__(self, match): + Extractor.__init__(self) + self.board, self.thread = match.groups() + + def items(self): + url = "https://warosu.org/" + self.board + "/thread/" + self.thread + page = self.request(url).text + data = self.get_metadata(page) + posts = self.posts(page) + + if not data["title"]: + title = text.remove_html(posts[0]["com"]) + data["title"] = text.unescape(title)[:50] + + yield Message.Version, 1 + yield Message.Directory, data + for post in self.posts(page): + if "image" not in post: + continue + post.update(data) + yield Message.Url, post["image"], post + + def get_metadata(self, page): + """Collect metadata for extractor-job""" + boardname = text.extract(page, "