diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index f0478f76..2e497042 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -70,6 +70,7 @@ Spectrum Nexus |http://www.thes-0| Chapters, Manga The /b/ Archive https://thebarchive.com/ Threads Tumblr https://www.tumblr.com/ Images from Users, Posts, Tag-Searches Twitter https://twitter.com/ Tweets +Warosu https://warosu.org/ Threads World Three http://www.slide.world-three.org/ Chapters, Manga Yandere https://yande.re/ Pools, Posts, Tag-Searches Chronos http://chronos.to/ individual Images diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 99667ab1..75573a05 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -77,6 +77,7 @@ modules = [ "thebarchive", "tumblr", "twitter", + "warosu", "worldthree", "yandere", "imagehosts", diff --git a/gallery_dl/extractor/mangazuki.py b/gallery_dl/extractor/mangazuki.py index 359bb91b..a536bab2 100644 --- a/gallery_dl/extractor/mangazuki.py +++ b/gallery_dl/extractor/mangazuki.py @@ -80,7 +80,7 @@ class MangazukiMangaExtractor(MangaExtractor): "url": "aab747414191b14e768f4a1eb148448d83ef2e14", }), ("https://raws.mangazuki.co/series/Rakujitsu-no-Pathos", { - "url": "4c5fcee4ad306faa3cfe952f7474293a99d11787", + "url": "57ac10ce4f4a93a313c80542bbc5bd6fd922b055", }), ] diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py new file mode 100644 index 00000000..dbb3d0b0 --- /dev/null +++ b/gallery_dl/extractor/warosu.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://warosu.org/""" + +from .common import Extractor, Message +from .. import text + + +class WarosuThreadExtractor(Extractor): + """Extractor for images from threads on warosu.org""" + category = "warosu" + subcategory = "thread" + directory_fmt = ["{category}", "{board}", "{thread} - {title}"] + filename_fmt = "{tim}-{filename}{ext}" + pattern = [r"(?:https?://)?(?:www\.)?warosu\.org/([^/]+)/thread/(\d+)"] + test = [ + ("https://warosu.org/jp/thread/16656025", { + "url": "889d57246ed67e491e5b8f7f124e50ea7991e770", + "keyword": "dab56209e31634b44eb99a2cdd85fa922c726b4f", + }), + ("https://warosu.org/jp/thread/16658073", { + "url": "4500cf3184b067424fd9883249bd543c905fbecd", + "keyword": "084369b27b8cfc08a2276e00a4be6ffd7b1e5088", + "content": "d48df0a701e6599312bfff8674f4aa5d4fb8db1c", + }), + ] + + def __init__(self, match): + Extractor.__init__(self) + self.board, self.thread = match.groups() + + def items(self): + url = "https://warosu.org/" + self.board + "/thread/" + self.thread + page = self.request(url).text + data = self.get_metadata(page) + posts = self.posts(page) + + if not data["title"]: + title = text.remove_html(posts[0]["com"]) + data["title"] = text.unescape(title)[:50] + + yield Message.Version, 1 + yield Message.Directory, data + for post in self.posts(page): + if "image" not in post: + continue + post.update(data) + yield Message.Url, post["image"], post + + def get_metadata(self, page): + """Collect metadata for extractor-job""" + boardname = text.extract(page, "", "")[0] + title = text.extract(page, 'filetitle" itemprop="name">', '<')[0] + return { + "board": self.board, + "board-name": boardname.rpartition(" - ")[2], + "thread": self.thread, + "title": title, + } + + def posts(self, page): + """Build a list of all post-objects""" + page = text.extract(page, '
', '')[0] + needle = '
' + return [self.parse(post) for post in page.split(needle)] + + def parse(self, post): + """Build post-object by extracting data from an HTML post""" + data = self._extract_post(post) + if "File:" in post: + self._extract_image(post, data) + part = data["image"].rpartition("/")[2] + data["tim"], _, data["extension"] = part.partition(".") + data["ext"] = "." + data["extension"] + return data + + @staticmethod + def _extract_post(post): + data = text.extract_all(post, ( + ("no" , 'id="p', '"'), + ("name", '', ''), + ("time", ''), + ("now" , '', '<'), + ("com" , '

', '

'), + ))[0] + data["com"] = text.unescape(text.remove_html(data["com"].strip())) + return data + + @staticmethod + def _extract_image(post, data): + text.extract_all(post, ( + ("fsize" , 'File: ', ', '), + ("w" , '', 'x'), + ("h" , '', ', '), + ("filename", '', '<'), + ("image" , '
\n