diff --git a/docs/supportedsites.md b/docs/supportedsites.md index e9a1a518..d288b4c0 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -13,6 +13,12 @@ Consider all sites to be NSFW unless otherwise known. + + 2chen + https://2chen.moe/ + Boards, Threads + + 35PHOTO https://35photo.pro/ diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py new file mode 100644 index 00000000..3e65fe6d --- /dev/null +++ b/gallery_dl/extractor/2chen.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://2chen.moe/""" + +from .common import Extractor, Message +from .. import text + + +class _2chenThreadExtractor(Extractor): + """Extractor for 2chen threads""" + category = "2chen" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{time} {filename}.{extension}" + archive_fmt = "{hash}" + root = "https://2chen.moe" + pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)/(\d+)" + test = ( + ("https://2chen.moe/jp/303786", { + "count": ">= 10", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/{}/{}".format(self.root, self.board, self.thread) + page = self.request(url, encoding="utf-8").text + data = self.metadata(page) + yield Message.Directory, data + for post in self.posts(page): + if not post["url"]: + continue + post.update(data) + post["url"] = self.root + post["url"] + post["time"] = text.parse_int(post["date"].timestamp()) + yield Message.Url, post["url"], text.nameext_from_url( + post["filename"], post) + + def metadata(self, page): + board, pos = text.extract(page, 'class="board">/', '/<') + title = text.extract(page, "

", "

", pos)[0] + return { + "board" : board, + "thread": self.thread, + "title" : text.unescape(title), + } + + def posts(self, page): + """Return iterable with relevant posts""" + return map(self.parse, text.extract_iter( + page, 'class="glass media', '')) + + def parse(self, post): + extr = text.extract_from(post) + return { + "name" : text.unescape(extr("", "")), + "date" : text.parse_datetime( + extr("")[2], + "%d %b %Y (%a) %H:%M:%S" + ), + "no" : extr('href="#p', '"'), + "url" : extr('