diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py new file mode 100644 index 00000000..c1a1be8d --- /dev/null +++ b/gallery_dl/extractor/2chan.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.2chan.net/""" + +from .common import Extractor, Message +from .. import text + + +class FutabaThreadExtractor(Extractor): + """Extractor for images from threads on www.2chan.net""" + category = "2chan" + subcategory = "thread" + directory_fmt = ["{category}", "{board-name}", "{thread}"] + pattern = [r"(?:https?://)?(([^.]+)\.2chan\.net/([^/]+)/res/(\d+))"] + urlfmt = "https://{server}.2chan.net/{board}/src/{filename}" + test = [("http://dec.2chan.net/70/res/947.htm", { + "url": "c5c12b80b290e224b6758507b3bb952044f4595b", + "keyword": "e1295c0a96f733898e92742bcc1a4c4b320e3748", + })] + + def __init__(self, match): + Extractor.__init__(self) + url, self.server, self.board, self.thread = match.groups() + self.url = "https://" + url + ".htm" + + def items(self): + page = self.request(self.url).text + data = self.get_metadata(page) + yield Message.Version, 1 + yield Message.Directory, data + for post in self.posts(page): + if "filename" not in post: + continue + post.update(data) + url = self.urlfmt.format_map(post) + yield Message.Url, url, post + + def get_metadata(self, page): + """Collect metadata for extractor-job""" + title = text.extract(page, "", "")[0] + title, _, boardname = title.rpartition(" - ") + return { + "server": self.server, + "title": title, + "board": self.board, + "board-name": boardname[:-4], + "thread": self.thread, + } + + def posts(self, page): + """Build a list of all post-objects""" + page = text.extract( + page, '
', '
')[0] + return [ + self.parse(post) + for post in page.split('') + ] + + def parse(self, post): + """Build post-object by extracting data from an HTML post""" + data = self._extract_post(post) + if '', ''), + ("name" , '', ' '), + ("now" , ' ', ' '), + (None , '', ''), + ))[0] + + @staticmethod + def _extract_image(post, data): + text.extract_all(post, ( + ("filename", '_blank">', '<'), + ("fsize" , '(', ' '), + ), 0, data) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index f9817a72..e17c3501 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -11,6 +11,7 @@ import importlib modules = [ "pixiv", + "2chan", "3dbooru", "4chan", "4plebs", diff --git a/gallery_dl/extractor/chan.py b/gallery_dl/extractor/chan.py index 067ba0a5..f2307404 100644 --- a/gallery_dl/extractor/chan.py +++ b/gallery_dl/extractor/chan.py @@ -6,14 +6,14 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Base classes for extractors for different Futaba Channel boards""" +"""Base classes for extractors for different Futaba Channel-like boards""" from .common import Extractor, Message from .. import text class ChanThreadExtractor(Extractor): - """Base class for extractors for Futaba Channel boards""" + """Base class for extractors for Futaba Channel-like boards""" category = "chan" subcategory = "thread" directory_fmt = ["{category}", "{board}-{thread}"]