diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index e59087b3..3be862b3 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -52,6 +52,7 @@ Kirei Cake https://reader.kireicake.com/ Chapters, Manga KissManga https://kissmanga.com/ Chapters, Manga Komikcast https://komikcast.com/ Chapters, Manga Konachan https://konachan.com/ Pools, Popular Images, Posts, Tag-Searches +livedoor Blog http://blog.livedoor.jp/ Blogs, Posts Luscious https://luscious.net/ Albums, Search Results Optional Manga Fox https://fanfox.net/ Chapters Manga Here https://www.mangahere.cc/ Chapters, Manga diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 5a08ac7d..21a0ef18 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -47,6 +47,7 @@ modules = [ "kissmanga", "komikcast", "konachan", + "livedoor", "luscious", "mangadex", "mangafox", diff --git a/gallery_dl/extractor/livedoor.py b/gallery_dl/extractor/livedoor.py new file mode 100644 index 00000000..f0ecfd47 --- /dev/null +++ b/gallery_dl/extractor/livedoor.py @@ -0,0 +1,160 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for http://blog.livedoor.jp/""" + +from .common import Extractor, Message +from .. import text + + +class LivedoorExtractor(Extractor): + """Base class for livedoor extractors""" + category = "livedoor" + root = "http://blog.livedoor.jp" + img_root = "http://livedoor.blogimg.jp" + filename_fmt = "{post[id]}_{post[title]}_{num:>02}.{extension}" + directory_fmt = ("{category}", "{post[user]}") + archive_fmt = "{post[id]}_{hash}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + + def items(self): + yield Message.Version, 1 + for post in self.posts(): + images = self._images(post) + if images: + yield Message.Directory, {"post": post} + for image in images: + yield Message.Url, image["url"], image + + def posts(self): + """Return an iterable with post objects""" + + def _load(self, data, body): + pid , pos = text.extract(data, "id : '" , "'") + title, pos = text.extract(data, "title : '", "'", pos) + cat1 , pos = text.extract(data, "name:'" , "'", pos) + cat2 , pos = text.extract(data, "name:'" , "'", pos) + date , pos = text.extract(data, "date : '" , "'", pos) + tags , pos = text.extract(body, '
', '') + + return { + "id" : text.parse_int(pid), + "title" : title, + "date" : date, + "categories": [cat1, cat2], + "tags" : text.split_html(tags), + "user" : self.user, + "body" : body, + } + + def _images(self, post): + imgs = [] + body = post.pop("body") + + for num, img in enumerate(text.extract_iter(body, ""), 1): + src = text.extract(img, 'src="', '"')[0] + alt = text.extract(img, 'alt="', '"')[0] + + if src.startswith(self.img_root): + url = src.replace("-s.", ".") + else: + url = text.urljoin(self.root, src) + name, _, ext = url.rpartition("/")[2].rpartition(".") + + imgs.append({ + "url" : url, + "num" : num, + "hash" : name, + "filename" : alt or name, + "extension": ext, + "post" : post, + }) + + return imgs + + +class LivedoorBlogExtractor(LivedoorExtractor): + """Extractor for a user's blog on blog.livedoor.jp""" + subcategory = "blog" + pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/?(?:$|[?&#])" + test = ("http://blog.livedoor.jp/zatsu_ke/", { + "range": "1-50", + "count": 50, + "pattern": r"http://livedoor.blogimg.jp/zatsu_ke/imgs/\w/\w/\w+\.\w+", + "keyword": { + "post": { + "categories": list, + "date": str, + "id": int, + "tags": list, + "title": str, + "user": "zatsu_ke" + }, + "filename": str, + "hash": r"re:\w{4,}", + "num": int, + }, + }) + + def posts(self): + url = "{}/{}".format(self.root, self.user) + + while url: + page = self.request(url).text + pos = 0 + + while True: + data, pos = text.extract(page, '.articles.push(', ');', pos) + if not data: + break + body, pos = text.extract( + page, + '
', + '', + pos, + ) + yield self._load(data, body) + + url = text.extract(page, '