# -*- coding: utf-8 -*- # Copyright 2019 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for http://blog.livedoor.jp/""" from .common import Extractor, Message from .. import text class LivedoorExtractor(Extractor): """Base class for livedoor extractors""" category = "livedoor" root = "http://blog.livedoor.jp" img_root = "http://livedoor.blogimg.jp" filename_fmt = "{post[id]}_{post[title]}_{num:>02}.{extension}" directory_fmt = ("{category}", "{post[user]}") archive_fmt = "{post[id]}_{hash}" def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) def items(self): yield Message.Version, 1 for post in self.posts(): images = self._images(post) if images: yield Message.Directory, {"post": post} for image in images: yield Message.Url, image["url"], image def posts(self): """Return an iterable with post objects""" def _load(self, data, body): pid , pos = text.extract(data, "id : '" , "'") title, pos = text.extract(data, "title : '", "'", pos) cat1 , pos = text.extract(data, "name:'" , "'", pos) cat2 , pos = text.extract(data, "name:'" , "'", pos) date , pos = text.extract(data, "date : '" , "'", pos) tags , pos = text.extract(body, '

', '') return { "id" : text.parse_int(pid), "title" : title, "date" : date, "categories": [cat1, cat2], "tags" : text.split_html(tags), "user" : self.user, "body" : body, } def _images(self, post): imgs = [] body = post.pop("body") for num, img in enumerate(text.extract_iter(body, ""), 1): src = text.extract(img, 'src="', '"')[0] alt = text.extract(img, 'alt="', '"')[0] if src.startswith(self.img_root): url = src.replace("-s.", ".") else: url = text.urljoin(self.root, src) name, _, ext = url.rpartition("/")[2].rpartition(".") imgs.append({ "url" : url, "num" : num, "hash" : name, "filename" : alt or name, "extension": ext, "post" : post, }) return imgs class LivedoorBlogExtractor(LivedoorExtractor): """Extractor for a user's blog on blog.livedoor.jp""" subcategory = "blog" pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/?(?:$|[?&#])" test = ("http://blog.livedoor.jp/zatsu_ke/", { "range": "1-50", "count": 50, "pattern": r"http://livedoor.blogimg.jp/zatsu_ke/imgs/\w/\w/\w+\.\w+", "keyword": { "post": { "categories": list, "date": str, "id": int, "tags": list, "title": str, "user": "zatsu_ke" }, "filename": str, "hash": r"re:\w{4,}", "num": int, }, }) def posts(self): url = "{}/{}".format(self.root, self.user) while url: page = self.request(url).text pos = 0 while True: data, pos = text.extract(page, '.articles.push(', ');', pos) if not data: break body, pos = text.extract( page, '

', '', pos, ) yield self._load(data, body) url = text.extract(page, '') body, pos = text.extract( page, '

', '', pos, ) return (self._load(data, body),)