From 40c7eb3424ff5a28a8dd8804d14a3219aa6acaed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 6 Jun 2019 15:22:27 +0200 Subject: [PATCH] [livedoor] improve extraction (fixes #301) --- gallery_dl/extractor/livedoor.py | 54 +++++++++++++++++++------------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/gallery_dl/extractor/livedoor.py b/gallery_dl/extractor/livedoor.py index ae296139..ed72f4c2 100644 --- a/gallery_dl/extractor/livedoor.py +++ b/gallery_dl/extractor/livedoor.py @@ -59,6 +59,8 @@ class LivedoorExtractor(Extractor): src = text.extract(img, 'src="', '"')[0] alt = text.extract(img, 'alt="', '"')[0] + if not src: + continue if "://livedoor.blogimg.jp/" in src: url = src.replace("-s.", ".") else: @@ -81,24 +83,30 @@ class LivedoorBlogExtractor(LivedoorExtractor): """Extractor for a user's blog on blog.livedoor.jp""" subcategory = "blog" pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/?(?:$|[?&#])" - test = ("http://blog.livedoor.jp/zatsu_ke/", { - "range": "1-50", - "count": 50, - "pattern": r"https?://livedoor.blogimg.jp/\w+/imgs/\w/\w/\w+\.\w+", - "keyword": { - "post": { - "categories": list, - "date": "type:datetime", - "id": int, - "tags": list, - "title": str, - "user": "zatsu_ke" + test = ( + ("http://blog.livedoor.jp/zatsu_ke/", { + "range": "1-50", + "count": 50, + "pattern": r"https?://livedoor.blogimg.jp/\w+/imgs/\w/\w/\w+\.\w+", + "keyword": { + "post": { + "categories": list, + "date": "type:datetime", + "id": int, + "tags": list, + "title": str, + "user": "zatsu_ke" + }, + "filename": str, + "hash": r"re:\w{4,}", + "num": int, }, - "filename": str, - "hash": r"re:\w{4,}", - "num": int, - }, - }) + }), + ("http://blog.livedoor.jp/uotapo/", { + "range": "1-5", + "count": 5, + }), + ) def posts(self): url = "{}/{}".format(self.root, self.user) @@ -109,8 +117,8 @@ class LivedoorBlogExtractor(LivedoorExtractor): data = extr('.articles.push(', ');') if not data: break - body = extr('
', - '') + body = extr('class="article-body-inner">', + 'class="article-footer">') yield self._load(data, body) url = extr('