[livedoor] improve extraction (fixes #301)

This commit is contained in:
Mike Fährmann
2019-06-06 15:22:27 +02:00
parent 62335b9015
commit 40c7eb3424

View File

@@ -59,6 +59,8 @@ class LivedoorExtractor(Extractor):
src = text.extract(img, 'src="', '"')[0] src = text.extract(img, 'src="', '"')[0]
alt = text.extract(img, 'alt="', '"')[0] alt = text.extract(img, 'alt="', '"')[0]
if not src:
continue
if "://livedoor.blogimg.jp/" in src: if "://livedoor.blogimg.jp/" in src:
url = src.replace("-s.", ".") url = src.replace("-s.", ".")
else: else:
@@ -81,24 +83,30 @@ class LivedoorBlogExtractor(LivedoorExtractor):
"""Extractor for a user's blog on blog.livedoor.jp""" """Extractor for a user's blog on blog.livedoor.jp"""
subcategory = "blog" subcategory = "blog"
pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/?(?:$|[?&#])" pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/?(?:$|[?&#])"
test = ("http://blog.livedoor.jp/zatsu_ke/", { test = (
"range": "1-50", ("http://blog.livedoor.jp/zatsu_ke/", {
"count": 50, "range": "1-50",
"pattern": r"https?://livedoor.blogimg.jp/\w+/imgs/\w/\w/\w+\.\w+", "count": 50,
"keyword": { "pattern": r"https?://livedoor.blogimg.jp/\w+/imgs/\w/\w/\w+\.\w+",
"post": { "keyword": {
"categories": list, "post": {
"date": "type:datetime", "categories": list,
"id": int, "date": "type:datetime",
"tags": list, "id": int,
"title": str, "tags": list,
"user": "zatsu_ke" "title": str,
"user": "zatsu_ke"
},
"filename": str,
"hash": r"re:\w{4,}",
"num": int,
}, },
"filename": str, }),
"hash": r"re:\w{4,}", ("http://blog.livedoor.jp/uotapo/", {
"num": int, "range": "1-5",
}, "count": 5,
}) }),
)
def posts(self): def posts(self):
url = "{}/{}".format(self.root, self.user) url = "{}/{}".format(self.root, self.user)
@@ -109,8 +117,8 @@ class LivedoorBlogExtractor(LivedoorExtractor):
data = extr('.articles.push(', ');') data = extr('.articles.push(', ');')
if not data: if not data:
break break
body = extr('<div class="article-body-inner">', body = extr('class="article-body-inner">',
'<!-- articleBody End -->') 'class="article-footer">')
yield self._load(data, body) yield self._load(data, body)
url = extr('<a rel="next" href="', '"') url = extr('<a rel="next" href="', '"')
@@ -128,6 +136,10 @@ class LivedoorPostExtractor(LivedoorExtractor):
"url": "fc1d6a9557245b5a27d3a10bf0fa9922ef377215", "url": "fc1d6a9557245b5a27d3a10bf0fa9922ef377215",
"keyword": "0229072abb5cd8a221df72e0ffdfc13336c0e9ce", "keyword": "0229072abb5cd8a221df72e0ffdfc13336c0e9ce",
}), }),
("http://blog.livedoor.jp/uotapo/archives/1050616939.html", {
"url": "3f3581807ec4776e6a67ed7985a22494d4bc4904",
"keyword": "2eb3e383c68e909c4dd3d563c16d0b6e2fe6627b",
}),
) )
def __init__(self, match): def __init__(self, match):
@@ -139,6 +151,6 @@ class LivedoorPostExtractor(LivedoorExtractor):
self.root, self.user, self.post_id) self.root, self.user, self.post_id)
extr = text.extract_from(self.request(url).text) extr = text.extract_from(self.request(url).text)
data = extr('articles :', '</script>') data = extr('articles :', '</script>')
body = extr('<div class="article-body-inner">', body = extr('class="article-body-inner">',
'<!-- articleBody End -->') 'class="article-footer">')
return (self._load(data, body),) return (self._load(data, body),)