[yaplog] improve metadata extraction (#443)

- provide a fallback if there is no numerical image ID
- add a 'filename' field
- convert 'date' to an actual datetime object
This commit is contained in:
Mike Fährmann
2019-10-11 18:29:58 +02:00
parent 15af2f8464
commit d4ffd6c952

View File

@@ -36,11 +36,13 @@ class YaplogExtractor(AsynchronousMixin, Extractor):
iurl = text.extract(page, '<img src="', '"')[0] iurl = text.extract(page, '<img src="', '"')[0]
if iurl[0] == "/": if iurl[0] == "/":
iurl = text.urljoin(self.root, iurl) iurl = text.urljoin(self.root, iurl)
iid, _, ext = iurl.rpartition("/")[2].rpartition(".") name, _, ext = iurl.rpartition("/")[2].rpartition(".")
iid = name.rpartition("_")[0] or name
image = { image = {
"url" : iurl, "url" : iurl,
"num" : num, "num" : num,
"id" : text.parse_int(iid.partition("_")[0]), "id" : text.parse_int(iid, iid),
"filename" : name,
"extension": ext, "extension": ext,
"post" : post, "post" : post,
} }
@@ -75,7 +77,7 @@ class YaplogExtractor(AsynchronousMixin, Extractor):
"id" : text.parse_int(pid), "id" : text.parse_int(pid),
"title": text.unescape(title[:-3]), "title": text.unescape(title[:-3]),
"user" : self.user, "user" : self.user,
"date" : date, "date" : text.parse_datetime(date, "%B %d [%a], %Y, %H:%M"),
} }
@@ -102,7 +104,7 @@ class YaplogPostExtractor(YaplogExtractor):
test = ( test = (
("https://yaplog.jp/imamiami0726/image/1299", { ("https://yaplog.jp/imamiami0726/image/1299", {
"url": "896cae20fa718735a57e723c48544e830ff31345", "url": "896cae20fa718735a57e723c48544e830ff31345",
"keyword": "f8d8781e61c4c38238a7622d6df6c905f864e5d3", "keyword": "22df8ad6cb534514c6bb2ff000381d156769a620",
}), }),
# complete image URLs (#443) # complete image URLs (#443)
("https://yaplog.jp/msjane/archive/246", { ("https://yaplog.jp/msjane/archive/246", {