[2chen] fix extraction (#3356)
update 'archive_fmt' update tests update 'board' regex
This commit is contained in:
@@ -16,13 +16,15 @@ class _2chenThreadExtractor(Extractor):
|
|||||||
subcategory = "thread"
|
subcategory = "thread"
|
||||||
directory_fmt = ("{category}", "{board}", "{thread} {title}")
|
directory_fmt = ("{category}", "{board}", "{thread} {title}")
|
||||||
filename_fmt = "{time} {filename}.{extension}"
|
filename_fmt = "{time} {filename}.{extension}"
|
||||||
archive_fmt = "{board}_{thread}_{hash}"
|
archive_fmt = "{board}_{thread}_{hash}_{time}"
|
||||||
root = "https://2chen.moe"
|
root = "https://2chen.moe"
|
||||||
pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)/(\d+)"
|
pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)/(\d+)"
|
||||||
test = (
|
test = (
|
||||||
("https://2chen.moe/jp/303786", {
|
("https://2chen.moe/tv/496715", {
|
||||||
"count": ">= 10",
|
"count": ">= 179",
|
||||||
}),
|
}),
|
||||||
|
# 404
|
||||||
|
("https://2chen.moe/jp/303786"),
|
||||||
)
|
)
|
||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
@@ -31,7 +33,7 @@ class _2chenThreadExtractor(Extractor):
|
|||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
url = "{}/{}/{}".format(self.root, self.board, self.thread)
|
url = "{}/{}/{}".format(self.root, self.board, self.thread)
|
||||||
page = self.request(url, encoding="utf-8").text
|
page = self.request(url, encoding="utf-8", notfound="thread").text
|
||||||
data = self.metadata(page)
|
data = self.metadata(page)
|
||||||
yield Message.Directory, data
|
yield Message.Directory, data
|
||||||
for post in self.posts(page):
|
for post in self.posts(page):
|
||||||
@@ -66,7 +68,7 @@ class _2chenThreadExtractor(Extractor):
|
|||||||
"%d %b %Y (%a) %H:%M:%S"
|
"%d %b %Y (%a) %H:%M:%S"
|
||||||
),
|
),
|
||||||
"no" : extr('href="#p', '"'),
|
"no" : extr('href="#p', '"'),
|
||||||
"url" : extr('</span><a href="', '"'),
|
"url" : extr('</a><a href="', '"'),
|
||||||
"filename": text.unescape(extr('download="', '"')),
|
"filename": text.unescape(extr('download="', '"')),
|
||||||
"hash" : extr('data-hash="', '"'),
|
"hash" : extr('data-hash="', '"'),
|
||||||
}
|
}
|
||||||
@@ -77,7 +79,7 @@ class _2chenBoardExtractor(Extractor):
|
|||||||
category = "2chen"
|
category = "2chen"
|
||||||
subcategory = "board"
|
subcategory = "board"
|
||||||
root = "https://2chen.moe"
|
root = "https://2chen.moe"
|
||||||
pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)(?:/catalog)?/?$"
|
pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)(?:/catalog|/?$)"
|
||||||
test = (
|
test = (
|
||||||
("https://2chen.moe/co/", {
|
("https://2chen.moe/co/", {
|
||||||
"pattern": _2chenThreadExtractor.pattern
|
"pattern": _2chenThreadExtractor.pattern
|
||||||
@@ -92,7 +94,7 @@ class _2chenBoardExtractor(Extractor):
|
|||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
url = "{}/{}/catalog".format(self.root, self.board)
|
url = "{}/{}/catalog".format(self.root, self.board)
|
||||||
page = self.request(url).text
|
page = self.request(url, notfound="board").text
|
||||||
data = {"_extractor": _2chenThreadExtractor}
|
data = {"_extractor": _2chenThreadExtractor}
|
||||||
for thread in text.extract_iter(
|
for thread in text.extract_iter(
|
||||||
page, '<figure><a href="', '"'):
|
page, '<figure><a href="', '"'):
|
||||||
|
|||||||
Reference in New Issue
Block a user