[8chan] use api

This commit is contained in:
Mike Fährmann
2015-09-07 16:32:20 +02:00
parent d8ef128e74
commit d7e0d81bdd
2 changed files with 18 additions and 57 deletions

View File

@@ -8,65 +8,25 @@
"""Extract image- and video-urls from threads on https://8ch.net/""" """Extract image- and video-urls from threads on https://8ch.net/"""
from .common import SequentialExtractor, Message from .chan import ChanExtractor
from urllib.parse import unquote
import re
info = { info = {
"category": "8chan", "category": "8chan",
"extractor": "InfinityChanExtractor", "extractor": "InfinityChanExtractor",
"directory": ["{category}", "{board}-{thread-id}"], "directory": ["{category}", "{board}-{thread}"],
"filename": "{timestamp}-{name}", "filename": "{tim}-{filename}{ext}",
"pattern": [ "pattern": [
r"(?:https?://)?(?:www\.)?(?:8chan\.co|8ch\.net)/([^/]+/res/\d+).*", r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+).*",
], ],
} }
class InfinityChanExtractor(SequentialExtractor): class InfinityChanExtractor(ChanExtractor):
url_base = "https://8ch.net" api_url = "https://8ch.net/{board}/res/{thread}.json"
url_fmt = url_base + "/{board}/res/{thread-id}.html" file_url = "https://media.8ch.net/{board}/src/{tim}{ext}"
regex = (
r'>File: <a href="([^"]+)">([^<]+)\.[^<]+<.*?'
r'<span class="postfilename"( title="([^"]+)")?>([^<]+)<'
)
def __init__(self, match, config): def __init__(self, match, config):
SequentialExtractor.__init__(self, config) ChanExtractor.__init__(
self.match = match self, config, info["category"],
match.group(1), match.group(2)
def items(self): )
yield Message.Version, 1
metadata = self.get_job_metadata()
yield Message.Directory, metadata
url = self.url_fmt.format(**metadata)
text = self.request(url).text
for match in re.finditer(self.regex, text):
yield Message.Url, self.get_file_url(match), self.get_file_metadata(match)
def get_job_metadata(self):
"""Collect metadata for extractor-job"""
board, _, thread_id = self.match.group(1).split("/")
return {
"category": info["category"],
"board": board,
"thread-id": thread_id,
}
@staticmethod
def get_file_metadata(match):
"""Collect metadata for a downloadable file"""
return {
"timestamp": match.group(2),
"name": unquote(match.group(4) or match.group(5)),
}
def get_file_url(self, match):
"""Extract download-url from 'match'"""
url = match.group(1)
if url.startswith("/"):
url = self.url_base + url
return url

View File

@@ -1,4 +1,3 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2015 Mike Fährmann # Copyright 2015 Mike Fährmann
@@ -10,6 +9,7 @@
"""Base classes for extractors for different Futaba Channel boards""" """Base classes for extractors for different Futaba Channel boards"""
from .common import SequentialExtractor, Message from .common import SequentialExtractor, Message
import re
class ChanExtractor(SequentialExtractor): class ChanExtractor(SequentialExtractor):
@@ -34,14 +34,15 @@ class ChanExtractor(SequentialExtractor):
continue continue
post.update(self.metadata) post.update(self.metadata)
yield Message.Url, self.file_url.format(**post), post yield Message.Url, self.file_url.format(**post), post
if "extra_files" in post:
for file in post["extra_files"]:
post.update(file)
yield Message.Url, self.file_url.format(**post), post
@staticmethod @staticmethod
def get_thread_title(post): def get_thread_title(post):
"""Return thread title from first post""" """Return thread title from first post"""
if "sub" in post: if "sub" in post:
return post["sub"] return post["sub"]
com = post["com"] com = re.sub("<[^>]+?>", "", post["com"])
pos = com.find("<br>") return " ".join(com.split())[:50]
if pos == -1:
return com
return com[:min(pos, 50)]