[8chan] add 'thread' and 'board' extractors (#2938)
This commit is contained in:
@@ -49,6 +49,12 @@ Consider all sites to be NSFW unless otherwise known.
|
||||
<td>Favorites, Galleries, individual Images, User Profiles</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>8chan</td>
|
||||
<td>https://8chan.moe/</td>
|
||||
<td>Boards, Threads</td>
|
||||
<td></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>8kun</td>
|
||||
<td>https://8kun.top/</td>
|
||||
|
||||
172
gallery_dl/extractor/8chan.py
Normal file
172
gallery_dl/extractor/8chan.py
Normal file
@@ -0,0 +1,172 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2022 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extractors for https://8chan.moe/"""
|
||||
|
||||
from .common import Extractor, Message
|
||||
from .. import text
|
||||
from ..cache import memcache
|
||||
from datetime import datetime, timedelta
|
||||
import itertools
|
||||
|
||||
BASE_PATTERN = r"(?:https?://)?8chan\.(moe|se|cc)"
|
||||
|
||||
|
||||
class _8chanExtractor(Extractor):
|
||||
"""Base class for 8chan extractors"""
|
||||
category = "8chan"
|
||||
root = "https://8chan.moe"
|
||||
|
||||
def __init__(self, match):
|
||||
self.root = "https://8chan." + match.group(1)
|
||||
Extractor.__init__(self, match)
|
||||
|
||||
@memcache()
|
||||
def _prepare_cookies(self):
|
||||
# fetch captcha cookies
|
||||
# (necessary to download without getting interrupted)
|
||||
now = datetime.utcnow()
|
||||
url = self.root + "/captcha.js"
|
||||
params = {"d": now.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (UTC)")}
|
||||
self.request(url, params=params).content
|
||||
|
||||
# adjust cookies
|
||||
# - remove 'expires' timestamp
|
||||
# - move 'captchaexpiration' value forward by 1 month)
|
||||
domain = self.root.rpartition("/")[2]
|
||||
for cookie in self.session.cookies:
|
||||
if cookie.domain.endswith(domain):
|
||||
cookie.expires = None
|
||||
if cookie.name == "captchaexpiration":
|
||||
cookie.value = (now + timedelta(30, 300)).strftime(
|
||||
"%a, %d %b %Y %H:%M:%S GMT")
|
||||
|
||||
return self.session.cookies
|
||||
|
||||
|
||||
class _8chanThreadExtractor(_8chanExtractor):
|
||||
"""Extractor for 8chan threads"""
|
||||
subcategory = "thread"
|
||||
directory_fmt = ("{category}", "{boardUri}",
|
||||
"{threadId} {subject[:50]}")
|
||||
filename_fmt = "{postId}{num:?-//} {filename[:200]}.{extension}"
|
||||
archive_fmt = "{boardUri}_{postId}_{num}"
|
||||
pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)"
|
||||
test = (
|
||||
("https://8chan.moe/vhs/res/4.html", {
|
||||
"pattern": r"https://8chan\.moe/\.media/[0-9a-f]{64}\.\w+$",
|
||||
"count": 14,
|
||||
"keyword": {
|
||||
"archived": False,
|
||||
"autoSage": False,
|
||||
"boardDescription": "Film and Cinema",
|
||||
"boardMarkdown": None,
|
||||
"boardName": "Movies",
|
||||
"boardUri": "vhs",
|
||||
"creation": r"re:\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d{3}Z",
|
||||
"cyclic": False,
|
||||
"email": None,
|
||||
"id": "re:^[0-9a-f]{6}$",
|
||||
"locked": False,
|
||||
"markdown": str,
|
||||
"maxFileCount": 5,
|
||||
"maxFileSize": "32.00 MB",
|
||||
"maxMessageLength": 8001,
|
||||
"message": str,
|
||||
"mime": str,
|
||||
"name": "Anonymous",
|
||||
"num": int,
|
||||
"originalName": str,
|
||||
"path": r"re:/.media/[0-9a-f]{64}\.\w+$",
|
||||
"pinned": False,
|
||||
"postId": int,
|
||||
"signedRole": None,
|
||||
"size": int,
|
||||
"threadId": 4,
|
||||
"thumb": r"re:/.media/t_[0-9a-f]{64}$",
|
||||
"uniquePosters": 9,
|
||||
"usesCustomCss": True,
|
||||
"usesCustomJs": False,
|
||||
"wsPort": 8880,
|
||||
"wssPort": 2087,
|
||||
},
|
||||
}),
|
||||
("https://8chan.se/vhs/res/4.html"),
|
||||
("https://8chan.cc/vhs/res/4.html"),
|
||||
)
|
||||
|
||||
def __init__(self, match):
|
||||
_8chanExtractor.__init__(self, match)
|
||||
_, self.board, self.thread = match.groups()
|
||||
|
||||
def items(self):
|
||||
# fetch thread data
|
||||
url = "{}/{}/res/{}.".format(self.root, self.board, self.thread)
|
||||
self.session.headers["Referer"] = url + "html"
|
||||
thread = self.request(url + "json").json()
|
||||
thread["postId"] = thread["threadId"]
|
||||
thread["_http_headers"] = {"Referer": url + "html"}
|
||||
|
||||
try:
|
||||
self.session.cookies = self._prepare_cookies()
|
||||
except Exception as exc:
|
||||
self.log.debug("Failed to fetch captcha cookies: %s: %s",
|
||||
exc.__class__.__name__, exc, exc_info=True)
|
||||
|
||||
# download files
|
||||
posts = thread.pop("posts", ())
|
||||
yield Message.Directory, thread
|
||||
for post in itertools.chain((thread,), posts):
|
||||
files = post.pop("files", ())
|
||||
if not files:
|
||||
continue
|
||||
thread.update(post)
|
||||
for num, file in enumerate(files):
|
||||
file.update(thread)
|
||||
file["num"] = num
|
||||
text.nameext_from_url(file["originalName"], file)
|
||||
yield Message.Url, self.root + file["path"], file
|
||||
|
||||
|
||||
class _8chanBoardExtractor(_8chanExtractor):
|
||||
"""Extractor for 8chan boards"""
|
||||
subcategory = "board"
|
||||
pattern = BASE_PATTERN + r"/([^/?#]+)/(?:(\d+)\.html)?$"
|
||||
test = (
|
||||
("https://8chan.moe/vhs/"),
|
||||
("https://8chan.moe/vhs/2.html", {
|
||||
"pattern": _8chanThreadExtractor.pattern,
|
||||
"count": 23,
|
||||
}),
|
||||
("https://8chan.se/vhs/"),
|
||||
("https://8chan.cc/vhs/"),
|
||||
)
|
||||
|
||||
def __init__(self, match):
|
||||
_8chanExtractor.__init__(self, match)
|
||||
_, self.board, self.page = match.groups()
|
||||
self.session.headers["Referer"] = self.root + "/"
|
||||
|
||||
def items(self):
|
||||
page = text.parse_int(self.page, 1)
|
||||
url = "{}/{}/{}.json".format(self.root, self.board, page)
|
||||
board = self.request(url).json()
|
||||
threads = board["threads"]
|
||||
|
||||
while True:
|
||||
for thread in threads:
|
||||
thread["_extractor"] = _8chanThreadExtractor
|
||||
url = "{}/{}/res/{}.html".format(
|
||||
self.root, self.board, thread["threadId"])
|
||||
yield Message.Queue, url, thread
|
||||
|
||||
page += 1
|
||||
if page > board["pageCount"]:
|
||||
return
|
||||
url = "{}/{}/{}.json".format(self.root, self.board, page)
|
||||
threads = self.request(url).json()["threads"]
|
||||
@@ -16,6 +16,7 @@ modules = [
|
||||
"420chan",
|
||||
"4chan",
|
||||
"500px",
|
||||
"8chan",
|
||||
"8kun",
|
||||
"8muses",
|
||||
"adultempire",
|
||||
|
||||
@@ -224,6 +224,9 @@ SUBCATEGORY_MAP = {
|
||||
"replies": "",
|
||||
"list-members": "List Members",
|
||||
},
|
||||
"vk": {
|
||||
"tagged": "Tagged Photos",
|
||||
},
|
||||
"wallhaven": {
|
||||
"collections": "",
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user