[foolfuuka] add support for more sites (#18)

- https://arch.b4k.co
- https://archive.whatisthisimnotgoodwithcomputers.com
- https://archive.yeet.net

Notes:
- The name "whatisthisimnotgoodwithcomputers" is way too long ...
- archive.yeet.net is out of date and also blocked by 4chan servers
  - newest threads are 2 weeks old
  - using "https://archive.yeet.net" as Referer header results in
    "403 Forbidden" when accessing 4chan
This commit is contained in:
Mike Fährmann
2017-09-16 21:11:44 +02:00
parent 84d4450410
commit cebf800a7f
7 changed files with 89 additions and 5 deletions

View File

@@ -18,6 +18,7 @@ modules = [
"8chan",
"archivedmoe",
"archiveofsins",
"b4k",
"batoto",
"danbooru",
"desuarchive",
@@ -78,8 +79,10 @@ modules = [
"tumblr",
"twitter",
"warosu",
"whatisthisimnotgoodwithcomputers",
"worldthree",
"yandere",
"yeet",
"imagehosts",
"directlink",
"recursive",

View File

@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://arch.b4k.co/"""
from . import chan
class BfourkThreadExtractor(chan.FoolfuukaThreadExtractor):
"""Extractor for images from threads on arch.b4k.co"""
category = "b4k"
root = "https://arch.b4k.co"
pattern = [r"(?:https?://)?arch\.b4k\.co/([^/]+)/thread/(\d+)"]
test = [("http://arch.b4k.co/meta/thread/196/", {
"url": "cdd4931ac1cd00264b0b54e2e3b0d8f6ae48957e",
})]
def remote(self, media):
return media["remote_media_link"]

View File

@@ -70,12 +70,14 @@ class FoolfuukaThreadExtractor(SharedConfigExtractor):
"{thread_num} - {title}"]
filename_fmt = "{media[media]}"
root = ""
referer = True
def __init__(self, match):
SharedConfigExtractor.__init__(self)
self.board, self.thread = match.groups()
self.session.headers["User-Agent"] = "Mozilla 5.0"
self.session.headers["Referer"] = self.root
if self.referer:
self.session.headers["Referer"] = self.root
def items(self):
op = True
@@ -91,9 +93,9 @@ class FoolfuukaThreadExtractor(SharedConfigExtractor):
url = media["media_link"]
if not url and "remote_media_link" in media:
needle = '<meta http-equiv="Refresh" content="0; url='
page = self.request(media["remote_media_link"]).text
url = text.extract(page, needle, '"')[0]
url = self.remote(media)
if url.startswith("/"):
url = self.root + url
post["extension"] = url.rpartition(".")[2]
yield Message.Url, url, post
@@ -104,7 +106,12 @@ class FoolfuukaThreadExtractor(SharedConfigExtractor):
data = self.request(url, params=params).json()[self.thread]
# sort post-objects by their key
posts = sorted(data["posts"].items(), key=operator.itemgetter(0))
posts = sorted(data.get("posts", {}).items())
posts = map(operator.itemgetter(1), posts)
return itertools.chain((data["op"],), posts)
def remote(self, media):
needle = '<meta http-equiv="Refresh" content="0; url='
page = self.request(media["remote_media_link"]).text
return text.extract(page, needle, '"')[0]

View File

@@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://archive.whatisthisimnotgoodwithcomputers.com"""
from . import chan
class WitingwcThreadExtractor(chan.FoolfuukaThreadExtractor):
"""Extractor for archive.whatisthisimnotgoodwithcomputers.com"""
category = "whatisthisimnotgoodwithcomputers"
root = "https://archive.whatisthisimnotgoodwithcomputers.com"
pattern = [r"(?:https?://)?archive\.whatisthisimnotgoodwithcomputers\.com/"
r"([^/]+)/thread/(\d+)"]
test = [(("https://archive.whatisthisimnotgoodwithcomputers.com/"
"ref/thread/1094/"), {
"url": "cf8f6d4b4950767d2131de308ebc96eec05b04f6",
})]

View File

@@ -0,0 +1,22 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://archive.yeet.net/"""
from . import chan
class YeetThreadExtractor(chan.FoolfuukaThreadExtractor):
"""Extractor for images from threads on archive.yeet.net"""
category = "yeet"
root = "https://archive.yeet.net"
pattern = [r"(?:https?://)?archive\.yeet\.net/([^/]+)/thread/(\d+)"]
test = [("https://archive.yeet.net/yeet/thread/359/", {
"url": "ced64a1aadaafc4f359ab89d9f801050731803f1",
})]
referer = False