initial commit

This commit is contained in:
Mike Fährmann
2014-10-12 21:56:44 +02:00
parent f738f3ae58
commit deef91eddc
18 changed files with 903 additions and 395 deletions

View File

@@ -0,0 +1,21 @@
from .common import BasicExtractor
from urllib.parse import unquote
import re
class Extractor(BasicExtractor):
thread_url_fmt = "https://www.8chan.co/{0}/res/{1}.html"
regex = r'>File: <a href="([^"]+)">([^<]+)\.[^<]+<.*?<span class="postfilename"( title="([^"]+)")?>([^<]+)<'
def __init__(self, match, config):
BasicExtractor.__init__(self, config)
self.board, _, self.thread_id = match.group(1).split("/")
self.category = "8chan"
self.directory = self.board + "-" + self.thread_id
def images(self):
url = self.thread_url_fmt.format(self.board, self.thread_id)
text = self.request(url).text
for match in re.finditer(self.regex, text):
url, prefix, fullname, name = match.group(1, 2, 4, 5)
yield ("https://www.8chan.co" + url, prefix + "-" + unquote(fullname or name))

View File

@@ -0,0 +1,47 @@
import os
import sys
import re
import sqlite3
import importlib
class ExtractorFinder():
def __init__(self, config):
self.config = config
self.match_list = list()
if "database" in config["general"]:
path = os.path.expanduser(config["general"]["database"])
conn = sqlite3.connect(path)
self.load_from_database(conn)
self.load_from_config(config)
def match(self, url):
for category, regex in self.match_list:
match = regex.match(url)
if match:
module = importlib.import_module("."+category, __package__)
return module.Extractor(match, self.config)
return None
def load_from_database(self, db):
query = (
"SELECT regex.re, category.name "
"FROM regex JOIN category "
"ON regex.category_id = category.id"
)
for row in db.execute(query):
self.add_match(row[1], row[0])
def load_from_config(self, conf):
for category in conf:
for key, value in conf[category].items():
if(key.startswith("regex")):
self.add_match(category, value)
def add_match(self, category, regex):
try:
# print(category, regex)
self.match_list.append( (category, re.compile(regex)) )
except:
print("[Warning] [{0}] failed to compile regular expression '{1}'"
.format(category, regex))

View File

@@ -0,0 +1,22 @@
from .common import AsyncExtractor
from ..util import filename_from_url
from urllib.parse import unquote
class Extractor(AsyncExtractor):
def __init__(self, match, config):
AsyncExtractor.__init__(self, config)
self.url = "https://bato.to/read/_/" + match.group(1) + "/_/1"
self.category = "batoto"
self.directory = match.group(1)
def images(self):
next_url = self.url
while next_url:
text = self.request(next_url).text
pos = text.find('<div id="full_image"')
next_url, pos = self.extract(text, '<a href="', '"', pos)
url, pos = self.extract(text, 'src="', '"', pos)
name = unquote( filename_from_url(url) )
yield url, name

View File

@@ -0,0 +1,67 @@
import queue
import threading
import requests
from ..util import safe_request
class BasicExtractor():
def __init__(self, config):
self.session = requests.Session()
self.category = ""
self.directory = ""
def __iter__(self):
return self.images()
def request(self, url, *args, **kwargs):
return safe_request(self.session, url, *args, **kwargs)
def enable_useragent(self):
self.session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"
@staticmethod
def extract(txt, begin, end, pos=0):
try:
first = txt.index(begin, pos) + len(begin)
last = txt.index(end, first)
return txt[first:last], last+len(end)
except:
return None, pos
@staticmethod
def extract_all(txt, begin, end, pos=0):
try:
first = txt.index(begin, pos)
last = txt.index(end, first + len(begin)) + len(end)
return txt[first:last], last
except:
return None, pos
class AsyncExtractor(BasicExtractor):
def __init__(self, config):
super().__init__(config)
self.__queue = queue.Queue(maxsize=5)
self.__thread = threading.Thread(target=self.async_images, daemon=True)
def __iter__(self):
get = self.__queue.get
done = self.__queue.task_done
self.__thread.start()
while True:
task = get()
if task is None:
return
yield task
done()
def async_images(self):
put = self.__queue.put
try:
for task in self.images():
put(task)
except:
import traceback
print(traceback.format_exc())
put(None)

View File

@@ -0,0 +1,68 @@
from .common import BasicExtractor
from ..util import unescape
import time
import random
import json
class Extractor(BasicExtractor):
api_url = "http://exhentai.org/api.php"
name_fmt = "{}_{:>04}_{}_{}"
def __init__(self, match, config):
BasicExtractor.__init__(self, config)
self.url = match.group(0)
self.gid, self.token = match.group(1).split("/")
self.category = "exhentai"
self.directory = self.gid
self.session.cookies.update(config["exhentai-cookies"])
def images(self):
e = self.extract
# get gallery page
text = self.request(self.url).text
# get first image page
url, pos = self.extract_all(text, "http://exhentai.org/s/", "-1")
text = self.request(url).text
# extract information
_ , pos = e(text, '<div id="i3"><a onclick="return load_image(', '')
imgkey, pos = e(text, "'", "'", pos)
url , pos = e(text, '<img id="img" src="', '"', pos)
name , pos = e(text, '<div id="i4"><div>', ' :: ', pos)
orgurl, pos = e(text, 'http://exhentai.org/fullimg.php', '"', pos)
if orgurl: url = "http://exhentai.org/fullimg.php" + unescape(orgurl)
yield url, name_fmt.format(self.gid, 1, imgkey, name)
gid , pos = e(text, 'var gid=' , ';', pos)
startkey, pos = e(text, 'var startkey="', '";', pos)
showkey , pos = e(text, 'var showkey="' , '";', pos)
# use json-api for further pages
request = {
"method" : "showpage",
"gid" : int(gid),
"page" : 2,
"imgkey" : imgkey,
"showkey": showkey,
}
while True:
time.sleep( random.uniform(2, 5) )
info = json.loads(
self.session.post(self.api_url, data=json.dumps(request)).text
)
imgkey, pos = e(info["i3"], "'", "'")
url , pos = e(info["i3"], '<img id="img" src="', '"', pos)
name , pos = e(info["i" ], '<div>', ' :: ')
orgurl, pos = e(info["i7"], '<a href="', '"')
if orgurl: url = unescape(orgurl)
yield url, name_fmt.format(gid, request["page"], imgkey, name)
if request["imgkey"] == imgkey:
return
request["imgkey"] = imgkey
request["page"] += 1

View File

@@ -0,0 +1,50 @@
from .common import AsyncExtractor
from ..util import filename_from_url
import xml.etree.ElementTree as ET
class BooruExtractor(AsyncExtractor):
def __init__(self, match, config):
AsyncExtractor.__init__(self, config)
self.tags = match.group(1)
self.category = "booru"
self.params = {"tags": self.tags}
self.page = "page"
self.directory = self.tags.replace("/", "_")
def images(self):
self.update_page(reset=True)
while True:
root = ET.fromstring(
self.request(self.api_url, verify=True, params=self.params).text
)
# root = tree.getroot()
if len(root) == 0:
return
for item in root:
url = item.attrib["file_url"]
name = "{}_{}".format(self.category, filename_from_url(url))
yield url, name
self.update_page()
def update_page(self, reset=False):
# Override this method in derived classes if necessary.
# It is usually enough to adjust the 'page' attribute
if reset is False:
self.params[self.page] += 1
else:
self.params[self.page] = 1
class Extractor(BooruExtractor):
def __init__(self, match, config):
BooruExtractor.__init__(self, match, config)
self.category = "gelbooru"
self.api_url = "http://gelbooru.com/"
self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}
def update_page(self, reset=False):
if reset is False:
self.params["pid"] += 1
else:
self.params["pid"] = 0

View File

@@ -0,0 +1,150 @@
from .common import AsyncExtractor
from ..util import safe_request
import re
import csv
import requests
class Extractor(AsyncExtractor):
member_url = "http://www.pixiv.net/member_illust.php"
illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"
def __init__(self, match, config):
AsyncExtractor.__init__(self, config)
self.member_id = match.group(1)
self.category = "pixiv"
self.directory = self.member_id
self.session.cookies.update(config["pixiv-cookies"])
self.session.headers.update({"Referer": "http://www.pixiv.net/"})
self.api = PixivAPI(config["pixiv-cookies"]["PHPSESSID"])
def images(self):
sname_fmt = "pixiv_{1}_{0}.{2}"
mname_fmt = "pixiv_{1}_{0}_p{num:02}.{2}"
singl_v1_fmt = "http://i1.pixiv.net/img{4:>02}/img/{24}/{0}.{2}"
manga_v1_fmt = "http://i1.pixiv.net/img{4:>02}/img/{24}/{0}{big}_p{num}.{2}"
singl_v2_fmt = "http://i1.pixiv.net/img-original/img/{date}/{0}_p0.{2}"
manga_v2_fmt = "http://i1.pixiv.net/img-original/img/{date}/{0}_p{num}.{2}"
date = ""
big = ""
for img in self.image_ids():
data = self.api.request(img)
# debug
# for i, value in enumerate(data):
# print("{:02}: {}".format(i, value))
# return
# debug end
if "うごイラ" in data[13]:
# ugoira / animations
try:
url, framelist = self.parse_ugoira(img)
data[2] = "zip"
yield (url, sname_fmt.format(*data))
data[2] = "txt"
yield (framelist, sname_fmt.format(*data))
continue
except:
print("[Warning] failed to get ugoira url; trying fallback")
# images
if img > 46270949:
date = data[6][45:64]
url_s_fmt = singl_v2_fmt
url_m_fmt = manga_v2_fmt
else:
big = "_big" if img > 11319935 else ""
url_s_fmt = singl_v1_fmt
url_m_fmt = manga_v1_fmt
if not data[19]:
yield (url_s_fmt.format(*data, date=date), sname_fmt.format(*data))
else:
for i in range(0, int(data[19])):
yield (url_m_fmt.format(*data, num=i, date=date, big=big),
mname_fmt.format(*data, num=i))
def image_ids(self):
"""generator -- yield all image ids"""
needle = '<li class="image-item"><a href="/member_illust.php?mode=medium&amp;illust_id='
params = {"id": self.member_id, "p": 1}
while True:
text = self.request(self.member_url, params=params).text
end = 0
found = 0
while True:
pos = text.find(needle, end)
if pos == -1:
break
pos += len(needle)
end = text.find('"', pos)
found += 1
yield int(text[pos:end])
if found != 20:
return
params["p"] += 1
def parse_ugoira(self, illust_id):
# get illust page
text = self.request(
self.illust_url,
params={"illust_id": illust_id},
).text
# parse page
url , pos = self.extract(text, 'ugokuIllustFullscreenData = {"src":"', '"')
frames, pos = self.extract(text, '"frames":[', ']', pos)
# fix url
url = url.replace("\\/", "/")
# build framelist
framelist = "text://" + re.sub(
r'\{"file":"([^"]+)","delay":(\d+)\},?',
r'\1 \2\n',
frames
)
return url, framelist
class PixivAPI():
api_url = "http://spapi.pixiv.net/iphone/illust.php"
def __init__(self, session_id):
self.session = requests.Session()
self.session.params["PHPSESSID"] = session_id
def request(self, illust_id):
while True:
text = safe_request(
self.session,
self.api_url,
params={"illust_id": illust_id}
).text
if len(text) > 31:
return next(csv.reader([text]))
# class FileDict(dict):
#
# def __init__(self, *args):
# super().__init__()
# self.re = re.compile(r"pixiv_\d+_(?P<id>\d+)(?P<extra>_p\d+)?\.[a-z]{3}")
# for arg in args:
# self.load_from(arg)
#
# def load_from(self, directory):
# match = self.re.match
# for file in os.listdir(directory):
# m = match(file)
# if m is None:
# continue
# val = True if m.group("extra") else False
# dict.__setitem__(self, m.group("id"), val)
#
# def __getitem__(self, key):
# return dict.get(self, key)

View File

@@ -0,0 +1,35 @@
from .common import AsyncExtractor
from ..util import filename_from_url
class Extractor(AsyncExtractor):
url = "https://chan.sankakucomplex.com/"
def __init__(self, match, config):
AsyncExtractor.__init__(self, config)
self.tags = match.group(1)
self.category = "sankaku"
self.directory = self.tags.replace("/", "_")
self.enable_useragent()
def images(self):
needle = ' src="//c.sankakucomplex.com/data/preview/'
params = {"tags": self.tags, "page":1}
while True:
text = self.request(self.url, params=params).text
print(text)
return
pos = 0
found = 0
while True:
try:
url, pos = self.extract(text, needle, '"', pos)
found += 1
print("https://cs.sankakucomplex.com/data/" + url)
yield ("https://cs.sankakucomplex.com/data/" + url,
"%s_%s" % (self.category, filename_from_url(url)))
except:
break
if found == 0:
break
params["page"] += 1