initial commit

This commit is contained in:
Mike Fährmann
2014-10-12 21:56:44 +02:00
parent f738f3ae58
commit deef91eddc
18 changed files with 903 additions and 395 deletions

44
gallery_dl/__init__.py Normal file
View File

@@ -0,0 +1,44 @@
__author__ = "Mike Fährmann"
__copyright__ = "Copyright 2014, Mike Fährmann"
__license__ = "GPLv3"
__version__ = "0.4"
__maintainer__ = "Mike Fährmann"
__email__ = "mike_faehrmann@web.de"
import os
import sys
import argparse
import configparser
from . import extractor
from . import downloader
def parse_cmdline_options():
p = argparse.ArgumentParser(
description='Download images from various sources')
p.add_argument("-c", "--config",
default="~/.config/gallery/config", metavar="CFG", help="alternate configuration file")
p.add_argument("-d", "--dest",
metavar="DEST", help="destination directory")
p.add_argument("urls", nargs="+",
metavar="URL", help="url to download images from")
return p.parse_args()
def parse_config_file(path):
config = configparser.ConfigParser(
interpolation=None,
)
config.optionxform = lambda opt:opt
config.read(os.path.expanduser(path))
return config
def main():
opts = parse_cmdline_options()
conf = parse_config_file(opts.config)
extf = extractor.ExtractorFinder(conf)
dlmg = downloader.DownloadManager(opts, conf)
for url in opts.urls:
ex = extf.match(url)
dlmg.add(ex)

View File

@@ -0,0 +1,54 @@
import os
import sys
import importlib
class DownloadManager():
def __init__(self, opts, conf):
self.opts = opts
self.conf = conf
self.downloaders = {}
def add(self, extr):
if self.opts.dest:
dest = self.opts.dest
elif extr.category in self.conf:
dest = self.conf[extr.category].get("destination", "/tmp/")
else:
dest = self.conf["general"].get("destination", "/tmp/")
dest = os.path.join(dest, extr.category, extr.directory)
os.makedirs(dest, exist_ok=True)
for url, filename in extr:
path = os.path.join(dest, filename)
if os.path.exists(path):
self.print_skip(path)
continue
dl = self.get_downloader(extr, url)
self.print_start(path)
tries = dl.download(url, path)
self.print_success(path, tries)
def get_downloader(self, extr, url):
end = url.find("://")
proto = url[:end] if end != -1 else "http"
if proto not in self.downloaders:
# import downloader
module = importlib.import_module("."+proto, __package__)
self.downloaders[proto] = module.Downloader
return self.downloaders[proto](extr)
@staticmethod
def print_start(path):
print(path, end="")
sys.stdout.flush()
@staticmethod
def print_skip(path):
print("\033[2m", path, "\033[0m", sep="")
@staticmethod
def print_success(path, tries):
if tries == 0:
print("\r", end="")
print("\r\033[1;32m", path, "\033[0m", sep="")

View File

@@ -0,0 +1,21 @@
import os
class BasicDownloader():
max_tries = 5
def download(self, url, path):
with open(path, "wb") as file:
try:
return self.download_impl(url, file)
file.close()
except:
# make sure to remove file if download failed
os.unlink(path)
raise
@staticmethod
def print_error(file, error, tries, max_tries=5):
if tries == 1 and hasattr(file, "name"):
print("\r\033[1;31m", file.name, sep="")
print("\033[0;31m[Error]\033[0m ", error, " (", tries, "/", max_tries, ")", sep="")

View File

@@ -0,0 +1,42 @@
from .common import BasicDownloader
import time
import requests
class Downloader(BasicDownloader):
def __init__(self, extr):
BasicDownloader.__init__(self)
self.session = extr.session
def download_impl(self, url, file):
tries = 0
while True:
# try to connect to remote source
try:
response = self.session.get(url, stream=True, verify=True)
except requests.exceptions.ConnectionError as e:
tries += 1
self.print_error(file, e, tries, self.max_tries)
time.sleep(1)
if tries == self.max_tries:
raise
continue
# reject error-status-codes
if response.status_code != requests.codes.ok:
tries += 1
self.print_error(file, 'HTTP status "{} {}"'.format(
response.status_code, response.reason), tries, self.max_tries)
if response.status_code == 404:
return self.max_tries
time.sleep(1)
if tries == 5:
response.raise_for_status()
continue
# everything ok -- proceed to download
break
for data in response.iter_content(16384):
file.write(data)
return tries

View File

@@ -0,0 +1 @@
from .http import Downloader

View File

@@ -0,0 +1,10 @@
from .common import BasicDownloader
class Downloader(BasicDownloader):
def __init__(self, extr):
BasicDownloader.__init__(self)
def download_impl(self, url, file):
file.write(bytes(url[7:], "utf-8"))
return 0

View File

@@ -0,0 +1,21 @@
from .common import BasicExtractor
from urllib.parse import unquote
import re
class Extractor(BasicExtractor):
thread_url_fmt = "https://www.8chan.co/{0}/res/{1}.html"
regex = r'>File: <a href="([^"]+)">([^<]+)\.[^<]+<.*?<span class="postfilename"( title="([^"]+)")?>([^<]+)<'
def __init__(self, match, config):
BasicExtractor.__init__(self, config)
self.board, _, self.thread_id = match.group(1).split("/")
self.category = "8chan"
self.directory = self.board + "-" + self.thread_id
def images(self):
url = self.thread_url_fmt.format(self.board, self.thread_id)
text = self.request(url).text
for match in re.finditer(self.regex, text):
url, prefix, fullname, name = match.group(1, 2, 4, 5)
yield ("https://www.8chan.co" + url, prefix + "-" + unquote(fullname or name))

View File

@@ -0,0 +1,47 @@
import os
import sys
import re
import sqlite3
import importlib
class ExtractorFinder():
def __init__(self, config):
self.config = config
self.match_list = list()
if "database" in config["general"]:
path = os.path.expanduser(config["general"]["database"])
conn = sqlite3.connect(path)
self.load_from_database(conn)
self.load_from_config(config)
def match(self, url):
for category, regex in self.match_list:
match = regex.match(url)
if match:
module = importlib.import_module("."+category, __package__)
return module.Extractor(match, self.config)
return None
def load_from_database(self, db):
query = (
"SELECT regex.re, category.name "
"FROM regex JOIN category "
"ON regex.category_id = category.id"
)
for row in db.execute(query):
self.add_match(row[1], row[0])
def load_from_config(self, conf):
for category in conf:
for key, value in conf[category].items():
if(key.startswith("regex")):
self.add_match(category, value)
def add_match(self, category, regex):
try:
# print(category, regex)
self.match_list.append( (category, re.compile(regex)) )
except:
print("[Warning] [{0}] failed to compile regular expression '{1}'"
.format(category, regex))

View File

@@ -0,0 +1,22 @@
from .common import AsyncExtractor
from ..util import filename_from_url
from urllib.parse import unquote
class Extractor(AsyncExtractor):
def __init__(self, match, config):
AsyncExtractor.__init__(self, config)
self.url = "https://bato.to/read/_/" + match.group(1) + "/_/1"
self.category = "batoto"
self.directory = match.group(1)
def images(self):
next_url = self.url
while next_url:
text = self.request(next_url).text
pos = text.find('<div id="full_image"')
next_url, pos = self.extract(text, '<a href="', '"', pos)
url, pos = self.extract(text, 'src="', '"', pos)
name = unquote( filename_from_url(url) )
yield url, name

View File

@@ -0,0 +1,67 @@
import queue
import threading
import requests
from ..util import safe_request
class BasicExtractor():
def __init__(self, config):
self.session = requests.Session()
self.category = ""
self.directory = ""
def __iter__(self):
return self.images()
def request(self, url, *args, **kwargs):
return safe_request(self.session, url, *args, **kwargs)
def enable_useragent(self):
self.session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"
@staticmethod
def extract(txt, begin, end, pos=0):
try:
first = txt.index(begin, pos) + len(begin)
last = txt.index(end, first)
return txt[first:last], last+len(end)
except:
return None, pos
@staticmethod
def extract_all(txt, begin, end, pos=0):
try:
first = txt.index(begin, pos)
last = txt.index(end, first + len(begin)) + len(end)
return txt[first:last], last
except:
return None, pos
class AsyncExtractor(BasicExtractor):
def __init__(self, config):
super().__init__(config)
self.__queue = queue.Queue(maxsize=5)
self.__thread = threading.Thread(target=self.async_images, daemon=True)
def __iter__(self):
get = self.__queue.get
done = self.__queue.task_done
self.__thread.start()
while True:
task = get()
if task is None:
return
yield task
done()
def async_images(self):
put = self.__queue.put
try:
for task in self.images():
put(task)
except:
import traceback
print(traceback.format_exc())
put(None)

View File

@@ -0,0 +1,68 @@
from .common import BasicExtractor
from ..util import unescape
import time
import random
import json
class Extractor(BasicExtractor):
api_url = "http://exhentai.org/api.php"
name_fmt = "{}_{:>04}_{}_{}"
def __init__(self, match, config):
BasicExtractor.__init__(self, config)
self.url = match.group(0)
self.gid, self.token = match.group(1).split("/")
self.category = "exhentai"
self.directory = self.gid
self.session.cookies.update(config["exhentai-cookies"])
def images(self):
e = self.extract
# get gallery page
text = self.request(self.url).text
# get first image page
url, pos = self.extract_all(text, "http://exhentai.org/s/", "-1")
text = self.request(url).text
# extract information
_ , pos = e(text, '<div id="i3"><a onclick="return load_image(', '')
imgkey, pos = e(text, "'", "'", pos)
url , pos = e(text, '<img id="img" src="', '"', pos)
name , pos = e(text, '<div id="i4"><div>', ' :: ', pos)
orgurl, pos = e(text, 'http://exhentai.org/fullimg.php', '"', pos)
if orgurl: url = "http://exhentai.org/fullimg.php" + unescape(orgurl)
yield url, name_fmt.format(self.gid, 1, imgkey, name)
gid , pos = e(text, 'var gid=' , ';', pos)
startkey, pos = e(text, 'var startkey="', '";', pos)
showkey , pos = e(text, 'var showkey="' , '";', pos)
# use json-api for further pages
request = {
"method" : "showpage",
"gid" : int(gid),
"page" : 2,
"imgkey" : imgkey,
"showkey": showkey,
}
while True:
time.sleep( random.uniform(2, 5) )
info = json.loads(
self.session.post(self.api_url, data=json.dumps(request)).text
)
imgkey, pos = e(info["i3"], "'", "'")
url , pos = e(info["i3"], '<img id="img" src="', '"', pos)
name , pos = e(info["i" ], '<div>', ' :: ')
orgurl, pos = e(info["i7"], '<a href="', '"')
if orgurl: url = unescape(orgurl)
yield url, name_fmt.format(gid, request["page"], imgkey, name)
if request["imgkey"] == imgkey:
return
request["imgkey"] = imgkey
request["page"] += 1

View File

@@ -0,0 +1,50 @@
from .common import AsyncExtractor
from ..util import filename_from_url
import xml.etree.ElementTree as ET
class BooruExtractor(AsyncExtractor):
def __init__(self, match, config):
AsyncExtractor.__init__(self, config)
self.tags = match.group(1)
self.category = "booru"
self.params = {"tags": self.tags}
self.page = "page"
self.directory = self.tags.replace("/", "_")
def images(self):
self.update_page(reset=True)
while True:
root = ET.fromstring(
self.request(self.api_url, verify=True, params=self.params).text
)
# root = tree.getroot()
if len(root) == 0:
return
for item in root:
url = item.attrib["file_url"]
name = "{}_{}".format(self.category, filename_from_url(url))
yield url, name
self.update_page()
def update_page(self, reset=False):
# Override this method in derived classes if necessary.
# It is usually enough to adjust the 'page' attribute
if reset is False:
self.params[self.page] += 1
else:
self.params[self.page] = 1
class Extractor(BooruExtractor):
def __init__(self, match, config):
BooruExtractor.__init__(self, match, config)
self.category = "gelbooru"
self.api_url = "http://gelbooru.com/"
self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}
def update_page(self, reset=False):
if reset is False:
self.params["pid"] += 1
else:
self.params["pid"] = 0

View File

@@ -0,0 +1,150 @@
from .common import AsyncExtractor
from ..util import safe_request
import re
import csv
import requests
class Extractor(AsyncExtractor):
member_url = "http://www.pixiv.net/member_illust.php"
illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"
def __init__(self, match, config):
AsyncExtractor.__init__(self, config)
self.member_id = match.group(1)
self.category = "pixiv"
self.directory = self.member_id
self.session.cookies.update(config["pixiv-cookies"])
self.session.headers.update({"Referer": "http://www.pixiv.net/"})
self.api = PixivAPI(config["pixiv-cookies"]["PHPSESSID"])
def images(self):
sname_fmt = "pixiv_{1}_{0}.{2}"
mname_fmt = "pixiv_{1}_{0}_p{num:02}.{2}"
singl_v1_fmt = "http://i1.pixiv.net/img{4:>02}/img/{24}/{0}.{2}"
manga_v1_fmt = "http://i1.pixiv.net/img{4:>02}/img/{24}/{0}{big}_p{num}.{2}"
singl_v2_fmt = "http://i1.pixiv.net/img-original/img/{date}/{0}_p0.{2}"
manga_v2_fmt = "http://i1.pixiv.net/img-original/img/{date}/{0}_p{num}.{2}"
date = ""
big = ""
for img in self.image_ids():
data = self.api.request(img)
# debug
# for i, value in enumerate(data):
# print("{:02}: {}".format(i, value))
# return
# debug end
if "うごイラ" in data[13]:
# ugoira / animations
try:
url, framelist = self.parse_ugoira(img)
data[2] = "zip"
yield (url, sname_fmt.format(*data))
data[2] = "txt"
yield (framelist, sname_fmt.format(*data))
continue
except:
print("[Warning] failed to get ugoira url; trying fallback")
# images
if img > 46270949:
date = data[6][45:64]
url_s_fmt = singl_v2_fmt
url_m_fmt = manga_v2_fmt
else:
big = "_big" if img > 11319935 else ""
url_s_fmt = singl_v1_fmt
url_m_fmt = manga_v1_fmt
if not data[19]:
yield (url_s_fmt.format(*data, date=date), sname_fmt.format(*data))
else:
for i in range(0, int(data[19])):
yield (url_m_fmt.format(*data, num=i, date=date, big=big),
mname_fmt.format(*data, num=i))
def image_ids(self):
"""generator -- yield all image ids"""
needle = '<li class="image-item"><a href="/member_illust.php?mode=medium&amp;illust_id='
params = {"id": self.member_id, "p": 1}
while True:
text = self.request(self.member_url, params=params).text
end = 0
found = 0
while True:
pos = text.find(needle, end)
if pos == -1:
break
pos += len(needle)
end = text.find('"', pos)
found += 1
yield int(text[pos:end])
if found != 20:
return
params["p"] += 1
def parse_ugoira(self, illust_id):
# get illust page
text = self.request(
self.illust_url,
params={"illust_id": illust_id},
).text
# parse page
url , pos = self.extract(text, 'ugokuIllustFullscreenData = {"src":"', '"')
frames, pos = self.extract(text, '"frames":[', ']', pos)
# fix url
url = url.replace("\\/", "/")
# build framelist
framelist = "text://" + re.sub(
r'\{"file":"([^"]+)","delay":(\d+)\},?',
r'\1 \2\n',
frames
)
return url, framelist
class PixivAPI():
api_url = "http://spapi.pixiv.net/iphone/illust.php"
def __init__(self, session_id):
self.session = requests.Session()
self.session.params["PHPSESSID"] = session_id
def request(self, illust_id):
while True:
text = safe_request(
self.session,
self.api_url,
params={"illust_id": illust_id}
).text
if len(text) > 31:
return next(csv.reader([text]))
# class FileDict(dict):
#
# def __init__(self, *args):
# super().__init__()
# self.re = re.compile(r"pixiv_\d+_(?P<id>\d+)(?P<extra>_p\d+)?\.[a-z]{3}")
# for arg in args:
# self.load_from(arg)
#
# def load_from(self, directory):
# match = self.re.match
# for file in os.listdir(directory):
# m = match(file)
# if m is None:
# continue
# val = True if m.group("extra") else False
# dict.__setitem__(self, m.group("id"), val)
#
# def __getitem__(self, key):
# return dict.get(self, key)

View File

@@ -0,0 +1,35 @@
from .common import AsyncExtractor
from ..util import filename_from_url
class Extractor(AsyncExtractor):
url = "https://chan.sankakucomplex.com/"
def __init__(self, match, config):
AsyncExtractor.__init__(self, config)
self.tags = match.group(1)
self.category = "sankaku"
self.directory = self.tags.replace("/", "_")
self.enable_useragent()
def images(self):
needle = ' src="//c.sankakucomplex.com/data/preview/'
params = {"tags": self.tags, "page":1}
while True:
text = self.request(self.url, params=params).text
print(text)
return
pos = 0
found = 0
while True:
try:
url, pos = self.extract(text, needle, '"', pos)
found += 1
print("https://cs.sankakucomplex.com/data/" + url)
yield ("https://cs.sankakucomplex.com/data/" + url,
"%s_%s" % (self.category, filename_from_url(url)))
except:
break
if found == 0:
break
params["page"] += 1

33
gallery_dl/util.py Normal file
View File

@@ -0,0 +1,33 @@
import time
import requests
import html.parser
def safe_request(session, url, *args, **kwargs):
tries = 0
while True:
# try to connect to remote source
try:
r = session.get(url, *args, **kwargs)
except requests.exceptions.ConnectionError:
tries += 1
time.sleep(1)
if tries == 5:
raise
continue
# reject error-status-codes
if r.status_code != requests.codes.ok:
tries += 1
time.sleep(1)
if tries == 5:
r.raise_for_status()
continue
# everything ok -- proceed to download
return r
def filename_from_url(url):
pos = url.rfind("/")
return url[pos+1:]
unescape = html.parser.HTMLParser().unescape