initial commit

2014-10-12 21:56:44 +02:00
parent f738f3ae58
commit deef91eddc
18 changed files with 903 additions and 395 deletions
--- a/gallery_dl/init.py
+++ b/gallery_dl/init.py
@@ -0,0 +1,44 @@
+__author__     = "Mike Fährmann"
+__copyright__  = "Copyright 2014, Mike Fährmann"
+
+__license__    = "GPLv3"
+__version__    = "0.4"
+__maintainer__ = "Mike Fährmann"
+__email__      = "mike_faehrmann@web.de"
+
+import os
+import sys
+import argparse
+import configparser
+
+from . import extractor
+from . import downloader
+
+def parse_cmdline_options():
+    p = argparse.ArgumentParser(
+        description='Download images from various sources')
+    p.add_argument("-c", "--config",
+        default="~/.config/gallery/config", metavar="CFG", help="alternate configuration file")
+    p.add_argument("-d", "--dest",
+        metavar="DEST", help="destination directory")
+    p.add_argument("urls", nargs="+",
+        metavar="URL", help="url to download images from")
+    return p.parse_args()
+
+def parse_config_file(path):
+    config = configparser.ConfigParser(
+        interpolation=None,
+    )
+    config.optionxform = lambda opt:opt
+    config.read(os.path.expanduser(path))
+    return config
+
+def main():
+    opts = parse_cmdline_options()
+    conf = parse_config_file(opts.config)
+    extf = extractor.ExtractorFinder(conf)
+    dlmg = downloader.DownloadManager(opts, conf)
+
+    for url in opts.urls:
+        ex = extf.match(url)
+        dlmg.add(ex)
--- a/gallery_dl/downloader/init.py
+++ b/gallery_dl/downloader/init.py
@@ -0,0 +1,54 @@
+import os
+import sys
+import importlib
+
+class DownloadManager():
+
+    def __init__(self, opts, conf):
+        self.opts = opts
+        self.conf = conf
+        self.downloaders = {}
+
+    def add(self, extr):
+        if self.opts.dest:
+            dest = self.opts.dest
+        elif extr.category in self.conf:
+            dest = self.conf[extr.category].get("destination", "/tmp/")
+        else:
+            dest = self.conf["general"].get("destination", "/tmp/")
+        dest = os.path.join(dest, extr.category, extr.directory)
+        os.makedirs(dest, exist_ok=True)
+
+        for url, filename in extr:
+            path = os.path.join(dest, filename)
+            if os.path.exists(path):
+                self.print_skip(path)
+                continue
+            dl = self.get_downloader(extr, url)
+            self.print_start(path)
+            tries = dl.download(url, path)
+            self.print_success(path, tries)
+
+    def get_downloader(self, extr, url):
+        end   = url.find("://")
+        proto = url[:end] if end != -1 else "http"
+        if proto not in self.downloaders:
+            # import downloader
+            module = importlib.import_module("."+proto, __package__)
+            self.downloaders[proto] = module.Downloader
+        return self.downloaders[proto](extr)
+
+    @staticmethod
+    def print_start(path):
+        print(path, end="")
+        sys.stdout.flush()
+
+    @staticmethod
+    def print_skip(path):
+        print("\033[2m", path, "\033[0m", sep="")
+
+    @staticmethod
+    def print_success(path, tries):
+        if tries == 0:
+            print("\r", end="")
+        print("\r\033[1;32m", path, "\033[0m", sep="")
--- a/gallery_dl/downloader/common.py
+++ b/gallery_dl/downloader/common.py
@@ -0,0 +1,21 @@
+import os
+
+class BasicDownloader():
+
+    max_tries = 5
+
+    def download(self, url, path):
+        with open(path, "wb") as file:
+            try:
+                return self.download_impl(url, file)
+                file.close()
+            except:
+                # make sure to remove file if download failed
+                os.unlink(path)
+                raise
+
+    @staticmethod
+    def print_error(file, error, tries, max_tries=5):
+        if tries == 1 and hasattr(file, "name"):
+            print("\r\033[1;31m", file.name, sep="")
+        print("\033[0;31m[Error]\033[0m ", error, " (", tries, "/", max_tries, ")", sep="")
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -0,0 +1,42 @@
+from .common import BasicDownloader
+import time
+import requests
+
+class Downloader(BasicDownloader):
+
+    def __init__(self, extr):
+        BasicDownloader.__init__(self)
+        self.session = extr.session
+
+    def download_impl(self, url, file):
+        tries = 0
+        while True:
+            # try to connect to remote source
+            try:
+                response = self.session.get(url, stream=True, verify=True)
+            except requests.exceptions.ConnectionError as e:
+                tries += 1
+                self.print_error(file, e, tries, self.max_tries)
+                time.sleep(1)
+                if tries == self.max_tries:
+                    raise
+                continue
+
+            # reject error-status-codes
+            if response.status_code != requests.codes.ok:
+                tries += 1
+                self.print_error(file, 'HTTP status "{} {}"'.format(
+                    response.status_code, response.reason), tries, self.max_tries)
+                if response.status_code == 404:
+                    return self.max_tries
+                time.sleep(1)
+                if tries == 5:
+                    response.raise_for_status()
+                continue
+
+            # everything ok -- proceed to download
+            break
+
+        for data in response.iter_content(16384):
+            file.write(data)
+        return tries
--- a/gallery_dl/downloader/https.py
+++ b/gallery_dl/downloader/https.py
@@ -0,0 +1 @@
+from .http import Downloader
--- a/gallery_dl/downloader/text.py
+++ b/gallery_dl/downloader/text.py
@@ -0,0 +1,10 @@
+from .common import BasicDownloader
+
+class Downloader(BasicDownloader):
+
+    def __init__(self, extr):
+        BasicDownloader.__init__(self)
+
+    def download_impl(self, url, file):
+        file.write(bytes(url[7:], "utf-8"))
+        return 0
--- a/gallery_dl/extractor/8chan.py
+++ b/gallery_dl/extractor/8chan.py
@@ -0,0 +1,21 @@
+from .common import BasicExtractor
+from urllib.parse import unquote
+import re
+
+class Extractor(BasicExtractor):
+
+    thread_url_fmt = "https://www.8chan.co/{0}/res/{1}.html"
+    regex = r'>File: <a href="([^"]+)">([^<]+)\.[^<]+<.*?<span class="postfilename"( title="([^"]+)")?>([^<]+)<'
+
+    def __init__(self, match, config):
+        BasicExtractor.__init__(self, config)
+        self.board, _, self.thread_id = match.group(1).split("/")
+        self.category = "8chan"
+        self.directory = self.board + "-" + self.thread_id
+
+    def images(self):
+        url  = self.thread_url_fmt.format(self.board, self.thread_id)
+        text = self.request(url).text
+        for match in re.finditer(self.regex, text):
+            url, prefix, fullname, name = match.group(1, 2, 4, 5)
+            yield ("https://www.8chan.co" + url, prefix + "-" + unquote(fullname or name))
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@@ -0,0 +1,47 @@
+import os
+import sys
+import re
+import sqlite3
+import importlib
+
+class ExtractorFinder():
+
+    def __init__(self, config):
+        self.config = config
+        self.match_list = list()
+        if "database" in config["general"]:
+            path = os.path.expanduser(config["general"]["database"])
+            conn = sqlite3.connect(path)
+            self.load_from_database(conn)
+        self.load_from_config(config)
+
+    def match(self, url):
+        for category, regex in self.match_list:
+            match = regex.match(url)
+            if match:
+                module = importlib.import_module("."+category, __package__)
+                return module.Extractor(match, self.config)
+        return None
+
+    def load_from_database(self, db):
+        query = (
+            "SELECT regex.re, category.name "
+            "FROM   regex JOIN category "
+            "ON     regex.category_id = category.id"
+        )
+        for row in db.execute(query):
+            self.add_match(row[1], row[0])
+
+    def load_from_config(self, conf):
+        for category in conf:
+            for key, value in conf[category].items():
+                if(key.startswith("regex")):
+                    self.add_match(category, value)
+
+    def add_match(self, category, regex):
+        try:
+            # print(category, regex)
+            self.match_list.append( (category, re.compile(regex)) )
+        except:
+            print("[Warning] [{0}] failed to compile regular expression '{1}'"
+                  .format(category, regex))
--- a/gallery_dl/extractor/batoto.py
+++ b/gallery_dl/extractor/batoto.py
@@ -0,0 +1,22 @@
+from .common import AsyncExtractor
+from ..util import filename_from_url
+from urllib.parse import unquote
+
+class Extractor(AsyncExtractor):
+
+    def __init__(self, match, config):
+        AsyncExtractor.__init__(self, config)
+        self.url = "https://bato.to/read/_/" + match.group(1) + "/_/1"
+        self.category = "batoto"
+        self.directory = match.group(1)
+
+    def images(self):
+        next_url = self.url
+        while next_url:
+            text = self.request(next_url).text
+            pos  = text.find('<div id="full_image"')
+
+            next_url, pos = self.extract(text, '<a href="', '"', pos)
+            url, pos = self.extract(text, 'src="', '"', pos)
+            name = unquote( filename_from_url(url) )
+            yield url, name
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -0,0 +1,67 @@
+import queue
+import threading
+import requests
+from ..util import safe_request
+
+class BasicExtractor():
+
+    def __init__(self, config):
+        self.session   = requests.Session()
+        self.category  = ""
+        self.directory = ""
+
+    def __iter__(self):
+        return self.images()
+
+    def request(self, url, *args, **kwargs):
+        return safe_request(self.session, url, *args, **kwargs)
+
+    def enable_useragent(self):
+        self.session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"
+
+    @staticmethod
+    def extract(txt, begin, end, pos=0):
+        try:
+            first = txt.index(begin, pos) + len(begin)
+            last  = txt.index(end, first)
+            return txt[first:last], last+len(end)
+        except:
+            return None, pos
+
+    @staticmethod
+    def extract_all(txt, begin, end, pos=0):
+        try:
+            first = txt.index(begin, pos)
+            last  = txt.index(end, first + len(begin)) + len(end)
+            return txt[first:last], last
+        except:
+            return None, pos
+
+class AsyncExtractor(BasicExtractor):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.__queue  = queue.Queue(maxsize=5)
+        self.__thread = threading.Thread(target=self.async_images, daemon=True)
+
+    def __iter__(self):
+        get  = self.__queue.get
+        done = self.__queue.task_done
+
+        self.__thread.start()
+        while True:
+            task = get()
+            if task is None:
+                return
+            yield task
+            done()
+
+    def async_images(self):
+        put = self.__queue.put
+        try:
+            for task in self.images():
+                put(task)
+        except:
+            import traceback
+            print(traceback.format_exc())
+        put(None)
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -0,0 +1,68 @@
+from .common import BasicExtractor
+from ..util import unescape
+import time
+import random
+import json
+
+class Extractor(BasicExtractor):
+
+    api_url  = "http://exhentai.org/api.php"
+    name_fmt = "{}_{:>04}_{}_{}"
+
+    def __init__(self, match, config):
+        BasicExtractor.__init__(self, config)
+        self.url = match.group(0)
+        self.gid, self.token = match.group(1).split("/")
+        self.category  = "exhentai"
+        self.directory = self.gid
+        self.session.cookies.update(config["exhentai-cookies"])
+
+    def images(self):
+        e = self.extract
+
+        # get gallery page
+        text = self.request(self.url).text
+
+        # get first image page
+        url, pos = self.extract_all(text, "http://exhentai.org/s/", "-1")
+        text = self.request(url).text
+
+        # extract information
+        _     , pos = e(text, '<div id="i3"><a onclick="return load_image(', '')
+        imgkey, pos = e(text, "'", "'", pos)
+        url   , pos = e(text, '<img id="img" src="', '"', pos)
+        name  , pos = e(text, '<div id="i4"><div>', ' :: ', pos)
+        orgurl, pos = e(text, 'http://exhentai.org/fullimg.php', '"', pos)
+        if orgurl: url = "http://exhentai.org/fullimg.php" + unescape(orgurl)
+        yield url, name_fmt.format(self.gid, 1, imgkey, name)
+
+        gid     , pos = e(text, 'var gid='      ,  ';', pos)
+        startkey, pos = e(text, 'var startkey="', '";', pos)
+        showkey , pos = e(text, 'var showkey="' , '";', pos)
+
+        # use json-api for further pages
+        request = {
+            "method" : "showpage",
+            "gid"    : int(gid),
+            "page"   : 2,
+            "imgkey" : imgkey,
+            "showkey": showkey,
+        }
+
+        while True:
+            time.sleep( random.uniform(2, 5) )
+            info = json.loads(
+                self.session.post(self.api_url, data=json.dumps(request)).text
+            )
+
+            imgkey, pos = e(info["i3"], "'", "'")
+            url   , pos = e(info["i3"], '<img id="img" src="', '"', pos)
+            name  , pos = e(info["i" ], '<div>', ' :: ')
+            orgurl, pos = e(info["i7"], '<a href="', '"')
+            if orgurl: url = unescape(orgurl)
+            yield url, name_fmt.format(gid, request["page"], imgkey, name)
+
+            if request["imgkey"] == imgkey:
+                return
+            request["imgkey"] = imgkey
+            request["page"] += 1
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -0,0 +1,50 @@
+from .common import AsyncExtractor
+from ..util import filename_from_url
+import xml.etree.ElementTree as ET
+
+class BooruExtractor(AsyncExtractor):
+
+    def __init__(self, match, config):
+        AsyncExtractor.__init__(self, config)
+        self.tags      = match.group(1)
+        self.category  = "booru"
+        self.params    = {"tags": self.tags}
+        self.page      = "page"
+        self.directory = self.tags.replace("/", "_")
+
+    def images(self):
+        self.update_page(reset=True)
+        while True:
+            root = ET.fromstring(
+                self.request(self.api_url, verify=True, params=self.params).text
+            )
+            # root = tree.getroot()
+            if len(root) == 0:
+                return
+            for item in root:
+                url  = item.attrib["file_url"]
+                name = "{}_{}".format(self.category, filename_from_url(url))
+                yield url, name
+            self.update_page()
+
+    def update_page(self, reset=False):
+        # Override this method in derived classes if necessary.
+        # It is usually enough to adjust the 'page' attribute
+        if reset is False:
+            self.params[self.page] += 1
+        else:
+            self.params[self.page]  = 1
+
+class Extractor(BooruExtractor):
+
+    def __init__(self, match, config):
+        BooruExtractor.__init__(self, match, config)
+        self.category = "gelbooru"
+        self.api_url  = "http://gelbooru.com/"
+        self.params   = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}
+
+    def update_page(self, reset=False):
+        if reset is False:
+            self.params["pid"] += 1
+        else:
+            self.params["pid"]  = 0
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -0,0 +1,150 @@
+from .common import AsyncExtractor
+from ..util import safe_request
+import re
+import csv
+import requests
+
+class Extractor(AsyncExtractor):
+
+    member_url = "http://www.pixiv.net/member_illust.php"
+    illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"
+
+    def __init__(self, match, config):
+        AsyncExtractor.__init__(self, config)
+        self.member_id = match.group(1)
+        self.category = "pixiv"
+        self.directory = self.member_id
+        self.session.cookies.update(config["pixiv-cookies"])
+        self.session.headers.update({"Referer": "http://www.pixiv.net/"})
+        self.api = PixivAPI(config["pixiv-cookies"]["PHPSESSID"])
+
+    def images(self):
+        sname_fmt  = "pixiv_{1}_{0}.{2}"
+        mname_fmt  = "pixiv_{1}_{0}_p{num:02}.{2}"
+
+        singl_v1_fmt = "http://i1.pixiv.net/img{4:>02}/img/{24}/{0}.{2}"
+        manga_v1_fmt = "http://i1.pixiv.net/img{4:>02}/img/{24}/{0}{big}_p{num}.{2}"
+
+        singl_v2_fmt = "http://i1.pixiv.net/img-original/img/{date}/{0}_p0.{2}"
+        manga_v2_fmt = "http://i1.pixiv.net/img-original/img/{date}/{0}_p{num}.{2}"
+
+        date = ""
+        big  = ""
+
+        for img in self.image_ids():
+            data = self.api.request(img)
+            # debug
+            # for i, value in enumerate(data):
+                # print("{:02}: {}".format(i, value))
+            # return
+            # debug end
+
+            if "うごイラ" in data[13]:
+                # ugoira / animations
+                try:
+                    url, framelist = self.parse_ugoira(img)
+                    data[2] = "zip"
+                    yield (url, sname_fmt.format(*data))
+                    data[2] = "txt"
+                    yield (framelist, sname_fmt.format(*data))
+                    continue
+                except:
+                    print("[Warning] failed to get ugoira url; trying fallback")
+
+            # images
+            if img > 46270949:
+                date = data[6][45:64]
+                url_s_fmt = singl_v2_fmt
+                url_m_fmt = manga_v2_fmt
+            else:
+                big = "_big" if img > 11319935 else ""
+                url_s_fmt = singl_v1_fmt
+                url_m_fmt = manga_v1_fmt
+
+            if not data[19]:
+                yield (url_s_fmt.format(*data, date=date), sname_fmt.format(*data))
+            else:
+                for i in range(0, int(data[19])):
+                    yield (url_m_fmt.format(*data, num=i, date=date, big=big),
+                           mname_fmt.format(*data, num=i))
+
+    def image_ids(self):
+        """generator -- yield all image ids"""
+        needle = '<li class="image-item"><a href="/member_illust.php?mode=medium&amp;illust_id='
+        params = {"id": self.member_id, "p": 1}
+        while True:
+            text  = self.request(self.member_url, params=params).text
+            end   = 0
+            found = 0
+            while True:
+                pos = text.find(needle, end)
+                if pos == -1:
+                    break
+                pos += len(needle)
+                end = text.find('"', pos)
+                found += 1
+                yield int(text[pos:end])
+            if found != 20:
+                return
+            params["p"] += 1
+
+    def parse_ugoira(self, illust_id):
+        # get illust page
+        text = self.request(
+            self.illust_url,
+            params={"illust_id": illust_id},
+        ).text
+
+        # parse page
+        url   , pos = self.extract(text, 'ugokuIllustFullscreenData  = {"src":"', '"')
+        frames, pos = self.extract(text, '"frames":[', ']', pos)
+
+        # fix url
+        url = url.replace("\\/", "/")
+
+        # build framelist
+        framelist = "text://" + re.sub(
+            r'\{"file":"([^"]+)","delay":(\d+)\},?',
+            r'\1 \2\n',
+            frames
+        )
+
+        return url, framelist
+
+
+class PixivAPI():
+    api_url = "http://spapi.pixiv.net/iphone/illust.php"
+
+    def __init__(self, session_id):
+        self.session = requests.Session()
+        self.session.params["PHPSESSID"] = session_id
+
+    def request(self, illust_id):
+        while True:
+            text = safe_request(
+                self.session,
+                self.api_url,
+                params={"illust_id": illust_id}
+            ).text
+            if len(text) > 31:
+                return next(csv.reader([text]))
+
+# class FileDict(dict):
+#
+    # def __init__(self, *args):
+        # super().__init__()
+        # self.re = re.compile(r"pixiv_\d+_(?P<id>\d+)(?P<extra>_p\d+)?\.[a-z]{3}")
+        # for arg in args:
+            # self.load_from(arg)
+#
+    # def load_from(self, directory):
+        # match = self.re.match
+        # for file in os.listdir(directory):
+            # m = match(file)
+            # if m is None:
+                # continue
+            # val = True if m.group("extra") else False
+            # dict.__setitem__(self, m.group("id"), val)
+#
+    # def __getitem__(self, key):
+        # return dict.get(self, key)
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -0,0 +1,35 @@
+from .common import AsyncExtractor
+from ..util import filename_from_url
+
+class Extractor(AsyncExtractor):
+
+    url = "https://chan.sankakucomplex.com/"
+
+    def __init__(self, match, config):
+        AsyncExtractor.__init__(self, config)
+        self.tags      = match.group(1)
+        self.category  = "sankaku"
+        self.directory = self.tags.replace("/", "_")
+        self.enable_useragent()
+
+    def images(self):
+        needle = ' src="//c.sankakucomplex.com/data/preview/'
+        params = {"tags": self.tags, "page":1}
+        while True:
+            text  = self.request(self.url, params=params).text
+            print(text)
+            return
+            pos   = 0
+            found = 0
+            while True:
+                try:
+                    url, pos = self.extract(text, needle, '"', pos)
+                    found += 1
+                    print("https://cs.sankakucomplex.com/data/" + url)
+                    yield ("https://cs.sankakucomplex.com/data/" + url,
+                           "%s_%s" % (self.category, filename_from_url(url)))
+                except:
+                    break
+            if found == 0:
+                break
+            params["page"] += 1
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -0,0 +1,33 @@
+import time
+import requests
+import html.parser
+
+def safe_request(session, url, *args, **kwargs):
+    tries = 0
+    while True:
+        # try to connect to remote source
+        try:
+            r = session.get(url, *args, **kwargs)
+        except requests.exceptions.ConnectionError:
+            tries += 1
+            time.sleep(1)
+            if tries == 5:
+                raise
+            continue
+
+        # reject error-status-codes
+        if r.status_code != requests.codes.ok:
+            tries += 1
+            time.sleep(1)
+            if tries == 5:
+                r.raise_for_status()
+            continue
+
+        # everything ok -- proceed to download
+        return r
+
+def filename_from_url(url):
+    pos = url.rfind("/")
+    return url[pos+1:]
+
+unescape = html.parser.HTMLParser().unescape