initial commit

2014-10-12 21:56:44 +02:00
parent f738f3ae58
commit deef91eddc
18 changed files with 903 additions and 395 deletions
--- a/gallery_dl/extractor/8chan.py
+++ b/gallery_dl/extractor/8chan.py
@@ -0,0 +1,21 @@
+from .common import BasicExtractor
+from urllib.parse import unquote
+import re
+
+class Extractor(BasicExtractor):
+
+    thread_url_fmt = "https://www.8chan.co/{0}/res/{1}.html"
+    regex = r'>File: <a href="([^"]+)">([^<]+)\.[^<]+<.*?<span class="postfilename"( title="([^"]+)")?>([^<]+)<'
+
+    def __init__(self, match, config):
+        BasicExtractor.__init__(self, config)
+        self.board, _, self.thread_id = match.group(1).split("/")
+        self.category = "8chan"
+        self.directory = self.board + "-" + self.thread_id
+
+    def images(self):
+        url  = self.thread_url_fmt.format(self.board, self.thread_id)
+        text = self.request(url).text
+        for match in re.finditer(self.regex, text):
+            url, prefix, fullname, name = match.group(1, 2, 4, 5)
+            yield ("https://www.8chan.co" + url, prefix + "-" + unquote(fullname or name))
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@@ -0,0 +1,47 @@
+import os
+import sys
+import re
+import sqlite3
+import importlib
+
+class ExtractorFinder():
+
+    def __init__(self, config):
+        self.config = config
+        self.match_list = list()
+        if "database" in config["general"]:
+            path = os.path.expanduser(config["general"]["database"])
+            conn = sqlite3.connect(path)
+            self.load_from_database(conn)
+        self.load_from_config(config)
+
+    def match(self, url):
+        for category, regex in self.match_list:
+            match = regex.match(url)
+            if match:
+                module = importlib.import_module("."+category, __package__)
+                return module.Extractor(match, self.config)
+        return None
+
+    def load_from_database(self, db):
+        query = (
+            "SELECT regex.re, category.name "
+            "FROM   regex JOIN category "
+            "ON     regex.category_id = category.id"
+        )
+        for row in db.execute(query):
+            self.add_match(row[1], row[0])
+
+    def load_from_config(self, conf):
+        for category in conf:
+            for key, value in conf[category].items():
+                if(key.startswith("regex")):
+                    self.add_match(category, value)
+
+    def add_match(self, category, regex):
+        try:
+            # print(category, regex)
+            self.match_list.append( (category, re.compile(regex)) )
+        except:
+            print("[Warning] [{0}] failed to compile regular expression '{1}'"
+                  .format(category, regex))
--- a/gallery_dl/extractor/batoto.py
+++ b/gallery_dl/extractor/batoto.py
@@ -0,0 +1,22 @@
+from .common import AsyncExtractor
+from ..util import filename_from_url
+from urllib.parse import unquote
+
+class Extractor(AsyncExtractor):
+
+    def __init__(self, match, config):
+        AsyncExtractor.__init__(self, config)
+        self.url = "https://bato.to/read/_/" + match.group(1) + "/_/1"
+        self.category = "batoto"
+        self.directory = match.group(1)
+
+    def images(self):
+        next_url = self.url
+        while next_url:
+            text = self.request(next_url).text
+            pos  = text.find('<div id="full_image"')
+
+            next_url, pos = self.extract(text, '<a href="', '"', pos)
+            url, pos = self.extract(text, 'src="', '"', pos)
+            name = unquote( filename_from_url(url) )
+            yield url, name
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -0,0 +1,67 @@
+import queue
+import threading
+import requests
+from ..util import safe_request
+
+class BasicExtractor():
+
+    def __init__(self, config):
+        self.session   = requests.Session()
+        self.category  = ""
+        self.directory = ""
+
+    def __iter__(self):
+        return self.images()
+
+    def request(self, url, *args, **kwargs):
+        return safe_request(self.session, url, *args, **kwargs)
+
+    def enable_useragent(self):
+        self.session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"
+
+    @staticmethod
+    def extract(txt, begin, end, pos=0):
+        try:
+            first = txt.index(begin, pos) + len(begin)
+            last  = txt.index(end, first)
+            return txt[first:last], last+len(end)
+        except:
+            return None, pos
+
+    @staticmethod
+    def extract_all(txt, begin, end, pos=0):
+        try:
+            first = txt.index(begin, pos)
+            last  = txt.index(end, first + len(begin)) + len(end)
+            return txt[first:last], last
+        except:
+            return None, pos
+
+class AsyncExtractor(BasicExtractor):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.__queue  = queue.Queue(maxsize=5)
+        self.__thread = threading.Thread(target=self.async_images, daemon=True)
+
+    def __iter__(self):
+        get  = self.__queue.get
+        done = self.__queue.task_done
+
+        self.__thread.start()
+        while True:
+            task = get()
+            if task is None:
+                return
+            yield task
+            done()
+
+    def async_images(self):
+        put = self.__queue.put
+        try:
+            for task in self.images():
+                put(task)
+        except:
+            import traceback
+            print(traceback.format_exc())
+        put(None)
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -0,0 +1,68 @@
+from .common import BasicExtractor
+from ..util import unescape
+import time
+import random
+import json
+
+class Extractor(BasicExtractor):
+
+    api_url  = "http://exhentai.org/api.php"
+    name_fmt = "{}_{:>04}_{}_{}"
+
+    def __init__(self, match, config):
+        BasicExtractor.__init__(self, config)
+        self.url = match.group(0)
+        self.gid, self.token = match.group(1).split("/")
+        self.category  = "exhentai"
+        self.directory = self.gid
+        self.session.cookies.update(config["exhentai-cookies"])
+
+    def images(self):
+        e = self.extract
+
+        # get gallery page
+        text = self.request(self.url).text
+
+        # get first image page
+        url, pos = self.extract_all(text, "http://exhentai.org/s/", "-1")
+        text = self.request(url).text
+
+        # extract information
+        _     , pos = e(text, '<div id="i3"><a onclick="return load_image(', '')
+        imgkey, pos = e(text, "'", "'", pos)
+        url   , pos = e(text, '<img id="img" src="', '"', pos)
+        name  , pos = e(text, '<div id="i4"><div>', ' :: ', pos)
+        orgurl, pos = e(text, 'http://exhentai.org/fullimg.php', '"', pos)
+        if orgurl: url = "http://exhentai.org/fullimg.php" + unescape(orgurl)
+        yield url, name_fmt.format(self.gid, 1, imgkey, name)
+
+        gid     , pos = e(text, 'var gid='      ,  ';', pos)
+        startkey, pos = e(text, 'var startkey="', '";', pos)
+        showkey , pos = e(text, 'var showkey="' , '";', pos)
+
+        # use json-api for further pages
+        request = {
+            "method" : "showpage",
+            "gid"    : int(gid),
+            "page"   : 2,
+            "imgkey" : imgkey,
+            "showkey": showkey,
+        }
+
+        while True:
+            time.sleep( random.uniform(2, 5) )
+            info = json.loads(
+                self.session.post(self.api_url, data=json.dumps(request)).text
+            )
+
+            imgkey, pos = e(info["i3"], "'", "'")
+            url   , pos = e(info["i3"], '<img id="img" src="', '"', pos)
+            name  , pos = e(info["i" ], '<div>', ' :: ')
+            orgurl, pos = e(info["i7"], '<a href="', '"')
+            if orgurl: url = unescape(orgurl)
+            yield url, name_fmt.format(gid, request["page"], imgkey, name)
+
+            if request["imgkey"] == imgkey:
+                return
+            request["imgkey"] = imgkey
+            request["page"] += 1
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -0,0 +1,50 @@
+from .common import AsyncExtractor
+from ..util import filename_from_url
+import xml.etree.ElementTree as ET
+
+class BooruExtractor(AsyncExtractor):
+
+    def __init__(self, match, config):
+        AsyncExtractor.__init__(self, config)
+        self.tags      = match.group(1)
+        self.category  = "booru"
+        self.params    = {"tags": self.tags}
+        self.page      = "page"
+        self.directory = self.tags.replace("/", "_")
+
+    def images(self):
+        self.update_page(reset=True)
+        while True:
+            root = ET.fromstring(
+                self.request(self.api_url, verify=True, params=self.params).text
+            )
+            # root = tree.getroot()
+            if len(root) == 0:
+                return
+            for item in root:
+                url  = item.attrib["file_url"]
+                name = "{}_{}".format(self.category, filename_from_url(url))
+                yield url, name
+            self.update_page()
+
+    def update_page(self, reset=False):
+        # Override this method in derived classes if necessary.
+        # It is usually enough to adjust the 'page' attribute
+        if reset is False:
+            self.params[self.page] += 1
+        else:
+            self.params[self.page]  = 1
+
+class Extractor(BooruExtractor):
+
+    def __init__(self, match, config):
+        BooruExtractor.__init__(self, match, config)
+        self.category = "gelbooru"
+        self.api_url  = "http://gelbooru.com/"
+        self.params   = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}
+
+    def update_page(self, reset=False):
+        if reset is False:
+            self.params["pid"] += 1
+        else:
+            self.params["pid"]  = 0
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -0,0 +1,150 @@
+from .common import AsyncExtractor
+from ..util import safe_request
+import re
+import csv
+import requests
+
+class Extractor(AsyncExtractor):
+
+    member_url = "http://www.pixiv.net/member_illust.php"
+    illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"
+
+    def __init__(self, match, config):
+        AsyncExtractor.__init__(self, config)
+        self.member_id = match.group(1)
+        self.category = "pixiv"
+        self.directory = self.member_id
+        self.session.cookies.update(config["pixiv-cookies"])
+        self.session.headers.update({"Referer": "http://www.pixiv.net/"})
+        self.api = PixivAPI(config["pixiv-cookies"]["PHPSESSID"])
+
+    def images(self):
+        sname_fmt  = "pixiv_{1}_{0}.{2}"
+        mname_fmt  = "pixiv_{1}_{0}_p{num:02}.{2}"
+
+        singl_v1_fmt = "http://i1.pixiv.net/img{4:>02}/img/{24}/{0}.{2}"
+        manga_v1_fmt = "http://i1.pixiv.net/img{4:>02}/img/{24}/{0}{big}_p{num}.{2}"
+
+        singl_v2_fmt = "http://i1.pixiv.net/img-original/img/{date}/{0}_p0.{2}"
+        manga_v2_fmt = "http://i1.pixiv.net/img-original/img/{date}/{0}_p{num}.{2}"
+
+        date = ""
+        big  = ""
+
+        for img in self.image_ids():
+            data = self.api.request(img)
+            # debug
+            # for i, value in enumerate(data):
+                # print("{:02}: {}".format(i, value))
+            # return
+            # debug end
+
+            if "うごイラ" in data[13]:
+                # ugoira / animations
+                try:
+                    url, framelist = self.parse_ugoira(img)
+                    data[2] = "zip"
+                    yield (url, sname_fmt.format(*data))
+                    data[2] = "txt"
+                    yield (framelist, sname_fmt.format(*data))
+                    continue
+                except:
+                    print("[Warning] failed to get ugoira url; trying fallback")
+
+            # images
+            if img > 46270949:
+                date = data[6][45:64]
+                url_s_fmt = singl_v2_fmt
+                url_m_fmt = manga_v2_fmt
+            else:
+                big = "_big" if img > 11319935 else ""
+                url_s_fmt = singl_v1_fmt
+                url_m_fmt = manga_v1_fmt
+
+            if not data[19]:
+                yield (url_s_fmt.format(*data, date=date), sname_fmt.format(*data))
+            else:
+                for i in range(0, int(data[19])):
+                    yield (url_m_fmt.format(*data, num=i, date=date, big=big),
+                           mname_fmt.format(*data, num=i))
+
+    def image_ids(self):
+        """generator -- yield all image ids"""
+        needle = '<li class="image-item"><a href="/member_illust.php?mode=medium&amp;illust_id='
+        params = {"id": self.member_id, "p": 1}
+        while True:
+            text  = self.request(self.member_url, params=params).text
+            end   = 0
+            found = 0
+            while True:
+                pos = text.find(needle, end)
+                if pos == -1:
+                    break
+                pos += len(needle)
+                end = text.find('"', pos)
+                found += 1
+                yield int(text[pos:end])
+            if found != 20:
+                return
+            params["p"] += 1
+
+    def parse_ugoira(self, illust_id):
+        # get illust page
+        text = self.request(
+            self.illust_url,
+            params={"illust_id": illust_id},
+        ).text
+
+        # parse page
+        url   , pos = self.extract(text, 'ugokuIllustFullscreenData  = {"src":"', '"')
+        frames, pos = self.extract(text, '"frames":[', ']', pos)
+
+        # fix url
+        url = url.replace("\\/", "/")
+
+        # build framelist
+        framelist = "text://" + re.sub(
+            r'\{"file":"([^"]+)","delay":(\d+)\},?',
+            r'\1 \2\n',
+            frames
+        )
+
+        return url, framelist
+
+
+class PixivAPI():
+    api_url = "http://spapi.pixiv.net/iphone/illust.php"
+
+    def __init__(self, session_id):
+        self.session = requests.Session()
+        self.session.params["PHPSESSID"] = session_id
+
+    def request(self, illust_id):
+        while True:
+            text = safe_request(
+                self.session,
+                self.api_url,
+                params={"illust_id": illust_id}
+            ).text
+            if len(text) > 31:
+                return next(csv.reader([text]))
+
+# class FileDict(dict):
+#
+    # def __init__(self, *args):
+        # super().__init__()
+        # self.re = re.compile(r"pixiv_\d+_(?P<id>\d+)(?P<extra>_p\d+)?\.[a-z]{3}")
+        # for arg in args:
+            # self.load_from(arg)
+#
+    # def load_from(self, directory):
+        # match = self.re.match
+        # for file in os.listdir(directory):
+            # m = match(file)
+            # if m is None:
+                # continue
+            # val = True if m.group("extra") else False
+            # dict.__setitem__(self, m.group("id"), val)
+#
+    # def __getitem__(self, key):
+        # return dict.get(self, key)
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -0,0 +1,35 @@
+from .common import AsyncExtractor
+from ..util import filename_from_url
+
+class Extractor(AsyncExtractor):
+
+    url = "https://chan.sankakucomplex.com/"
+
+    def __init__(self, match, config):
+        AsyncExtractor.__init__(self, config)
+        self.tags      = match.group(1)
+        self.category  = "sankaku"
+        self.directory = self.tags.replace("/", "_")
+        self.enable_useragent()
+
+    def images(self):
+        needle = ' src="//c.sankakucomplex.com/data/preview/'
+        params = {"tags": self.tags, "page":1}
+        while True:
+            text  = self.request(self.url, params=params).text
+            print(text)
+            return
+            pos   = 0
+            found = 0
+            while True:
+                try:
+                    url, pos = self.extract(text, needle, '"', pos)
+                    found += 1
+                    print("https://cs.sankakucomplex.com/data/" + url)
+                    yield ("https://cs.sankakucomplex.com/data/" + url,
+                           "%s_%s" % (self.category, filename_from_url(url)))
+                except:
+                    break
+            if found == 0:
+                break
+            params["page"] += 1