initial commit
This commit is contained in:
21
gallery_dl/extractor/8chan.py
Normal file
21
gallery_dl/extractor/8chan.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from .common import BasicExtractor
|
||||
from urllib.parse import unquote
|
||||
import re
|
||||
|
||||
class Extractor(BasicExtractor):
|
||||
|
||||
thread_url_fmt = "https://www.8chan.co/{0}/res/{1}.html"
|
||||
regex = r'>File: <a href="([^"]+)">([^<]+)\.[^<]+<.*?<span class="postfilename"( title="([^"]+)")?>([^<]+)<'
|
||||
|
||||
def __init__(self, match, config):
|
||||
BasicExtractor.__init__(self, config)
|
||||
self.board, _, self.thread_id = match.group(1).split("/")
|
||||
self.category = "8chan"
|
||||
self.directory = self.board + "-" + self.thread_id
|
||||
|
||||
def images(self):
|
||||
url = self.thread_url_fmt.format(self.board, self.thread_id)
|
||||
text = self.request(url).text
|
||||
for match in re.finditer(self.regex, text):
|
||||
url, prefix, fullname, name = match.group(1, 2, 4, 5)
|
||||
yield ("https://www.8chan.co" + url, prefix + "-" + unquote(fullname or name))
|
||||
47
gallery_dl/extractor/__init__.py
Normal file
47
gallery_dl/extractor/__init__.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import sqlite3
|
||||
import importlib
|
||||
|
||||
class ExtractorFinder():
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.match_list = list()
|
||||
if "database" in config["general"]:
|
||||
path = os.path.expanduser(config["general"]["database"])
|
||||
conn = sqlite3.connect(path)
|
||||
self.load_from_database(conn)
|
||||
self.load_from_config(config)
|
||||
|
||||
def match(self, url):
|
||||
for category, regex in self.match_list:
|
||||
match = regex.match(url)
|
||||
if match:
|
||||
module = importlib.import_module("."+category, __package__)
|
||||
return module.Extractor(match, self.config)
|
||||
return None
|
||||
|
||||
def load_from_database(self, db):
|
||||
query = (
|
||||
"SELECT regex.re, category.name "
|
||||
"FROM regex JOIN category "
|
||||
"ON regex.category_id = category.id"
|
||||
)
|
||||
for row in db.execute(query):
|
||||
self.add_match(row[1], row[0])
|
||||
|
||||
def load_from_config(self, conf):
|
||||
for category in conf:
|
||||
for key, value in conf[category].items():
|
||||
if(key.startswith("regex")):
|
||||
self.add_match(category, value)
|
||||
|
||||
def add_match(self, category, regex):
|
||||
try:
|
||||
# print(category, regex)
|
||||
self.match_list.append( (category, re.compile(regex)) )
|
||||
except:
|
||||
print("[Warning] [{0}] failed to compile regular expression '{1}'"
|
||||
.format(category, regex))
|
||||
22
gallery_dl/extractor/batoto.py
Normal file
22
gallery_dl/extractor/batoto.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from .common import AsyncExtractor
|
||||
from ..util import filename_from_url
|
||||
from urllib.parse import unquote
|
||||
|
||||
class Extractor(AsyncExtractor):
|
||||
|
||||
def __init__(self, match, config):
|
||||
AsyncExtractor.__init__(self, config)
|
||||
self.url = "https://bato.to/read/_/" + match.group(1) + "/_/1"
|
||||
self.category = "batoto"
|
||||
self.directory = match.group(1)
|
||||
|
||||
def images(self):
|
||||
next_url = self.url
|
||||
while next_url:
|
||||
text = self.request(next_url).text
|
||||
pos = text.find('<div id="full_image"')
|
||||
|
||||
next_url, pos = self.extract(text, '<a href="', '"', pos)
|
||||
url, pos = self.extract(text, 'src="', '"', pos)
|
||||
name = unquote( filename_from_url(url) )
|
||||
yield url, name
|
||||
67
gallery_dl/extractor/common.py
Normal file
67
gallery_dl/extractor/common.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import queue
|
||||
import threading
|
||||
import requests
|
||||
from ..util import safe_request
|
||||
|
||||
class BasicExtractor():
|
||||
|
||||
def __init__(self, config):
|
||||
self.session = requests.Session()
|
||||
self.category = ""
|
||||
self.directory = ""
|
||||
|
||||
def __iter__(self):
|
||||
return self.images()
|
||||
|
||||
def request(self, url, *args, **kwargs):
|
||||
return safe_request(self.session, url, *args, **kwargs)
|
||||
|
||||
def enable_useragent(self):
|
||||
self.session.headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0"
|
||||
|
||||
@staticmethod
|
||||
def extract(txt, begin, end, pos=0):
|
||||
try:
|
||||
first = txt.index(begin, pos) + len(begin)
|
||||
last = txt.index(end, first)
|
||||
return txt[first:last], last+len(end)
|
||||
except:
|
||||
return None, pos
|
||||
|
||||
@staticmethod
|
||||
def extract_all(txt, begin, end, pos=0):
|
||||
try:
|
||||
first = txt.index(begin, pos)
|
||||
last = txt.index(end, first + len(begin)) + len(end)
|
||||
return txt[first:last], last
|
||||
except:
|
||||
return None, pos
|
||||
|
||||
class AsyncExtractor(BasicExtractor):
|
||||
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
self.__queue = queue.Queue(maxsize=5)
|
||||
self.__thread = threading.Thread(target=self.async_images, daemon=True)
|
||||
|
||||
def __iter__(self):
|
||||
get = self.__queue.get
|
||||
done = self.__queue.task_done
|
||||
|
||||
self.__thread.start()
|
||||
while True:
|
||||
task = get()
|
||||
if task is None:
|
||||
return
|
||||
yield task
|
||||
done()
|
||||
|
||||
def async_images(self):
|
||||
put = self.__queue.put
|
||||
try:
|
||||
for task in self.images():
|
||||
put(task)
|
||||
except:
|
||||
import traceback
|
||||
print(traceback.format_exc())
|
||||
put(None)
|
||||
68
gallery_dl/extractor/exhentai.py
Normal file
68
gallery_dl/extractor/exhentai.py
Normal file
@@ -0,0 +1,68 @@
|
||||
from .common import BasicExtractor
|
||||
from ..util import unescape
|
||||
import time
|
||||
import random
|
||||
import json
|
||||
|
||||
class Extractor(BasicExtractor):
|
||||
|
||||
api_url = "http://exhentai.org/api.php"
|
||||
name_fmt = "{}_{:>04}_{}_{}"
|
||||
|
||||
def __init__(self, match, config):
|
||||
BasicExtractor.__init__(self, config)
|
||||
self.url = match.group(0)
|
||||
self.gid, self.token = match.group(1).split("/")
|
||||
self.category = "exhentai"
|
||||
self.directory = self.gid
|
||||
self.session.cookies.update(config["exhentai-cookies"])
|
||||
|
||||
def images(self):
|
||||
e = self.extract
|
||||
|
||||
# get gallery page
|
||||
text = self.request(self.url).text
|
||||
|
||||
# get first image page
|
||||
url, pos = self.extract_all(text, "http://exhentai.org/s/", "-1")
|
||||
text = self.request(url).text
|
||||
|
||||
# extract information
|
||||
_ , pos = e(text, '<div id="i3"><a onclick="return load_image(', '')
|
||||
imgkey, pos = e(text, "'", "'", pos)
|
||||
url , pos = e(text, '<img id="img" src="', '"', pos)
|
||||
name , pos = e(text, '<div id="i4"><div>', ' :: ', pos)
|
||||
orgurl, pos = e(text, 'http://exhentai.org/fullimg.php', '"', pos)
|
||||
if orgurl: url = "http://exhentai.org/fullimg.php" + unescape(orgurl)
|
||||
yield url, name_fmt.format(self.gid, 1, imgkey, name)
|
||||
|
||||
gid , pos = e(text, 'var gid=' , ';', pos)
|
||||
startkey, pos = e(text, 'var startkey="', '";', pos)
|
||||
showkey , pos = e(text, 'var showkey="' , '";', pos)
|
||||
|
||||
# use json-api for further pages
|
||||
request = {
|
||||
"method" : "showpage",
|
||||
"gid" : int(gid),
|
||||
"page" : 2,
|
||||
"imgkey" : imgkey,
|
||||
"showkey": showkey,
|
||||
}
|
||||
|
||||
while True:
|
||||
time.sleep( random.uniform(2, 5) )
|
||||
info = json.loads(
|
||||
self.session.post(self.api_url, data=json.dumps(request)).text
|
||||
)
|
||||
|
||||
imgkey, pos = e(info["i3"], "'", "'")
|
||||
url , pos = e(info["i3"], '<img id="img" src="', '"', pos)
|
||||
name , pos = e(info["i" ], '<div>', ' :: ')
|
||||
orgurl, pos = e(info["i7"], '<a href="', '"')
|
||||
if orgurl: url = unescape(orgurl)
|
||||
yield url, name_fmt.format(gid, request["page"], imgkey, name)
|
||||
|
||||
if request["imgkey"] == imgkey:
|
||||
return
|
||||
request["imgkey"] = imgkey
|
||||
request["page"] += 1
|
||||
50
gallery_dl/extractor/gelbooru.py
Normal file
50
gallery_dl/extractor/gelbooru.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from .common import AsyncExtractor
|
||||
from ..util import filename_from_url
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
class BooruExtractor(AsyncExtractor):
|
||||
|
||||
def __init__(self, match, config):
|
||||
AsyncExtractor.__init__(self, config)
|
||||
self.tags = match.group(1)
|
||||
self.category = "booru"
|
||||
self.params = {"tags": self.tags}
|
||||
self.page = "page"
|
||||
self.directory = self.tags.replace("/", "_")
|
||||
|
||||
def images(self):
|
||||
self.update_page(reset=True)
|
||||
while True:
|
||||
root = ET.fromstring(
|
||||
self.request(self.api_url, verify=True, params=self.params).text
|
||||
)
|
||||
# root = tree.getroot()
|
||||
if len(root) == 0:
|
||||
return
|
||||
for item in root:
|
||||
url = item.attrib["file_url"]
|
||||
name = "{}_{}".format(self.category, filename_from_url(url))
|
||||
yield url, name
|
||||
self.update_page()
|
||||
|
||||
def update_page(self, reset=False):
|
||||
# Override this method in derived classes if necessary.
|
||||
# It is usually enough to adjust the 'page' attribute
|
||||
if reset is False:
|
||||
self.params[self.page] += 1
|
||||
else:
|
||||
self.params[self.page] = 1
|
||||
|
||||
class Extractor(BooruExtractor):
|
||||
|
||||
def __init__(self, match, config):
|
||||
BooruExtractor.__init__(self, match, config)
|
||||
self.category = "gelbooru"
|
||||
self.api_url = "http://gelbooru.com/"
|
||||
self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}
|
||||
|
||||
def update_page(self, reset=False):
|
||||
if reset is False:
|
||||
self.params["pid"] += 1
|
||||
else:
|
||||
self.params["pid"] = 0
|
||||
150
gallery_dl/extractor/pixiv.py
Normal file
150
gallery_dl/extractor/pixiv.py
Normal file
@@ -0,0 +1,150 @@
|
||||
from .common import AsyncExtractor
|
||||
from ..util import safe_request
|
||||
import re
|
||||
import csv
|
||||
import requests
|
||||
|
||||
class Extractor(AsyncExtractor):
|
||||
|
||||
member_url = "http://www.pixiv.net/member_illust.php"
|
||||
illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"
|
||||
|
||||
def __init__(self, match, config):
|
||||
AsyncExtractor.__init__(self, config)
|
||||
self.member_id = match.group(1)
|
||||
self.category = "pixiv"
|
||||
self.directory = self.member_id
|
||||
self.session.cookies.update(config["pixiv-cookies"])
|
||||
self.session.headers.update({"Referer": "http://www.pixiv.net/"})
|
||||
self.api = PixivAPI(config["pixiv-cookies"]["PHPSESSID"])
|
||||
|
||||
def images(self):
|
||||
sname_fmt = "pixiv_{1}_{0}.{2}"
|
||||
mname_fmt = "pixiv_{1}_{0}_p{num:02}.{2}"
|
||||
|
||||
singl_v1_fmt = "http://i1.pixiv.net/img{4:>02}/img/{24}/{0}.{2}"
|
||||
manga_v1_fmt = "http://i1.pixiv.net/img{4:>02}/img/{24}/{0}{big}_p{num}.{2}"
|
||||
|
||||
singl_v2_fmt = "http://i1.pixiv.net/img-original/img/{date}/{0}_p0.{2}"
|
||||
manga_v2_fmt = "http://i1.pixiv.net/img-original/img/{date}/{0}_p{num}.{2}"
|
||||
|
||||
date = ""
|
||||
big = ""
|
||||
|
||||
for img in self.image_ids():
|
||||
data = self.api.request(img)
|
||||
# debug
|
||||
# for i, value in enumerate(data):
|
||||
# print("{:02}: {}".format(i, value))
|
||||
# return
|
||||
# debug end
|
||||
|
||||
if "うごイラ" in data[13]:
|
||||
# ugoira / animations
|
||||
try:
|
||||
url, framelist = self.parse_ugoira(img)
|
||||
data[2] = "zip"
|
||||
yield (url, sname_fmt.format(*data))
|
||||
data[2] = "txt"
|
||||
yield (framelist, sname_fmt.format(*data))
|
||||
continue
|
||||
except:
|
||||
print("[Warning] failed to get ugoira url; trying fallback")
|
||||
|
||||
# images
|
||||
if img > 46270949:
|
||||
date = data[6][45:64]
|
||||
url_s_fmt = singl_v2_fmt
|
||||
url_m_fmt = manga_v2_fmt
|
||||
else:
|
||||
big = "_big" if img > 11319935 else ""
|
||||
url_s_fmt = singl_v1_fmt
|
||||
url_m_fmt = manga_v1_fmt
|
||||
|
||||
if not data[19]:
|
||||
yield (url_s_fmt.format(*data, date=date), sname_fmt.format(*data))
|
||||
else:
|
||||
for i in range(0, int(data[19])):
|
||||
yield (url_m_fmt.format(*data, num=i, date=date, big=big),
|
||||
mname_fmt.format(*data, num=i))
|
||||
|
||||
def image_ids(self):
|
||||
"""generator -- yield all image ids"""
|
||||
needle = '<li class="image-item"><a href="/member_illust.php?mode=medium&illust_id='
|
||||
params = {"id": self.member_id, "p": 1}
|
||||
while True:
|
||||
text = self.request(self.member_url, params=params).text
|
||||
end = 0
|
||||
found = 0
|
||||
while True:
|
||||
pos = text.find(needle, end)
|
||||
if pos == -1:
|
||||
break
|
||||
pos += len(needle)
|
||||
end = text.find('"', pos)
|
||||
found += 1
|
||||
yield int(text[pos:end])
|
||||
if found != 20:
|
||||
return
|
||||
params["p"] += 1
|
||||
|
||||
def parse_ugoira(self, illust_id):
|
||||
# get illust page
|
||||
text = self.request(
|
||||
self.illust_url,
|
||||
params={"illust_id": illust_id},
|
||||
).text
|
||||
|
||||
# parse page
|
||||
url , pos = self.extract(text, 'ugokuIllustFullscreenData = {"src":"', '"')
|
||||
frames, pos = self.extract(text, '"frames":[', ']', pos)
|
||||
|
||||
# fix url
|
||||
url = url.replace("\\/", "/")
|
||||
|
||||
# build framelist
|
||||
framelist = "text://" + re.sub(
|
||||
r'\{"file":"([^"]+)","delay":(\d+)\},?',
|
||||
r'\1 \2\n',
|
||||
frames
|
||||
)
|
||||
|
||||
return url, framelist
|
||||
|
||||
|
||||
class PixivAPI():
|
||||
api_url = "http://spapi.pixiv.net/iphone/illust.php"
|
||||
|
||||
def __init__(self, session_id):
|
||||
self.session = requests.Session()
|
||||
self.session.params["PHPSESSID"] = session_id
|
||||
|
||||
def request(self, illust_id):
|
||||
while True:
|
||||
text = safe_request(
|
||||
self.session,
|
||||
self.api_url,
|
||||
params={"illust_id": illust_id}
|
||||
).text
|
||||
if len(text) > 31:
|
||||
return next(csv.reader([text]))
|
||||
|
||||
# class FileDict(dict):
|
||||
#
|
||||
# def __init__(self, *args):
|
||||
# super().__init__()
|
||||
# self.re = re.compile(r"pixiv_\d+_(?P<id>\d+)(?P<extra>_p\d+)?\.[a-z]{3}")
|
||||
# for arg in args:
|
||||
# self.load_from(arg)
|
||||
#
|
||||
# def load_from(self, directory):
|
||||
# match = self.re.match
|
||||
# for file in os.listdir(directory):
|
||||
# m = match(file)
|
||||
# if m is None:
|
||||
# continue
|
||||
# val = True if m.group("extra") else False
|
||||
# dict.__setitem__(self, m.group("id"), val)
|
||||
#
|
||||
# def __getitem__(self, key):
|
||||
# return dict.get(self, key)
|
||||
35
gallery_dl/extractor/sankaku.py
Normal file
35
gallery_dl/extractor/sankaku.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from .common import AsyncExtractor
|
||||
from ..util import filename_from_url
|
||||
|
||||
class Extractor(AsyncExtractor):
|
||||
|
||||
url = "https://chan.sankakucomplex.com/"
|
||||
|
||||
def __init__(self, match, config):
|
||||
AsyncExtractor.__init__(self, config)
|
||||
self.tags = match.group(1)
|
||||
self.category = "sankaku"
|
||||
self.directory = self.tags.replace("/", "_")
|
||||
self.enable_useragent()
|
||||
|
||||
def images(self):
|
||||
needle = ' src="//c.sankakucomplex.com/data/preview/'
|
||||
params = {"tags": self.tags, "page":1}
|
||||
while True:
|
||||
text = self.request(self.url, params=params).text
|
||||
print(text)
|
||||
return
|
||||
pos = 0
|
||||
found = 0
|
||||
while True:
|
||||
try:
|
||||
url, pos = self.extract(text, needle, '"', pos)
|
||||
found += 1
|
||||
print("https://cs.sankakucomplex.com/data/" + url)
|
||||
yield ("https://cs.sankakucomplex.com/data/" + url,
|
||||
"%s_%s" % (self.category, filename_from_url(url)))
|
||||
except:
|
||||
break
|
||||
if found == 0:
|
||||
break
|
||||
params["page"] += 1
|
||||
Reference in New Issue
Block a user