Merge branch 'multi-extractor'

This commit is contained in:
Mike Fährmann
2015-11-21 04:29:38 +01:00
34 changed files with 348 additions and 419 deletions

View File

@@ -8,24 +8,25 @@
"""Extract image-urls from http://behoimi.org/""" """Extract image-urls from http://behoimi.org/"""
from .booru import JSONBooruExtractor from . import booru
info = { class ThreeDeeBooruExtractor(booru.JSONBooruExtractor):
"category": "3dbooru", """Base class for 3dbooru extractors"""
"extractor": "ThreeDeeBooruExtractor", category = "3dbooru"
"directory": ["{category}", "{tags}"], api_url = "http://behoimi.org/post/index.json"
"filename": "{category}_{id}_{md5}.{extension}", headers = {
"pattern": [ "Referer": "http://behoimi.org/post/show/",
r"(?:https?://)?(?:www\.)?behoimi\.org/post(?:/(?:index)?)?\?tags=([^&]+).*", "User-Agent": "Mozilla/5.0",
], }
}
class ThreeDeeBooruExtractor(JSONBooruExtractor): class ThreeDeeBooruTagExtractor(ThreeDeeBooruExtractor, booru.BooruTagExtractor):
"""Extract images from 3dbooru based on search-tags"""
pattern = [r"(?:https?://)?(?:www\.)?behoimi\.org/post(?:/(?:index)?)?\?tags=([^&]+)"]
def __init__(self, match): class ThreeDeeBooruPoolExtractor(ThreeDeeBooruExtractor, booru.BooruPoolExtractor):
JSONBooruExtractor.__init__(self, match, info) """Extract image-pools from 3dbooru"""
self.api_url = "http://behoimi.org/post/index.json" pattern = [r"(?:https?://)?(?:www\.)?behoimi\.org/pool/show/(\d+)"]
self.headers = {
"Referer": "http://behoimi.org/post/show/", class ThreeDeeBooruPostExtractor(ThreeDeeBooruExtractor, booru.BooruPostExtractor):
"User-Agent": "Mozilla/5.0" """Extract single images from 3dbooru"""
} pattern = [r"(?:https?://)?(?:www\.)?behoimi\.org/post/show/(\d+)"]

View File

@@ -10,23 +10,14 @@
from .chan import ChanExtractor from .chan import ChanExtractor
info = {
"category": "4chan",
"extractor": "FourChanExtractor",
"directory": ["{category}", "{board}-{thread}"],
"filename": "{tim}-{filename}{ext}",
"pattern": [
r"(?:https?://)?boards\.4chan\.org/([^/]+)/thread/(\d+).*",
],
}
class FourChanExtractor(ChanExtractor): class FourChanExtractor(ChanExtractor):
category = "4chan"
pattern = [r"(?:https?://)?boards\.4chan\.org/([^/]+)/thread/(\d+)"]
api_url = "https://a.4cdn.org/{board}/thread/{thread}.json" api_url = "https://a.4cdn.org/{board}/thread/{thread}.json"
file_url = "https://i.4cdn.org/{board}/{tim}{ext}" file_url = "https://i.4cdn.org/{board}/{tim}{ext}"
def __init__(self, match): def __init__(self, match):
ChanExtractor.__init__( ChanExtractor.__init__(
self, info["category"], self, match.group(1), match.group(2)
match.group(1), match.group(2)
) )

View File

@@ -10,23 +10,14 @@
from .chan import ChanExtractor from .chan import ChanExtractor
info = {
"category": "8chan",
"extractor": "InfinityChanExtractor",
"directory": ["{category}", "{board}-{thread}"],
"filename": "{tim}-{filename}{ext}",
"pattern": [
r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+).*",
],
}
class InfinityChanExtractor(ChanExtractor): class InfinityChanExtractor(ChanExtractor):
category = "8chan"
pattern = [r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+)"]
api_url = "https://8ch.net/{board}/res/{thread}.json" api_url = "https://8ch.net/{board}/res/{thread}.json"
file_url = "https://8ch.net/{board}/src/{tim}{ext}" file_url = "https://8ch.net/{board}/src/{tim}{ext}"
def __init__(self, match): def __init__(self, match):
ChanExtractor.__init__( ChanExtractor.__init__(
self, info["category"], self, match.group(1), match.group(2)
match.group(1), match.group(2)
) )

View File

@@ -46,11 +46,11 @@ modules = [
def find(url): def find(url):
"""Find extractor suitable for handling the given url""" """Find extractor suitable for handling the given url"""
for pattern, module, klass in _list_patterns(): for pattern, klass in _list_patterns():
match = re.match(pattern, url) match = re.match(pattern, url)
if match: if match:
return klass(match), module.info return klass(match)
return None, None return None
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# internals # internals
@@ -59,15 +59,22 @@ _cache = []
_module_iter = iter(modules) _module_iter = iter(modules)
def _list_patterns(): def _list_patterns():
"""Yield all available (pattern, module, klass) tuples""" """Yield all available (pattern, info, class) tuples"""
for entry in _cache: for entry in _cache:
yield entry yield entry
for module_name in _module_iter: for module_name in _module_iter:
module = importlib.import_module("."+module_name, __package__) module = importlib.import_module("."+module_name, __package__)
klass = getattr(module, module.info["extractor"]) for klass in _get_classes(module):
userpatterns = config.get(("extractor", module_name, "pattern"), default=[]) for pattern in klass.pattern:
for pattern in userpatterns + module.info["pattern"]: etuple = (pattern, klass)
etuple = (pattern, module, klass) _cache.append(etuple)
_cache.append(etuple) yield etuple
yield etuple
def _get_classes(module):
"""Return a list of all extractor classes in a module"""
return [
klass for klass in module.__dict__.values() if (
hasattr(klass, "pattern") and klass.__module__ == module.__name__
)
]

View File

@@ -10,21 +10,14 @@
from .common import AsynchronousExtractor, Message from .common import AsynchronousExtractor, Message
from .. import text, iso639_1 from .. import text, iso639_1
import os.path
import re import re
info = {
"category": "batoto",
"extractor": "BatotoExtractor",
"directory": ["{category}", "{manga}", "c{chapter:>03} - {title}"],
"filename": "{manga}_c{chapter:>03}_{page:>03}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?bato\.to/reader#([0-9a-f]+)",
],
}
class BatotoExtractor(AsynchronousExtractor): class BatotoExtractor(AsynchronousExtractor):
category = "batoto"
directory_fmt = ["{category}", "{manga}", "c{chapter:>03} - {title}"]
filename_fmt = "{manga}_c{chapter:>03}_{page:>03}.{extension}"
pattern = [r"(?:https?://)?(?:www\.)?bato\.to/reader#([0-9a-f]+)"]
url = "https://bato.to/areader" url = "https://bato.to/areader"
def __init__(self, match): def __init__(self, match):
@@ -68,7 +61,7 @@ class BatotoExtractor(AsynchronousExtractor):
manga, pos = extr(page, "document.title = '", " - ", pos) manga, pos = extr(page, "document.title = '", " - ", pos)
match = re.match(r"(Vol.(\d+) )?Ch.(\d+)([^:]*)(: (.+))?", cinfo) match = re.match(r"(Vol.(\d+) )?Ch.(\d+)([^:]*)(: (.+))?", cinfo)
return { return {
"category": info["category"], "category": self.category,
"token": self.token, "token": self.token,
"manga": manga, "manga": manga,
"volume": match.group(2) or "", "volume": match.group(2) or "",

View File

@@ -12,20 +12,20 @@ from .common import Extractor, Message
from .. import text from .. import text
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
import json import json
import os.path
import urllib.parse import urllib.parse
class BooruExtractor(Extractor): class BooruExtractor(Extractor):
info = {}
headers = {}
page = "page"
api_url = "" api_url = ""
category = ""
def __init__(self, match, info): def __init__(self):
Extractor.__init__(self) Extractor.__init__(self)
self.info = info self.params = {"limit": 50}
self.tags = text.unquote(match.group(1)) self.setup()
self.page = "page"
self.params = {"tags": self.tags}
self.headers = {}
def items(self): def items(self):
yield Message.Version, 1 yield Message.Version, 1
@@ -40,6 +40,9 @@ class BooruExtractor(Extractor):
def items_impl(self): def items_impl(self):
pass pass
def setup(self):
pass
def update_page(self, reset=False): def update_page(self, reset=False):
"""Update the value of the 'page' parameter""" """Update the value of the 'page' parameter"""
# Override this method in derived classes if necessary. # Override this method in derived classes if necessary.
@@ -51,14 +54,14 @@ class BooruExtractor(Extractor):
def get_job_metadata(self): def get_job_metadata(self):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
# Override this method in derived classes
return { return {
"category": self.info["category"], "category": self.category,
"tags": self.tags
} }
def get_file_metadata(self, data): def get_file_metadata(self, data):
"""Collect metadata for a downloadable file""" """Collect metadata for a downloadable file"""
data["category"] = self.info["category"] data["category"] = self.category
return text.nameext_from_url(self.get_file_url(data), data) return text.nameext_from_url(self.get_file_url(data), data)
def get_file_url(self, data): def get_file_url(self, data):
@@ -78,10 +81,10 @@ class JSONBooruExtractor(BooruExtractor):
self.request(self.api_url, verify=True, params=self.params, self.request(self.api_url, verify=True, params=self.params,
headers=self.headers).text headers=self.headers).text
) )
if len(images) == 0:
return
for data in images: for data in images:
yield data yield data
if len(images) < self.params["limit"]:
return
self.update_page() self.update_page()
@@ -93,8 +96,56 @@ class XMLBooruExtractor(BooruExtractor):
root = ET.fromstring( root = ET.fromstring(
self.request(self.api_url, verify=True, params=self.params).text self.request(self.api_url, verify=True, params=self.params).text
) )
if len(root) == 0:
return
for item in root: for item in root:
yield item.attrib yield item.attrib
if len(root) < self.params["limit"]:
return
self.update_page() self.update_page()
class BooruTagExtractor(BooruExtractor):
"""Extract images based on search-tags"""
directory_fmt = ["{category}", "{tags}"]
filename_fmt = "{category}_{id}_{md5}.{extension}"
def __init__(self, match):
BooruExtractor.__init__(self)
self.tags = text.unquote(match.group(1))
self.params["tags"] = self.tags
def get_job_metadata(self):
return {
"category": self.category,
"tags": self.tags,
}
class BooruPoolExtractor(BooruExtractor):
"""Extract image-pools"""
directory_fmt = ["{category}", "pool", "{pool}"]
filename_fmt = "{category}_{id}_{md5}.{extension}"
def __init__(self, match):
BooruExtractor.__init__(self)
self.pool = match.group(1)
self.params["tags"] = "pool:" + self.pool
def get_job_metadata(self):
return {
"category": self.category,
"pool": self.pool,
}
class BooruPostExtractor(BooruExtractor):
"""Extract single images"""
directory_fmt = ["{category}"]
filename_fmt = "{category}_{id}_{md5}.{extension}"
def __init__(self, match):
BooruExtractor.__init__(self)
self.post = match.group(1)
self.params["tags"] = "id:" + self.post

View File

@@ -13,13 +13,15 @@ from .. import text
class ChanExtractor(Extractor): class ChanExtractor(Extractor):
directory_fmt = ["{category}", "{board}-{thread}"]
filename_fmt = "{tim}-{filename}{ext}"
api_url = "" api_url = ""
file_url = "" file_url = ""
def __init__(self, category, board, thread): def __init__(self, board, thread):
Extractor.__init__(self) Extractor.__init__(self)
self.metadata = { self.metadata = {
"category": category, "category": self.category,
"board": board, "board": board,
"thread": thread, "thread": thread,
} }

View File

@@ -8,20 +8,21 @@
"""Extract image-urls from https://danbooru.donmai.us/""" """Extract image-urls from https://danbooru.donmai.us/"""
from .booru import JSONBooruExtractor from . import booru
info = { class DanbooruExtractor(booru.JSONBooruExtractor):
"category": "danbooru", """Base class for danbooru extractors"""
"extractor": "DanbooruExtractor", category = "danbooru"
"directory": ["{category}", "{tags}"], api_url = "https://danbooru.donmai.us/posts.json"
"filename": "{category}_{id}_{md5}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?danbooru.donmai.us/posts\?(?:utf8=%E2%9C%93&)?tags=([^&]+).*",
],
}
class DanbooruExtractor(JSONBooruExtractor): class DanbooruTagExtractor(DanbooruExtractor, booru.BooruTagExtractor):
"""Extract images from danbooru based on search-tags"""
pattern = [r"(?:https?://)?(?:www\.)?danbooru.donmai.us/posts\?(?:utf8=%E2%9C%93&)?tags=([^&]+)"]
def __init__(self, match): class DanbooruPoolExtractor(DanbooruExtractor, booru.BooruPoolExtractor):
JSONBooruExtractor.__init__(self, match, info) """Extract image-pools from danbooru"""
self.api_url = "https://danbooru.donmai.us/posts.json" pattern = [r"(?:https?://)?(?:www\.)?danbooru.donmai.us/pools/(\d+)"]
class DanbooruPostExtractor(DanbooruExtractor, booru.BooruPostExtractor):
"""Extract single images from danbooru"""
pattern = [r"(?:https?://)?(?:www\.)?danbooru.donmai.us/posts/(\d+)"]

View File

@@ -10,21 +10,15 @@
from .common import AsynchronousExtractor, Message from .common import AsynchronousExtractor, Message
from .. import text from .. import text
import os.path
import re import re
info = {
"category": "deviantart",
"extractor": "DeviantArtExtractor",
"directory": ["{category}", "{artist}"],
"filename": "{category}_{index}_{title}.{extension}",
"pattern": [
r"(?:https?://)?([^\.]+)\.deviantart\.com/gallery/.*",
],
}
class DeviantArtExtractor(AsynchronousExtractor): class DeviantArtExtractor(AsynchronousExtractor):
category = "deviantart"
directory_fmt = ["{category}", "{artist}"]
filename_fmt = "{category}_{index}_{title}.{extension}"
pattern = [r"(?:https?://)?([^\.]+)\.deviantart\.com/gallery/.*"]
def __init__(self, match): def __init__(self, match):
AsynchronousExtractor.__init__(self) AsynchronousExtractor.__init__(self)
self.session.cookies["agegate_state"] = "1" self.session.cookies["agegate_state"] = "1"
@@ -57,14 +51,14 @@ class DeviantArtExtractor(AsynchronousExtractor):
def get_job_metadata(self): def get_job_metadata(self):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
return { return {
"category": info["category"], "category": self.category,
"artist": self.artist, "artist": self.artist,
} }
def get_image_metadata(self, image): def get_image_metadata(self, image):
"""Collect metadata for an image""" """Collect metadata for an image"""
match = self.extract_data(image, 'title', match = self.extract_data(image, 'title',
'(.+) by (.+), ([A-Z][a-z]{2} \d+, \d{4}) in') r'(.+) by (.+), ([A-Z][a-z]{2} \d+, \d{4}) in')
if image.startswith(" ismature"): if image.startswith(" ismature"):
# adult image # adult image
url, _ = text.extract(image, 'href="', '"') url, _ = text.extract(image, 'href="', '"')
@@ -76,7 +70,7 @@ class DeviantArtExtractor(AsynchronousExtractor):
height, pos = text.extract(page, ' height="', '"', pos) height, pos = text.extract(page, ' height="', '"', pos)
else: else:
# normal image # normal image
index = self.extract_data(image, 'href', '[^"]+-(\d+)').group(1) index = self.extract_data(image, 'href', r'[^"]+-(\d+)').group(1)
url, pos = text.extract(image, ' data-super-full-img="', '"', match.end()) url, pos = text.extract(image, ' data-super-full-img="', '"', match.end())
if url: if url:
width , pos = text.extract(image, ' data-super-full-width="', '"', pos) width , pos = text.extract(image, ' data-super-full-width="', '"', pos)

View File

@@ -8,21 +8,24 @@
"""Extract image-urls from https://e621.net/""" """Extract image-urls from https://e621.net/"""
from .booru import JSONBooruExtractor from . import booru
info = { class E621Extractor(booru.JSONBooruExtractor):
"category": "e621", """Base class for e621 extractors"""
"extractor": "E621Extractor", category = "e621"
"directory": ["{category}", "{tags}"], api_url = "https://e621.net/post/index.json"
"filename": "{category}_{id}_{md5}.{extension}",
"pattern": [ class E621TagExtractor(E621Extractor, booru.BooruTagExtractor):
"""Extract images from e621 based on search-tags"""
pattern = [
r"(?:https?://)?(?:www\.)?e621\.net/post/index/\d+/([^?]+)", r"(?:https?://)?(?:www\.)?e621\.net/post/index/\d+/([^?]+)",
r"(?:https?://)?(?:www\.)?e621\.net/post\?tags=([^&]+).*" r"(?:https?://)?(?:www\.)?e621\.net/post\?tags=([^&]+)",
], ]
}
class E621Extractor(JSONBooruExtractor): class E621PoolExtractor(E621Extractor, booru.BooruPoolExtractor):
"""Extract image-pools from e621"""
pattern = [r"(?:https?://)?(?:www\.)?e621\.net/pool/show/(\d+)"]
def __init__(self, match): class E621PostExtractor(E621Extractor, booru.BooruPostExtractor):
JSONBooruExtractor.__init__(self, match, info) """Extract single images from e621"""
self.api_url = "https://e621.net/post/index.json" pattern = [r"(?:https?://)?(?:www\.)?e621\.net/post/show/(\d+)"]

View File

@@ -13,18 +13,12 @@ from .. import config, text, iso639_1
import time import time
import random import random
info = {
"category": "exhentai",
"extractor": "ExhentaiExtractor",
"directory": ["{category}", "{gallery-id}"],
"filename": "{gallery-id}_{num:>04}_{imgkey}_{name}.{extension}",
"pattern": [
r"(?:https?://)?(g\.e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})",
],
}
class ExhentaiExtractor(Extractor): class ExhentaiExtractor(Extractor):
category = "exhentai"
directory_fmt = ["{category}", "{gallery-id}"]
filename_fmt = "{gallery-id}_{num:>04}_{imgkey}_{name}.{extension}"
pattern = [r"(?:https?://)?(g\.e-|ex)hentai\.org/g/(\d+)/([\da-f]{10})"]
api_url = "http://exhentai.org/api.php" api_url = "http://exhentai.org/api.php"
def __init__(self, match): def __init__(self, match):
@@ -70,7 +64,7 @@ class ExhentaiExtractor(Extractor):
def get_job_metadata(self, page): def get_job_metadata(self, page):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
data = { data = {
"category" : info["category"], "category" : self.category,
"gallery-id" : self.gid, "gallery-id" : self.gid,
"gallery-token": self.token, "gallery-token": self.token,
} }

View File

@@ -8,27 +8,19 @@
"""Extract image-urls from http://gelbooru.com/""" """Extract image-urls from http://gelbooru.com/"""
from .booru import XMLBooruExtractor from . import booru
from .. import config from .. import config
info = { class GelbooruExtractor(booru.XMLBooruExtractor):
"category": "gelbooru", """Base class for gelbooru extractors"""
"extractor": "GelbooruExtractor",
"directory": ["{category}", "{tags}"],
"filename": "{category}_{id}_{md5}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?\?page=post&s=list&tags=([^&]+).*",
],
}
class GelbooruExtractor(XMLBooruExtractor): category = "gelbooru"
api_url = "http://gelbooru.com/"
def __init__(self, match): def setup(self):
XMLBooruExtractor.__init__(self, match, info) self.params.update({"page":"dapi", "s":"post", "q":"index"})
self.api_url = "http://gelbooru.com/"
self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}
self.session.cookies.update( self.session.cookies.update(
config.get(("extractor", info["category"], "cookies")) config.get(("extractor", self.category, "cookies"))
) )
def update_page(self, reset=False): def update_page(self, reset=False):
@@ -36,3 +28,16 @@ class GelbooruExtractor(XMLBooruExtractor):
self.params["pid"] += 1 self.params["pid"] += 1
else: else:
self.params["pid"] = 0 self.params["pid"] = 0
class GelbooruTagExtractor(GelbooruExtractor, booru.BooruTagExtractor):
"""Extract images from gelbooru based on search-tags"""
pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?\?page=post&s=list&tags=([^&]+)"]
# TODO: find out how to access pools via gelbooru-api
# class GelbooruPoolExtractor(GelbooruExtractor, booru.BooruPoolExtractor):
# """Extract image-pools from gelbooru"""
# pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?\?page=pool&s=show&id=(\d+)"]
class GelbooruPostExtractor(GelbooruExtractor, booru.BooruPostExtractor):
"""Extract single images from gelbooru"""
pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?\?page=post&s=view&id=(\d+)"]

View File

@@ -10,20 +10,13 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text from .. import text
import os.path
info = {
"category": "hbrowse",
"extractor": "HbrowseExtractor",
"directory": ["{category}", "{gallery-id} {title}"],
"filename": "{category}_{gallery-id}_{num:>03}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?hbrowse\.com/(\d+)/(c\d+)",
],
}
class HbrowseExtractor(Extractor): class HbrowseExtractor(Extractor):
category = "hbrowse"
directory_fmt = ["{category}", "{gallery-id} {title}"]
filename_fmt = "{category}_{gallery-id}_{num:>03}.{extension}"
pattern = [r"(?:https?://)?(?:www\.)?hbrowse\.com/(\d+)/(c\d+)"]
url_base = "http://www.hbrowse.com/thumbnails/" url_base = "http://www.hbrowse.com/thumbnails/"
def __init__(self, match): def __init__(self, match):
@@ -43,7 +36,7 @@ class HbrowseExtractor(Extractor):
def get_job_metadata(self, page): def get_job_metadata(self, page):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
data = { data = {
"category": info["category"], "category": self.category,
'gallery-id': self.gid, 'gallery-id': self.gid,
'chapter': int(self.chapter[1:]), 'chapter': int(self.chapter[1:]),
} }

View File

@@ -12,19 +12,15 @@ from .common import Extractor, Message
from .. import text from .. import text
import os.path import os.path
info = {
"category": "hentaifoundry",
"extractor": "HentaiFoundryExtractor",
"directory": ["{category}", "{artist}"],
"filename": "{category}_{index}_{title}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?hentai-foundry\.com/pictures/user/([^/]+)",
r"(?:https?://)?(?:www\.)?hentai-foundry\.com/user/([^/]+)/profile",
],
}
class HentaiFoundryExtractor(Extractor): class HentaiFoundryExtractor(Extractor):
category = "hentaifoundry"
directory_fmt = ["{category}", "{artist}"]
filename_fmt = "{category}_{index}_{title}.{extension}"
pattern = [
r"(?:https?://)?(?:www\.)?hentai-foundry\.com/pictures/user/([^/]+)",
r"(?:https?://)?(?:www\.)?hentai-foundry\.com/user/([^/]+)/profile",
]
url_base = "http://www.hentai-foundry.com/pictures/user/" url_base = "http://www.hentai-foundry.com/pictures/user/"
def __init__(self, match): def __init__(self, match):
@@ -60,7 +56,7 @@ class HentaiFoundryExtractor(Extractor):
token, pos = text.extract(page, 'hidden" value="', '"') token, pos = text.extract(page, 'hidden" value="', '"')
count, pos = text.extract(page, 'class="active" >Pictures (', ')', pos) count, pos = text.extract(page, 'class="active" >Pictures (', ')', pos)
return { return {
"category": info["category"], "category": self.category,
"artist": self.artist, "artist": self.artist,
"count": count, "count": count,
}, token }, token

View File

@@ -10,21 +10,15 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, iso639_1 from .. import text, iso639_1
import os.path
import string import string
info = {
"category": "hitomi",
"extractor": "HitomiExtractor",
"directory": ["{category}", "{gallery-id} {title}"],
"filename": "{category}_{gallery-id}_{num:>03}_{name}.{extension}",
"pattern": [
r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)\.html",
],
}
class HitomiExtractor(Extractor): class HitomiExtractor(Extractor):
category = "hitomi"
directory_fmt = ["{category}", "{gallery-id} {title}"]
filename_fmt = "{category}_{gallery-id}_{num:>03}_{name}.{extension}"
pattern = [r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)\.html"]
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) Extractor.__init__(self)
self.gid = match.group(1) self.gid = match.group(1)
@@ -62,7 +56,7 @@ class HitomiExtractor(Extractor):
series, pos = text.extract(page, '.html">', '</a>', pos) series, pos = text.extract(page, '.html">', '</a>', pos)
lang = lang.capitalize() lang = lang.capitalize()
return { return {
"category": info["category"], "category": self.category,
"gallery-id": self.gid, "gallery-id": self.gid,
"title": title, "title": title,
"artist": string.capwords(artist), "artist": string.capwords(artist),

View File

@@ -10,20 +10,13 @@
from .common import AsynchronousExtractor, Message from .common import AsynchronousExtractor, Message
from .. import text from .. import text
import os.path
info = {
"category": "imagebam",
"extractor": "ImagebamExtractor",
"directory": ["{category}", "{title} - {gallery-key}"],
"filename": "{num:>03}-{filename}",
"pattern": [
r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+).*",
],
}
class ImagebamExtractor(AsynchronousExtractor): class ImagebamExtractor(AsynchronousExtractor):
category = "imagebam"
directory_fmt = ["{category}", "{title} - {gallery-key}"]
filename_fmt = "{num:>03}-{filename}"
pattern = [r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([^/]+).*"]
url_base = "http://www.imagebam.com" url_base = "http://www.imagebam.com"
def __init__(self, match): def __init__(self, match):
@@ -47,7 +40,7 @@ class ImagebamExtractor(AsynchronousExtractor):
response.encoding = "utf-8" response.encoding = "utf-8"
page = response.text page = response.text
data = { data = {
"category": info["category"], "category": self.category,
"gallery-key": self.gkey, "gallery-key": self.gkey,
} }
data, _ = text.extract_all(page, ( data, _ = text.extract_all(page, (

View File

@@ -12,18 +12,12 @@ from .common import AsynchronousExtractor, Message
from .. import text from .. import text
import re import re
info = {
"category": "imgbox",
"extractor": "ImgboxExtractor",
"directory": ["{category}", "{title} - {gallery-key}"],
"filename": "{num:>03}-{name}",
"pattern": [
r"(?:https?://)?(?:www\.)?imgbox\.com/g/(.+)",
],
}
class ImgboxExtractor(AsynchronousExtractor): class ImgboxExtractor(AsynchronousExtractor):
category = "imgbox"
directory_fmt = ["{category}", "{title} - {gallery-key}"]
filename_fmt = "{num:>03}-{name}"
pattern = [r"(?:https?://)?(?:www\.)?imgbox\.com/g/(.+)"]
url_base = "http://imgbox.com" url_base = "http://imgbox.com"
def __init__(self, match): def __init__(self, match):
@@ -44,7 +38,7 @@ class ImgboxExtractor(AsynchronousExtractor):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
match = re.search(r"<h1>(.+) \(([^ ]+) ([^ ]+) \w+\) - (\d+)", page) match = re.search(r"<h1>(.+) \(([^ ]+) ([^ ]+) \w+\) - (\d+)", page)
return { return {
"category": info["category"], "category": self.category,
"gallery-key": self.key, "gallery-key": self.key,
"title": match.group(1), "title": match.group(1),
"date": match.group(2), "date": match.group(2),
@@ -62,7 +56,8 @@ class ImgboxExtractor(AsynchronousExtractor):
), values=data) ), values=data)
return data return data
def get_file_url(self, page): @staticmethod
def get_file_url(page):
"""Extract download-url""" """Extract download-url"""
base = "http://i.imgbox.com/" base = "http://i.imgbox.com/"
path, _ = text.extract(page, base, '"') path, _ = text.extract(page, base, '"')

View File

@@ -12,18 +12,13 @@ from .common import Extractor, Message
from .. import text from .. import text
import re import re
info = {
"category": "imgchili",
"extractor": "ImgchiliExtractor",
"directory": ["{category}", "{title} - {key}"],
"filename": "{num:>03}-{name}",
"pattern": [
r"(?:https?://)?(?:www\.)?imgchili\.net/album/([^/]+)",
],
}
class ImgchiliExtractor(Extractor): class ImgchiliExtractor(Extractor):
category = "imgchili"
directory_fmt = ["{category}", "{title} - {key}"]
filename_fmt = "{num:>03}-{name}"
pattern = [r"(?:https?://)?(?:www\.)?imgchili\.net/album/([^/]+)"]
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) Extractor.__init__(self)
self.match = match self.match = match

View File

@@ -10,20 +10,14 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text from .. import text
import os.path
info = {
"category": "imgth",
"extractor": "ImgthExtractor",
"directory": ["{category}", "{gallery-id} {title}"],
"filename": "{category}_{gallery-id}_{num:>03}.{extension}",
"pattern": [
r"(?:https?://)?imgth\.com/gallery/(\d+)",
],
}
class ImgthExtractor(Extractor): class ImgthExtractor(Extractor):
category = "imgth"
directory_fmt = ["{category}", "{gallery-id} {title}"]
filename_fmt = "{category}_{gallery-id}_{num:>03}.{extension}"
pattern = [r"(?:https?://)?imgth\.com/gallery/(\d+)"]
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) Extractor.__init__(self)
self.gid = match.group(1) self.gid = match.group(1)
@@ -55,7 +49,7 @@ class ImgthExtractor(Extractor):
def get_job_metadata(self, page): def get_job_metadata(self, page):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
data = { data = {
"category": info["category"], "category": self.category,
"gallery-id": self.gid, "gallery-id": self.gid,
} }
data, _ = text.extract_all(page, ( data, _ = text.extract_all(page, (

View File

@@ -12,18 +12,13 @@ from .common import Extractor, Message
from .. import text from .. import text
import os.path import os.path
info = {
"category": "imgur",
"extractor": "ImgurExtractor",
"directory": ["{category}", "{album-key} - {title}"],
"filename": "{category}_{album-key}_{num:>03}_{name}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?imgur\.com/(?:a|gallery)/([^/?&#]+)",
],
}
class ImgurExtractor(Extractor): class ImgurExtractor(Extractor):
category = "imgur"
directory_fmt = ["{category}", "{album-key} - {title}"]
filename_fmt = "{category}_{album-key}_{num:>03}_{name}.{extension}"
pattern = [r"(?:https?://)?(?:www\.)?imgur\.com/(?:a|gallery)/([^/?&#]+)"]
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) Extractor.__init__(self)
self.album = match.group(1) self.album = match.group(1)
@@ -43,7 +38,7 @@ class ImgurExtractor(Extractor):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
page = self.request("https://imgur.com/a/" + self.album).text page = self.request("https://imgur.com/a/" + self.album).text
data = { data = {
"category": info["category"], "category": self.category,
"album-key": self.album, "album-key": self.album,
} }
return text.extract_all(page, ( return text.extract_all(page, (

View File

@@ -10,21 +10,15 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, cloudflare from .. import text, cloudflare
import os.path
import re import re
info = {
"category": "kissmanga",
"extractor": "KissmangaExtractor",
"directory": ["{category}", "{manga}", "c{chapter:>03}{chapter-minor} - {title}"],
"filename": "{manga}_c{chapter:>03}{chapter-minor}_{page:>03}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?kissmanga\.com/Manga/.+/.+\?id=\d+",
],
}
class KissmangaExtractor(Extractor): class KissmangaExtractor(Extractor):
category = "kissmanga"
directory_fmt = ["{category}", "{manga}", "c{chapter:>03}{chapter-minor} - {title}"]
filename_fmt = "{manga}_c{chapter:>03}{chapter-minor}_{page:>03}.{extension}"
pattern = [r"(?:https?://)?(?:www\.)?kissmanga\.com/Manga/.+/.+\?id=\d+"]
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) Extractor.__init__(self)
self.url = match.group(0) self.url = match.group(0)
@@ -41,8 +35,7 @@ class KissmangaExtractor(Extractor):
data["page"] = num data["page"] = num
yield Message.Url, url, text.nameext_from_url(url, data) yield Message.Url, url, text.nameext_from_url(url, data)
@staticmethod def get_job_metadata(self, page):
def get_job_metadata(page):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
manga, pos = text.extract(page, "Read manga\n", "\n") manga, pos = text.extract(page, "Read manga\n", "\n")
cinfo, pos = text.extract(page, "", "\n", pos) cinfo, pos = text.extract(page, "", "\n", pos)
@@ -50,7 +43,7 @@ class KissmangaExtractor(Extractor):
r"(?:Vol.0*(\d+) )?(?:Ch.)?0*(\d+)(?:\.0*(\d+))?(?:: (.+))?", cinfo) r"(?:Vol.0*(\d+) )?(?:Ch.)?0*(\d+)(?:\.0*(\d+))?(?:: (.+))?", cinfo)
chminor = match.group(3) chminor = match.group(3)
return { return {
"category": info["category"], "category": self.category,
"manga": manga, "manga": manga,
"volume": match.group(1) or "", "volume": match.group(1) or "",
"chapter": match.group(2), "chapter": match.group(2),

View File

@@ -8,20 +8,21 @@
"""Extract image-urls from https://konachan.com/""" """Extract image-urls from https://konachan.com/"""
from .booru import JSONBooruExtractor from . import booru
info = { class KonachanExtractor(booru.JSONBooruExtractor):
"category": "konachan", """Base class for konachan extractors"""
"extractor": "KonachanExtractor", category = "konachan"
"directory": ["{category}", "{tags}"], api_url = "https://konachan.com/post.json"
"filename": "{category}_{id}_{md5}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?konachan\.com/post\?tags=([^&]+).*",
],
}
class KonachanExtractor(JSONBooruExtractor): class KonachanTagExtractor(KonachanExtractor, booru.BooruTagExtractor):
"""Extract images from konachan based on search-tags"""
pattern = [r"(?:https?://)?(?:www\.)?konachan\.com/post\?tags=([^&]+)"]
def __init__(self, match): class KonachanPoolExtractor(KonachanExtractor, booru.BooruPoolExtractor):
JSONBooruExtractor.__init__(self, match, info) """Extract image-pools from konachan"""
self.api_url = "http://konachan.com/post.json" pattern = [r"(?:https?://)?(?:www\.)?konachan\.com/pool/show/(\d+)"]
class KonachanPostExtractor(KonachanExtractor, booru.BooruPostExtractor):
"""Extract single images from konachan"""
pattern = [r"(?:https?://)?(?:www\.)?konachan\.com/post/show/(\d+)"]

View File

@@ -10,18 +10,13 @@
from .mangareader import MangaReaderExtractor from .mangareader import MangaReaderExtractor
info = {
"category": "mangapanda",
"extractor": "MangaPandaExtractor",
"directory": ["{category}", "{manga}", "c{chapter:>03} - {title}"],
"filename": "{manga}_c{chapter:>03}_{page:>03}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/]+)/(\d+))",
r"(?:https?://)?(?:www\.)?mangapanda\.com(/\d+-\d+-\d+(/[^/]+)/chapter-(\d+).html)",
],
}
class MangaPandaExtractor(MangaReaderExtractor): class MangaPandaExtractor(MangaReaderExtractor):
category = info["category"] category = "mangapanda"
directory_fmt = ["{category}", "{manga}", "c{chapter:>03} - {title}"]
filename_fmt = "{manga}_c{chapter:>03}_{page:>03}.{extension}"
pattern = [
r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/]+)/(\d+))",
r"(?:https?://)?(?:www\.)?mangapanda\.com(/\d+-\d+-\d+(/[^/]+)/chapter-(\d+).html)",
]
url_base = "http://www.mangapanda.com" url_base = "http://www.mangapanda.com"

View File

@@ -10,22 +10,16 @@
from .common import AsynchronousExtractor, Message from .common import AsynchronousExtractor, Message
from .. import text from .. import text
import os.path
info = {
"category": "mangareader",
"extractor": "MangaReaderExtractor",
"directory": ["{category}", "{manga}", "c{chapter:>03} - {title}"],
"filename": "{manga}_c{chapter:>03}_{page:>03}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/]+)/(\d+))",
r"(?:https?://)?(?:www\.)?mangareader\.net(/\d+-\d+-\d+(/[^/]+)/chapter-(\d+).html)",
],
}
class MangaReaderExtractor(AsynchronousExtractor): class MangaReaderExtractor(AsynchronousExtractor):
category = info["category"] category = "mangareader"
directory_fmt = ["{category}", "{manga}", "c{chapter:>03} - {title}"]
filename_fmt = "{manga}_c{chapter:>03}_{page:>03}.{extension}"
pattern = [
r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/]+)/(\d+))",
r"(?:https?://)?(?:www\.)?mangareader\.net(/\d+-\d+-\d+(/[^/]+)/chapter-(\d+).html)",
]
url_base = "http://www.mangareader.net" url_base = "http://www.mangareader.net"
def __init__(self, match): def __init__(self, match):

View File

@@ -10,20 +10,13 @@
from .common import AsynchronousExtractor, Message from .common import AsynchronousExtractor, Message
from .. import text from .. import text
import os
info = {
"category": "mangashare",
"extractor": "MangaShareExtractor",
"directory": ["{category}", "{manga}", "c{chapter:>03} - {title}"],
"filename": "{manga}_c{chapter:>03}_{page:>03}.{extension}",
"pattern": [
r"(?:https?://)?read\.mangashare\.com/([^/]+/chapter-\d+)",
],
}
class MangaShareExtractor(AsynchronousExtractor): class MangaShareExtractor(AsynchronousExtractor):
category = "mangashare"
directory_fmt = ["{category}", "{manga}", "c{chapter:>03} - {title}"]
filename_fmt = "{manga}_c{chapter:>03}_{page:>03}.{extension}"
pattern = [r"(?:https?://)?read\.mangashare\.com/([^/]+/chapter-\d+)"]
url_fmt = "http://read.mangashare.com/{}/page{:>03}.html" url_fmt = "http://read.mangashare.com/{}/page{:>03}.html"
def __init__(self, match): def __init__(self, match):
@@ -40,11 +33,10 @@ class MangaShareExtractor(AsynchronousExtractor):
text.nameext_from_url(url, data) text.nameext_from_url(url, data)
yield Message.Url, url, data.copy() yield Message.Url, url, data.copy()
@staticmethod def get_job_metadata(self, page):
def get_job_metadata(page):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
data = { data = {
"category": info["category"], "category": self.category,
"lang": "en", "lang": "en",
"language": "English", "language": "English",
} }

View File

@@ -10,20 +10,13 @@
from .common import AsynchronousExtractor, Message from .common import AsynchronousExtractor, Message
from .. import text from .. import text
import os.path
info = {
"category": "mangastream",
"extractor": "MangaStreamExtractor",
"directory": ["{category}", "{manga}", "c{chapter:>03}{chapter-minor} - {title}"],
"filename": "{manga}_c{chapter:>03}{chapter-minor}_{page:>03}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?readms\.com/r/([^/]*/(\d+)([^/]*)?/(\d+))",
],
}
class MangaStreamExtractor(AsynchronousExtractor): class MangaStreamExtractor(AsynchronousExtractor):
category = "mangastream"
directory_fmt = ["{category}", "{manga}", "c{chapter:>03}{chapter-minor} - {title}"]
filename_fmt = "{manga}_c{chapter:>03}{chapter-minor}_{page:>03}.{extension}"
pattern = [r"(?:https?://)?(?:www\.)?readms\.com/r/([^/]*/(\d+)([^/]*)?/(\d+))"]
url_base = "https://readms.com/r/" url_base = "https://readms.com/r/"
def __init__(self, match): def __init__(self, match):
@@ -47,7 +40,7 @@ class MangaStreamExtractor(AsynchronousExtractor):
def get_job_metadata(self, page): def get_job_metadata(self, page):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
data = { data = {
"category": info["category"], "category": self.category,
"chapter": self.chapter, "chapter": self.chapter,
"chapter-minor": self.ch_minor, "chapter-minor": self.ch_minor,
"chapter-id": self.ch_id, "chapter-id": self.ch_id,
@@ -61,7 +54,8 @@ class MangaStreamExtractor(AsynchronousExtractor):
), values=data) ), values=data)
return data return data
def get_page_metadata(self, page): @staticmethod
def get_page_metadata(page):
"""Collect next url, image-url and metadata for one manga-page""" """Collect next url, image-url and metadata for one manga-page"""
nurl, pos = text.extract(page, '<div class="page">\n<a href="', '"') nurl, pos = text.extract(page, '<div class="page">\n<a href="', '"')
iurl, pos = text.extract(page, '<img id="manga-page" src="', '"', pos) iurl, pos = text.extract(page, '<img id="manga-page" src="', '"', pos)

View File

@@ -12,18 +12,13 @@ from .common import Extractor, Message
from .. import text from .. import text
import json import json
info = {
"category": "nhentai",
"extractor": "NhentaiExtractor",
"directory": ["{category}", "{gallery-id} {title}"],
"filename": "{category}_{gallery-id}_{num:>03}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?nhentai\.net/g/(\d+)",
],
}
class NhentaiExtractor(Extractor): class NhentaiExtractor(Extractor):
category = "nhentai"
directory_fmt = ["{category}", "{gallery-id} {title}"]
filename_fmt = "{category}_{gallery-id}_{num:>03}.{extension}"
pattern = [r"(?:https?://)?(?:www\.)?nhentai\.net/g/(\d+)"]
def __init__(self, match): def __init__(self, match):
Extractor.__init__(self) Extractor.__init__(self)
self.gid = match.group(1) self.gid = match.group(1)
@@ -57,7 +52,7 @@ class NhentaiExtractor(Extractor):
title_en = ginfo["title"].get("english", "") title_en = ginfo["title"].get("english", "")
title_ja = ginfo["title"].get("japanese", "") title_ja = ginfo["title"].get("japanese", "")
return { return {
"category": info["category"], "category": self.category,
"gallery-id": self.gid, "gallery-id": self.gid,
"upload-date": ginfo["upload_date"], "upload-date": ginfo["upload_date"],
"media-id": ginfo["media_id"], "media-id": ginfo["media_id"],

View File

@@ -12,18 +12,12 @@ from .common import AsynchronousExtractor, Message
from .. import config, text from .. import config, text
import re import re
info = {
"category": "nijie",
"extractor": "NijieExtractor",
"directory": ["{category}", "{artist-id}"],
"filename": "{category}_{artist-id}_{image-id}_p{index:>02}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?nijie\.info/members(?:_illust)?\.php\?id=(\d+)",
],
}
class NijieExtractor(AsynchronousExtractor): class NijieExtractor(AsynchronousExtractor):
category = "nijie"
directory_fmt = ["{category}", "{artist-id}"]
filename_fmt = "{category}_{artist-id}_{image-id}_p{index:>02}.{extension}"
pattern = [r"(?:https?://)?(?:www\.)?nijie\.info/members(?:_illust)?\.php\?id=(\d+)"]
popup_url = "https://nijie.info/view_popup.php?id=" popup_url = "https://nijie.info/view_popup.php?id="
def __init__(self, match): def __init__(self, match):
@@ -37,7 +31,7 @@ class NijieExtractor(AsynchronousExtractor):
self.session.cookies["R18"] = "1" self.session.cookies["R18"] = "1"
self.session.cookies["nijie_referer"] = "nijie.info" self.session.cookies["nijie_referer"] = "nijie.info"
self.session.cookies.update( self.session.cookies.update(
config.get(("extractor", info["category"], "cookies")) config.get(("extractor", self.category, "cookies"))
) )
def items(self): def items(self):
@@ -52,7 +46,7 @@ class NijieExtractor(AsynchronousExtractor):
def get_job_metadata(self): def get_job_metadata(self):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
return { return {
"category": info["category"], "category": self.category,
"artist-id": self.artist_id, "artist-id": self.artist_id,
} }

View File

@@ -14,19 +14,12 @@ import re
import json import json
import time import time
info = {
"category": "pixiv",
"extractor": "PixivExtractor",
"directory": ["{category}", "{artist-id}-{artist-nick}"],
"filename": "{category}_{artist-id}_{id}{num}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?pixiv\.net/member(?:_illust)?\.php\?id=(\d+)",
],
}
class PixivExtractor(Extractor): class PixivExtractor(Extractor):
category = "pixiv"
directory_fmt = ["{category}", "{artist-id}-{artist-nick}"]
filename_fmt = "{category}_{artist-id}_{id}{num}.{extension}"
pattern = [r"(?:https?://)?(?:www\.)?pixiv\.net/member(?:_illust)?\.php\?id=(\d+)"]
member_url = "http://www.pixiv.net/member_illust.php" member_url = "http://www.pixiv.net/member_illust.php"
illust_url = "http://www.pixiv.net/member_illust.php?mode=medium" illust_url = "http://www.pixiv.net/member_illust.php?mode=medium"
@@ -121,7 +114,7 @@ class PixivExtractor(Extractor):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
data = self.api.user(self.artist_id)["response"][0] data = self.api.user(self.artist_id)["response"][0]
return { return {
"category": info["category"], "category": self.category,
"artist-id": self.artist_id, "artist-id": self.artist_id,
"artist-name": data["name"], "artist-name": data["name"],
"artist-nick": data["account"], "artist-nick": data["account"],

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2014, 2015 Mike Fährmann # Copyright 2015 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -8,27 +8,27 @@
"""Extract image-urls from http://safebooru.org/""" """Extract image-urls from http://safebooru.org/"""
from .booru import XMLBooruExtractor from . import booru
info = { class SafebooruExtractor(booru.XMLBooruExtractor):
"category": "safebooru", """Base class for safebooru extractors"""
"extractor": "SafebooruExtractor",
"directory": ["{category}", "{tags}"],
"filename": "{category}_{id}_{md5}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?\?page=post&s=list&tags=([^&]+).*",
],
}
class SafebooruExtractor(XMLBooruExtractor): category = "safebooru"
api_url = "http://safebooru.org/index.php"
def __init__(self, match): def setup(self):
XMLBooruExtractor.__init__(self, match, info) self.params.update({"page":"dapi", "s":"post", "q":"index"})
self.api_url = "http://safebooru.org/index.php"
self.params = {"page":"dapi", "s":"post", "q":"index", "tags":self.tags}
def update_page(self, reset=False): def update_page(self, reset=False):
if reset is False: if reset is False:
self.params["pid"] += 1 self.params["pid"] += 1
else: else:
self.params["pid"] = 0 self.params["pid"] = 0
class SafebooruTagExtractor(SafebooruExtractor, booru.BooruTagExtractor):
"""Extract images from safebooru based on search-tags"""
pattern = [r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?\?page=post&s=list&tags=([^&]+)"]
class SafebooruPostExtractor(SafebooruExtractor, booru.BooruPostExtractor):
"""Extract single images from safebooru"""
pattern = [r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?\?page=post&s=view&id=(\d+)"]

View File

@@ -10,20 +10,13 @@
from .common import AsynchronousExtractor, Message from .common import AsynchronousExtractor, Message
from .. import text from .. import text
import os.path
info = {
"category": "sankaku",
"extractor": "SankakuExtractor",
"directory": ["{category}", "{tags}"],
"filename": "{category}_{id}_{md5}.{extension}",
"pattern": [
r"(?:https?://)?chan\.sankakucomplex\.com/\?tags=([^&]+)",
],
}
class SankakuExtractor(AsynchronousExtractor): class SankakuExtractor(AsynchronousExtractor):
category = "sankaku"
directory_fmt = ["{category}", "{tags}"]
filename_fmt = "{category}_{id}_{md5}.{extension}"
pattern = [r"(?:https?://)?chan\.sankakucomplex\.com/\?tags=([^&]+)"]
url = "https://chan.sankakucomplex.com/" url = "https://chan.sankakucomplex.com/"
def __init__(self, match): def __init__(self, match):
@@ -45,7 +38,7 @@ class SankakuExtractor(AsynchronousExtractor):
def get_job_metadata(self): def get_job_metadata(self):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
return { return {
"category": info["category"], "category": self.category,
"tags": self.tags, "tags": self.tags,
} }

View File

@@ -10,21 +10,17 @@
from .common import AsynchronousExtractor, Message from .common import AsynchronousExtractor, Message
from .. import text from .. import text
import os.path
info = {
"category": "spectrumnexus",
"extractor": "SpectrumNexusExtractor",
"directory": ["{category}", "{manga}", "c{chapter:>03}"],
"filename": "{manga}_c{chapter:>03}_{page:>03}.{extension}",
"pattern": [
r"(?:https?://)?(view\.thespectrum\.net/series/[^\.]+.html)\?ch=Chapter\+(\d+)",
r"(?:https?://)?(view\.thespectrum\.net/series/[^/]+-chapter-(\d+)\.html)",
],
}
class SpectrumNexusExtractor(AsynchronousExtractor): class SpectrumNexusExtractor(AsynchronousExtractor):
category = "spectrumnexus"
directory_fmt = ["{category}", "{manga}", "c{chapter:>03}"]
filename_fmt = "{manga}_c{chapter:>03}_{page:>03}.{extension}"
pattern = [
r"(?:https?://)?(view\.thespectrum\.net/series/[^\.]+.html)\?ch=Chapter\+(\d+)",
r"(?:https?://)?(view\.thespectrum\.net/series/[^/]+-chapter-(\d+)\.html)",
]
def __init__(self, match): def __init__(self, match):
AsynchronousExtractor.__init__(self) AsynchronousExtractor.__init__(self)
self.url = "http://" + match.group(1) self.url = "http://" + match.group(1)
@@ -52,7 +48,7 @@ class SpectrumNexusExtractor(AsynchronousExtractor):
def get_job_metadata(self, page): def get_job_metadata(self, page):
"""Collect metadata for extractor-job""" """Collect metadata for extractor-job"""
data = { data = {
"category": info["category"], "category": self.category,
"chapter": self.chapter, "chapter": self.chapter,
} }
return text.extract_all(page, ( return text.extract_all(page, (

View File

@@ -8,20 +8,21 @@
"""Extract image-urls from https://yande.re/""" """Extract image-urls from https://yande.re/"""
from .booru import JSONBooruExtractor from . import booru
info = { class YandereExtractor(booru.JSONBooruExtractor):
"category": "yandere", """Base class for yandere extractors"""
"extractor": "YandereExtractor", category = "yandere"
"directory": ["{category}", "{tags}"], api_url = "https://yande.re/post.json"
"filename": "{category}_{id}_{md5}.{extension}",
"pattern": [
r"(?:https?://)?(?:www\.)?yande\.re/post\?tags=([^&]+).*",
],
}
class YandereExtractor(JSONBooruExtractor): class YandereTagExtractor(YandereExtractor, booru.BooruTagExtractor):
"""Extract images from yandere based on search-tags"""
pattern = [r"(?:https?://)?(?:www\.)?yande\.re/post\?tags=([^&]+)"]
def __init__(self, match): class YanderePoolExtractor(YandereExtractor, booru.BooruPoolExtractor):
JSONBooruExtractor.__init__(self, match, info) """Extract image-pools from yandere"""
self.api_url = "https://yande.re/post.json" pattern = [r"(?:https?://)?(?:www\.)?yande.re/pool/show/(\d+)"]
class YanderePostExtractor(YandereExtractor, booru.BooruPostExtractor):
"""Extract single images from yandere"""
pattern = [r"(?:https?://)?(?:www\.)?yande.re/post/show/(\d+)"]

View File

@@ -14,19 +14,19 @@ from .extractor.common import Message
class DownloadJob(): class DownloadJob():
def __init__(self, url): def __init__(self, url):
self.extractor, self.info = extractor.find(url) self.extractor = extractor.find(url)
if self.extractor is None: if self.extractor is None:
print(url, ": No extractor found", sep="", file=sys.stderr) print(url, ": No extractor found", sep="", file=sys.stderr)
return return
self.directory = self.get_base_directory() self.directory = self.get_base_directory()
self.downloaders = {} self.downloaders = {}
self.filename_fmt = config.get( self.filename_fmt = config.get(
("extractor", self.info["category"], "filename"), ("extractor", self.extractor.category, "filename"),
default=self.info["filename"] default=self.extractor.filename_fmt
) )
segments = config.get( segments = config.get(
("extractor", self.info["category"], "directory"), ("extractor", self.extractor.category, "directory"),
default=self.info["directory"] default=self.extractor.directory_fmt
) )
self.directory_fmt = os.path.join(*segments) self.directory_fmt = os.path.join(*segments)
@@ -51,7 +51,7 @@ class DownloadJob():
elif msg[0] == Message.Version: elif msg[0] == Message.Version:
if msg[1] != 1: if msg[1] != 1:
raise "unsupported message-version ({}, {})".format( raise "unsupported message-version ({}, {})".format(
self.info.category, msg[1] self.extractor.category, msg[1]
) )
# TODO: support for multiple message versions # TODO: support for multiple message versions
@@ -118,7 +118,7 @@ class DownloadJob():
class KeywordJob(): class KeywordJob():
def __init__(self, url): def __init__(self, url):
self.extractor, self.info = extractor.find(url) self.extractor = extractor.find(url)
if self.extractor is None: if self.extractor is None:
print(url, ": No extractor found", sep="", file=sys.stderr) print(url, ": No extractor found", sep="", file=sys.stderr)
return return