diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 3577c3e6..611603ec 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -92,11 +92,8 @@ modules = [
"pururin",
"reactor",
"readcomiconline",
- "realbooru",
"reddit",
"redgifs",
- "rule34",
- "safebooru",
"sankaku",
"sankakucomplex",
"seiga",
@@ -122,6 +119,7 @@ modules = [
"xhamster",
"xvideos",
"yuki",
+ "booru",
"moebooru",
"foolfuuka",
"foolslide",
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index 378e1440..517df935 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -1,247 +1,248 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2020 Mike Fährmann
+# Copyright 2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Base classes for extractors for danbooru and co"""
+"""Extractors for *booru sites"""
+
+from .common import Extractor, Message, generate_extractors
+from .. import text, util, exception
-from .common import Extractor, Message
-from .. import text, exception
from xml.etree import ElementTree
import collections
-import datetime
-import operator
import re
class BooruExtractor(Extractor):
- """Base class for all booru extractors"""
+ """Base class for *booru extractors"""
basecategory = "booru"
filename_fmt = "{category}_{id}_{md5}.{extension}"
- api_url = ""
- post_url = ""
- per_page = 50
- page_start = 1
- page_limit = None
- sort = False
+ page_start = 0
+ per_page = 100
- def __init__(self, match):
- super().__init__(match)
- self.params = {}
- self.extags = self.post_url and self.config("tags", False)
+ def items(self):
+ self.login()
+ extended_tags = self.config("tags", False)
+ data = self.metadata()
+ for post in self.posts():
+ try:
+ url = self._prepare_post(post, extended_tags)
+ except KeyError:
+ continue
+ post.update(data)
+ text.nameext_from_url(url, post)
+ yield Message.Directory, post
+ yield Message.Url, url, post
def skip(self, num):
pages = num // self.per_page
- if self.page_limit and pages + self.page_start > self.page_limit:
- pages = self.page_limit - self.page_start
self.page_start += pages
return pages * self.per_page
- def items(self):
- yield Message.Version, 1
- data = self.get_metadata()
+ def login(self):
+ """Login and set necessary cookies"""
- self.reset_page()
- while True:
- images = self.parse_response(
- self.request(self.api_url, params=self.params))
+ def metadata(self):
+ """Return a dict with general metadata"""
+ return ()
- for image in images:
- try:
- url = self.get_file_url(image)
- except KeyError:
- continue
- if url.startswith("/"):
- url = text.urljoin(self.api_url, url)
- image.update(data)
- text.nameext_from_url(url, image)
- if self.extags:
- self.extended_tags(image)
- yield Message.Directory, image
- yield Message.Url, url, image
+ def posts(self):
+ """Return an iterable with post objects"""
+ return ()
- if len(images) < self.per_page:
- return
- self.update_page(image)
+ def _prepare_post(self, post, extended_tags=False):
+ url = post["file_url"]
+ if url[0] == "/":
+ url = self.root + url
+ if extended_tags:
+ self._fetch_extended_tags(post)
+ post["date"] = text.parse_datetime(
+ post["created_at"], "%a %b %d %H:%M:%S %z %Y")
+ return url
- def reset_page(self):
- """Initialize params to point to the first page"""
- self.params["page"] = self.page_start
-
- def update_page(self, data):
- """Update params to point to the next page"""
-
- def parse_response(self, response):
- """Parse JSON API response"""
- images = response.json()
- if self.sort:
- images.sort(key=operator.itemgetter("score", "id"),
- reverse=True)
- return images
-
- def get_metadata(self):
- """Collect metadata for extractor-job"""
- return {}
-
- @staticmethod
- def get_file_url(image):
- return image["file_url"]
-
- def extended_tags(self, image, page=None):
- """Retrieve extended tag information"""
+ def _fetch_extended_tags(self, post, page=None):
if not page:
- url = self.post_url.format(image["id"])
+ url = "{}/index.php?page=post&s=view&id={}".format(
+ self.root, post["id"])
page = self.request(url).text
- tags = collections.defaultdict(list)
- tags_html = text.extract(page, '
Now Viewing: ", "")
- if not name:
- name, pos = text.extract(page, "Pool: ", "
")
+ name, pos = text.extract(page, "Pool: ", "
")
if not name:
raise exception.NotFoundError("pool")
- self.posts = list(text.extract_iter(
- page, 'class="thumb" id="p', '"', pos))
+ self.post_ids = text.extract_iter(
+ page, 'class="thumb" id="p', '"', pos)
return {
- "pool": text.parse_int(self.pool),
+ "pool": text.parse_int(self.pool_id),
"pool_name": text.unescape(name),
- "count": len(self.posts),
}
- def reset_page(self):
- self.index = self.page_start
- self.update_page(None)
-
- def update_page(self, data):
- try:
- post = self.posts[self.index]
- self.index += 1
- except IndexError:
- post = "0"
- self.params["tags"] = "id:" + post
+ def posts(self):
+ params = {}
+ for params["id"] in util.advance(self.post_ids, self.page_start):
+ for post in self._api_request(params):
+ yield post.attrib
-class PostMixin():
- """Extraction of a single image-post"""
- subcategory = "post"
- archive_fmt = "{id}"
+EXTRACTORS = {
+ "rule34": {
+ "root": "https://rule34.xxx",
+ "test-tag": (
+ ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
+ "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
+ "pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
+ "count": 1,
+ }),
+ ),
+ "test-pool": (
+ ("https://rule34.xxx/index.php?page=pool&s=show&id=179", {
+ "count": 3,
+ }),
+ ),
+ "test-post": (
+ ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
+ "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "danraku",
+ "tags_character": "kashima_(kantai_collection)",
+ "tags_copyright": "kantai_collection",
+ "tags_general": str,
+ "tags_metadata": str,
+ },
+ }),
+ ),
+ },
+ "safebooru": {
+ "root": "https://safebooru.org",
+ "test-tag": (
+ ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
+ "url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
+ "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
+ }),
+ ),
+ "test-pool": (
+ ("https://safebooru.org/index.php?page=pool&s=show&id=11", {
+ "count": 5,
+ }),
+ ),
+ "test-post": (
+ ("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
+ "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
+ "content": "93b293b27dabd198afafabbaf87c49863ac82f27",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "kawanakajima",
+ "tags_character": "heath_ledger ronald_mcdonald the_joker",
+ "tags_copyright": "dc_comics mcdonald's the_dark_knight",
+ "tags_general": str,
+ },
+ }),
+ ),
+ },
+ "realbooru": {
+ "root": "https://realbooru.com",
+ "test-tag": (
+ ("https://realbooru.com/index.php?page=post&s=list&tags=wine", {
+ "count": ">= 64",
+ }),
+ ),
+ "test-pool": (
+ ("https://realbooru.com/index.php?page=pool&s=show&id=1", {
+ "count": 3,
+ }),
+ ),
+ "test-post": (
+ ("https://realbooru.com/index.php?page=post&s=view&id=668483", {
+ "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9",
+ "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
+ }),
+ ),
+ },
+}
- def __init__(self, match):
- super().__init__(match)
- self.post = match.group("post")
- self.params["tags"] = "id:" + self.post
-
-
-class MoebooruPopularMixin():
- """Extraction and metadata handling for Moebooru and Danbooru v1"""
- subcategory = "popular"
- directory_fmt = ("{category}", "popular", "{scale}", "{date}")
- archive_fmt = "P_{scale[0]}_{date}_{id}"
- page_start = None
- sort = True
-
- def __init__(self, match):
- super().__init__(match)
- self.params.update(text.parse_query(match.group("query")))
- self.scale = match.group("scale")
-
- def get_metadata(self, fmt="%Y-%m-%d"):
- date = self.get_date() or datetime.date.today().isoformat()
- scale = self.get_scale() or "day"
-
- if scale == "week":
- date = datetime.date.fromisoformat(date)
- date = (date - datetime.timedelta(days=date.weekday())).isoformat()
- elif scale == "month":
- date = date[:-3]
-
- return {"date": date, "scale": scale}
-
- def get_date(self):
- if "year" in self.params:
- return "{:>04}-{:>02}-{:>02}".format(
- self.params["year"],
- self.params.get("month", "01"),
- self.params.get("day", "01"))
- return None
-
- def get_scale(self):
- if self.scale and self.scale.startswith("by_"):
- return self.scale[3:]
- return self.scale
+generate_extractors(EXTRACTORS, globals(), (
+ BooruTagExtractor,
+ BooruPoolExtractor,
+ BooruPostExtractor,
+))
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index c32ba5c0..b0614e20 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -6,98 +6,27 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://gelbooru.com/"""
+"""Extractors for https://gelbooru.com/"""
from . import booru
-from .common import Message
-from .. import text
+from .. import text, exception
-class GelbooruExtractor(booru.XmlParserMixin,
- booru.GelbooruPageMixin,
- booru.BooruExtractor):
+class GelbooruBase():
"""Base class for gelbooru extractors"""
category = "gelbooru"
- api_url = "https://gelbooru.com/index.php"
- post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}"
- pool_url = "https://gelbooru.com/index.php?page=pool&s=show&id={}"
+ root = "https://gelbooru.com"
- def __init__(self, match):
- super().__init__(match)
-
- self.use_api = self.config("api", True)
- if self.use_api:
- self.params.update({"page": "dapi", "s": "post", "q": "index"})
- else:
- self.items = self.items_noapi
- self.session.cookies["fringeBenefits"] = "yup"
- self.per_page = 42
-
- @staticmethod
- def get_file_url(image):
- url = image["file_url"]
+ def _prepare_post(self, post, extended_tags=False):
+ url = booru.BooruExtractor._prepare_post(self, post, extended_tags)
if url.startswith("https://mp4.gelbooru.com/"):
- ihash = image["md5"]
+ md5 = post["md5"]
return "https://img2.gelbooru.com/images/{}/{}/{}.webm".format(
- ihash[0:2], ihash[2:4], ihash)
+ md5[0:2], md5[2:4], md5)
return url
- def items_noapi(self):
- yield Message.Version, 1
- data = self.get_metadata()
- for post in self.get_posts():
- post = self.get_post_data(post)
- url = post["file_url"]
- post.update(data)
- text.nameext_from_url(url, post)
- yield Message.Directory, post
- yield Message.Url, url, post
-
- def get_posts(self):
- """Return an iterable containing all relevant post objects"""
- url = "https://gelbooru.com/index.php?page=post&s=list"
- params = {
- "tags": self.params["tags"],
- "pid" : self.page_start * self.per_page
- }
-
- while True:
- page = self.request(url, params=params).text
- ids = list(text.extract_iter(page, 'Id: ', '<'),
- ("created_at", '- Posted: ', '<'),
- ("width" , '
- Size: ', 'x'),
- ("height" , '', '<'),
- ("source" , '
- Source: Rating: ', '<'),
- (None , '
- Score: ', ''),
- ("score" , '>', '<'),
- ("file_url" , '
- [^]+)")
@@ -112,7 +41,7 @@ class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor):
)
-class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor):
+class GelbooruPoolExtractor(GelbooruBase, booru.BooruPoolExtractor):
"""Extractor for image-pools from gelbooru.com"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=pool&s=show&id=(?P\d+)")
@@ -126,8 +55,23 @@ class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor):
}),
)
+ def metadata(self):
+ url = "{}/index.php?page=pool&s=show&id={}".format(
+ self.root, self.pool_id)
+ page = self.request(url).text
-class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
+ name, pos = text.extract(page, "
Now Viewing: ", "
")
+ if not name:
+ raise exception.NotFoundError("pool")
+ self.post_ids = text.extract_iter(page, 'class="" id="p', '"', pos)
+
+ return {
+ "pool": text.parse_int(self.pool_id),
+ "pool_name": text.unescape(name),
+ }
+
+
+class GelbooruPostExtractor(GelbooruBase, booru.BooruPostExtractor):
"""Extractor for single images from gelbooru.com"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=post&s=view&id=(?P\d+)")
@@ -135,6 +79,3 @@ class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
"content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
"count": 1,
})
-
- def get_posts(self):
- return (self.post,)
diff --git a/gallery_dl/extractor/realbooru.py b/gallery_dl/extractor/realbooru.py
deleted file mode 100644
index 1d2140ad..00000000
--- a/gallery_dl/extractor/realbooru.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://realbooru.com/"""
-
-from . import booru
-
-
-class RealbooruExtractor(booru.XmlParserMixin,
- booru.GelbooruPageMixin,
- booru.BooruExtractor):
- """Base class for realbooru extractors"""
- category = "realbooru"
- api_url = "https://realbooru.com/index.php"
- post_url = "https://realbooru.com/index.php?page=post&s=view&id={}"
- pool_url = "https://realbooru.com/index.php?page=pool&s=show&id={}"
-
- def __init__(self, match):
- super().__init__(match)
- self.params.update({"page": "dapi", "s": "post", "q": "index"})
-
-
-class RealbooruTagExtractor(booru.TagMixin, RealbooruExtractor):
- """Extractor for images from realbooru.com based on search-tags"""
- pattern = (r"(?:https?://)?(?:www\.)?realbooru\.com/(?:index\.php)?"
- r"\?page=post&s=list&tags=(?P[^]+)")
- test = ("https://realbooru.com/index.php?page=post&s=list&tags=wine", {
- "count": ">= 64",
- })
-
-
-class RealbooruPoolExtractor(booru.GelbooruPoolMixin, RealbooruExtractor):
- """Extractor for image-pools from realbooru.com"""
- pattern = (r"(?:https?://)?(?:www\.)?realbooru\.com/(?:index\.php)?"
- r"\?page=pool&s=show&id=(?P\d+)")
- test = ("https://realbooru.com/index.php?page=pool&s=show&id=1", {
- "count": 3,
- })
-
-
-class RealbooruPostExtractor(booru.PostMixin, RealbooruExtractor):
- """Extractor for single images from realbooru.com"""
- pattern = (r"(?:https?://)?(?:www\.)?realbooru\.com/(?:index\.php)?"
- r"\?page=post&s=view&id=(?P\d+)")
- test = ("https://realbooru.com/index.php?page=post&s=view&id=668483", {
- "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9",
- "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
- # "options": (("tags", True),),
- # "keyword": {
- # "tags_general" : str,
- # "tags_metadata": str,
- # "tags_model" : "jennifer_lawrence",
- # },
- })
diff --git a/gallery_dl/extractor/rule34.py b/gallery_dl/extractor/rule34.py
deleted file mode 100644
index de7ef451..00000000
--- a/gallery_dl/extractor/rule34.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2016-2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://rule34.xxx/"""
-
-from . import booru
-
-
-class Rule34Extractor(booru.XmlParserMixin,
- booru.GelbooruPageMixin,
- booru.BooruExtractor):
- """Base class for rule34 extractors"""
- category = "rule34"
- api_url = "https://rule34.xxx/index.php"
- post_url = "https://rule34.xxx/index.php?page=post&s=view&id={}"
- pool_url = "https://rule34.xxx/index.php?page=pool&s=show&id={}"
- page_limit = 4000
-
- def __init__(self, match):
- super().__init__(match)
- self.params.update({"page": "dapi", "s": "post", "q": "index"})
-
-
-class Rule34TagExtractor(booru.TagMixin, Rule34Extractor):
- """Extractor for images from rule34.xxx based on search-tags"""
- pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
- r"\?page=post&s=list&tags=(?P[^]+)")
- test = ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
- "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
- "pattern": r"https?://([^.]+\.)?rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
- "count": 1,
- })
-
-
-class Rule34PoolExtractor(booru.GelbooruPoolMixin, Rule34Extractor):
- """Extractor for image-pools from rule34.xxx"""
- pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
- r"\?page=pool&s=show&id=(?P\d+)")
- test = ("https://rule34.xxx/index.php?page=pool&s=show&id=179", {
- "count": 3,
- })
-
-
-class Rule34PostExtractor(booru.PostMixin, Rule34Extractor):
- """Extractor for single images from rule34.xxx"""
- pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
- r"\?page=post&s=view&id=(?P\d+)")
- test = ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
- "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "danraku",
- "tags_character": "kashima_(kantai_collection)",
- "tags_copyright": "kantai_collection",
- "tags_general": str,
- "tags_metadata": str,
- },
- })
diff --git a/gallery_dl/extractor/safebooru.py b/gallery_dl/extractor/safebooru.py
deleted file mode 100644
index f5f058cd..00000000
--- a/gallery_dl/extractor/safebooru.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2015-2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://safebooru.org/"""
-
-from . import booru
-
-
-class SafebooruExtractor(booru.XmlParserMixin,
- booru.GelbooruPageMixin,
- booru.BooruExtractor):
- """Base class for safebooru extractors"""
- category = "safebooru"
- api_url = "https://safebooru.org/index.php"
- post_url = "https://safebooru.org/index.php?page=post&s=view&id={}"
- pool_url = "https://safebooru.org/index.php?page=pool&s=show&id={}"
-
- def __init__(self, match):
- super().__init__(match)
- self.params.update({"page": "dapi", "s": "post", "q": "index"})
-
-
-class SafebooruTagExtractor(booru.TagMixin, SafebooruExtractor):
- """Extractor for images from safebooru.org based on search-tags"""
- pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
- r"\?page=post&s=list&tags=(?P[^]+)")
- test = ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
- "url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
- "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
- })
-
-
-class SafebooruPoolExtractor(booru.GelbooruPoolMixin, SafebooruExtractor):
- """Extractor for image-pools from safebooru.org"""
- pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
- r"\?page=pool&s=show&id=(?P\d+)")
- test = ("https://safebooru.org/index.php?page=pool&s=show&id=11", {
- "count": 5,
- })
-
-
-class SafebooruPostExtractor(booru.PostMixin, SafebooruExtractor):
- """Extractor for single images from safebooru.org"""
- pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
- r"\?page=post&s=view&id=(?P\d+)")
- test = ("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
- "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
- "content": "93b293b27dabd198afafabbaf87c49863ac82f27",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "kawanakajima",
- "tags_character": "heath_ledger ronald_mcdonald the_joker",
- "tags_copyright": "dc_comics mcdonald's the_dark_knight",
- "tags_general": str,
- },
- })