diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index f702b8d7..eba75452 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -129,8 +129,10 @@ class MoebooruPageMixin():
class GelbooruPageMixin():
"""Pagination for Gelbooru-like sites"""
+ page_start = 0
+
def reset_page(self):
- self.params["pid"] = self.page_start - 1
+ self.params["pid"] = self.page_start
def update_page(self, data):
self.params["pid"] += 1
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index 110160a6..99ae4162 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -8,55 +8,46 @@
"""Extract images from https://gelbooru.com/"""
-from .common import SharedConfigExtractor, Message
-from .. import text, util, exception
-import xml.etree.ElementTree as ET
+from . import booru
+from .common import Message
+from .. import text, exception
-class GelbooruExtractor(SharedConfigExtractor):
+class GelbooruExtractor(booru.XmlParserMixin,
+ booru.GelbooruPageMixin,
+ booru.BooruExtractor):
"""Base class for gelbooru extractors"""
- basecategory = "booru"
category = "gelbooru"
- filename_fmt = "{category}_{id}_{md5}.{extension}"
- api_url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index"
+ api_url = "https://gelbooru.com/index.php"
+ post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}"
+
+ def __init__(self, match):
+ super().__init__(match)
- def __init__(self):
- SharedConfigExtractor.__init__(self)
- self.start_post = 0
self.use_api = self.config("api", True)
if self.use_api:
- self.get_post_data = self.get_post_data_api
+ self.params.update({"page": "dapi", "s": "post", "q": "index"})
+ else:
+ self.items = self.items_noapi
- def items(self):
+ def items_noapi(self):
data = self.get_metadata()
yield Message.Version, 1
yield Message.Directory, data
- for post in util.advance(self.get_posts(), self.start_post):
- if isinstance(post, str):
- post = self.get_post_data(post)
- for key in ("id", "width", "height", "score", "change"):
- post[key] = text.parse_int(post[key])
+ for post in self.get_posts():
+ post = self.get_post_data(post)
url = post["file_url"]
post.update(data)
yield Message.Url, url, text.nameext_from_url(url, post)
- def skip(self, num):
- self.start_post += num
- return num
-
- def get_metadata(self):
- """Return general metadata"""
- return {}
-
def get_posts(self):
"""Return an iterable containing all relevant post objects"""
def get_post_data(self, post_id):
"""Extract metadata of a single post"""
- page = self.request("https://gelbooru.com/index.php?page=post&s=view"
- "&id=" + post_id).text
+ page = self.request(self.post_url.format(post_id)).text
data = text.extract_all(page, (
(None , '[^]+)"]
test = [
("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
"count": 5,
@@ -102,42 +85,13 @@ class GelbooruTagExtractor(GelbooruExtractor):
]
def __init__(self, match):
- GelbooruExtractor.__init__(self)
- self.tags = text.unquote(match.group(1).replace("+", " "))
- self.per_page = 100 if self.use_api else 42
- self.start_page = 0
-
- def skip(self, num):
- pages, posts = divmod(num, self.per_page)
- self.start_page += pages
- self.start_post += posts
- return num
-
- def get_metadata(self):
- return {"search_tags": self.tags}
+ super().__init__(match)
+ if not self.use_api:
+ self.per_page = 42
def get_posts(self):
- if self.use_api:
- return self._get_posts_api()
- return self._get_posts_manual()
-
- def _get_posts_api(self):
- params = {
- # 'pid' is page-id; first page has index 0
- "tags": self.tags, "limit": self.per_page, "pid": self.start_page}
- while True:
- root = ET.fromstring(
- self.request(self.api_url, params=params).text)
- for item in root:
- yield item.attrib
- if len(root) < self.per_page:
- return
- params["pid"] += 1
-
- def _get_posts_manual(self):
url = "https://gelbooru.com/index.php?page=post&s=list"
- # 'pid' is post-id; values for 'pid' must be multiples of 42
- params = {"tags": self.tags, "pid": self.start_page * self.per_page}
+ params = {"tags": self.tags, "pid": self.page_start * self.per_page}
while True:
page = self.request(url, params=params).text
@@ -150,20 +104,12 @@ class GelbooruTagExtractor(GelbooruExtractor):
class GelbooruPoolExtractor(GelbooruExtractor):
"""Extractor for image-pools from gelbooru.com"""
- subcategory = "pool"
- directory_fmt = ["{category}", "pool", "{pool}"]
- archive_fmt = "p_{pool}_{id}"
pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=pool&s=show&id=(\d+)"]
test = [("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
"count": 6,
})]
- def __init__(self, match):
- GelbooruExtractor.__init__(self)
- self.pool_id = match.group(1)
- self.posts = None
-
def get_metadata(self):
page = self.request("https://gelbooru.com/index.php?page=pool&s=show"
"&id=" + self.pool_id).text
@@ -183,20 +129,14 @@ class GelbooruPoolExtractor(GelbooruExtractor):
return self.posts
-class GelbooruPostExtractor(GelbooruExtractor):
+class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
"""Extractor for single images from gelbooru.com"""
- subcategory = "post"
- archive_fmt = "{id}"
pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
- r"\?page=post&s=view&id=(\d+)"]
+ r"\?page=post&s=view&id=(?P\d+)"]
test = [("https://gelbooru.com/index.php?page=post&s=view&id=313638", {
"content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
"count": 1,
})]
- def __init__(self, match):
- GelbooruExtractor.__init__(self)
- self.post_id = match.group(1)
-
def get_posts(self):
- return (self.post_id,)
+ return (self.post,)