diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 3577c3e6..611603ec 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -92,11 +92,8 @@ modules = [ "pururin", "reactor", "readcomiconline", - "realbooru", "reddit", "redgifs", - "rule34", - "safebooru", "sankaku", "sankakucomplex", "seiga", @@ -122,6 +119,7 @@ modules = [ "xhamster", "xvideos", "yuki", + "booru", "moebooru", "foolfuuka", "foolslide", diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 378e1440..517df935 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -1,247 +1,248 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2020 Mike Fährmann +# Copyright 2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Base classes for extractors for danbooru and co""" +"""Extractors for *booru sites""" + +from .common import Extractor, Message, generate_extractors +from .. import text, util, exception -from .common import Extractor, Message -from .. import text, exception from xml.etree import ElementTree import collections -import datetime -import operator import re class BooruExtractor(Extractor): - """Base class for all booru extractors""" + """Base class for *booru extractors""" basecategory = "booru" filename_fmt = "{category}_{id}_{md5}.{extension}" - api_url = "" - post_url = "" - per_page = 50 - page_start = 1 - page_limit = None - sort = False + page_start = 0 + per_page = 100 - def __init__(self, match): - super().__init__(match) - self.params = {} - self.extags = self.post_url and self.config("tags", False) + def items(self): + self.login() + extended_tags = self.config("tags", False) + data = self.metadata() + for post in self.posts(): + try: + url = self._prepare_post(post, extended_tags) + except KeyError: + continue + post.update(data) + text.nameext_from_url(url, post) + yield Message.Directory, post + yield Message.Url, url, post def skip(self, num): pages = num // self.per_page - if self.page_limit and pages + self.page_start > self.page_limit: - pages = self.page_limit - self.page_start self.page_start += pages return pages * self.per_page - def items(self): - yield Message.Version, 1 - data = self.get_metadata() + def login(self): + """Login and set necessary cookies""" - self.reset_page() - while True: - images = self.parse_response( - self.request(self.api_url, params=self.params)) + def metadata(self): + """Return a dict with general metadata""" + return () - for image in images: - try: - url = self.get_file_url(image) - except KeyError: - continue - if url.startswith("/"): - url = text.urljoin(self.api_url, url) - image.update(data) - text.nameext_from_url(url, image) - if self.extags: - self.extended_tags(image) - yield Message.Directory, image - yield Message.Url, url, image + def posts(self): + """Return an iterable with post objects""" + return () - if len(images) < self.per_page: - return - self.update_page(image) + def _prepare_post(self, post, extended_tags=False): + url = post["file_url"] + if url[0] == "/": + url = self.root + url + if extended_tags: + self._fetch_extended_tags(post) + post["date"] = text.parse_datetime( + post["created_at"], "%a %b %d %H:%M:%S %z %Y") + return url - def reset_page(self): - """Initialize params to point to the first page""" - self.params["page"] = self.page_start - - def update_page(self, data): - """Update params to point to the next page""" - - def parse_response(self, response): - """Parse JSON API response""" - images = response.json() - if self.sort: - images.sort(key=operator.itemgetter("score", "id"), - reverse=True) - return images - - def get_metadata(self): - """Collect metadata for extractor-job""" - return {} - - @staticmethod - def get_file_url(image): - return image["file_url"] - - def extended_tags(self, image, page=None): - """Retrieve extended tag information""" + def _fetch_extended_tags(self, post, page=None): if not page: - url = self.post_url.format(image["id"]) + url = "{}/index.php?page=post&s=view&id={}".format( + self.root, post["id"]) page = self.request(url).text - tags = collections.defaultdict(list) - tags_html = text.extract(page, '