move extractors from booru.py into their own gelbooru_v02 module

This commit is contained in:
Mike Fährmann
2021-02-17 00:12:51 +01:00
parent 04e7a7c8a9
commit 08d7934c6e
4 changed files with 200 additions and 179 deletions

View File

@@ -35,6 +35,7 @@ modules = [
"furaffinity", "furaffinity",
"fuskator", "fuskator",
"gelbooru", "gelbooru",
"gelbooru_v02",
"gfycat", "gfycat",
"hbrowse", "hbrowse",
"hentai2read", "hentai2read",

View File

@@ -9,12 +9,8 @@
"""Extractors for *booru sites""" """Extractors for *booru sites"""
from .common import BaseExtractor, Message from .common import BaseExtractor, Message
from .. import text, util, exception from .. import text
from xml.etree import ElementTree
import collections
import operator import operator
import re
class BooruExtractor(BaseExtractor): class BooruExtractor(BaseExtractor):
@@ -66,174 +62,8 @@ class BooruExtractor(BaseExtractor):
_file_url = operator.itemgetter("file_url") _file_url = operator.itemgetter("file_url")
@staticmethod def _prepare(self, post):
def _prepare(post): """Prepare the 'post's metadata"""
post["date"] = text.parse_datetime(
post["created_at"], "%a %b %d %H:%M:%S %z %Y")
def _extended_tags(self, post, page=None): def _extended_tags(self, post, page=None):
if not page: """Generate extended tag information"""
url = "{}/index.php?page=post&s=view&id={}".format(
self.root, post["id"])
page = self.request(url).text
html = text.extract(page, '<ul id="tag-', '</ul>')[0]
if html:
tags = collections.defaultdict(list)
pattern = re.compile(
r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
for tag_type, tag_name in pattern.findall(html):
tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items():
post["tags_" + key] = " ".join(value)
def _api_request(self, params):
url = self.root + "/index.php?page=dapi&s=post&q=index"
return ElementTree.fromstring(self.request(url, params=params).text)
def _pagination(self, params):
params["pid"] = self.page_start
params["limit"] = self.per_page
while True:
root = self._api_request(params)
for post in root:
yield post.attrib
if len(root) < self.per_page:
return
params["pid"] += 1
BASE_PATTERN = BooruExtractor.update({
"rule34": {
"root": "https://rule34.xxx",
},
"safebooru": {
"root": "https://safebooru.org",
},
"realbooru": {
"root": "https://realbooru.com",
},
})
class BooruPostExtractor(BooruExtractor):
subcategory = "post"
archive_fmt = "{id}"
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
test = (
("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
"content": "97e4bbf86c3860be18de384d02d544251afe1d45",
"options": (("tags", True),),
"keyword": {
"tags_artist": "danraku",
"tags_character": "kashima_(kantai_collection)",
"tags_copyright": "kantai_collection",
"tags_general": str,
"tags_metadata": str,
},
}),
("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
"url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
"content": "93b293b27dabd198afafabbaf87c49863ac82f27",
"options": (("tags", True),),
"keyword": {
"tags_artist": "kawanakajima",
"tags_character": "heath_ledger ronald_mcdonald the_joker",
"tags_copyright": "dc_comics mcdonald's the_dark_knight",
"tags_general": str,
},
}),
("https://realbooru.com/index.php?page=post&s=view&id=668483", {
"url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9",
"content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
}),
)
def __init__(self, match):
BooruExtractor.__init__(self, match)
self.post_id = match.group(match.lastindex)
def posts(self):
return self._pagination({"id": self.post_id})
class BooruTagExtractor(BooruExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)"
test = (
("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
"content": "97e4bbf86c3860be18de384d02d544251afe1d45",
"pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
"count": 1,
}),
("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
"url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
"content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
}),
("https://realbooru.com/index.php?page=post&s=list&tags=wine", {
"count": ">= 64",
}),
)
def __init__(self, match):
BooruExtractor.__init__(self, match)
tags = match.group(match.lastindex)
self.tags = text.unquote(tags.replace("+", " "))
def metadata(self):
return {"search_tags": self.tags}
def posts(self):
return self._pagination({"tags" : self.tags})
class BooruPoolExtractor(BooruExtractor):
subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool}")
archive_fmt = "p_{pool}_{id}"
pattern = BASE_PATTERN + r"/index\.php\?page=pool&s=show&id=(\d+)"
test = (
("https://rule34.xxx/index.php?page=pool&s=show&id=179", {
"count": 3,
}),
("https://safebooru.org/index.php?page=pool&s=show&id=11", {
"count": 5,
}),
("https://realbooru.com/index.php?page=pool&s=show&id=1", {
"count": 3,
}),
)
def __init__(self, match):
BooruExtractor.__init__(self, match)
self.pool_id = match.group(match.lastindex)
self.post_ids = ()
def skip(self, num):
self.page_start += num
return num
def metadata(self):
url = "{}/index.php?page=pool&s=show&id={}".format(
self.root, self.pool_id)
page = self.request(url).text
name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
if not name:
raise exception.NotFoundError("pool")
self.post_ids = text.extract_iter(
page, 'class="thumb" id="p', '"', pos)
return {
"pool": text.parse_int(self.pool_id),
"pool_name": text.unescape(name),
}
def posts(self):
params = {}
for params["id"] in util.advance(self.post_ids, self.page_start):
for post in self._api_request(params):
yield post.attrib

View File

@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2014-2020 Mike Fährmann # Copyright 2014-2021 Mike Fährmann
# #
# This program is free software; you can redistribute it and/or modify # This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
@@ -8,13 +8,14 @@
"""Extractors for https://gelbooru.com/""" """Extractors for https://gelbooru.com/"""
from . import booru from . import gelbooru_v02
from .. import text, exception from .. import text, exception
class GelbooruBase(): class GelbooruBase():
"""Base class for gelbooru extractors""" """Base class for gelbooru extractors"""
category = "gelbooru" category = "gelbooru"
basecategory = "booru"
root = "https://gelbooru.com" root = "https://gelbooru.com"
@staticmethod @staticmethod
@@ -27,7 +28,8 @@ class GelbooruBase():
return url return url
class GelbooruTagExtractor(GelbooruBase, booru.BooruTagExtractor): class GelbooruTagExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02TagExtractor):
"""Extractor for images from gelbooru.com based on search-tags""" """Extractor for images from gelbooru.com based on search-tags"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=post&s=list&tags=(?P<tags>[^&#]+)") r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
@@ -42,7 +44,8 @@ class GelbooruTagExtractor(GelbooruBase, booru.BooruTagExtractor):
) )
class GelbooruPoolExtractor(GelbooruBase, booru.BooruPoolExtractor): class GelbooruPoolExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02PoolExtractor):
"""Extractor for image-pools from gelbooru.com""" """Extractor for image-pools from gelbooru.com"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=pool&s=show&id=(?P<pool>\d+)") r"\?page=pool&s=show&id=(?P<pool>\d+)")
@@ -72,7 +75,8 @@ class GelbooruPoolExtractor(GelbooruBase, booru.BooruPoolExtractor):
} }
class GelbooruPostExtractor(GelbooruBase, booru.BooruPostExtractor): class GelbooruPostExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02PostExtractor):
"""Extractor for single images from gelbooru.com""" """Extractor for single images from gelbooru.com"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=post&s=view&id=(?P<post>\d+)") r"\?page=post&s=view&id=(?P<post>\d+)")

View File

@@ -0,0 +1,186 @@
# -*- coding: utf-8 -*-
# Copyright 2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for Gelbooru v0.2 sites"""
from . import booru
from .. import text, util, exception
from xml.etree import ElementTree
import collections
import re
class GelbooruV02Extractor(booru.BooruExtractor):
basecategory = "gelbooru_v02"
def _api_request(self, params):
url = self.root + "/index.php?page=dapi&s=post&q=index"
return ElementTree.fromstring(self.request(url, params=params).text)
def _pagination(self, params):
params["pid"] = self.page_start
params["limit"] = self.per_page
while True:
root = self._api_request(params)
for post in root:
yield post.attrib
if len(root) < self.per_page:
return
params["pid"] += 1
@staticmethod
def _prepare(post):
post["date"] = text.parse_datetime(
post["created_at"], "%a %b %d %H:%M:%S %z %Y")
def _extended_tags(self, post, page=None):
if not page:
url = "{}/index.php?page=post&s=view&id={}".format(
self.root, post["id"])
page = self.request(url).text
html = text.extract(page, '<ul id="tag-', '</ul>')[0]
if html:
tags = collections.defaultdict(list)
pattern = re.compile(
r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
for tag_type, tag_name in pattern.findall(html):
tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items():
post["tags_" + key] = " ".join(value)
BASE_PATTERN = GelbooruV02Extractor.update({
"realbooru": {"root": "https://realbooru.com"},
"rule34" : {"root": "https://rule34.xxx"},
"safebooru": {"root": "https://safebooru.org"},
})
class GelbooruV02TagExtractor(GelbooruV02Extractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)"
test = (
("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
"content": "0a3aa1e302bd4ed0599305cd4c225ca8d28249b2",
"pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
"count": 1,
}),
("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
"url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
"content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
}),
("https://realbooru.com/index.php?page=post&s=list&tags=wine", {
"count": ">= 64",
}),
)
def __init__(self, match):
GelbooruV02Extractor.__init__(self, match)
tags = match.group(match.lastindex)
self.tags = text.unquote(tags.replace("+", " "))
def metadata(self):
return {"search_tags": self.tags}
def posts(self):
return self._pagination({"tags" : self.tags})
class GelbooruV02PoolExtractor(GelbooruV02Extractor):
subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool}")
archive_fmt = "p_{pool}_{id}"
pattern = BASE_PATTERN + r"/index\.php\?page=pool&s=show&id=(\d+)"
test = (
("https://rule34.xxx/index.php?page=pool&s=show&id=179", {
"count": 3,
}),
("https://safebooru.org/index.php?page=pool&s=show&id=11", {
"count": 5,
}),
("https://realbooru.com/index.php?page=pool&s=show&id=1", {
"count": 3,
}),
)
def __init__(self, match):
GelbooruV02Extractor.__init__(self, match)
self.pool_id = match.group(match.lastindex)
self.post_ids = ()
def skip(self, num):
self.page_start += num
return num
def metadata(self):
url = "{}/index.php?page=pool&s=show&id={}".format(
self.root, self.pool_id)
page = self.request(url).text
name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
if not name:
raise exception.NotFoundError("pool")
self.post_ids = text.extract_iter(
page, 'class="thumb" id="p', '"', pos)
return {
"pool": text.parse_int(self.pool_id),
"pool_name": text.unescape(name),
}
def posts(self):
params = {}
for params["id"] in util.advance(self.post_ids, self.page_start):
for post in self._api_request(params):
yield post.attrib
class GelbooruV02PostExtractor(GelbooruV02Extractor):
subcategory = "post"
archive_fmt = "{id}"
pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
test = (
("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
"content": "0a3aa1e302bd4ed0599305cd4c225ca8d28249b2",
"options": (("tags", True),),
"keyword": {
"tags_artist": "danraku",
"tags_character": "kashima_(kantai_collection)",
"tags_copyright": "kantai_collection",
"tags_general": str,
"tags_metadata": str,
},
}),
("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
"url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
"content": "93b293b27dabd198afafabbaf87c49863ac82f27",
"options": (("tags", True),),
"keyword": {
"tags_artist": "kawanakajima",
"tags_character": "heath_ledger ronald_mcdonald the_joker",
"tags_copyright": "dc_comics mcdonald's the_dark_knight",
"tags_general": str,
},
}),
("https://realbooru.com/index.php?page=post&s=view&id=668483", {
"url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9",
"content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
}),
)
def __init__(self, match):
GelbooruV02Extractor.__init__(self, match)
self.post_id = match.group(match.lastindex)
def posts(self):
return self._pagination({"id": self.post_id})