[gelbooru] tag-splitting for non-api mode
This commit is contained in:
@@ -621,6 +621,8 @@ extractor.3dbooru.tags
|
|||||||
----------------------
|
----------------------
|
||||||
extractor.e621.tags
|
extractor.e621.tags
|
||||||
-------------------
|
-------------------
|
||||||
|
extractor.gelbooru.tags
|
||||||
|
-----------------------
|
||||||
extractor.konachan.tags
|
extractor.konachan.tags
|
||||||
-----------------------
|
-----------------------
|
||||||
extractor.rule34.tags
|
extractor.rule34.tags
|
||||||
|
|||||||
@@ -31,10 +31,7 @@ class BooruExtractor(SharedConfigExtractor):
|
|||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.params = {}
|
self.params = {}
|
||||||
self.prepare = None
|
self.extags = self.post_url and self.config("tags", False)
|
||||||
|
|
||||||
if self.post_url and self.config("tags", False):
|
|
||||||
self.prepare = self._extended_tags
|
|
||||||
|
|
||||||
def skip(self, num):
|
def skip(self, num):
|
||||||
pages = num // self.per_page
|
pages = num // self.per_page
|
||||||
@@ -62,8 +59,8 @@ class BooruExtractor(SharedConfigExtractor):
|
|||||||
if url.startswith("/"):
|
if url.startswith("/"):
|
||||||
url = text.urljoin(self.api_url, url)
|
url = text.urljoin(self.api_url, url)
|
||||||
image.update(data)
|
image.update(data)
|
||||||
if self.prepare:
|
if self.extags:
|
||||||
self.prepare(image)
|
self.extended_tags(image)
|
||||||
yield Message.Url, url, text.nameext_from_url(url, image)
|
yield Message.Url, url, text.nameext_from_url(url, image)
|
||||||
|
|
||||||
if len(images) < self.per_page:
|
if len(images) < self.per_page:
|
||||||
@@ -89,17 +86,16 @@ class BooruExtractor(SharedConfigExtractor):
|
|||||||
"""Collect metadata for extractor-job"""
|
"""Collect metadata for extractor-job"""
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def _extended_tags(self, image):
|
def extended_tags(self, image, page=None):
|
||||||
"""Rerieve extended tag information"""
|
"""Rerieve extended tag information"""
|
||||||
url = self.post_url.format(image["id"])
|
if not page:
|
||||||
page = self.request(url).text
|
url = self.post_url.format(image["id"])
|
||||||
tag_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
|
page = self.request(url).text
|
||||||
|
|
||||||
tags = collections.defaultdict(list)
|
tags = collections.defaultdict(list)
|
||||||
|
tags_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
|
||||||
pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S)
|
pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S)
|
||||||
for tag_type, tag_name in pattern.findall(tag_html):
|
for tag_type, tag_name in pattern.findall(tags_html):
|
||||||
tags[tag_type].append(text.unquote(tag_name))
|
tags[tag_type].append(text.unquote(tag_name))
|
||||||
|
|
||||||
for key, value in tags.items():
|
for key, value in tags.items():
|
||||||
image["tags_" + key] = " ".join(value)
|
image["tags_" + key] = " ".join(value)
|
||||||
|
|
||||||
@@ -185,7 +181,7 @@ class GelbooruPoolMixin(PoolMixin):
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
"pool": text.parse_int(self.pool),
|
"pool": text.parse_int(self.pool),
|
||||||
"pool_name": text.unescape(name or ""),
|
"pool_name": text.unescape(name),
|
||||||
"count": len(self.posts),
|
"count": len(self.posts),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -68,6 +68,8 @@ class GelbooruExtractor(booru.XmlParserMixin,
|
|||||||
data["rating"] = (data["rating"] or "?")[0].lower()
|
data["rating"] = (data["rating"] or "?")[0].lower()
|
||||||
data["tags"] = " ".join(
|
data["tags"] = " ".join(
|
||||||
[tag.replace(" ", "_") for tag in data["tags"].split(", ")])
|
[tag.replace(" ", "_") for tag in data["tags"].split(", ")])
|
||||||
|
if self.extags:
|
||||||
|
self.extended_tags(data, page)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
|
|||||||
pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
|
pattern = [r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
|
||||||
r"/Comic/[^/?&#]+/[^/?&#]+\?id=(\d+)"]
|
r"/Comic/[^/?&#]+/[^/?&#]+\?id=(\d+)"]
|
||||||
test = [("http://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {
|
test = [("http://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {
|
||||||
"url": "a45c77f8fbde66091fe2346d6341f9cf3c6b1bc5",
|
"url": "2bbab6ec4fbc05d269cca420a82a9b5acda28682",
|
||||||
"keyword": "c6de1c9c8a307dc4be56783c4ac6f1338ffac6fc",
|
"keyword": "c6de1c9c8a307dc4be56783c4ac6f1338ffac6fc",
|
||||||
})]
|
})]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user