[realbooru] fix 'tags' for video posts (#8455)
* fix realbooru tags for video posts
* fix lines too long
* combine 'tags' and 'tags_<category>' extraction
- extract tag categories independent of 'tags' option
* add 'video' & 'animated GIF' tests
---------
Co-authored-by: Mike Fährmann <mike_faehrmann@web.de>
This commit is contained in:
committed by
GitHub
parent
68909724e7
commit
481bf1e393
@@ -28,18 +28,30 @@ class RealbooruExtractor(booru.BooruExtractor):
|
||||
extr('class="container"', '>')
|
||||
|
||||
post = {
|
||||
"_html" : page,
|
||||
"id" : post_id,
|
||||
"rating" : "e" if rating == "adult" else (rating or "?")[0],
|
||||
"tags" : text.unescape(extr(' alt="', '"')),
|
||||
"file_url" : extr('src="', '"'),
|
||||
"created_at": extr(">Posted at ", " by "),
|
||||
"uploader" : extr(">", "<"),
|
||||
"score" : extr('">', "<"),
|
||||
"tags" : extr('<br />', "</div>"),
|
||||
"title" : extr('id="title" style="width: 100%;" value="', '"'),
|
||||
"source" : extr('d="source" style="width: 100%;" value="', '"'),
|
||||
}
|
||||
|
||||
tags_container = post["tags"]
|
||||
tags = []
|
||||
tags_categories = collections.defaultdict(list)
|
||||
pattern = text.re(r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
|
||||
for tag_type, tag_name in pattern.findall(tags_container):
|
||||
tag = text.unescape(text.unquote(tag_name))
|
||||
tags.append(tag)
|
||||
tags_categories[tag_type].append(tag)
|
||||
for key, value in tags_categories.items():
|
||||
post[f"tags_{key}"] = ", ".join(value)
|
||||
tags.sort()
|
||||
|
||||
post["tags"] = ", ".join(tags)
|
||||
post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0]
|
||||
return post
|
||||
|
||||
@@ -66,16 +78,6 @@ class RealbooruExtractor(booru.BooruExtractor):
|
||||
return
|
||||
params["pid"] += self.per_page
|
||||
|
||||
def _tags(self, post, _):
|
||||
page = post["_html"]
|
||||
tag_container = text.extr(page, 'id="tagLink"', '</div>')
|
||||
tags = collections.defaultdict(list)
|
||||
pattern = text.re(r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)')
|
||||
for tag_type, tag_name in pattern.findall(tag_container):
|
||||
tags[tag_type].append(text.unescape(text.unquote(tag_name)))
|
||||
for key, value in tags.items():
|
||||
post["tags_" + key] = " ".join(value)
|
||||
|
||||
|
||||
class RealbooruTagExtractor(RealbooruExtractor):
|
||||
subcategory = "tag"
|
||||
|
||||
Reference in New Issue
Block a user