[booru] refactor 'tags' and 'notes' extraction
- move HTML request for post pages into its own function - move gelbooru_v02.py notes extraction to gelbooru.py since it only works there - clean up some code
This commit is contained in:
@@ -25,6 +25,7 @@ class BooruExtractor(BaseExtractor):
|
||||
data = self.metadata()
|
||||
tags = self.config("tags", False)
|
||||
notes = self.config("notes", False)
|
||||
fetch_html = tags or notes
|
||||
|
||||
for post in self.posts():
|
||||
try:
|
||||
@@ -36,11 +37,13 @@ class BooruExtractor(BaseExtractor):
|
||||
"(md5: %s)", post.get("id"), post.get("md5"))
|
||||
continue
|
||||
|
||||
page_html = None
|
||||
if tags:
|
||||
page_html = self._extended_tags(post)
|
||||
if notes:
|
||||
self._notes(post, page_html)
|
||||
if fetch_html:
|
||||
html = self._html(post)
|
||||
if tags:
|
||||
self._tags(post, html)
|
||||
if notes:
|
||||
self._notes(post, html)
|
||||
|
||||
text.nameext_from_url(url, post)
|
||||
post.update(data)
|
||||
self._prepare(post)
|
||||
@@ -67,16 +70,13 @@ class BooruExtractor(BaseExtractor):
|
||||
_file_url = operator.itemgetter("file_url")
|
||||
|
||||
def _prepare(self, post):
|
||||
"""Prepare the 'post's metadata"""
|
||||
"""Prepare a 'post's metadata"""
|
||||
|
||||
def _extended_tags(self, post, page=None):
|
||||
"""Generate extended tag information
|
||||
def _html(self, post):
|
||||
"""Return HTML content of a post"""
|
||||
|
||||
The return value of this function will be
|
||||
passed to the _notes function as the page parameter.
|
||||
This makes it possible to reuse the same HTML both for
|
||||
extracting tags and notes.
|
||||
"""
|
||||
def _tags(self, post, page):
|
||||
"""Extract extended tag metadata"""
|
||||
|
||||
def _notes(self, post, page=None):
|
||||
"""Generate information about notes"""
|
||||
def _notes(self, post, page):
|
||||
"""Extract notes metadata"""
|
||||
|
||||
@@ -68,6 +68,22 @@ class GelbooruBase():
|
||||
yield "https://img2.gelbooru.com" + path
|
||||
yield "https://img1.gelbooru.com" + path
|
||||
|
||||
def _notes(self, post, page):
|
||||
notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
|
||||
if not notes_data:
|
||||
return
|
||||
|
||||
post["notes"] = notes = []
|
||||
extr = text.extract
|
||||
for note in text.extract_iter(notes_data, '<article', '</article>'):
|
||||
notes.append({
|
||||
"width" : int(extr(note, 'data-width="', '"')[0]),
|
||||
"height": int(extr(note, 'data-height="', '"')[0]),
|
||||
"x" : int(extr(note, 'data-x="', '"')[0]),
|
||||
"y" : int(extr(note, 'data-y="', '"')[0]),
|
||||
"body" : extr(note, 'data-body="', '"')[0],
|
||||
})
|
||||
|
||||
|
||||
class GelbooruTagExtractor(GelbooruBase,
|
||||
gelbooru_v02.GelbooruV02TagExtractor):
|
||||
@@ -182,21 +198,21 @@ class GelbooruPostExtractor(GelbooruBase,
|
||||
"keywords": {
|
||||
"notes": [
|
||||
{
|
||||
"height": 553,
|
||||
"body": "Look over this way when you talk~",
|
||||
"height": 553,
|
||||
"width": 246,
|
||||
"x": 35,
|
||||
"y": 72
|
||||
"y": 72,
|
||||
},
|
||||
{
|
||||
"height": 557,
|
||||
"body": "Hey~\nAre you listening~?",
|
||||
"height": 557,
|
||||
"width": 246,
|
||||
"x": 1233,
|
||||
"y": 109
|
||||
}
|
||||
]
|
||||
}
|
||||
"y": 109,
|
||||
},
|
||||
],
|
||||
},
|
||||
}),
|
||||
)
|
||||
|
||||
|
||||
@@ -93,11 +93,11 @@ class GelbooruV02Extractor(booru.BooruExtractor):
|
||||
self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
|
||||
return url
|
||||
|
||||
def _extended_tags(self, post, page=None):
|
||||
if not page:
|
||||
url = "{}/index.php?page=post&s=view&id={}".format(
|
||||
self.root, post["id"])
|
||||
page = self.request(url).text
|
||||
def _html(self, post):
|
||||
return self.request("{}/index.php?page=post&s=view&id={}".format(
|
||||
self.root, post["id"])).text
|
||||
|
||||
def _tags(self, post, page):
|
||||
html = text.extract(page, '<ul id="tag-', '</ul>')[0]
|
||||
if not html:
|
||||
html = text.extract(page, '<ul class="tag-', '</ul>')[0]
|
||||
@@ -109,31 +109,6 @@ class GelbooruV02Extractor(booru.BooruExtractor):
|
||||
tags[tag_type].append(text.unquote(tag_name))
|
||||
for key, value in tags.items():
|
||||
post["tags_" + key] = " ".join(value)
|
||||
return page
|
||||
|
||||
def _notes(self, post, page=None):
|
||||
if not page:
|
||||
url = "{}/index.php?page=post&s=view&id={}".format(
|
||||
self.root, post["id"])
|
||||
page = self.request(url).text
|
||||
notes = []
|
||||
notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
|
||||
if not notes_data:
|
||||
return
|
||||
|
||||
note_iter = text.extract_iter(notes_data, '<article', '</article>')
|
||||
extr = text.extract
|
||||
for note_data in note_iter:
|
||||
note = {
|
||||
"width": int(extr(note_data, 'data-width="', '"')[0]),
|
||||
"height": int(extr(note_data, 'data-height="', '"')[0]),
|
||||
"x": int(extr(note_data, 'data-x="', '"')[0]),
|
||||
"y": int(extr(note_data, 'data-y="', '"')[0]),
|
||||
"body": extr(note_data, 'data-body="', '"')[0],
|
||||
}
|
||||
notes.append(note)
|
||||
|
||||
post["notes"] = notes
|
||||
|
||||
|
||||
INSTANCES = {
|
||||
|
||||
@@ -26,10 +26,11 @@ class MoebooruExtractor(BooruExtractor):
|
||||
def _prepare(post):
|
||||
post["date"] = text.parse_timestamp(post["created_at"])
|
||||
|
||||
def _extended_tags(self, post, page=None):
|
||||
if not page:
|
||||
url = "{}/post/show/{}".format(self.root, post["id"])
|
||||
page = self.request(url).text
|
||||
def _html(self, post):
|
||||
return self.request("{}/post/show/{}".format(
|
||||
self.root, post["id"])).text
|
||||
|
||||
def _tags(self, post, page):
|
||||
html = text.extract(page, '<ul id="tag-', '</ul>')[0]
|
||||
if html:
|
||||
tags = collections.defaultdict(list)
|
||||
@@ -38,30 +39,24 @@ class MoebooruExtractor(BooruExtractor):
|
||||
tags[tag_type].append(text.unquote(tag_name))
|
||||
for key, value in tags.items():
|
||||
post["tags_" + key] = " ".join(value)
|
||||
return page
|
||||
|
||||
def _notes(self, post, page=None):
|
||||
if not page:
|
||||
url = "{}/post/show/{}".format(self.root, post["id"])
|
||||
page = self.request(url).text
|
||||
notes = []
|
||||
def _notes(self, post, page):
|
||||
notes_container = text.extract(page, 'id="note-container"', "<img ")[0]
|
||||
if not notes_container:
|
||||
return
|
||||
|
||||
post["notes"] = notes = []
|
||||
for note in notes_container.split('class="note-box"')[1:]:
|
||||
extr = text.extract_from(note)
|
||||
notes.append({
|
||||
"width" : int(extr("width: ", "p")),
|
||||
"height": int(extr("height: ", "p")),
|
||||
"y" : int(extr("top: ", "p")),
|
||||
"x" : int(extr("left: ", "p")),
|
||||
"width" : int(extr("width:", "p")),
|
||||
"height": int(extr("height:", "p")),
|
||||
"y" : int(extr("top:", "p")),
|
||||
"x" : int(extr("left:", "p")),
|
||||
"id" : int(extr('id="note-body-', '"')),
|
||||
"body" : text.remove_html(extr('>', "</div>")),
|
||||
"body" : text.remove_html(extr(">", "</div>")),
|
||||
})
|
||||
|
||||
post["notes"] = notes
|
||||
|
||||
def _pagination(self, url, params):
|
||||
params["page"] = self.page_start
|
||||
params["limit"] = self.per_page
|
||||
|
||||
@@ -27,10 +27,6 @@ class PhilomenaExtractor(BooruExtractor):
|
||||
def _prepare(post):
|
||||
post["date"] = text.parse_datetime(post["created_at"])
|
||||
|
||||
@staticmethod
|
||||
def _extended_tags(post):
|
||||
pass
|
||||
|
||||
def _pagination(self, url, params):
|
||||
params["page"] = 1
|
||||
params["per_page"] = self.per_page
|
||||
|
||||
@@ -63,7 +63,7 @@ class SankakuExtractor(BooruExtractor):
|
||||
def _check_expired(self, response):
|
||||
return not response.history or '.com/expired.png' not in response.url
|
||||
|
||||
def _extended_tags(self, post):
|
||||
def _tags(self, post, page):
|
||||
tags = collections.defaultdict(list)
|
||||
types = self.TAG_TYPES
|
||||
for tag in post["tags"]:
|
||||
|
||||
@@ -83,7 +83,7 @@ class TwibooruPostExtractor(TwibooruExtractor):
|
||||
"tag_ids": list,
|
||||
"tags": list,
|
||||
"thumbnails_generated": True,
|
||||
"updated_at": "2022-05-13T00:43:19.791Z",
|
||||
"updated_at": "2022-09-21T14:31:50.441Z",
|
||||
"upvotes": int,
|
||||
"view_url": "https://cdn.twibooru.org/img/2020/7/8/1/full.png",
|
||||
"width": 576,
|
||||
|
||||
Reference in New Issue
Block a user