From dff03a6605fa92a566558b60a69c142ae0ce5ea5 Mon Sep 17 00:00:00 2001 From: thatfuckingbird <67429906+thatfuckingbird@users.noreply.github.com> Date: Tue, 13 Apr 2021 23:40:24 +0200 Subject: [PATCH] [booru] add an option to extract notes (only gelbooru for now) (#1457) * [booru] add an option to extract notes (currently implemented only for gelbooru) * appease linter * [gelbooru] rename "text" to "body" in note extraction * add a code comment about reusing return value of _extended_tags --- docs/configuration.rst | 10 ++++++++++ docs/gallery-dl.conf | 3 ++- gallery_dl/extractor/booru.py | 17 +++++++++++++++-- gallery_dl/extractor/gelbooru.py | 22 ++++++++++++++++++++++ gallery_dl/extractor/gelbooru_v02.py | 25 +++++++++++++++++++++++++ 5 files changed, 74 insertions(+), 3 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 60fb5ae6..ef30d56f 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1747,6 +1747,16 @@ Description Note: This requires 1 additional HTTP request for each post. +extractor.[booru].notes +---------------------- +Type + ``bool`` +Default + ``false`` +Description + Extract overlay notes (position and text). + + Note: This requires 1 additional HTTP request for each post. extractor.[manga-extractor].chapter-reverse ------------------------------------------- diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 8a3d9e2a..8d3a19cb 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -279,7 +279,8 @@ }, "booru": { - "tags": false + "tags": false, + "notes": false } }, diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index c3cf3f72..a42ec53b 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -24,6 +24,7 @@ class BooruExtractor(BaseExtractor): self.login() data = self.metadata() tags = self.config("tags", False) + notes = self.config("notes", False) for post in self.posts(): try: @@ -35,8 +36,11 @@ class BooruExtractor(BaseExtractor): "(md5: %s)", post.get("id"), post.get("md5")) continue + page_html = None if tags: - self._extended_tags(post) + page_html = self._extended_tags(post) + if notes: + self._notes(post, page_html) self._prepare(post) post.update(data) text.nameext_from_url(url, post) @@ -66,4 +70,13 @@ class BooruExtractor(BaseExtractor): """Prepare the 'post's metadata""" def _extended_tags(self, post, page=None): - """Generate extended tag information""" + """Generate extended tag information + + The return value of this function will be + passed to the _notes function as the page parameter. + This makes it possible to reuse the same HTML both for + extracting tags and notes. + """ + + def _notes(self, post, page=None): + """Generate information about notes""" diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 0e296017..863ceadb 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -108,4 +108,26 @@ class GelbooruPostExtractor(GelbooruBase, "pattern": r"https://img\d\.gelbooru\.com/images" r"/22/61/226111273615049235b001b381707bd0\.webm", }), + # notes + ("https://gelbooru.com/index.php?page=post&s=view&id=5997331", { + "options": (("notes", True),), + "keywords": { + "notes": [ + { + "height": 553, + "body": "Look over this way when you talk~", + "width": 246, + "x": 35, + "y": 72 + }, + { + "height": 557, + "body": "Hey~\nAre you listening~?", + "width": 246, + "x": 1233, + "y": 109 + } + ] + } + }), ) diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index a81bbca2..1b877b3a 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -57,6 +57,31 @@ class GelbooruV02Extractor(booru.BooruExtractor): tags[tag_type].append(text.unquote(tag_name)) for key, value in tags.items(): post["tags_" + key] = " ".join(value) + return page + + def _notes(self, post, page=None): + if not page: + url = "{}/index.php?page=post&s=view&id={}".format( + self.root, post["id"]) + page = self.request(url).text + notes = [] + notes_data = text.extract(page, '
')[0] + if not notes_data: + return + + note_iter = text.extract_iter(notes_data, '') + extr = text.extract + for note_data in note_iter: + note = { + "width": int(extr(note_data, 'data-width="', '"')[0]), + "height": int(extr(note_data, 'data-height="', '"')[0]), + "x": int(extr(note_data, 'data-x="', '"')[0]), + "y": int(extr(note_data, 'data-y="', '"')[0]), + "body": extr(note_data, 'data-body="', '"')[0], + } + notes.append(note) + + post["notes"] = notes BASE_PATTERN = GelbooruV02Extractor.update({