From dff03a6605fa92a566558b60a69c142ae0ce5ea5 Mon Sep 17 00:00:00 2001
From: thatfuckingbird <67429906+thatfuckingbird@users.noreply.github.com>
Date: Tue, 13 Apr 2021 23:40:24 +0200
Subject: [PATCH] [booru] add an option to extract notes (only gelbooru for
 now) (#1457)

* [booru] add an option to extract notes (currently implemented only for gelbooru)

* appease linter

* [gelbooru] rename "text" to "body" in note extraction

* add a code comment about reusing return value of _extended_tags
---
 docs/configuration.rst               | 10 ++++++++++
 docs/gallery-dl.conf                 |  3 ++-
 gallery_dl/extractor/booru.py        | 17 +++++++++++++++--
 gallery_dl/extractor/gelbooru.py     | 22 ++++++++++++++++++++++
 gallery_dl/extractor/gelbooru_v02.py | 25 +++++++++++++++++++++++++
 5 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/docs/configuration.rst b/docs/configuration.rst
index 60fb5ae6..ef30d56f 100644
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -1747,6 +1747,16 @@ Description
 
     Note: This requires 1 additional HTTP request for each post.
 
+extractor.[booru].notes
+----------------------
+Type
+    ``bool``
+Default
+    ``false``
+Description
+    Extract overlay notes (position and text).
+
+    Note: This requires 1 additional HTTP request for each post.
 
 extractor.[manga-extractor].chapter-reverse
 -------------------------------------------
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index 8a3d9e2a..8d3a19cb 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -279,7 +279,8 @@
         },
         "booru":
         {
-            "tags": false
+            "tags": false,
+            "notes": false
         }
     },
 
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index c3cf3f72..a42ec53b 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -24,6 +24,7 @@ class BooruExtractor(BaseExtractor):
         self.login()
         data = self.metadata()
         tags = self.config("tags", False)
+        notes = self.config("notes", False)
 
         for post in self.posts():
             try:
@@ -35,8 +36,11 @@ class BooruExtractor(BaseExtractor):
                                "(md5: %s)", post.get("id"), post.get("md5"))
                 continue
 
+            page_html = None
             if tags:
-                self._extended_tags(post)
+                page_html = self._extended_tags(post)
+            if notes:
+                self._notes(post, page_html)
             self._prepare(post)
             post.update(data)
             text.nameext_from_url(url, post)
@@ -66,4 +70,13 @@ class BooruExtractor(BaseExtractor):
         """Prepare the 'post's metadata"""
 
     def _extended_tags(self, post, page=None):
-        """Generate extended tag information"""
+        """Generate extended tag information
+
+        The return value of this function will be
+        passed to the _notes function as the page parameter.
+        This makes it possible to reuse the same HTML both for
+        extracting tags and notes.
+        """
+
+    def _notes(self, post, page=None):
+        """Generate information about notes"""
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index 0e296017..863ceadb 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -108,4 +108,26 @@ class GelbooruPostExtractor(GelbooruBase,
             "pattern": r"https://img\d\.gelbooru\.com/images"
                        r"/22/61/226111273615049235b001b381707bd0\.webm",
         }),
+        # notes
+        ("https://gelbooru.com/index.php?page=post&s=view&id=5997331", {
+            "options": (("notes", True),),
+            "keywords": {
+                "notes": [
+                    {
+                        "height": 553,
+                        "body": "Look over this way when you talk~",
+                        "width": 246,
+                        "x": 35,
+                        "y": 72
+                    },
+                    {
+                        "height": 557,
+                        "body": "Hey~\nAre you listening~?",
+                        "width": 246,
+                        "x": 1233,
+                        "y": 109
+                    }
+                ]
+            }
+        }),
     )
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index a81bbca2..1b877b3a 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -57,6 +57,31 @@ class GelbooruV02Extractor(booru.BooruExtractor):
                 tags[tag_type].append(text.unquote(tag_name))
             for key, value in tags.items():
                 post["tags_" + key] = " ".join(value)
+        return page
+
+    def _notes(self, post, page=None):
+        if not page:
+            url = "{}/index.php?page=post&s=view&id={}".format(
+                self.root, post["id"])
+            page = self.request(url).text
+        notes = []
+        notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
+        if not notes_data:
+            return
+
+        note_iter = text.extract_iter(notes_data, '<article', '</article>')
+        extr = text.extract
+        for note_data in note_iter:
+            note = {
+                "width": int(extr(note_data, 'data-width="', '"')[0]),
+                "height": int(extr(note_data, 'data-height="', '"')[0]),
+                "x": int(extr(note_data, 'data-x="', '"')[0]),
+                "y": int(extr(note_data, 'data-y="', '"')[0]),
+                "body": extr(note_data, 'data-body="', '"')[0],
+            }
+            notes.append(note)
+
+        post["notes"] = notes
 
 
 BASE_PATTERN = GelbooruV02Extractor.update({