From 775895f44b9015ea66f5a7b1804590463c976676 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 31 Oct 2022 12:01:19 +0100 Subject: [PATCH] [booru] refactor 'tags' and 'notes' extraction - move HTML request for post pages into its own function - move gelbooru_v02.py notes extraction to gelbooru.py since it only works there - clean up some code --- gallery_dl/extractor/booru.py | 30 ++++++++++++------------ gallery_dl/extractor/gelbooru.py | 30 ++++++++++++++++++------ gallery_dl/extractor/gelbooru_v02.py | 35 ++++------------------------ gallery_dl/extractor/moebooru.py | 29 ++++++++++------------- gallery_dl/extractor/philomena.py | 4 ---- gallery_dl/extractor/sankaku.py | 2 +- gallery_dl/extractor/twibooru.py | 2 +- 7 files changed, 57 insertions(+), 75 deletions(-) diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 12d98b12..0d7d13d4 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -25,6 +25,7 @@ class BooruExtractor(BaseExtractor): data = self.metadata() tags = self.config("tags", False) notes = self.config("notes", False) + fetch_html = tags or notes for post in self.posts(): try: @@ -36,11 +37,13 @@ class BooruExtractor(BaseExtractor): "(md5: %s)", post.get("id"), post.get("md5")) continue - page_html = None - if tags: - page_html = self._extended_tags(post) - if notes: - self._notes(post, page_html) + if fetch_html: + html = self._html(post) + if tags: + self._tags(post, html) + if notes: + self._notes(post, html) + text.nameext_from_url(url, post) post.update(data) self._prepare(post) @@ -67,16 +70,13 @@ class BooruExtractor(BaseExtractor): _file_url = operator.itemgetter("file_url") def _prepare(self, post): - """Prepare the 'post's metadata""" + """Prepare a 'post's metadata""" - def _extended_tags(self, post, page=None): - """Generate extended tag information + def _html(self, post): + """Return HTML content of a post""" - The return value of this function will be - passed to the _notes function as the page parameter. - This makes it possible to reuse the same HTML both for - extracting tags and notes. - """ + def _tags(self, post, page): + """Extract extended tag metadata""" - def _notes(self, post, page=None): - """Generate information about notes""" + def _notes(self, post, page): + """Extract notes metadata""" diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index a2cf0c03..63450c04 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -68,6 +68,22 @@ class GelbooruBase(): yield "https://img2.gelbooru.com" + path yield "https://img1.gelbooru.com" + path + def _notes(self, post, page): + notes_data = text.extract(page, '
')[0] + if not notes_data: + return + + post["notes"] = notes = [] + extr = text.extract + for note in text.extract_iter(notes_data, ''): + notes.append({ + "width" : int(extr(note, 'data-width="', '"')[0]), + "height": int(extr(note, 'data-height="', '"')[0]), + "x" : int(extr(note, 'data-x="', '"')[0]), + "y" : int(extr(note, 'data-y="', '"')[0]), + "body" : extr(note, 'data-body="', '"')[0], + }) + class GelbooruTagExtractor(GelbooruBase, gelbooru_v02.GelbooruV02TagExtractor): @@ -182,21 +198,21 @@ class GelbooruPostExtractor(GelbooruBase, "keywords": { "notes": [ { - "height": 553, "body": "Look over this way when you talk~", + "height": 553, "width": 246, "x": 35, - "y": 72 + "y": 72, }, { - "height": 557, "body": "Hey~\nAre you listening~?", + "height": 557, "width": 246, "x": 1233, - "y": 109 - } - ] - } + "y": 109, + }, + ], + }, }), ) diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 82146146..c3b04459 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -93,11 +93,11 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) return url - def _extended_tags(self, post, page=None): - if not page: - url = "{}/index.php?page=post&s=view&id={}".format( - self.root, post["id"]) - page = self.request(url).text + def _html(self, post): + return self.request("{}/index.php?page=post&s=view&id={}".format( + self.root, post["id"])).text + + def _tags(self, post, page): html = text.extract(page, '
    ')[0] @@ -109,31 +109,6 @@ class GelbooruV02Extractor(booru.BooruExtractor): tags[tag_type].append(text.unquote(tag_name)) for key, value in tags.items(): post["tags_" + key] = " ".join(value) - return page - - def _notes(self, post, page=None): - if not page: - url = "{}/index.php?page=post&s=view&id={}".format( - self.root, post["id"]) - page = self.request(url).text - notes = [] - notes_data = text.extract(page, '
    ')[0] - if not notes_data: - return - - note_iter = text.extract_iter(notes_data, '') - extr = text.extract - for note_data in note_iter: - note = { - "width": int(extr(note_data, 'data-width="', '"')[0]), - "height": int(extr(note_data, 'data-height="', '"')[0]), - "x": int(extr(note_data, 'data-x="', '"')[0]), - "y": int(extr(note_data, 'data-y="', '"')[0]), - "body": extr(note_data, 'data-body="', '"')[0], - } - notes.append(note) - - post["notes"] = notes INSTANCES = { diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index 4d63c3ea..e3179c75 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -26,10 +26,11 @@ class MoebooruExtractor(BooruExtractor): def _prepare(post): post["date"] = text.parse_timestamp(post["created_at"]) - def _extended_tags(self, post, page=None): - if not page: - url = "{}/post/show/{}".format(self.root, post["id"]) - page = self.request(url).text + def _html(self, post): + return self.request("{}/post/show/{}".format( + self.root, post["id"])).text + + def _tags(self, post, page): html = text.extract(page, '
      ', "")), + "body" : text.remove_html(extr(">", "")), }) - post["notes"] = notes - def _pagination(self, url, params): params["page"] = self.page_start params["limit"] = self.per_page diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 225f0ffc..fc85125c 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -27,10 +27,6 @@ class PhilomenaExtractor(BooruExtractor): def _prepare(post): post["date"] = text.parse_datetime(post["created_at"]) - @staticmethod - def _extended_tags(post): - pass - def _pagination(self, url, params): params["page"] = 1 params["per_page"] = self.per_page diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 3396e3ac..32759b8b 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -63,7 +63,7 @@ class SankakuExtractor(BooruExtractor): def _check_expired(self, response): return not response.history or '.com/expired.png' not in response.url - def _extended_tags(self, post): + def _tags(self, post, page): tags = collections.defaultdict(list) types = self.TAG_TYPES for tag in post["tags"]: diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index 93fa039d..f010f926 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -83,7 +83,7 @@ class TwibooruPostExtractor(TwibooruExtractor): "tag_ids": list, "tags": list, "thumbnails_generated": True, - "updated_at": "2022-05-13T00:43:19.791Z", + "updated_at": "2022-09-21T14:31:50.441Z", "upvotes": int, "view_url": "https://cdn.twibooru.org/img/2020/7/8/1/full.png", "width": 576,