From df4f823f6ed6bbbb67a72b0bb39211a56bcc1b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 1 Aug 2025 21:21:41 +0200 Subject: [PATCH] [booth] add 'strategy' option (#7920) - extract file URLs from HTML webpages - fix 'adult' cookie value --- docs/configuration.rst | 17 +++++++++++ docs/gallery-dl.conf | 4 ++- gallery_dl/extractor/booth.py | 36 ++++++++++++++--------- test/results/booth.py | 55 ++++++++++++++++++++++------------- 4 files changed, 77 insertions(+), 35 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 0cff476e..6d607122 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1767,6 +1767,23 @@ Description * ``tiny`` (144p) +extractor.booth.strategy +------------------------ +Type + ``string`` +Default + ``"webpage"`` +Description + Selects how to handle and extract file URLs. + + ``"webpage"`` + Retrieve the full HTML page + and extract file URLs from it + ``"fallback"`` + Use `fallback `__ URLs + to `guess` each file's correct filename extension + + extractor.bunkr.endpoint ------------------------ Type diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 1c4a463f..f3b36a94 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -184,7 +184,9 @@ }, "booth": { - "sleep-request": "0.5-1.5" + "sleep-request": "0.5-1.5", + + "strategy": "webpage" }, "bunkr": { diff --git a/gallery_dl/extractor/booth.py b/gallery_dl/extractor/booth.py index 8a8db507..4a5201d6 100644 --- a/gallery_dl/extractor/booth.py +++ b/gallery_dl/extractor/booth.py @@ -22,7 +22,7 @@ class BoothExtractor(Extractor): request_interval = (0.5, 1.5) def _init(self): - self.cookies.set("adult", "1", domain=".booth.pm") + self.cookies.set("adult", "t", domain=".booth.pm") def items(self): for item in self.shop_items(): @@ -48,8 +48,13 @@ class BoothItemExtractor(BoothExtractor): example = "https://booth.pm/items/12345" def items(self): - url = f"{self.root}/ja/items/{self.groups[0]}.json" - item = self.request_json(url) + url = f"{self.root}/ja/items/{self.groups[0]}" + if self.config("strategy") == "fallback": + page = None + item = self.request_json(url + ".json") + else: + page = self.request(url).text + item = self.request_json(url + ".json", interval=False) item["booth_category"] = item.pop("category", None) item["date"] = text.parse_datetime( @@ -59,7 +64,7 @@ class BoothItemExtractor(BoothExtractor): shop = item["shop"] shop["id"] = text.parse_int(shop["thumbnail_url"].rsplit("/", 3)[1]) - if files := self._extract_files(item): + if files := self._extract_files(item, page): item["count"] = len(files) shop["uuid"] = files[0]["url"].split("/", 4)[3] else: @@ -73,17 +78,20 @@ class BoothItemExtractor(BoothExtractor): text.nameext_from_url(url, file) yield Message.Url, url, {**item, **file} - def _extract_files(self, item): - files = [] + def _extract_files(self, item, page): + if page is None: + files = [] + for image in item.pop("images"): + url = image["original"].replace("_base_resized", "") + files.append({ + "url" : url, + "_fallback": _fallback(url), + }) + return files - for image in item.pop("images"): - url = image["original"].replace("_base_resized", "") - files.append({ - "url" : url, - "_fallback": _fallback(url), - }) - - return files + del item["images"] + return [{"url": url} + for url in text.extract_iter(page, 'data-origin="', '"')] class BoothShopExtractor(BoothExtractor): diff --git a/test/results/booth.py b/test/results/booth.py index b19dd3a7..60dadedc 100644 --- a/test/results/booth.py +++ b/test/results/booth.py @@ -11,18 +11,10 @@ __tests__ = ( { "#url" : "https://booth.pm/ja/items/4693741", "#class" : booth.BoothItemExtractor, - "#pattern" : r"https://booth.pximg.net/792d497b-6e82-4df3-86de-31577e10f476/i/4693741/[\w-]{36}.jpg", + "#pattern" : r"https://booth.pximg.net/792d497b-6e82-4df3-86de-31577e10f476/i/4693741/[\w-]{36}\.(jpg|png)", "#count" : 10, - "booth_category" : { - "id" : 56, - "name" : "漫画・マンガ", - "url" : "https://booth.pm/ja/browse/%E6%BC%AB%E7%94%BB%E3%83%BB%E3%83%9E%E3%83%B3%E3%82%AC", - "parent": { - "name": "漫画", - "url" : "https://booth.pm/ja/browse/%E6%BC%AB%E7%94%BB", - }, - }, + "!_fallback" : ..., "buyee_variations": [], "count" : 10, "num" : range(1, 10), @@ -63,11 +55,28 @@ https://www.melonbooks.co.jp/detail/detail.php?product_id=1872452""", "published_at" : "2023-04-16T23:25:29.000+09:00", "purchase_limit" : None, "report_url" : "https://wanoazayaka.booth.pm/items/4693741/report", + "shipping_info" : "支払いから発送までの日数:4日以内", + "small_stock" : None, + "sound" : None, + "tracks" : None, + "url" : str, + "wish_list_url" : "https://booth.pm/items/4693741/wish_list", + "wish_lists_count": range(80, 120), + "wished" : False, + "tag_banners" : "len:list:5", + "booth_category" : { + "id" : 56, + "name" : "漫画・マンガ", + "url" : "https://booth.pm/ja/browse/%E6%BC%AB%E7%94%BB%E3%83%BB%E3%83%9E%E3%83%B3%E3%82%AC", + "parent": { + "name": "漫画", + "url" : "https://booth.pm/ja/browse/%E6%BC%AB%E7%94%BB", + }, + }, "share" : { "hashtags": ["booth_pm"], "text" : "※英語版※ I can no longer hear the railway crossing.【Bocchi the rock!】 | ふたりぼっちのSolitude", }, - "shipping_info" : "支払いから発送までの日数:4日以内", "shop" : { "id" : 5742915, "uuid" : "792d497b-6e82-4df3-86de-31577e10f476", @@ -77,14 +86,6 @@ https://www.melonbooks.co.jp/detail/detail.php?product_id=1872452""", "url" : "https://wanoazayaka.booth.pm/", "verified" : False, }, - "small_stock" : None, - "sound" : None, - "tracks" : None, - "url" : str, - "wish_list_url" : "https://booth.pm/items/4693741/wish_list", - "wish_lists_count": range(80, 120), - "wished" : False, - "tag_banners" : "len:list:5", "tag_combination" : { "category": "漫画・マンガ", "tag" : "ぼっち・ざ・ろっく!", @@ -124,9 +125,10 @@ https://www.melonbooks.co.jp/detail/detail.php?product_id=1872452""", "#results" : ( "https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/131bf61c-0534-4af3-9408-f19f08cb3622.jpg", "https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/fb65233a-7a93-4219-ba9f-b63e11329fda.jpg", - "https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/e18c16a0-b285-4cd8-aacc-6b3c4f4c6ce3.jpg", + "https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/e18c16a0-b285-4cd8-aacc-6b3c4f4c6ce3.jpeg", ), + "!_fallback" : ..., "count" : 3, "date" : "dt:2025-07-28 07:00:43", "description" : """C106新作おっぱいマウスパッドです @@ -184,6 +186,19 @@ https://www.melonbooks.co.jp/detail/detail.php?product_id=1872452""", ], }, +{ + "#url" : "https://caramel-crunch.booth.pm/items/7236173", + "#class" : booth.BoothItemExtractor, + "#options" : {"strategy": "fallback"}, + "#results" : ( + "https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/131bf61c-0534-4af3-9408-f19f08cb3622.jpg", + "https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/fb65233a-7a93-4219-ba9f-b63e11329fda.jpg", + "https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/e18c16a0-b285-4cd8-aacc-6b3c4f4c6ce3.jpg", + ), + + "_fallback": "len:3", +}, + { "#url" : "https://wanoazayaka.booth.pm/", "#class" : booth.BoothShopExtractor,