[booth] add 'shop' extractor (#7920)

This commit is contained in:
Mike Fährmann
2025-07-29 16:09:46 +02:00
parent 8aa2fe33d5
commit 78f81ee48a
5 changed files with 94 additions and 5 deletions

View File

@@ -386,6 +386,7 @@ Default
* ``"0.5-1.5"``
``ao3``,
``arcalive``,
``booth``,
``civitai``,
``[Danbooru]``,
``[E621]``,

View File

@@ -182,6 +182,10 @@
"metadata": false,
"videos" : true
},
"booth":
{
"sleep-request": "0.5-1.5"
},
"bunkr":
{
"endpoint": "/api/_001_v2",

View File

@@ -160,7 +160,7 @@ Consider all listed sites to potentially be NSFW.
<tr>
<td>BOOTH</td>
<td>https://booth.pm/</td>
<td>Items</td>
<td>Items, Shops</td>
<td></td>
</tr>
<tr>

View File

@@ -9,9 +9,7 @@
"""Extractors for https://booth.pm/"""
from .common import Extractor, Message
from .. import text
BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?booth\.pm(?:/\w\w)?"
from .. import text, util
class BoothExtractor(Extractor):
@@ -21,14 +19,32 @@ class BoothExtractor(Extractor):
directory_fmt = ("{category}", "{shop[name]}", "{id} {name}")
filename_fmt = "{num:>02} {filename}.{extension}"
archive_fmt = "{id}_{filename}"
request_interval = (0.5, 1.5)
def _init(self):
self.cookies.set("adult", "1", domain=".booth.pm")
def items(self):
for item in self.shop_items():
item["_extractor"] = BoothItemExtractor
yield Message.Queue, item["shop_item_url"], item
def _pagination(self, url):
while True:
page = self.request(url).text
for item in text.extract_iter(page, ' data-item="', '"'):
yield util.json_loads(text.unescape(item))
next = text.extr(page, 'rel="next" class="nav-item" href="', '"')
if not next:
break
url = self.root + next
class BoothItemExtractor(BoothExtractor):
subcategory = "item"
pattern = BASE_PATTERN + r"/items/(\d+)"
pattern = r"(?:https?://)?(?:[\w-]+\.)?booth\.pm/(?:\w\w/)?items/(\d+)"
example = "https://booth.pm/items/12345"
def items(self):
@@ -63,6 +79,19 @@ class BoothItemExtractor(BoothExtractor):
return files
class BoothShopExtractor(BoothExtractor):
subcategory = "shop"
pattern = r"(?:https?://)?([\w-]+\.)booth\.pm/(?:\w\w/)?(?:items)?"
example = "https://SHOP.booth.pm/"
def __init__(self, match):
self.root = text.root_from_url(match[0])
BoothExtractor.__init__(self, match)
def shop_items(self):
return self._pagination(f"{self.root}/items")
def _fallback(url):
base = url[:-3]
yield base + "jpeg"

View File

@@ -180,4 +180,59 @@ https://www.melonbooks.co.jp/detail/detail.php?product_id=1872452""",
],
},
{
"#url" : "https://wanoazayaka.booth.pm/",
"#class" : booth.BoothShopExtractor,
"#results" : (
"https://wanoazayaka.booth.pm/items/4972816",
"https://wanoazayaka.booth.pm/items/4855567",
"https://wanoazayaka.booth.pm/items/4693741",
),
"event" : None,
"id" : int,
"is_adult" : False,
"is_end_of_sale": False,
"is_placeholder": False,
"is_sold_out" : False,
"is_vrchat" : False,
"minimum_stock" : None,
"music" : None,
"name" : str,
"price" : "700 JPY",
"url" : r"re:https://booth.pm/en/items/\d+",
"shop_item_url" : r"re:https://wanoazayaka.booth.pm/items/\d+",
"wish_list_url" : r"re:https://wanoazayaka.booth.pm/items/\d+/wish_list",
"thumbnail_image_urls": list,
"shop" : {
"name" : "ふたりぼっちのSolitude",
"thumbnail_url": "https://booth.pximg.net/c/48x48/users/5742915/icon_image/1448e5d8-f93f-445e-8e1e-acb29aa45aa4_base_resized.jpg",
"url" : "https://wanoazayaka.booth.pm/",
"verified" : False,
},
"tracking_data" : {
"product_brand" : "wanoazayaka",
"product_category": 56,
"product_event" : None,
"product_id" : int,
"product_name" : str,
"product_price" : 700,
"tracking" : "impression_item",
},
},
{
"#url" : "https://caramel-crunch.booth.pm/items",
"#class" : booth.BoothShopExtractor,
"#pattern" : booth.BoothItemExtractor.pattern,
"#count" : range(90, 120),
"shop": {
"name" : " ",
"thumbnail_url": "https://booth.pximg.net/c/48x48/users/49832/icon_image/a240e313-6a0f-4155-8310-a0d6abb299e6_base_resized.jpg",
"url" : "https://caramel-crunch.booth.pm/",
"verified" : False,
},
},
)