[booth] add 'strategy' option (#7920)
- extract file URLs from HTML webpages - fix 'adult' cookie value
This commit is contained in:
@@ -1767,6 +1767,23 @@ Description
|
||||
* ``tiny`` (144p)
|
||||
|
||||
|
||||
extractor.booth.strategy
|
||||
------------------------
|
||||
Type
|
||||
``string``
|
||||
Default
|
||||
``"webpage"``
|
||||
Description
|
||||
Selects how to handle and extract file URLs.
|
||||
|
||||
``"webpage"``
|
||||
Retrieve the full HTML page
|
||||
and extract file URLs from it
|
||||
``"fallback"``
|
||||
Use `fallback <extractor.*.fallback_>`__ URLs
|
||||
to `guess` each file's correct filename extension
|
||||
|
||||
|
||||
extractor.bunkr.endpoint
|
||||
------------------------
|
||||
Type
|
||||
|
||||
@@ -184,7 +184,9 @@
|
||||
},
|
||||
"booth":
|
||||
{
|
||||
"sleep-request": "0.5-1.5"
|
||||
"sleep-request": "0.5-1.5",
|
||||
|
||||
"strategy": "webpage"
|
||||
},
|
||||
"bunkr":
|
||||
{
|
||||
|
||||
@@ -22,7 +22,7 @@ class BoothExtractor(Extractor):
|
||||
request_interval = (0.5, 1.5)
|
||||
|
||||
def _init(self):
|
||||
self.cookies.set("adult", "1", domain=".booth.pm")
|
||||
self.cookies.set("adult", "t", domain=".booth.pm")
|
||||
|
||||
def items(self):
|
||||
for item in self.shop_items():
|
||||
@@ -48,8 +48,13 @@ class BoothItemExtractor(BoothExtractor):
|
||||
example = "https://booth.pm/items/12345"
|
||||
|
||||
def items(self):
|
||||
url = f"{self.root}/ja/items/{self.groups[0]}.json"
|
||||
item = self.request_json(url)
|
||||
url = f"{self.root}/ja/items/{self.groups[0]}"
|
||||
if self.config("strategy") == "fallback":
|
||||
page = None
|
||||
item = self.request_json(url + ".json")
|
||||
else:
|
||||
page = self.request(url).text
|
||||
item = self.request_json(url + ".json", interval=False)
|
||||
|
||||
item["booth_category"] = item.pop("category", None)
|
||||
item["date"] = text.parse_datetime(
|
||||
@@ -59,7 +64,7 @@ class BoothItemExtractor(BoothExtractor):
|
||||
shop = item["shop"]
|
||||
shop["id"] = text.parse_int(shop["thumbnail_url"].rsplit("/", 3)[1])
|
||||
|
||||
if files := self._extract_files(item):
|
||||
if files := self._extract_files(item, page):
|
||||
item["count"] = len(files)
|
||||
shop["uuid"] = files[0]["url"].split("/", 4)[3]
|
||||
else:
|
||||
@@ -73,17 +78,20 @@ class BoothItemExtractor(BoothExtractor):
|
||||
text.nameext_from_url(url, file)
|
||||
yield Message.Url, url, {**item, **file}
|
||||
|
||||
def _extract_files(self, item):
|
||||
files = []
|
||||
def _extract_files(self, item, page):
|
||||
if page is None:
|
||||
files = []
|
||||
for image in item.pop("images"):
|
||||
url = image["original"].replace("_base_resized", "")
|
||||
files.append({
|
||||
"url" : url,
|
||||
"_fallback": _fallback(url),
|
||||
})
|
||||
return files
|
||||
|
||||
for image in item.pop("images"):
|
||||
url = image["original"].replace("_base_resized", "")
|
||||
files.append({
|
||||
"url" : url,
|
||||
"_fallback": _fallback(url),
|
||||
})
|
||||
|
||||
return files
|
||||
del item["images"]
|
||||
return [{"url": url}
|
||||
for url in text.extract_iter(page, 'data-origin="', '"')]
|
||||
|
||||
|
||||
class BoothShopExtractor(BoothExtractor):
|
||||
|
||||
@@ -11,18 +11,10 @@ __tests__ = (
|
||||
{
|
||||
"#url" : "https://booth.pm/ja/items/4693741",
|
||||
"#class" : booth.BoothItemExtractor,
|
||||
"#pattern" : r"https://booth.pximg.net/792d497b-6e82-4df3-86de-31577e10f476/i/4693741/[\w-]{36}.jpg",
|
||||
"#pattern" : r"https://booth.pximg.net/792d497b-6e82-4df3-86de-31577e10f476/i/4693741/[\w-]{36}\.(jpg|png)",
|
||||
"#count" : 10,
|
||||
|
||||
"booth_category" : {
|
||||
"id" : 56,
|
||||
"name" : "漫画・マンガ",
|
||||
"url" : "https://booth.pm/ja/browse/%E6%BC%AB%E7%94%BB%E3%83%BB%E3%83%9E%E3%83%B3%E3%82%AC",
|
||||
"parent": {
|
||||
"name": "漫画",
|
||||
"url" : "https://booth.pm/ja/browse/%E6%BC%AB%E7%94%BB",
|
||||
},
|
||||
},
|
||||
"!_fallback" : ...,
|
||||
"buyee_variations": [],
|
||||
"count" : 10,
|
||||
"num" : range(1, 10),
|
||||
@@ -63,11 +55,28 @@ https://www.melonbooks.co.jp/detail/detail.php?product_id=1872452""",
|
||||
"published_at" : "2023-04-16T23:25:29.000+09:00",
|
||||
"purchase_limit" : None,
|
||||
"report_url" : "https://wanoazayaka.booth.pm/items/4693741/report",
|
||||
"shipping_info" : "支払いから発送までの日数:4日以内",
|
||||
"small_stock" : None,
|
||||
"sound" : None,
|
||||
"tracks" : None,
|
||||
"url" : str,
|
||||
"wish_list_url" : "https://booth.pm/items/4693741/wish_list",
|
||||
"wish_lists_count": range(80, 120),
|
||||
"wished" : False,
|
||||
"tag_banners" : "len:list:5",
|
||||
"booth_category" : {
|
||||
"id" : 56,
|
||||
"name" : "漫画・マンガ",
|
||||
"url" : "https://booth.pm/ja/browse/%E6%BC%AB%E7%94%BB%E3%83%BB%E3%83%9E%E3%83%B3%E3%82%AC",
|
||||
"parent": {
|
||||
"name": "漫画",
|
||||
"url" : "https://booth.pm/ja/browse/%E6%BC%AB%E7%94%BB",
|
||||
},
|
||||
},
|
||||
"share" : {
|
||||
"hashtags": ["booth_pm"],
|
||||
"text" : "※英語版※ I can no longer hear the railway crossing.【Bocchi the rock!】 | ふたりぼっちのSolitude",
|
||||
},
|
||||
"shipping_info" : "支払いから発送までの日数:4日以内",
|
||||
"shop" : {
|
||||
"id" : 5742915,
|
||||
"uuid" : "792d497b-6e82-4df3-86de-31577e10f476",
|
||||
@@ -77,14 +86,6 @@ https://www.melonbooks.co.jp/detail/detail.php?product_id=1872452""",
|
||||
"url" : "https://wanoazayaka.booth.pm/",
|
||||
"verified" : False,
|
||||
},
|
||||
"small_stock" : None,
|
||||
"sound" : None,
|
||||
"tracks" : None,
|
||||
"url" : str,
|
||||
"wish_list_url" : "https://booth.pm/items/4693741/wish_list",
|
||||
"wish_lists_count": range(80, 120),
|
||||
"wished" : False,
|
||||
"tag_banners" : "len:list:5",
|
||||
"tag_combination" : {
|
||||
"category": "漫画・マンガ",
|
||||
"tag" : "ぼっち・ざ・ろっく!",
|
||||
@@ -124,9 +125,10 @@ https://www.melonbooks.co.jp/detail/detail.php?product_id=1872452""",
|
||||
"#results" : (
|
||||
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/131bf61c-0534-4af3-9408-f19f08cb3622.jpg",
|
||||
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/fb65233a-7a93-4219-ba9f-b63e11329fda.jpg",
|
||||
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/e18c16a0-b285-4cd8-aacc-6b3c4f4c6ce3.jpg",
|
||||
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/e18c16a0-b285-4cd8-aacc-6b3c4f4c6ce3.jpeg",
|
||||
),
|
||||
|
||||
"!_fallback" : ...,
|
||||
"count" : 3,
|
||||
"date" : "dt:2025-07-28 07:00:43",
|
||||
"description" : """C106新作おっぱいマウスパッドです
|
||||
@@ -184,6 +186,19 @@ https://www.melonbooks.co.jp/detail/detail.php?product_id=1872452""",
|
||||
],
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://caramel-crunch.booth.pm/items/7236173",
|
||||
"#class" : booth.BoothItemExtractor,
|
||||
"#options" : {"strategy": "fallback"},
|
||||
"#results" : (
|
||||
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/131bf61c-0534-4af3-9408-f19f08cb3622.jpg",
|
||||
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/fb65233a-7a93-4219-ba9f-b63e11329fda.jpg",
|
||||
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/e18c16a0-b285-4cd8-aacc-6b3c4f4c6ce3.jpg",
|
||||
),
|
||||
|
||||
"_fallback": "len:3",
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://wanoazayaka.booth.pm/",
|
||||
"#class" : booth.BoothShopExtractor,
|
||||
|
||||
Reference in New Issue
Block a user