[booth] add 'strategy' option (#7920)
- extract file URLs from HTML webpages - fix 'adult' cookie value
This commit is contained in:
@@ -1767,6 +1767,23 @@ Description
|
|||||||
* ``tiny`` (144p)
|
* ``tiny`` (144p)
|
||||||
|
|
||||||
|
|
||||||
|
extractor.booth.strategy
|
||||||
|
------------------------
|
||||||
|
Type
|
||||||
|
``string``
|
||||||
|
Default
|
||||||
|
``"webpage"``
|
||||||
|
Description
|
||||||
|
Selects how to handle and extract file URLs.
|
||||||
|
|
||||||
|
``"webpage"``
|
||||||
|
Retrieve the full HTML page
|
||||||
|
and extract file URLs from it
|
||||||
|
``"fallback"``
|
||||||
|
Use `fallback <extractor.*.fallback_>`__ URLs
|
||||||
|
to `guess` each file's correct filename extension
|
||||||
|
|
||||||
|
|
||||||
extractor.bunkr.endpoint
|
extractor.bunkr.endpoint
|
||||||
------------------------
|
------------------------
|
||||||
Type
|
Type
|
||||||
|
|||||||
@@ -184,7 +184,9 @@
|
|||||||
},
|
},
|
||||||
"booth":
|
"booth":
|
||||||
{
|
{
|
||||||
"sleep-request": "0.5-1.5"
|
"sleep-request": "0.5-1.5",
|
||||||
|
|
||||||
|
"strategy": "webpage"
|
||||||
},
|
},
|
||||||
"bunkr":
|
"bunkr":
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ class BoothExtractor(Extractor):
|
|||||||
request_interval = (0.5, 1.5)
|
request_interval = (0.5, 1.5)
|
||||||
|
|
||||||
def _init(self):
|
def _init(self):
|
||||||
self.cookies.set("adult", "1", domain=".booth.pm")
|
self.cookies.set("adult", "t", domain=".booth.pm")
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
for item in self.shop_items():
|
for item in self.shop_items():
|
||||||
@@ -48,8 +48,13 @@ class BoothItemExtractor(BoothExtractor):
|
|||||||
example = "https://booth.pm/items/12345"
|
example = "https://booth.pm/items/12345"
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
url = f"{self.root}/ja/items/{self.groups[0]}.json"
|
url = f"{self.root}/ja/items/{self.groups[0]}"
|
||||||
item = self.request_json(url)
|
if self.config("strategy") == "fallback":
|
||||||
|
page = None
|
||||||
|
item = self.request_json(url + ".json")
|
||||||
|
else:
|
||||||
|
page = self.request(url).text
|
||||||
|
item = self.request_json(url + ".json", interval=False)
|
||||||
|
|
||||||
item["booth_category"] = item.pop("category", None)
|
item["booth_category"] = item.pop("category", None)
|
||||||
item["date"] = text.parse_datetime(
|
item["date"] = text.parse_datetime(
|
||||||
@@ -59,7 +64,7 @@ class BoothItemExtractor(BoothExtractor):
|
|||||||
shop = item["shop"]
|
shop = item["shop"]
|
||||||
shop["id"] = text.parse_int(shop["thumbnail_url"].rsplit("/", 3)[1])
|
shop["id"] = text.parse_int(shop["thumbnail_url"].rsplit("/", 3)[1])
|
||||||
|
|
||||||
if files := self._extract_files(item):
|
if files := self._extract_files(item, page):
|
||||||
item["count"] = len(files)
|
item["count"] = len(files)
|
||||||
shop["uuid"] = files[0]["url"].split("/", 4)[3]
|
shop["uuid"] = files[0]["url"].split("/", 4)[3]
|
||||||
else:
|
else:
|
||||||
@@ -73,17 +78,20 @@ class BoothItemExtractor(BoothExtractor):
|
|||||||
text.nameext_from_url(url, file)
|
text.nameext_from_url(url, file)
|
||||||
yield Message.Url, url, {**item, **file}
|
yield Message.Url, url, {**item, **file}
|
||||||
|
|
||||||
def _extract_files(self, item):
|
def _extract_files(self, item, page):
|
||||||
files = []
|
if page is None:
|
||||||
|
files = []
|
||||||
|
for image in item.pop("images"):
|
||||||
|
url = image["original"].replace("_base_resized", "")
|
||||||
|
files.append({
|
||||||
|
"url" : url,
|
||||||
|
"_fallback": _fallback(url),
|
||||||
|
})
|
||||||
|
return files
|
||||||
|
|
||||||
for image in item.pop("images"):
|
del item["images"]
|
||||||
url = image["original"].replace("_base_resized", "")
|
return [{"url": url}
|
||||||
files.append({
|
for url in text.extract_iter(page, 'data-origin="', '"')]
|
||||||
"url" : url,
|
|
||||||
"_fallback": _fallback(url),
|
|
||||||
})
|
|
||||||
|
|
||||||
return files
|
|
||||||
|
|
||||||
|
|
||||||
class BoothShopExtractor(BoothExtractor):
|
class BoothShopExtractor(BoothExtractor):
|
||||||
|
|||||||
@@ -11,18 +11,10 @@ __tests__ = (
|
|||||||
{
|
{
|
||||||
"#url" : "https://booth.pm/ja/items/4693741",
|
"#url" : "https://booth.pm/ja/items/4693741",
|
||||||
"#class" : booth.BoothItemExtractor,
|
"#class" : booth.BoothItemExtractor,
|
||||||
"#pattern" : r"https://booth.pximg.net/792d497b-6e82-4df3-86de-31577e10f476/i/4693741/[\w-]{36}.jpg",
|
"#pattern" : r"https://booth.pximg.net/792d497b-6e82-4df3-86de-31577e10f476/i/4693741/[\w-]{36}\.(jpg|png)",
|
||||||
"#count" : 10,
|
"#count" : 10,
|
||||||
|
|
||||||
"booth_category" : {
|
"!_fallback" : ...,
|
||||||
"id" : 56,
|
|
||||||
"name" : "漫画・マンガ",
|
|
||||||
"url" : "https://booth.pm/ja/browse/%E6%BC%AB%E7%94%BB%E3%83%BB%E3%83%9E%E3%83%B3%E3%82%AC",
|
|
||||||
"parent": {
|
|
||||||
"name": "漫画",
|
|
||||||
"url" : "https://booth.pm/ja/browse/%E6%BC%AB%E7%94%BB",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"buyee_variations": [],
|
"buyee_variations": [],
|
||||||
"count" : 10,
|
"count" : 10,
|
||||||
"num" : range(1, 10),
|
"num" : range(1, 10),
|
||||||
@@ -63,11 +55,28 @@ https://www.melonbooks.co.jp/detail/detail.php?product_id=1872452""",
|
|||||||
"published_at" : "2023-04-16T23:25:29.000+09:00",
|
"published_at" : "2023-04-16T23:25:29.000+09:00",
|
||||||
"purchase_limit" : None,
|
"purchase_limit" : None,
|
||||||
"report_url" : "https://wanoazayaka.booth.pm/items/4693741/report",
|
"report_url" : "https://wanoazayaka.booth.pm/items/4693741/report",
|
||||||
|
"shipping_info" : "支払いから発送までの日数:4日以内",
|
||||||
|
"small_stock" : None,
|
||||||
|
"sound" : None,
|
||||||
|
"tracks" : None,
|
||||||
|
"url" : str,
|
||||||
|
"wish_list_url" : "https://booth.pm/items/4693741/wish_list",
|
||||||
|
"wish_lists_count": range(80, 120),
|
||||||
|
"wished" : False,
|
||||||
|
"tag_banners" : "len:list:5",
|
||||||
|
"booth_category" : {
|
||||||
|
"id" : 56,
|
||||||
|
"name" : "漫画・マンガ",
|
||||||
|
"url" : "https://booth.pm/ja/browse/%E6%BC%AB%E7%94%BB%E3%83%BB%E3%83%9E%E3%83%B3%E3%82%AC",
|
||||||
|
"parent": {
|
||||||
|
"name": "漫画",
|
||||||
|
"url" : "https://booth.pm/ja/browse/%E6%BC%AB%E7%94%BB",
|
||||||
|
},
|
||||||
|
},
|
||||||
"share" : {
|
"share" : {
|
||||||
"hashtags": ["booth_pm"],
|
"hashtags": ["booth_pm"],
|
||||||
"text" : "※英語版※ I can no longer hear the railway crossing.【Bocchi the rock!】 | ふたりぼっちのSolitude",
|
"text" : "※英語版※ I can no longer hear the railway crossing.【Bocchi the rock!】 | ふたりぼっちのSolitude",
|
||||||
},
|
},
|
||||||
"shipping_info" : "支払いから発送までの日数:4日以内",
|
|
||||||
"shop" : {
|
"shop" : {
|
||||||
"id" : 5742915,
|
"id" : 5742915,
|
||||||
"uuid" : "792d497b-6e82-4df3-86de-31577e10f476",
|
"uuid" : "792d497b-6e82-4df3-86de-31577e10f476",
|
||||||
@@ -77,14 +86,6 @@ https://www.melonbooks.co.jp/detail/detail.php?product_id=1872452""",
|
|||||||
"url" : "https://wanoazayaka.booth.pm/",
|
"url" : "https://wanoazayaka.booth.pm/",
|
||||||
"verified" : False,
|
"verified" : False,
|
||||||
},
|
},
|
||||||
"small_stock" : None,
|
|
||||||
"sound" : None,
|
|
||||||
"tracks" : None,
|
|
||||||
"url" : str,
|
|
||||||
"wish_list_url" : "https://booth.pm/items/4693741/wish_list",
|
|
||||||
"wish_lists_count": range(80, 120),
|
|
||||||
"wished" : False,
|
|
||||||
"tag_banners" : "len:list:5",
|
|
||||||
"tag_combination" : {
|
"tag_combination" : {
|
||||||
"category": "漫画・マンガ",
|
"category": "漫画・マンガ",
|
||||||
"tag" : "ぼっち・ざ・ろっく!",
|
"tag" : "ぼっち・ざ・ろっく!",
|
||||||
@@ -124,9 +125,10 @@ https://www.melonbooks.co.jp/detail/detail.php?product_id=1872452""",
|
|||||||
"#results" : (
|
"#results" : (
|
||||||
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/131bf61c-0534-4af3-9408-f19f08cb3622.jpg",
|
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/131bf61c-0534-4af3-9408-f19f08cb3622.jpg",
|
||||||
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/fb65233a-7a93-4219-ba9f-b63e11329fda.jpg",
|
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/fb65233a-7a93-4219-ba9f-b63e11329fda.jpg",
|
||||||
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/e18c16a0-b285-4cd8-aacc-6b3c4f4c6ce3.jpg",
|
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/e18c16a0-b285-4cd8-aacc-6b3c4f4c6ce3.jpeg",
|
||||||
),
|
),
|
||||||
|
|
||||||
|
"!_fallback" : ...,
|
||||||
"count" : 3,
|
"count" : 3,
|
||||||
"date" : "dt:2025-07-28 07:00:43",
|
"date" : "dt:2025-07-28 07:00:43",
|
||||||
"description" : """C106新作おっぱいマウスパッドです
|
"description" : """C106新作おっぱいマウスパッドです
|
||||||
@@ -184,6 +186,19 @@ https://www.melonbooks.co.jp/detail/detail.php?product_id=1872452""",
|
|||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://caramel-crunch.booth.pm/items/7236173",
|
||||||
|
"#class" : booth.BoothItemExtractor,
|
||||||
|
"#options" : {"strategy": "fallback"},
|
||||||
|
"#results" : (
|
||||||
|
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/131bf61c-0534-4af3-9408-f19f08cb3622.jpg",
|
||||||
|
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/fb65233a-7a93-4219-ba9f-b63e11329fda.jpg",
|
||||||
|
"https://booth.pximg.net/74488d0d-e533-443c-82ce-fa961d5cbaf0/i/7236173/e18c16a0-b285-4cd8-aacc-6b3c4f4c6ce3.jpg",
|
||||||
|
),
|
||||||
|
|
||||||
|
"_fallback": "len:3",
|
||||||
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://wanoazayaka.booth.pm/",
|
"#url" : "https://wanoazayaka.booth.pm/",
|
||||||
"#class" : booth.BoothShopExtractor,
|
"#class" : booth.BoothShopExtractor,
|
||||||
|
|||||||
Reference in New Issue
Block a user