[skeb] add 'search_tags' metadata to search results (#2945)

This commit is contained in:
Mike Fährmann
2022-09-23 13:56:00 +02:00
parent 1378cbb8dd
commit 68f11e02a9

View File

@@ -26,8 +26,11 @@ class SkebExtractor(Extractor):
self.article = self.config("article", False) self.article = self.config("article", False)
def items(self): def items(self):
metadata = self.metadata()
for user_name, post_num in self.posts(): for user_name, post_num in self.posts():
response, post = self._get_post_data(user_name, post_num) response, post = self._get_post_data(user_name, post_num)
if metadata:
post.update(metadata)
yield Message.Directory, post yield Message.Directory, post
for data in self._get_urls_from_post(response, post): for data in self._get_urls_from_post(response, post):
url = data["file_url"] url = data["file_url"]
@@ -36,6 +39,9 @@ class SkebExtractor(Extractor):
def posts(self): def posts(self):
"""Return post number""" """Return post number"""
def metadata(self):
"""Return additional metadata"""
def _pagination(self, url, params): def _pagination(self, url, params):
headers = {"Referer": self.root, "Authorization": "Bearer null"} headers = {"Referer": self.root, "Authorization": "Bearer null"}
params["offset"] = 0 params["offset"] = 0
@@ -229,8 +235,12 @@ class SkebSearchExtractor(SkebExtractor):
pattern = r"(?:https?://)?skeb\.jp/search\?q=([^&#]+)" pattern = r"(?:https?://)?skeb\.jp/search\?q=([^&#]+)"
test = ("https://skeb.jp/search?q=bunny%20tree&t=works", { test = ("https://skeb.jp/search?q=bunny%20tree&t=works", {
"count": ">= 18", "count": ">= 18",
"keyword": {"search_tags": "bunny tree"},
}) })
def metadata(self):
return {"search_tags": text.unquote(self.user_name)}
def posts(self): def posts(self):
url = "https://hb1jt3kre9-2.algolianet.com/1/indexes/*/queries" url = "https://hb1jt3kre9-2.algolianet.com/1/indexes/*/queries"
params = { params = {
@@ -243,10 +253,10 @@ class SkebSearchExtractor(SkebExtractor):
"x-algolia-application-id": "HB1JT3KRE9", "x-algolia-application-id": "HB1JT3KRE9",
} }
page = 0
pams = ("hitsPerPage=40&filters=genre%3Aart%20OR%20genre%3Avoice%20OR" pams = ("hitsPerPage=40&filters=genre%3Aart%20OR%20genre%3Avoice%20OR"
"%20genre%3Anovel%20OR%20genre%3Avideo%20OR%20genre%3Amusic%2" "%20genre%3Anovel%20OR%20genre%3Avideo%20OR%20genre%3Amusic%2"
"0OR%20genre%3Acorrection&page=") "0OR%20genre%3Acorrection&page=")
page = 0
request = { request = {
"indexName": "Request", "indexName": "Request",
@@ -262,9 +272,7 @@ class SkebSearchExtractor(SkebExtractor):
for post in result["hits"]: for post in result["hits"]:
parts = post["path"].split("/") parts = post["path"].split("/")
user_name = parts[1][1:] yield parts[1][1:], parts[3]
post_num = parts[3]
yield user_name, post_num
if page >= result["nbPages"]: if page >= result["nbPages"]:
return return
@@ -287,8 +295,8 @@ class SkebFollowingExtractor(SkebExtractor):
def users(self): def users(self):
url = "{}/api/users/{}/following_creators".format( url = "{}/api/users/{}/following_creators".format(
self.root, self.user_name) self.root, self.user_name)
headers = {"Referer": self.root, "Authorization": "Bearer null"}
params = {"sort": "date", "offset": 0, "limit": 90} params = {"sort": "date", "offset": 0, "limit": 90}
headers = {"Referer": self.root, "Authorization": "Bearer null"}
while True: while True:
data = self.request(url, params=params, headers=headers).json() data = self.request(url, params=params, headers=headers).json()