[xasiat] add support (#4161 #5929 #7934)

* [xasiat] Album extractor
* [xasiat] Tags, categories, Models
* [xasiat] Tests
* update 'album' extractor
    - provide 'album_id' metadata
    - use redirected 'album_url' value
    - update metadata extraction in general
* extend test result data
This commit is contained in:
missionfloyd
2025-07-30 11:38:33 -06:00
committed by GitHub
parent 3b93184997
commit 952fcb1b8b
4 changed files with 195 additions and 0 deletions

View File

@@ -1175,6 +1175,12 @@ Consider all listed sites to potentially be NSFW.
<td>Galleries</td>
<td></td>
</tr>
<tr>
<td>Xasiat</td>
<td>https://www.xasiat.com</td>
<td>Albums, Categories, Models, Tag Searches</td>
<td></td>
</tr>
<tr>
<td>Xfolio</td>
<td>https://xfolio.jp/</td>

View File

@@ -211,6 +211,7 @@ modules = [
"wikiart",
"wikifeet",
"wikimedia",
"xasiat",
"xfolio",
"xhamster",
"xvideos",

View File

@@ -0,0 +1,103 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://www.xasiat.com"""
from .common import Extractor, Message
from .. import text, util
import time
BASE_PATTERN = r"(?:https?://)?(?:www\.)?xasiat\.com((?:/fr|/ja)?/albums"
class XasiatExtractor(Extractor):
category = "xasiat"
directory_fmt = ("{category}", "{title}")
archive_fmt = "{album_url}_{num}"
root = "https://www.xasiat.com"
def items(self):
data = {"_extractor": XasiatAlbumExtractor}
for url in self.posts():
yield Message.Queue, url, data
def posts(self):
return self._pagination(self.groups[0])
def _pagination(self, path, pnum=1):
url = f"{self.root}{path}/"
find_posts = util.re(r'class="item ">\s*<a href="([^"]+)').findall
while True:
params = {
"mode": "async",
"function": "get_block",
"block_id": "list_albums_common_albums_list",
"sort_by": "post_date",
"from": pnum,
"_": int(time.time() * 1000)
}
page = self.request(url, params=params).text
yield from find_posts(page)
if "<span>Next</span>" in page:
return
pnum += 1
class XasiatAlbumExtractor(XasiatExtractor):
subcategory = "album"
pattern = BASE_PATTERN + r"/(\d+)/[^/?#]+)"
example = "https://www.xasiat.com/albums/12345/TITLE/"
def items(self):
path, album_id = self.groups
url = f"{self.root}{path}/"
response = self.request(url)
extr = text.extract_from(response.text)
title = extr("<h1>", "<")
info = extr('class="info-content"', "</div>")
images = extr('class="images"', "</div>")
urls = list(text.extract_iter(images, 'href="', '"'))
data = {
"title": text.unescape(title),
"model": util.re(
r'top_models1"></i>\s*(.+)\s*</span').findall(info),
"tags": util.re(
r'tags/[^"]+\">\s*(.+)\s*</a').findall(info),
"album_category": util.re(
r'categories/[^"]+\">\s*(.+)\s*</a').findall(info)[0],
"album_url": response.url,
"album_id": text.parse_int(album_id),
"count": len(urls),
}
yield Message.Directory, data
for data["num"], url in enumerate(urls, 1):
yield Message.Url, url, text.nameext_from_url(url[:-1], data)
class XasiatTagExtractor(XasiatExtractor):
subcategory = "tag"
pattern = BASE_PATTERN + r"/tags/[^/?#]+)"
example = "https://www.xasiat.com/albums/tags/TAG/"
class XasiatCategoryExtractor(XasiatExtractor):
subcategory = "category"
pattern = BASE_PATTERN + r"/categories/[^/?#]+)"
example = "https://www.xasiat.com/albums/categories/CATEGORY/"
class XasiatModelExtractor(XasiatExtractor):
subcategory = "model"
pattern = BASE_PATTERN + r"/models/[^/?#]+)"
example = "https://www.xasiat.com/albums/models/MODEL/"

85
test/results/xasiat.py Normal file
View File

@@ -0,0 +1,85 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import xasiat
__tests__ = (
{
"#url" : "https://www.xasiat.com/albums/28156/photobook-2024-12-09-bomb/",
"#class" : xasiat.XasiatAlbumExtractor,
"#pattern": r"https://www.xasiat.com/get_image/2/\w{32}/sources/28000/28156/\d+.jpg/",
"#count" : 61,
"title" : "[Photobook] 2024.12.09 白濱美兎『忘れられない恋の味』BOMBデジタル写真集",
"album_category": "JAV & AV Models",
"album_id" : 28156,
"album_url" : "https://www.xasiat.com/albums/28156/photobook-2024-12-09-bomb/",
"count" : 61,
"num" : range(1, 61),
"extension" : "jpg",
"filename" : r"re:\d+",
"model" : [],
"tags" : [
"BOMB Photobook",
"Photobook",
],
},
{
"#url" : "https://www.xasiat.com/ja/albums/28155/cosplay1813/",
"#class": xasiat.XasiatAlbumExtractor,
"#count": 40,
"title" : "[Cosplay] 喜欢爱理吗 - 早濑优香",
"album_category": "グラビアアイドル",
"album_id" : 28155,
"album_url" : "https://www.xasiat.com/ja/albums/28155/cosplay1813/",
"count" : 40,
"num" : range(1, 40),
"model" : [],
"tags" : ["コスプレ"],
},
{
"#url" : "https://www.xasiat.com/fr/albums/23354/friday-impact-beauty-col-1/",
"#class": xasiat.XasiatAlbumExtractor,
"#count": 51,
"title" : "FRIDAYデジタル写真集 下村明香『Impact Beauty col.1』全カット",
"album_category": "Gravure Idols",
"model" : ["Sayaka Shimomura"],
"tags" : [
"FRIDAY Digital Photobook",
"De Toute Beauté",
],
},
{
"#url" : "https://www.xasiat.com/albums/categories/gravure-idols/",
"#class" : xasiat.XasiatCategoryExtractor,
"#pattern": xasiat.XasiatAlbumExtractor.pattern,
"#range" : "1-50",
"#count" : 50,
},
{
"#url" : "https://www.xasiat.com/albums/tags/japan/",
"#class" : xasiat.XasiatTagExtractor,
"#pattern": xasiat.XasiatAlbumExtractor.pattern,
"#range" : "1-50",
"#count" : 50,
},
{
"#url" : "https://www.xasiat.com/albums/models/remu-suzumori/",
"#class" : xasiat.XasiatModelExtractor,
"#pattern": xasiat.XasiatAlbumExtractor.pattern,
"#range" : "1-15",
"#count" : 15,
},
)