145 lines
4.9 KiB
Python
145 lines
4.9 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2025 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
"""Extractors for https://xfolio.jp/"""
|
|
|
|
from .common import Extractor, Message
|
|
from .. import text, exception
|
|
|
|
BASE_PATTERN = r"(?:https?://)?xfolio\.jp(?:/[^/?#]+)?"
|
|
|
|
|
|
class XfolioExtractor(Extractor):
|
|
"""Base class for xfolio extractors"""
|
|
category = "xfolio"
|
|
root = "https://xfolio.jp"
|
|
cookies_domain = ".xfolio.jp"
|
|
directory_fmt = ("{category}", "{creator_slug}", "{work_id}")
|
|
filename_fmt = "{work_id}_{image_id}.{extension}"
|
|
archive_fmt = "{work_id}_{image_id}"
|
|
request_interval = (0.5, 1.5)
|
|
|
|
def _init(self):
|
|
XfolioExtractor._init = Extractor._init
|
|
if not self.cookies_check(("xfolio_session",)):
|
|
self.log.error("'xfolio_session' cookie required")
|
|
|
|
def items(self):
|
|
data = {"_extractor": XfolioWorkExtractor}
|
|
for work in self.works():
|
|
yield Message.Queue, work, data
|
|
|
|
def request(self, url, **kwargs):
|
|
response = Extractor.request(self, url, **kwargs)
|
|
|
|
if "/system/recaptcha" in response.url:
|
|
raise exception.AbortExtraction("Bot check / CAPTCHA page")
|
|
|
|
return response
|
|
|
|
|
|
class XfolioWorkExtractor(XfolioExtractor):
|
|
subcategory = "work"
|
|
pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)/works/(\d+)"
|
|
example = "https://xfolio.jp/portfolio/USER/works/12345"
|
|
|
|
def items(self):
|
|
creator, work_id = self.groups
|
|
url = f"{self.root}/portfolio/{creator}/works/{work_id}"
|
|
html = self.request(url).text
|
|
|
|
work = self._extract_data(html)
|
|
files = self._extract_files(html, work)
|
|
work["count"] = len(files)
|
|
|
|
yield Message.Directory, "", work
|
|
for work["num"], file in enumerate(files, 1):
|
|
file.update(work)
|
|
yield Message.Url, file["url"], file
|
|
|
|
def _extract_data(self, html):
|
|
creator, work_id = self.groups
|
|
extr = text.extract_from(html)
|
|
return {
|
|
"title" : text.unescape(extr(
|
|
'property="og:title" content="', '"').rpartition(" - ")[0]),
|
|
"description" : text.unescape(extr(
|
|
'property="og:description" content="', '"')),
|
|
"creator_id" : extr(' data-creator-id="', '"'),
|
|
"creator_userid" : extr(' data-creator-user-id="', '"'),
|
|
"creator_name" : extr(' data-creator-name="', '"'),
|
|
"creator_profile": text.unescape(extr(
|
|
' data-creator-profile="', '"')),
|
|
"series_id" : extr("/series/", '"'),
|
|
"creator_slug" : creator,
|
|
"work_id" : work_id,
|
|
}
|
|
|
|
def _extract_files(self, html, work):
|
|
files = []
|
|
|
|
work_id = work["work_id"]
|
|
for img in text.extract_iter(
|
|
html, 'class="article__wrap_img', "</div>"):
|
|
image_id = text.extr(img, "/fullscale_image?image_id=", "&")
|
|
if not image_id:
|
|
self.log.warning(
|
|
"%s: 'fullscale_image' not available", work_id)
|
|
continue
|
|
|
|
files.append({
|
|
"image_id" : image_id,
|
|
"extension": "jpg",
|
|
"url": (f"{self.root}/user_asset.php?id={image_id}&work_id="
|
|
f"{work_id}&work_image_id={image_id}&type=work_image"),
|
|
"_http_headers": {"Referer": (
|
|
f"{self.root}/fullscale_image"
|
|
f"?image_id={image_id}&work_id={work_id}")},
|
|
})
|
|
|
|
return files
|
|
|
|
|
|
class XfolioUserExtractor(XfolioExtractor):
|
|
subcategory = "user"
|
|
pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)(?:/works)?/?(?:$|\?|#)"
|
|
example = "https://xfolio.jp/portfolio/USER"
|
|
|
|
def works(self):
|
|
url = f"{self.root}/portfolio/{self.groups[0]}/works"
|
|
|
|
while True:
|
|
html = self.request(url).text
|
|
|
|
for item in text.extract_iter(
|
|
html, '<div class="postItem', "</div>"):
|
|
yield text.extr(item, ' href="', '"')
|
|
|
|
pager = text.extr(html, ' class="pager__list_next', "</li>")
|
|
url = text.extr(pager, ' href="', '"')
|
|
if not url:
|
|
return
|
|
url = text.unescape(url)
|
|
|
|
|
|
class XfolioSeriesExtractor(XfolioExtractor):
|
|
subcategory = "series"
|
|
pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)/series/(\d+)"
|
|
example = "https://xfolio.jp/portfolio/USER/series/12345"
|
|
|
|
def works(self):
|
|
creator, series_id = self.groups
|
|
url = f"{self.root}/portfolio/{creator}/series/{series_id}"
|
|
html = self.request(url).text
|
|
|
|
return [
|
|
text.extr(item, ' href="', '"')
|
|
for item in text.extract_iter(
|
|
html, 'class="listWrap--title">', "</a>")
|
|
]
|