[turbo] update 'saint' extractors (#8893 #8896)

* Implements turbo.py & remove from domain pattern from saints.py
* Remove leftover commented pattern from saints.py
* Make turbo.py comply with flake8
* Add album support
* Improved metadata extracion for albums and single files & created turbo.py tests using saints.py test
* Align turbo.py extractor with flake8 rules
* Fix #class name on turbo.py tests
* Fix #category test
* Fix #category test x2
* Fix #category tests
* Fix #category tests
* Fix TurboMediaExtractor self.groups unpacking

* update basic module formatting
* replace 'saint' with 'turbo' in modules list
* remove saint extractors and tests
* update & simplify 'media' extractor
* update & simplify 'album' extractor
* update tests
* update supportedsites
* update 'category-map' & 'config-map'

---------

Co-authored-by: Mike Fährmann <mike_faehrmann@web.de>
This commit is contained in:
brerk
2026-01-19 15:20:13 -06:00
committed by GitHub
parent cc5bfa6eb0
commit e00c717b15
11 changed files with 230 additions and 282 deletions

View File

@@ -321,6 +321,7 @@ def main():
catmap = {
"coomer" : "coomerparty",
"kemono" : "kemonoparty",
"turbo" : "saint",
"schalenetwork": "koharu",
"naver-blog" : "naver",
"naver-chzzk" : "chzzk",

View File

@@ -175,6 +175,7 @@ def remap_categories():
("chzzk" , "naver-chzzk"),
("naverwebtoon", "naver-webtoon"),
("pixiv" , "pixiv-novel"),
("saint" , "turbo"),
)
elif not cmap:
return

View File

@@ -181,7 +181,6 @@ modules = [
"rule34vault",
"rule34xyz",
"s3ndpics",
"saint",
"sankaku",
"sankakucomplex",
"schalenetwork",
@@ -215,6 +214,7 @@ modules = [
"tumblr",
"tumblrgallery",
"tungsten",
"turbo",
"twibooru",
"twitter",
"urlgalleries",

View File

@@ -1,119 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2024-2026 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://saint2.su/ and https://turbovid.cr/"""
from .lolisafe import LolisafeAlbumExtractor
from .. import text
BASE_PATTERN = r"(?:https?://)?(?:turbo(?:vid)?\.cr|saint\d*\.(?:su|pk|cr|to))"
class SaintAlbumExtractor(LolisafeAlbumExtractor):
"""Extractor for saint albums"""
category = "saint"
root = "https://saint2.su"
pattern = BASE_PATTERN + r"/a/([^/?#]+)"
example = "https://saint2.su/a/ID"
def fetch_album(self, album_id):
# album metadata
response = self.request(self.root + "/a/" + album_id)
extr = text.extract_from(response.text)
title = extr("<title>", "</title")
descr = extr('name="description" content="', '"/>')
files = []
while True:
id2 = extr("/thumbs/", '"')
if not id2:
break
id2, sep, ts = id2.rpartition(".")[0].rpartition("-")
if sep:
date = self.parse_timestamp(ts)
else:
date = None
id2 = ts
files.append({
"id" : extr("/embed/", '"'),
"id2" : id2,
"date" : date,
# "extension": extr("<td>", "</"),
"size" : text.parse_int(extr('data="', '"')),
"file" : text.unescape(extr(
"onclick=\"play(", ")").strip("\"'")),
"id_dl": extr("/d/", ")").rstrip("\"'"),
})
return files, {
"album_id" : album_id,
"album_name" : text.unescape(title.rpartition(" - ")[0]),
"album_size" : sum(file["size"] for file in files),
"description" : text.unescape(descr),
"count" : len(files),
"_http_headers": {"Referer": response.url}
}
class SaintMediaExtractor(SaintAlbumExtractor):
"""Extractor for saint media links"""
subcategory = "media"
directory_fmt = ("{category}",)
pattern = BASE_PATTERN + r"(/(embe)?d/([^/?#]+))"
example = "https://saint2.su/embed/ID"
def fetch_album(self, album_id):
try:
path, embed, _ = self.groups
url = self.root + path
response = self.request(url)
extr = text.extract_from(response.text)
if embed:
id2, sep, ts = extr(
"/thumbs/", '"').rpartition(".")[0].rpartition("-")
if sep:
date = self.parse_timestamp(ts)
else:
date = None
id2 = ts
file = {
"id" : album_id,
"id2" : id2,
"date" : date,
"file" : text.unescape(extr('<source src="', '"')),
"id_dl": extr("/d/", "'"),
}
else: # /d/
file = {
"file" : text.unescape(extr('<a href="', '"')),
"id" : album_id,
"id_dl" : album_id,
"name" : album_id,
"filename" : album_id,
"extension": "mp4",
}
file["_http_headers"] = {"Referer": response.url}
except Exception as exc:
self.log.error("%s: %s", exc.__class__.__name__, exc)
return (), {}
return (file,), {
"album_id" : "",
"album_name" : "",
"album_size" : -1,
"description": "",
"count" : 1,
}

View File

@@ -0,0 +1,94 @@
# -*- coding: utf-8 -*-
# Copyright 2024-2026 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://turbo.cr/"""
from .lolisafe import LolisafeAlbumExtractor
from .. import text
BASE_PATTERN = (r"(?:https?://)?(?:"
r"(?:www\.)?turbo(?:vid)?\.cr|"
r"saint\d*\.(?:su|pk|cr|to))")
class TurboAlbumExtractor(LolisafeAlbumExtractor):
"""Extractor for turbo.cr albums"""
category = "turbo"
root = "https://turbo.cr"
pattern = BASE_PATTERN + r"/a/([^/?#]+)"
example = "https://turbo.cr/a/ID"
def fetch_album(self, album_id):
url = f"{self.root}/a/{album_id}"
extr = text.extract_from(self.request(url).text)
title = extr("<h1 ", "<")
descr = extr("<p ", "<")
tbody = extr('id="fileTbody"', '</tbody>')
headers = {"Referer": url}
return self._extract_files(tbody, headers), {
"album_id" : album_id,
"album_name" : text.unescape(title[title.find(">")+1:]),
"description" : text.unescape(descr[descr.find(">")+1:]),
"album_size" : sum(map(text.parse_int, text.extract_iter(
tbody, 'data-size="', '"'))),
"count" : tbody.count("data-id="),
"_http_headers": headers,
}
def _extract_files(self, body, headers):
for file in text.extract_iter(body, "<tr", "</tr>"):
data_id = text.extr(file, 'data-id="', '"')
url = f"{self.root}/api/sign?v={data_id}"
data = self.request_json(url, headers=headers)
name = data.get("original_filename") or data.get("filename")
yield text.nameext_from_name(name, {
"id" : data_id,
"file": data.get("url"),
"size": text.parse_int(text.extr(file, 'data-size="', '"')),
"_http_headers": headers,
})
class TurboMediaExtractor(TurboAlbumExtractor):
"""Extractor for turbo.cr media links"""
subcategory = "media"
directory_fmt = ("{category}",)
pattern = BASE_PATTERN + r"/(?:embe)?[dv]/([^/?#]+)"
example = "https://turbo.cr/embed/ID"
def fetch_album(self, album_id):
try:
return (self._extract_file(album_id),), {
"album_id" : "",
"album_name" : "",
"album_size" : -1,
"description": "",
"count" : 1,
}
except Exception as exc:
self.log.error("%s: %s", exc.__class__.__name__, exc)
return (), {}
def _extract_file(self, data_id):
url = f"{self.root}/d/{data_id}"
headers = {"Referer": url}
page = self.request(url).text
size = text.extr(page, 'id="fileSizeBytes">', '<')
date = text.extract(page, "<span>", "<", page.find("File ID:"))[0]
url = f"{self.root}/api/sign?v={data_id}"
data = self.request_json(url, headers=headers)
name = data.get("original_filename") or data.get("filename")
return text.nameext_from_name(name, {
"id" : data_id,
"file": data.get("url"),
"size": int(text.parse_float(size.replace("&#43;", "+"))),
"date": self.parse_datetime_iso(date),
"_http_headers": headers,
})