Files
gallery-dl/gallery_dl/extractor/scrolller.py
2025-12-20 22:07:37 +01:00

350 lines
10 KiB
Python

# -*- coding: utf-8 -*-
# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://scrolller.com/"""
from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
BASE_PATTERN = r"(?:https?://)?(?:www\.)?scrolller\.com"
class ScrolllerExtractor(Extractor):
"""Base class for scrolller extractors"""
category = "scrolller"
root = "https://scrolller.com"
directory_fmt = ("{category}", "{subredditTitle}")
filename_fmt = "{id}{num:?_//>03}{title:? //[:230]}.{extension}"
archive_fmt = "{id}_{num}"
request_interval = (0.5, 1.5)
def _init(self):
self.auth_token = None
def items(self):
self.login()
for post in self.posts():
files = self._extract_files(post)
post["count"] = len(files)
yield Message.Directory, "", post
for file in files:
url = file["url"]
post.update(file)
yield Message.Url, url, text.nameext_from_url(url, post)
def posts(self):
return ()
def _extract_files(self, post):
album = post.pop("albumContent", None)
if not album:
sources = post.get("mediaSources")
if not sources:
self.log.warning("%s: No media files", post.get("id"))
return ()
src = max(sources, key=self._sort_key)
src["num"] = 0
return (src,)
files = []
for num, media in enumerate(album, 1):
sources = media.get("mediaSources")
if not sources:
self.log.warning("%s/%s: Missing media file",
post.get("id"), num)
continue
src = max(sources, key=self._sort_key)
src["num"] = num
files.append(src)
return files
def login(self):
username, password = self._get_auth_info()
if username:
self.auth_token = self._login_impl(username, password)
@cache(maxage=28*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
variables = {
"username": username,
"password": password,
}
try:
data = self._request_graphql("LoginQuery", variables, False)
except exception.HttpError as exc:
if exc.status == 403:
raise exception.AuthenticationError()
raise
return data["login"]["token"]
def _request_graphql(self, opname, variables, admin=True):
headers = {
"Content-Type" : None,
"Origin" : self.root,
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-site",
}
data = {
"query" : QUERIES[opname],
"variables" : variables,
"authorization": self.auth_token,
}
if admin:
url = "https://api.scrolller.com/admin"
headers["Content-Type"] = "application/json"
else:
url = "https://api.scrolller.com/api/v2/graphql"
headers["Content-Type"] = "text/plain;charset=UTF-8"
return self.request_json(
url, method="POST", headers=headers, data=util.json_dumps(data),
)["data"]
def _pagination(self, opname, variables, data=None):
if data is None or not data.get("items"):
data = self._request_graphql(opname, variables)
while True:
while "items" not in data:
data = data.popitem()[1]
yield from data["items"]
if not data["iterator"]:
return
variables["iterator"] = data["iterator"]
data = self._request_graphql(opname, variables)
def _sort_key(self, src):
return src["width"], not src["isOptimized"]
class ScrolllerSubredditExtractor(ScrolllerExtractor):
"""Extractor for media from a scrolller subreddit"""
subcategory = "subreddit"
pattern = BASE_PATTERN + r"(/r/[^/?#]+)(?:/?\?([^#]+))?"
example = "https://scrolller.com/r/SUBREDDIT"
def posts(self):
url, query = self.groups
filter = None
sort = "RANDOM"
if query:
params = text.parse_query(query)
if "filter" in params:
filter = params["filter"].upper().rstrip("S")
variables = {
"url" : url,
"filter": filter,
"sortBy": sort,
"limit" : 50,
}
subreddit = self._request_graphql(
"SubredditQuery", variables)["getSubreddit"]
variables = {
"subredditId": subreddit["id"],
"iterator": None,
"filter" : filter,
"sortBy" : sort,
"limit" : 50,
"isNsfw" : subreddit["isNsfw"],
}
return self._pagination(
"SubredditChildrenQuery", variables, subreddit["children"])
class ScrolllerFollowingExtractor(ScrolllerExtractor):
"""Extractor for followed scrolller subreddits"""
subcategory = "following"
pattern = BASE_PATTERN + r"/following"
example = "https://scrolller.com/following"
def items(self):
self.login()
if not self.auth_token:
raise exception.AuthorizationError("Login required")
variables = {
"iterator": None,
"filter" : None,
"limit" : 10,
"isNsfw" : False,
"sortBy" : "RANDOM",
}
for subreddit in self._pagination("GetFollowingSubreddits", variables):
url = self.root + subreddit["url"]
subreddit["_extractor"] = ScrolllerSubredditExtractor
yield Message.Queue, url, subreddit
class ScrolllerPostExtractor(ScrolllerExtractor):
"""Extractor for media from a single scrolller post"""
subcategory = "post"
pattern = BASE_PATTERN + r"/(?!r/|following$)([^/?#]+)"
example = "https://scrolller.com/TITLE-SLUG-a1b2c3d4f5"
def posts(self):
variables = {"url": "/" + self.groups[0]}
data = self._request_graphql("SubredditPostQuery", variables)
return (data["getPost"],)
QUERIES = {
"SubredditPostQuery": """\
query SubredditPostQuery(
$url: String!
) {
getPost(
data: { url: $url }
) {
__typename id url title subredditId subredditTitle subredditUrl
redditPath isNsfw hasAudio fullLengthSource gfycatSource redgifsSource
ownerAvatar username displayName favoriteCount isPaid tags
commentsCount commentsRepliesCount isFavorite
albumContent { mediaSources { url width height isOptimized } }
mediaSources { url width height isOptimized }
blurredMediaSources { url width height isOptimized }
}
}
""",
"SubredditQuery": """\
query SubredditQuery(
$url: String!
$iterator: String
$sortBy: GallerySortBy
$filter: GalleryFilter
$limit: Int!
) {
getSubreddit(
data: {
url: $url,
iterator: $iterator,
filter: $filter,
limit: $limit,
sortBy: $sortBy
}
) {
__typename id url title secondaryTitle description createdAt isNsfw
subscribers isComplete itemCount videoCount pictureCount albumCount
isPaid username tags isFollowing
banner { url width height isOptimized }
children {
iterator items {
__typename id url title subredditId subredditTitle subredditUrl
redditPath isNsfw hasAudio fullLengthSource gfycatSource
redgifsSource ownerAvatar username displayName favoriteCount
isPaid tags commentsCount commentsRepliesCount isFavorite
albumContent { mediaSources { url width height isOptimized } }
mediaSources { url width height isOptimized }
blurredMediaSources { url width height isOptimized }
}
}
}
}
""",
"SubredditChildrenQuery": """\
query SubredditChildrenQuery(
$subredditId: Int!
$iterator: String
$filter: GalleryFilter
$sortBy: GallerySortBy
$limit: Int!
$isNsfw: Boolean
) {
getSubredditChildren(
data: {
subredditId: $subredditId,
iterator: $iterator,
filter: $filter,
sortBy: $sortBy,
limit: $limit,
isNsfw: $isNsfw
},
) {
iterator items {
__typename id url title subredditId subredditTitle subredditUrl
redditPath isNsfw hasAudio fullLengthSource gfycatSource
redgifsSource ownerAvatar username displayName favoriteCount isPaid
tags commentsCount commentsRepliesCount isFavorite
albumContent { mediaSources { url width height isOptimized } }
mediaSources { url width height isOptimized }
blurredMediaSources { url width height isOptimized }
}
}
}
""",
"GetFollowingSubreddits": """\
query GetFollowingSubreddits(
$iterator: String,
$limit: Int!,
$filter: GalleryFilter,
$isNsfw: Boolean,
$sortBy: GallerySortBy
) {
getFollowingSubreddits(
data: {
isNsfw: $isNsfw
limit: $limit
filter: $filter
iterator: $iterator
sortBy: $sortBy
}
) {
iterator items {
__typename id url title secondaryTitle description createdAt isNsfw
subscribers isComplete itemCount videoCount pictureCount albumCount
isFollowing
}
}
}
""",
"LoginQuery": """\
query LoginQuery(
$username: String!,
$password: String!
) {
login(
username: $username,
password: $password
) {
username token expiresAt isAdmin status isPremium
}
}
""",
"ItemTypeQuery": """\
query ItemTypeQuery(
$url: String!
) {
getItemType(
url: $url
)
}
""",
}