Files
gallery-dl/gallery_dl/extractor/pholder.py
Amar Paul b552cdba04 [pholder] add support (#2568 #9067)
* feat: extractor for pholder.com
    Closes #2568
* feat[pholder]: support gallery_id properly and tags
* doc[text.nameext_from_name]: minor typo in docstring

* remove '__init__' & 'request' methods and 'json' import
* use 'text.nameext_from_url' to ensure a 'filename' value
* fix 'imgur' links by disabling auto-Referer
* fix 'data["id"].partition()' call
    'partition' returns 3 elements
* use 'item["_source"]' data directly
* remove unused supportedsites overwrite
* catch all exceptions in '_thumb_resolution'
    fixes "KeyError: 'width'"
* use 'author' name for user folders

---------

Co-authored-by: Mike Fährmann <mike_faehrmann@web.de>
2026-02-15 09:46:18 +01:00

125 lines
4.4 KiB
Python

# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://pholder.com/"""
from .common import Extractor, Message
from .. import text, util, exception
BASE_PATTERN = r"(?:https?://)?(?:www\.)?pholder\.com"
def _thumb_resolution(thumbnail):
try:
return int(thumbnail["width"]) * int(thumbnail["height"])
except Exception:
return 0
class PholderExtractor(Extractor):
"""Base class for pholder extractors"""
category = "pholder"
root = "https://pholder.com"
directory_fmt = ("{category}", "{subredditTitle}")
filename_fmt = "{id}{gallery_id:? / /}{title:? //[:225]}.{extension}"
archive_fmt = "{id}_{filename}_{gallery_id:? / /}"
request_interval = (2.0, 4.0)
referer = False
def _parse_window_data(self, html):
# sometimes, window.data content is split across multiple script
# blocks.
tag_prefix = len("window_data = ")
window_data_content = ""
split_data = False
for tag in text.split_html(html):
if tag.startswith("window.data = "):
try:
return util.json_loads(tag[tag_prefix:])
except ValueError:
split_data = True
if split_data:
try:
window_data_content += tag
return util.json_loads(window_data_content[tag_prefix:])
except ValueError:
pass
raise exception.AbortExtraction("Could not locate window.data JSON.")
def _posts(self, page_url):
params = {"page": 1}
while True:
html = self.request(page_url, params=params).text
window_data = self._parse_window_data(html)
for item in window_data["media"]:
data = item["_source"]
data["id"] = item["_id"]
data["date"] = self.parse_timestamp(data.get("submitted_utc"))
if ":" in data["id"]:
# this is a gallery
# (can also see from item["is_gallery"])
# pholder does not preserver gallery order, but assigns
# each image a sub-id.
data["id"], _, data["gallery_id"] = \
data["id"].partition(":")
else:
data["gallery_id"] = ""
yield Message.Directory, "", data
for thumb in sorted(
data["thumbnails"],
key=lambda e: _thumb_resolution(e), reverse=True):
# try to use highest-resolution URLs from thumbnails first.
url = thumb["url"]
if url.rindex(":") > url.index(":"):
# sometimes, thumbnail image URLs end with ":large" or
# ":small", so we have to strip out any trailing
# ":word" bits.
url = url.rpartition(":")[0]
yield Message.Url, url, text.nameext_from_url(url, data)
break
else:
# Fallback to origin
url = data["origin"]
yield Message.Url, url, text.nameext_from_url(url, data)
if len(window_data["media"]) < 150:
break
params["page"] += 1
def items(self):
url = f"{self.root}/{self.groups[0]}"
return self._posts(url)
class PholderSubredditExtractor(PholderExtractor):
"""Extractor for media from pholder-stored posts for a subreddit"""
subcategory = "subreddit"
pattern = BASE_PATTERN + r"(/r/([^/?#]+))(?:/?\?([^#]+))?"
example = "https://pholder.com/r/SUBREDDIT"
class PholderUserExtractor(PholderExtractor):
"""Extractor for URLs from pholder-stored posts for a reddit user"""
subcategory = "user"
directory_fmt = ("{category}", "u_{author}")
pattern = BASE_PATTERN + r"(/u/[^/?#]+)(?:/?\?([^#]+))?"
example = "https://www.pholder.com/u/USER"
class PholderSearchExtractor(PholderExtractor):
"""Extractor for URLs from pholder-stored posts for a search"""
subcategory = "search"
pattern = BASE_PATTERN + r"/(.*)"
example = "https://www.pholder.com/SEARCH"