[pholder] add support (#2568 #9067)

* feat: extractor for pholder.com
    Closes #2568
* feat[pholder]: support gallery_id properly and tags
* doc[text.nameext_from_name]: minor typo in docstring

* remove '__init__' & 'request' methods and 'json' import
* use 'text.nameext_from_url' to ensure a 'filename' value
* fix 'imgur' links by disabling auto-Referer
* fix 'data["id"].partition()' call
    'partition' returns 3 elements
* use 'item["_source"]' data directly
* remove unused supportedsites overwrite
* catch all exceptions in '_thumb_resolution'
    fixes "KeyError: 'width'"
* use 'author' name for user folders

---------

Co-authored-by: Mike Fährmann <mike_faehrmann@web.de>
This commit is contained in:
Amar Paul
2026-02-15 03:46:18 -05:00
committed by GitHub
parent 01cb378baa
commit b552cdba04
7 changed files with 181 additions and 1 deletions

View File

@@ -1006,6 +1006,7 @@ Default
``4chanarchives`` | ``4chanarchives`` |
``archivedmoe`` | ``archivedmoe`` |
``nsfwalbum`` | ``nsfwalbum`` |
``pholder`` |
``tumblrgallery`` ``tumblrgallery``
``true`` ``true``
otherwise otherwise

View File

@@ -799,6 +799,12 @@ Consider all listed sites to potentially be NSFW.
<td>Collections, individual Images, Search Results, User Profiles</td> <td>Collections, individual Images, Search Results, User Profiles</td>
<td></td> <td></td>
</tr> </tr>
<tr id="pholder" title="pholder">
<td>pholder</td>
<td>https://pholder.com/</td>
<td>Search Results, Subreddits, User Profiles</td>
<td></td>
</tr>
<tr id="photovogue" title="photovogue"> <tr id="photovogue" title="photovogue">
<td>PhotoVogue</td> <td>PhotoVogue</td>
<td>https://www.vogue.com/photovogue/</td> <td>https://www.vogue.com/photovogue/</td>

View File

@@ -157,6 +157,7 @@ modules = [
"patreon", "patreon",
"pexels", "pexels",
"philomena", "philomena",
"pholder",
"photovogue", "photovogue",
"picarto", "picarto",
"picazor", "picazor",

View File

@@ -0,0 +1,124 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://pholder.com/"""
from .common import Extractor, Message
from .. import text, util, exception
BASE_PATTERN = r"(?:https?://)?(?:www\.)?pholder\.com"
def _thumb_resolution(thumbnail):
try:
return int(thumbnail["width"]) * int(thumbnail["height"])
except Exception:
return 0
class PholderExtractor(Extractor):
"""Base class for pholder extractors"""
category = "pholder"
root = "https://pholder.com"
directory_fmt = ("{category}", "{subredditTitle}")
filename_fmt = "{id}{gallery_id:? / /}{title:? //[:225]}.{extension}"
archive_fmt = "{id}_{filename}_{gallery_id:? / /}"
request_interval = (2.0, 4.0)
referer = False
def _parse_window_data(self, html):
# sometimes, window.data content is split across multiple script
# blocks.
tag_prefix = len("window_data = ")
window_data_content = ""
split_data = False
for tag in text.split_html(html):
if tag.startswith("window.data = "):
try:
return util.json_loads(tag[tag_prefix:])
except ValueError:
split_data = True
if split_data:
try:
window_data_content += tag
return util.json_loads(window_data_content[tag_prefix:])
except ValueError:
pass
raise exception.AbortExtraction("Could not locate window.data JSON.")
def _posts(self, page_url):
params = {"page": 1}
while True:
html = self.request(page_url, params=params).text
window_data = self._parse_window_data(html)
for item in window_data["media"]:
data = item["_source"]
data["id"] = item["_id"]
data["date"] = self.parse_timestamp(data.get("submitted_utc"))
if ":" in data["id"]:
# this is a gallery
# (can also see from item["is_gallery"])
# pholder does not preserver gallery order, but assigns
# each image a sub-id.
data["id"], _, data["gallery_id"] = \
data["id"].partition(":")
else:
data["gallery_id"] = ""
yield Message.Directory, "", data
for thumb in sorted(
data["thumbnails"],
key=lambda e: _thumb_resolution(e), reverse=True):
# try to use highest-resolution URLs from thumbnails first.
url = thumb["url"]
if url.rindex(":") > url.index(":"):
# sometimes, thumbnail image URLs end with ":large" or
# ":small", so we have to strip out any trailing
# ":word" bits.
url = url.rpartition(":")[0]
yield Message.Url, url, text.nameext_from_url(url, data)
break
else:
# Fallback to origin
url = data["origin"]
yield Message.Url, url, text.nameext_from_url(url, data)
if len(window_data["media"]) < 150:
break
params["page"] += 1
def items(self):
url = f"{self.root}/{self.groups[0]}"
return self._posts(url)
class PholderSubredditExtractor(PholderExtractor):
"""Extractor for media from pholder-stored posts for a subreddit"""
subcategory = "subreddit"
pattern = BASE_PATTERN + r"(/r/([^/?#]+))(?:/?\?([^#]+))?"
example = "https://pholder.com/r/SUBREDDIT"
class PholderUserExtractor(PholderExtractor):
"""Extractor for URLs from pholder-stored posts for a reddit user"""
subcategory = "user"
directory_fmt = ("{category}", "u_{author}")
pattern = BASE_PATTERN + r"(/u/[^/?#]+)(?:/?\?([^#]+))?"
example = "https://www.pholder.com/u/USER"
class PholderSearchExtractor(PholderExtractor):
"""Extractor for URLs from pholder-stored posts for a search"""
subcategory = "search"
pattern = BASE_PATTERN + r"/(.*)"
example = "https://www.pholder.com/SEARCH"

View File

@@ -120,7 +120,7 @@ def nameext_from_url(url, data=None):
def nameext_from_name(filename, data=None): def nameext_from_name(filename, data=None):
"""Extract the last part of an URL and fill 'data' accordingly""" """Extract the last part of a file name and fill 'data' accordingly"""
if data is None: if data is None:
data = {} data = {}

View File

@@ -153,6 +153,7 @@ CATEGORY_MAP = {
"nudostarforum" : "NudoStar Forums", "nudostarforum" : "NudoStar Forums",
"okporn" : "OK.PORN", "okporn" : "OK.PORN",
"paheal" : "Rule 34", "paheal" : "Rule 34",
"pholder" : "pholder",
"photovogue" : "PhotoVogue", "photovogue" : "PhotoVogue",
"picstate" : "PicState", "picstate" : "PicState",
"pidgiwiki" : "PidgiWiki", "pidgiwiki" : "PidgiWiki",

47
test/results/pholder.py Normal file
View File

@@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
from gallery_dl.extractor import pholder
__tests__ = (
{
"#url" : "https://pholder.com/r/lavaporn",
"#category": ("", "pholder", "subreddit"),
"#class" : pholder.PholderSubredditExtractor,
"#range" : "1-20",
"#count" : ">= 20",
},
{
"#url" : "https://pholder.com/r/lavaporn/",
"#category": ("", "pholder", "subreddit"),
"#class" : pholder.PholderSubredditExtractor,
},
{
"#url" : "https://pholder.com/u/automoderator",
"#category": ("", "pholder", "user"),
"#class" : pholder.PholderUserExtractor,
"#range" : "1-20",
"#count" : ">= 20",
},
{
"#url" : "https://pholder.com/u/automoderator/",
"#category": ("", "pholder", "user"),
"#class" : pholder.PholderUserExtractor,
},
{
"#url" : "https://pholder.com/search-text",
"#category": ("", "pholder", "search"),
"#class" : pholder.PholderSearchExtractor,
"#range" : "1-10",
"#count" : "== 10",
},
)