* feat: extractor for pholder.com
Closes #2568
* feat[pholder]: support gallery_id properly and tags
* doc[text.nameext_from_name]: minor typo in docstring
* remove '__init__' & 'request' methods and 'json' import
* use 'text.nameext_from_url' to ensure a 'filename' value
* fix 'imgur' links by disabling auto-Referer
* fix 'data["id"].partition()' call
'partition' returns 3 elements
* use 'item["_source"]' data directly
* remove unused supportedsites overwrite
* catch all exceptions in '_thumb_resolution'
fixes "KeyError: 'width'"
* use 'author' name for user folders
---------
Co-authored-by: Mike Fährmann <mike_faehrmann@web.de>
This commit is contained in:
@@ -1006,6 +1006,7 @@ Default
|
|||||||
``4chanarchives`` |
|
``4chanarchives`` |
|
||||||
``archivedmoe`` |
|
``archivedmoe`` |
|
||||||
``nsfwalbum`` |
|
``nsfwalbum`` |
|
||||||
|
``pholder`` |
|
||||||
``tumblrgallery``
|
``tumblrgallery``
|
||||||
``true``
|
``true``
|
||||||
otherwise
|
otherwise
|
||||||
|
|||||||
@@ -799,6 +799,12 @@ Consider all listed sites to potentially be NSFW.
|
|||||||
<td>Collections, individual Images, Search Results, User Profiles</td>
|
<td>Collections, individual Images, Search Results, User Profiles</td>
|
||||||
<td></td>
|
<td></td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr id="pholder" title="pholder">
|
||||||
|
<td>pholder</td>
|
||||||
|
<td>https://pholder.com/</td>
|
||||||
|
<td>Search Results, Subreddits, User Profiles</td>
|
||||||
|
<td></td>
|
||||||
|
</tr>
|
||||||
<tr id="photovogue" title="photovogue">
|
<tr id="photovogue" title="photovogue">
|
||||||
<td>PhotoVogue</td>
|
<td>PhotoVogue</td>
|
||||||
<td>https://www.vogue.com/photovogue/</td>
|
<td>https://www.vogue.com/photovogue/</td>
|
||||||
|
|||||||
@@ -157,6 +157,7 @@ modules = [
|
|||||||
"patreon",
|
"patreon",
|
||||||
"pexels",
|
"pexels",
|
||||||
"philomena",
|
"philomena",
|
||||||
|
"pholder",
|
||||||
"photovogue",
|
"photovogue",
|
||||||
"picarto",
|
"picarto",
|
||||||
"picazor",
|
"picazor",
|
||||||
|
|||||||
124
gallery_dl/extractor/pholder.py
Normal file
124
gallery_dl/extractor/pholder.py
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
"""Extractors for https://pholder.com/"""
|
||||||
|
|
||||||
|
from .common import Extractor, Message
|
||||||
|
from .. import text, util, exception
|
||||||
|
|
||||||
|
BASE_PATTERN = r"(?:https?://)?(?:www\.)?pholder\.com"
|
||||||
|
|
||||||
|
|
||||||
|
def _thumb_resolution(thumbnail):
|
||||||
|
try:
|
||||||
|
return int(thumbnail["width"]) * int(thumbnail["height"])
|
||||||
|
except Exception:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
class PholderExtractor(Extractor):
|
||||||
|
"""Base class for pholder extractors"""
|
||||||
|
category = "pholder"
|
||||||
|
root = "https://pholder.com"
|
||||||
|
directory_fmt = ("{category}", "{subredditTitle}")
|
||||||
|
filename_fmt = "{id}{gallery_id:? / /}{title:? //[:225]}.{extension}"
|
||||||
|
archive_fmt = "{id}_{filename}_{gallery_id:? / /}"
|
||||||
|
request_interval = (2.0, 4.0)
|
||||||
|
referer = False
|
||||||
|
|
||||||
|
def _parse_window_data(self, html):
|
||||||
|
# sometimes, window.data content is split across multiple script
|
||||||
|
# blocks.
|
||||||
|
tag_prefix = len("window_data = ")
|
||||||
|
window_data_content = ""
|
||||||
|
split_data = False
|
||||||
|
|
||||||
|
for tag in text.split_html(html):
|
||||||
|
if tag.startswith("window.data = "):
|
||||||
|
try:
|
||||||
|
return util.json_loads(tag[tag_prefix:])
|
||||||
|
except ValueError:
|
||||||
|
split_data = True
|
||||||
|
|
||||||
|
if split_data:
|
||||||
|
try:
|
||||||
|
window_data_content += tag
|
||||||
|
return util.json_loads(window_data_content[tag_prefix:])
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
raise exception.AbortExtraction("Could not locate window.data JSON.")
|
||||||
|
|
||||||
|
def _posts(self, page_url):
|
||||||
|
params = {"page": 1}
|
||||||
|
while True:
|
||||||
|
html = self.request(page_url, params=params).text
|
||||||
|
window_data = self._parse_window_data(html)
|
||||||
|
|
||||||
|
for item in window_data["media"]:
|
||||||
|
data = item["_source"]
|
||||||
|
data["id"] = item["_id"]
|
||||||
|
data["date"] = self.parse_timestamp(data.get("submitted_utc"))
|
||||||
|
|
||||||
|
if ":" in data["id"]:
|
||||||
|
# this is a gallery
|
||||||
|
# (can also see from item["is_gallery"])
|
||||||
|
# pholder does not preserver gallery order, but assigns
|
||||||
|
# each image a sub-id.
|
||||||
|
data["id"], _, data["gallery_id"] = \
|
||||||
|
data["id"].partition(":")
|
||||||
|
else:
|
||||||
|
data["gallery_id"] = ""
|
||||||
|
|
||||||
|
yield Message.Directory, "", data
|
||||||
|
|
||||||
|
for thumb in sorted(
|
||||||
|
data["thumbnails"],
|
||||||
|
key=lambda e: _thumb_resolution(e), reverse=True):
|
||||||
|
# try to use highest-resolution URLs from thumbnails first.
|
||||||
|
url = thumb["url"]
|
||||||
|
if url.rindex(":") > url.index(":"):
|
||||||
|
# sometimes, thumbnail image URLs end with ":large" or
|
||||||
|
# ":small", so we have to strip out any trailing
|
||||||
|
# ":word" bits.
|
||||||
|
url = url.rpartition(":")[0]
|
||||||
|
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
# Fallback to origin
|
||||||
|
url = data["origin"]
|
||||||
|
yield Message.Url, url, text.nameext_from_url(url, data)
|
||||||
|
|
||||||
|
if len(window_data["media"]) < 150:
|
||||||
|
break
|
||||||
|
|
||||||
|
params["page"] += 1
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
url = f"{self.root}/{self.groups[0]}"
|
||||||
|
return self._posts(url)
|
||||||
|
|
||||||
|
|
||||||
|
class PholderSubredditExtractor(PholderExtractor):
|
||||||
|
"""Extractor for media from pholder-stored posts for a subreddit"""
|
||||||
|
subcategory = "subreddit"
|
||||||
|
pattern = BASE_PATTERN + r"(/r/([^/?#]+))(?:/?\?([^#]+))?"
|
||||||
|
example = "https://pholder.com/r/SUBREDDIT"
|
||||||
|
|
||||||
|
|
||||||
|
class PholderUserExtractor(PholderExtractor):
|
||||||
|
"""Extractor for URLs from pholder-stored posts for a reddit user"""
|
||||||
|
subcategory = "user"
|
||||||
|
directory_fmt = ("{category}", "u_{author}")
|
||||||
|
pattern = BASE_PATTERN + r"(/u/[^/?#]+)(?:/?\?([^#]+))?"
|
||||||
|
example = "https://www.pholder.com/u/USER"
|
||||||
|
|
||||||
|
|
||||||
|
class PholderSearchExtractor(PholderExtractor):
|
||||||
|
"""Extractor for URLs from pholder-stored posts for a search"""
|
||||||
|
subcategory = "search"
|
||||||
|
pattern = BASE_PATTERN + r"/(.*)"
|
||||||
|
example = "https://www.pholder.com/SEARCH"
|
||||||
@@ -120,7 +120,7 @@ def nameext_from_url(url, data=None):
|
|||||||
|
|
||||||
|
|
||||||
def nameext_from_name(filename, data=None):
|
def nameext_from_name(filename, data=None):
|
||||||
"""Extract the last part of an URL and fill 'data' accordingly"""
|
"""Extract the last part of a file name and fill 'data' accordingly"""
|
||||||
if data is None:
|
if data is None:
|
||||||
data = {}
|
data = {}
|
||||||
|
|
||||||
|
|||||||
@@ -153,6 +153,7 @@ CATEGORY_MAP = {
|
|||||||
"nudostarforum" : "NudoStar Forums",
|
"nudostarforum" : "NudoStar Forums",
|
||||||
"okporn" : "OK.PORN",
|
"okporn" : "OK.PORN",
|
||||||
"paheal" : "Rule 34",
|
"paheal" : "Rule 34",
|
||||||
|
"pholder" : "pholder",
|
||||||
"photovogue" : "PhotoVogue",
|
"photovogue" : "PhotoVogue",
|
||||||
"picstate" : "PicState",
|
"picstate" : "PicState",
|
||||||
"pidgiwiki" : "PidgiWiki",
|
"pidgiwiki" : "PidgiWiki",
|
||||||
|
|||||||
47
test/results/pholder.py
Normal file
47
test/results/pholder.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License version 2 as
|
||||||
|
# published by the Free Software Foundation.
|
||||||
|
|
||||||
|
from gallery_dl.extractor import pholder
|
||||||
|
|
||||||
|
|
||||||
|
__tests__ = (
|
||||||
|
{
|
||||||
|
"#url" : "https://pholder.com/r/lavaporn",
|
||||||
|
"#category": ("", "pholder", "subreddit"),
|
||||||
|
"#class" : pholder.PholderSubredditExtractor,
|
||||||
|
"#range" : "1-20",
|
||||||
|
"#count" : ">= 20",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://pholder.com/r/lavaporn/",
|
||||||
|
"#category": ("", "pholder", "subreddit"),
|
||||||
|
"#class" : pholder.PholderSubredditExtractor,
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://pholder.com/u/automoderator",
|
||||||
|
"#category": ("", "pholder", "user"),
|
||||||
|
"#class" : pholder.PholderUserExtractor,
|
||||||
|
"#range" : "1-20",
|
||||||
|
"#count" : ">= 20",
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://pholder.com/u/automoderator/",
|
||||||
|
"#category": ("", "pholder", "user"),
|
||||||
|
"#class" : pholder.PholderUserExtractor,
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://pholder.com/search-text",
|
||||||
|
"#category": ("", "pholder", "search"),
|
||||||
|
"#class" : pholder.PholderSearchExtractor,
|
||||||
|
"#range" : "1-10",
|
||||||
|
"#count" : "== 10",
|
||||||
|
},
|
||||||
|
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user