* feat: extractor for pholder.com
Closes #2568
* feat[pholder]: support gallery_id properly and tags
* doc[text.nameext_from_name]: minor typo in docstring
* remove '__init__' & 'request' methods and 'json' import
* use 'text.nameext_from_url' to ensure a 'filename' value
* fix 'imgur' links by disabling auto-Referer
* fix 'data["id"].partition()' call
'partition' returns 3 elements
* use 'item["_source"]' data directly
* remove unused supportedsites overwrite
* catch all exceptions in '_thumb_resolution'
fixes "KeyError: 'width'"
* use 'author' name for user folders
---------
Co-authored-by: Mike Fährmann <mike_faehrmann@web.de>
125 lines
4.4 KiB
Python
125 lines
4.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
"""Extractors for https://pholder.com/"""
|
|
|
|
from .common import Extractor, Message
|
|
from .. import text, util, exception
|
|
|
|
BASE_PATTERN = r"(?:https?://)?(?:www\.)?pholder\.com"
|
|
|
|
|
|
def _thumb_resolution(thumbnail):
|
|
try:
|
|
return int(thumbnail["width"]) * int(thumbnail["height"])
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
class PholderExtractor(Extractor):
|
|
"""Base class for pholder extractors"""
|
|
category = "pholder"
|
|
root = "https://pholder.com"
|
|
directory_fmt = ("{category}", "{subredditTitle}")
|
|
filename_fmt = "{id}{gallery_id:? / /}{title:? //[:225]}.{extension}"
|
|
archive_fmt = "{id}_{filename}_{gallery_id:? / /}"
|
|
request_interval = (2.0, 4.0)
|
|
referer = False
|
|
|
|
def _parse_window_data(self, html):
|
|
# sometimes, window.data content is split across multiple script
|
|
# blocks.
|
|
tag_prefix = len("window_data = ")
|
|
window_data_content = ""
|
|
split_data = False
|
|
|
|
for tag in text.split_html(html):
|
|
if tag.startswith("window.data = "):
|
|
try:
|
|
return util.json_loads(tag[tag_prefix:])
|
|
except ValueError:
|
|
split_data = True
|
|
|
|
if split_data:
|
|
try:
|
|
window_data_content += tag
|
|
return util.json_loads(window_data_content[tag_prefix:])
|
|
except ValueError:
|
|
pass
|
|
|
|
raise exception.AbortExtraction("Could not locate window.data JSON.")
|
|
|
|
def _posts(self, page_url):
|
|
params = {"page": 1}
|
|
while True:
|
|
html = self.request(page_url, params=params).text
|
|
window_data = self._parse_window_data(html)
|
|
|
|
for item in window_data["media"]:
|
|
data = item["_source"]
|
|
data["id"] = item["_id"]
|
|
data["date"] = self.parse_timestamp(data.get("submitted_utc"))
|
|
|
|
if ":" in data["id"]:
|
|
# this is a gallery
|
|
# (can also see from item["is_gallery"])
|
|
# pholder does not preserver gallery order, but assigns
|
|
# each image a sub-id.
|
|
data["id"], _, data["gallery_id"] = \
|
|
data["id"].partition(":")
|
|
else:
|
|
data["gallery_id"] = ""
|
|
|
|
yield Message.Directory, "", data
|
|
|
|
for thumb in sorted(
|
|
data["thumbnails"],
|
|
key=lambda e: _thumb_resolution(e), reverse=True):
|
|
# try to use highest-resolution URLs from thumbnails first.
|
|
url = thumb["url"]
|
|
if url.rindex(":") > url.index(":"):
|
|
# sometimes, thumbnail image URLs end with ":large" or
|
|
# ":small", so we have to strip out any trailing
|
|
# ":word" bits.
|
|
url = url.rpartition(":")[0]
|
|
yield Message.Url, url, text.nameext_from_url(url, data)
|
|
break
|
|
else:
|
|
# Fallback to origin
|
|
url = data["origin"]
|
|
yield Message.Url, url, text.nameext_from_url(url, data)
|
|
|
|
if len(window_data["media"]) < 150:
|
|
break
|
|
|
|
params["page"] += 1
|
|
|
|
def items(self):
|
|
url = f"{self.root}/{self.groups[0]}"
|
|
return self._posts(url)
|
|
|
|
|
|
class PholderSubredditExtractor(PholderExtractor):
|
|
"""Extractor for media from pholder-stored posts for a subreddit"""
|
|
subcategory = "subreddit"
|
|
pattern = BASE_PATTERN + r"(/r/([^/?#]+))(?:/?\?([^#]+))?"
|
|
example = "https://pholder.com/r/SUBREDDIT"
|
|
|
|
|
|
class PholderUserExtractor(PholderExtractor):
|
|
"""Extractor for URLs from pholder-stored posts for a reddit user"""
|
|
subcategory = "user"
|
|
directory_fmt = ("{category}", "u_{author}")
|
|
pattern = BASE_PATTERN + r"(/u/[^/?#]+)(?:/?\?([^#]+))?"
|
|
example = "https://www.pholder.com/u/USER"
|
|
|
|
|
|
class PholderSearchExtractor(PholderExtractor):
|
|
"""Extractor for URLs from pholder-stored posts for a search"""
|
|
subcategory = "search"
|
|
pattern = BASE_PATTERN + r"/(.*)"
|
|
example = "https://www.pholder.com/SEARCH"
|