diff --git a/docs/configuration.rst b/docs/configuration.rst index 8bb2cdcc..d7051a5e 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -1006,6 +1006,7 @@ Default ``4chanarchives`` | ``archivedmoe`` | ``nsfwalbum`` | + ``pholder`` | ``tumblrgallery`` ``true`` otherwise diff --git a/docs/supportedsites.md b/docs/supportedsites.md index bdf20c20..4ac53fb6 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -799,6 +799,12 @@ Consider all listed sites to potentially be NSFW. Collections, individual Images, Search Results, User Profiles + + pholder + https://pholder.com/ + Search Results, Subreddits, User Profiles + + PhotoVogue https://www.vogue.com/photovogue/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 2989bc36..29b9b719 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -157,6 +157,7 @@ modules = [ "patreon", "pexels", "philomena", + "pholder", "photovogue", "picarto", "picazor", diff --git a/gallery_dl/extractor/pholder.py b/gallery_dl/extractor/pholder.py new file mode 100644 index 00000000..12e150d4 --- /dev/null +++ b/gallery_dl/extractor/pholder.py @@ -0,0 +1,124 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://pholder.com/""" + +from .common import Extractor, Message +from .. import text, util, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?pholder\.com" + + +def _thumb_resolution(thumbnail): + try: + return int(thumbnail["width"]) * int(thumbnail["height"]) + except Exception: + return 0 + + +class PholderExtractor(Extractor): + """Base class for pholder extractors""" + category = "pholder" + root = "https://pholder.com" + directory_fmt = ("{category}", "{subredditTitle}") + filename_fmt = "{id}{gallery_id:? / /}{title:? //[:225]}.{extension}" + archive_fmt = "{id}_{filename}_{gallery_id:? / /}" + request_interval = (2.0, 4.0) + referer = False + + def _parse_window_data(self, html): + # sometimes, window.data content is split across multiple script + # blocks. + tag_prefix = len("window_data = ") + window_data_content = "" + split_data = False + + for tag in text.split_html(html): + if tag.startswith("window.data = "): + try: + return util.json_loads(tag[tag_prefix:]) + except ValueError: + split_data = True + + if split_data: + try: + window_data_content += tag + return util.json_loads(window_data_content[tag_prefix:]) + except ValueError: + pass + + raise exception.AbortExtraction("Could not locate window.data JSON.") + + def _posts(self, page_url): + params = {"page": 1} + while True: + html = self.request(page_url, params=params).text + window_data = self._parse_window_data(html) + + for item in window_data["media"]: + data = item["_source"] + data["id"] = item["_id"] + data["date"] = self.parse_timestamp(data.get("submitted_utc")) + + if ":" in data["id"]: + # this is a gallery + # (can also see from item["is_gallery"]) + # pholder does not preserver gallery order, but assigns + # each image a sub-id. + data["id"], _, data["gallery_id"] = \ + data["id"].partition(":") + else: + data["gallery_id"] = "" + + yield Message.Directory, "", data + + for thumb in sorted( + data["thumbnails"], + key=lambda e: _thumb_resolution(e), reverse=True): + # try to use highest-resolution URLs from thumbnails first. + url = thumb["url"] + if url.rindex(":") > url.index(":"): + # sometimes, thumbnail image URLs end with ":large" or + # ":small", so we have to strip out any trailing + # ":word" bits. + url = url.rpartition(":")[0] + yield Message.Url, url, text.nameext_from_url(url, data) + break + else: + # Fallback to origin + url = data["origin"] + yield Message.Url, url, text.nameext_from_url(url, data) + + if len(window_data["media"]) < 150: + break + + params["page"] += 1 + + def items(self): + url = f"{self.root}/{self.groups[0]}" + return self._posts(url) + + +class PholderSubredditExtractor(PholderExtractor): + """Extractor for media from pholder-stored posts for a subreddit""" + subcategory = "subreddit" + pattern = BASE_PATTERN + r"(/r/([^/?#]+))(?:/?\?([^#]+))?" + example = "https://pholder.com/r/SUBREDDIT" + + +class PholderUserExtractor(PholderExtractor): + """Extractor for URLs from pholder-stored posts for a reddit user""" + subcategory = "user" + directory_fmt = ("{category}", "u_{author}") + pattern = BASE_PATTERN + r"(/u/[^/?#]+)(?:/?\?([^#]+))?" + example = "https://www.pholder.com/u/USER" + + +class PholderSearchExtractor(PholderExtractor): + """Extractor for URLs from pholder-stored posts for a search""" + subcategory = "search" + pattern = BASE_PATTERN + r"/(.*)" + example = "https://www.pholder.com/SEARCH" diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 9fdf05bc..e9bd15e1 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -120,7 +120,7 @@ def nameext_from_url(url, data=None): def nameext_from_name(filename, data=None): - """Extract the last part of an URL and fill 'data' accordingly""" + """Extract the last part of a file name and fill 'data' accordingly""" if data is None: data = {} diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index f62aea8b..2d591ca6 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -153,6 +153,7 @@ CATEGORY_MAP = { "nudostarforum" : "NudoStar Forums", "okporn" : "OK.PORN", "paheal" : "Rule 34", + "pholder" : "pholder", "photovogue" : "PhotoVogue", "picstate" : "PicState", "pidgiwiki" : "PidgiWiki", diff --git a/test/results/pholder.py b/test/results/pholder.py new file mode 100644 index 00000000..46baf7fd --- /dev/null +++ b/test/results/pholder.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from gallery_dl.extractor import pholder + + +__tests__ = ( +{ + "#url" : "https://pholder.com/r/lavaporn", + "#category": ("", "pholder", "subreddit"), + "#class" : pholder.PholderSubredditExtractor, + "#range" : "1-20", + "#count" : ">= 20", +}, + +{ + "#url" : "https://pholder.com/r/lavaporn/", + "#category": ("", "pholder", "subreddit"), + "#class" : pholder.PholderSubredditExtractor, +}, + +{ + "#url" : "https://pholder.com/u/automoderator", + "#category": ("", "pholder", "user"), + "#class" : pholder.PholderUserExtractor, + "#range" : "1-20", + "#count" : ">= 20", +}, + +{ + "#url" : "https://pholder.com/u/automoderator/", + "#category": ("", "pholder", "user"), + "#class" : pholder.PholderUserExtractor, +}, + +{ + "#url" : "https://pholder.com/search-text", + "#category": ("", "pholder", "search"), + "#class" : pholder.PholderSearchExtractor, + "#range" : "1-10", + "#count" : "== 10", +}, + +)