From c73c2cda50c20c51f8117b04ee76d3724ec9dff8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 7 Jun 2019 16:31:20 +0200 Subject: [PATCH] [pornhub] add gallery & user extractor (#282) --- docs/supportedsites.rst | 1 + gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/pornhub.py | 157 +++++++++++++++++++++++++++++++ 3 files changed, 159 insertions(+) create mode 100644 gallery_dl/extractor/pornhub.py diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index 43f20534..fa4f554c 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -78,6 +78,7 @@ Pinterest https://www.pinterest.com/ Boards, Pins, pin.it Li Pixiv https://www.pixiv.net/ |pixiv-C| Required Pixnet https://www.pixnet.net/ |pixnet-C| Plurk https://www.plurk.com/ Posts, Timelines +Pornhub https://www.pornhub.com/ Images from Users, Galleries Pornreactor http://pornreactor.cc/ |pornreactor-C| PowerManga https://read.powermanga.org/ Chapters, Manga Pururin https://pururin.io/ Galleries diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 2ceb009d..ac64d513 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -72,6 +72,7 @@ modules = [ "pixiv", "pixnet", "plurk", + "pornhub", "pururin", "reactor", "readcomiconline", diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py new file mode 100644 index 00000000..40816b30 --- /dev/null +++ b/gallery_dl/extractor/pornhub.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.pornhub.com/""" + +from .common import Extractor, Message +from .. import text, exception + + +BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?pornhub\.com" + + +class PornhubExtractor(Extractor): + """Base class for pornhub extractors""" + category = "pornhub" + root = "https://www.pornhub.com" + + +class PornhubGalleryExtractor(PornhubExtractor): + """Extractor for image galleries on pornhub.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "{user}", "{gallery[id]} {gallery[title]}") + filename_fmt = "{num:>03}_{id}.{extension}" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/album/(\d+)" + test = ( + ("https://www.pornhub.com/album/1708982", { + "pattern": r"https://\w+.phncdn.com/pics/albums/\d+/\d+/\d+/\d+/", + "count": 93, + "keyword": { + "id": int, + "num": int, + "score": int, + "views": int, + "caption": str, + "user": "Unknown", + "gallery": { + "id" : 1708982, + "score": int, + "views": int, + "tags" : list, + "title": "Random Hentai", + }, + }, + }), + ("https://www.pornhub.com/album/37180171", { + "exception": exception.AuthorizationError, + }), + ) + + def __init__(self, match): + PornhubExtractor.__init__(self, match) + self.gallery_id = match.group(1) + self._first = None + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + for num, image in enumerate(self.images(), 1): + url = image["url"] + image.update(data) + image["num"] = num + yield Message.Url, url, text.nameext_from_url(url, image) + + def metadata(self): + url = "{}/album/{}".format( + self.root, self.gallery_id) + extr = text.extract_from(self.request(url).text) + + title = extr("", "") + score = extr('
', '<') + tags = extr('
= 8", + }), + ("https://www.pornhub.com/users/flyings0l0/"), + ("https://www.pornhub.com/users/flyings0l0/photos/public"), + ("https://www.pornhub.com/users/flyings0l0/photos/private"), + ("https://www.pornhub.com/users/flyings0l0/photos/favorites"), + ("https://www.pornhub.com/model/bossgirl/photos"), + ) + + def __init__(self, match): + PornhubExtractor.__init__(self, match) + self.type, self.user, self.cat = match.groups() + + def items(self): + url = "{}/{}/{}/photos/{}/ajax".format( + self.root, self.type, self.user, self.cat or "public") + params = {"page": 1} + headers = { + "Referer": url[:-5], + "X-Requested-With": "XMLHttpRequest", + } + + data = {"_extractor": PornhubGalleryExtractor} + yield Message.Version, 1 + while True: + page = self.request( + url, method="POST", headers=headers, params=params).text + if not page: + return + for gid in text.extract_iter(page, 'id="albumphoto', '"'): + yield Message.Queue, self.root + "/album/" + gid, data + params["page"] += 1