From df082e923cb2b04dbaa8c79c930a671561ba7cd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 1 Aug 2018 21:46:55 +0200 Subject: [PATCH] [behance] add gallery extractor (#95) --- docs/supportedsites.rst | 1 + gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/behance.py | 111 +++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+) create mode 100644 gallery_dl/extractor/behance.py diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst index 9cd14d58..57a48399 100644 --- a/docs/supportedsites.rst +++ b/docs/supportedsites.rst @@ -12,6 +12,7 @@ arch.b4k.co https://arch.b4k.co/ Threads Archive of Sins https://archiveofsins.com/ Threads Archived.Moe https://archived.moe/ Threads ArtStation https://www.artstation.com/ |Images from Use-0| +Behance https://www.behance.net/ Galleries Danbooru https://danbooru.donmai.us/ Pools, Popular Images, Posts, Tag-Searches Desuarchive https://desuarchive.org/ Threads DeviantArt https://www.deviantart.com/ |Collections, De-1| Optional (OAuth) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 2bbe87fb..7146c581 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -19,6 +19,7 @@ modules = [ "archiveofsins", "artstation", "b4k", + "behance", "danbooru", "desuarchive", "deviantart", diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py new file mode 100644 index 00000000..d1a65ba5 --- /dev/null +++ b/gallery_dl/extractor/behance.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.behance.net/""" + +from .common import Extractor, Message +from .. import text + + +class BehanceGalleryExtractor(Extractor): + """Extractor for image galleries from www.behance.net""" + category = "behance" + subcategory = "gallery" + directory_fmt = ["{category}", "{user}", "{gallery_id} {title}"] + filename_fmt = "{category}_{gallery_id}_{num:>02}.{extension}" + archive_fmt = "{gallery_id}_{num}" + root = "https://www.behance.net" + pattern = [r"(?:https?://)?(?:www\.)?behance\.net/gallery/(\d+)"] + test = [ + ("https://www.behance.net/gallery/17386197", { + "count": 2, + "keyword": { + "title": str, + "user": "Place Studio, Julio César Velazquez", + "fields": ["Animation", "Character Design", "Directing"], + "date": 1401810111, + "views": int, + "votes": int, + "comments": int, + }, + }), + ] + + def __init__(self, match): + Extractor.__init__(self) + self.gallery_id = match.group(1) + + def items(self): + url = "{}/gallery/{}/a".format(self.root, self.gallery_id) + page = self.request(url, cookies={"ilo0": "true"}).text + + data = self.get_metadata(page) + imgs = self.get_images(page) + data["count"] = len(imgs) + + yield Message.Version, 1 + yield Message.Directory, data + for data["num"], (url, external) in enumerate(imgs, 1): + if external: + yield Message.Queue, url, data + else: + yield Message.Url, url, text.nameext_from_url(url, data) + + def get_metadata(self, page): + """Collect metadata for extractor-job""" + users, pos = text.extract( + page, 'class="project-owner-info ', 'class="project-owner-actions') + title, pos = text.extract( + page, '
', '
', pos) + fields, pos = text.extract( + page, '', pos) + stats, pos = text.extract( + page, '
', 'Published', pos) + date, pos = text.extract( + page, ' data-timestamp="', '"', pos) + + users = self._parse_userinfo(users) + stats = text.split_html(stats) + + return { + "gallery_id": text.parse_int(self.gallery_id), + "title": text.unescape(title), + "user": ", ".join(users), + "fields": text.split_html(fields), + "date": text.parse_int(date), + "views": text.parse_int(stats[0]), + "votes": text.parse_int(stats[1]), + "comments": text.parse_int(stats[2]), + } + + @staticmethod + def get_images(page): + """Extract and return a list of all image- and external urls""" + results = [] + for p in text.extract_iter(page, "js-lightbox-slide-content", "', '', + ) + ] + + user = text.extract(users, ' class="profile-list-name"', '')[0] + return (user.rpartition(">")[2],)