HTML structure for gallery pages changed quite a bit, so it is now using the embedded JSON data. This changes a lot of metadata field names, but 'gallery_id', 'title', and 'user' are still provided for backwards compatibility. The internal API endpoint for user galleries also changed its data structure, but nothing too major.
179 lines
6.0 KiB
Python
179 lines
6.0 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2018-2019 Mike Fährmann
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
# published by the Free Software Foundation.
|
|
|
|
"""Extract images from https://www.behance.net/"""
|
|
|
|
from .common import Extractor, Message
|
|
from .. import text
|
|
import json
|
|
|
|
|
|
class BehanceExtractor(Extractor):
|
|
"""Base class for behance extractors"""
|
|
category = "behance"
|
|
root = "https://www.behance.net"
|
|
|
|
def items(self):
|
|
yield Message.Version, 1
|
|
for gallery in self.galleries():
|
|
yield Message.Queue, gallery["url"], self._update(gallery)
|
|
|
|
def galleries(self):
|
|
"""Return all relevant gallery URLs"""
|
|
return ()
|
|
|
|
@staticmethod
|
|
def _update(data):
|
|
# compress data to simple lists
|
|
data["fields"] = [field["name"] for field in data["fields"]]
|
|
data["owners"] = [owner["display_name"] for owner in data["owners"]]
|
|
if "tags" in data:
|
|
data["tags"] = [tag["title"] for tag in data["tags"]]
|
|
|
|
# backwards compatibility
|
|
data["gallery_id"] = data["id"]
|
|
data["title"] = data["name"]
|
|
data["user"] = ", ".join(data["owners"])
|
|
|
|
return data
|
|
|
|
|
|
class BehanceGalleryExtractor(BehanceExtractor):
|
|
"""Extractor for image galleries from www.behance.net"""
|
|
subcategory = "gallery"
|
|
directory_fmt = ["{category}", "{owners:J, }", "{id} {name}"]
|
|
filename_fmt = "{category}_{id}_{num:>02}.{extension}"
|
|
archive_fmt = "{id}_{num}"
|
|
pattern = [r"(?:https?://)?(?:www\.)?behance\.net/gallery/(\d+)"]
|
|
test = [
|
|
("https://www.behance.net/gallery/17386197/A-Short-Story", {
|
|
"count": 2,
|
|
"url": "ab79bd3bef8d3ae48e6ac74fd995c1dfaec1b7d2",
|
|
"keyword": {
|
|
"id": 17386197,
|
|
"name": 're:"Hi". A short story about the important things ',
|
|
"owners": ["Place Studio", "Julio César Velazquez"],
|
|
"fields": ["Animation", "Character Design", "Directing"],
|
|
"tags": list,
|
|
"module": dict,
|
|
},
|
|
}),
|
|
("https://www.behance.net/gallery/21324767/Nevada-City", {
|
|
"count": 6,
|
|
"url": "0258fe194fe7d828d6f2c7f6086a9a0a4140db1d",
|
|
"keyword": {"owners": ["Alex Strohl"]},
|
|
}),
|
|
]
|
|
|
|
def __init__(self, match):
|
|
BehanceExtractor.__init__(self)
|
|
self.gallery_id = match.group(1)
|
|
|
|
def items(self):
|
|
data = self.get_gallery_data()
|
|
imgs = self.get_images(data)
|
|
data["count"] = len(imgs)
|
|
|
|
yield Message.Version, 1
|
|
yield Message.Directory, data
|
|
for data["num"], (url, module) in enumerate(imgs, 1):
|
|
data["module"] = module
|
|
data["extension"] = text.ext_from_url(url)
|
|
yield Message.Url, url, data
|
|
|
|
def get_gallery_data(self):
|
|
"""Collect gallery info dict"""
|
|
url = "{}/gallery/{}/a".format(self.root, self.gallery_id)
|
|
cookies = {
|
|
"_evidon_consent_cookie":
|
|
'{"consent_date":"2019-01-31T09:41:15.132Z"}',
|
|
"bcp": "815b5eee-8bdf-4898-ac79-33c2bcc0ed19",
|
|
"gk_suid": "66981391",
|
|
"gki": '{"feature_project_view":false,'
|
|
'"feature_discover_login_prompt":false,'
|
|
'"feature_project_login_prompt":false}',
|
|
"ilo0": "true",
|
|
}
|
|
page = self.request(url, cookies=cookies).text
|
|
|
|
data = json.loads(text.extract(
|
|
page, 'id="beconfig-store_state">', '</script>')[0])
|
|
return self._update(data["project"]["project"])
|
|
|
|
def get_images(self, data):
|
|
""" """
|
|
results = []
|
|
|
|
for module in data["modules"]:
|
|
|
|
if module["type"] == "image":
|
|
url = module["sizes"]["original"]
|
|
results.append((url, module))
|
|
|
|
elif module["type"] == "embed":
|
|
embed = module.get("original_embed") or module.get("embed")
|
|
url = "ytdl:" + text.extract(embed, 'src="', '"')[0]
|
|
results.append((url, module))
|
|
|
|
return results
|
|
|
|
|
|
class BehanceUserExtractor(BehanceExtractor):
|
|
"""Extractor for a user's galleries from www.behance.net"""
|
|
subcategory = "user"
|
|
categorytransfer = True
|
|
pattern = [r"(?:https?://)?(?:www\.)?behance\.net/([^/?&#]+)/?$"]
|
|
test = [("https://www.behance.net/alexstrohl", {
|
|
"count": ">= 8",
|
|
"pattern": BehanceGalleryExtractor.pattern[0],
|
|
})]
|
|
|
|
def __init__(self, match):
|
|
BehanceExtractor.__init__(self)
|
|
self.user = match.group(1)
|
|
|
|
def galleries(self):
|
|
url = "{}/{}/projects".format(self.root, self.user)
|
|
headers = {"X-Requested-With": "XMLHttpRequest"}
|
|
params = {"offset": 0}
|
|
|
|
while True:
|
|
data = self.request(url, headers=headers, params=params).json()
|
|
work = data["profile"]["activeSection"]["work"]
|
|
yield from work["projects"]
|
|
if not work["hasMore"]:
|
|
return
|
|
params["offset"] += len(work["projects"])
|
|
|
|
|
|
class BehanceCollectionExtractor(BehanceExtractor):
|
|
"""Extractor for a collection's galleries from www.behance.net"""
|
|
subcategory = "collection"
|
|
categorytransfer = True
|
|
pattern = [r"(?:https?://)?(?:www\.)?behance\.net/collection/(\d+)"]
|
|
test = [("https://www.behance.net/collection/170615607/Sky", {
|
|
"count": ">= 13",
|
|
"pattern": BehanceGalleryExtractor.pattern[0],
|
|
})]
|
|
|
|
def __init__(self, match):
|
|
BehanceExtractor.__init__(self)
|
|
self.collection_id = match.group(1)
|
|
|
|
def galleries(self):
|
|
url = "{}/collection/{}/a".format(self.root, self.collection_id)
|
|
headers = {"X-Requested-With": "XMLHttpRequest"}
|
|
params = {}
|
|
|
|
while True:
|
|
data = self.request(url, headers=headers, params=params).json()
|
|
yield from data["output"]
|
|
if not data.get("offset"):
|
|
return
|
|
params["offset"] = data["offset"]
|