From 0a9a07a6e1191929800c729095cd3dc71a78db02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mike=20F=C3=A4hrmann?=
Date: Wed, 13 Dec 2017 21:15:05 +0100
Subject: [PATCH] [slideshare] improve metadata; flake8
- added 'views' and 'published' keywords
- fixed longer titles and descriptions
---
CHANGELOG.md | 2 ++
docs/supportedsites.rst | 1 +
gallery_dl/extractor/slideshare.py | 48 +++++++++++++++++++++---------
scripts/build_supportedsites.py | 1 +
test/test_extractors.py | 2 ++
5 files changed, 40 insertions(+), 14 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a1a53bf1..a652f561 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,8 @@
# Changelog
## Unreleased
+- Added support for:
+ - `slideshare` - https://www.slideshare.net/ ([#54](https://github.com/mikf/gallery-dl/issues/54))
- Added pool- and post-extractors for `sankaku`
## 1.1.0 - 2017-12-08
diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst
index a619d834..8b5cae01 100644
--- a/docs/supportedsites.rst
+++ b/docs/supportedsites.rst
@@ -66,6 +66,7 @@ Sankaku Channel https://chan.sankakucomplex.com/ Pools, Posts, Tag-Searc
Sea Otter Scans https://reader.seaotterscans.com/ Chapters, Manga
Sen Manga http://raw.senmanga.com/ Chapters
Sense-Scans http://sensescans.com/ Chapters, Manga
+SlideShare https://www.slideshare.net/ Presentations
Spectrum Nexus |http://www.thes-0| Chapters, Manga
The /b/ Archive https://thebarchive.com/ Threads
Tumblr https://www.tumblr.com/ Images from Users, Posts, Tag-Searches
diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py
index 5d1248f9..41bf8651 100644
--- a/gallery_dl/extractor/slideshare.py
+++ b/gallery_dl/extractor/slideshare.py
@@ -9,7 +9,7 @@
"""Extract images from https://www.slideshare.net/"""
from .common import Extractor, Message
-from .. import text
+from .. import text, util
class SlideshareExtractor(Extractor):
@@ -17,14 +17,21 @@ class SlideshareExtractor(Extractor):
category = "slideshare"
subcategory = "presentation"
directory_fmt = ["{category}", "{user}"]
- filename_fmt = "{presentation}-{num}.{extension}"
+ filename_fmt = "{presentation}-{num:>02}.{extension}"
pattern = [r"(?:https?://)?(?:www\.)?slideshare\.net/"
- r"([^/]+)/([^/]+)"]
+ r"([^/?]+)/([^/?]+)"]
test = [
- ("https://www.slideshare.net/Slideshare/get-started-with-slide-share", {
+ (("https://www.slideshare.net/"
+ "Slideshare/get-started-with-slide-share"), {
"url": "23685fb9b94b32c77a547d45dc3a82fe7579ea18",
"content": "ee54e54898778e92696a7afec3ffabdbd98eb0cc",
}),
+
+ # long title
+ (("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren"
+ "-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), {
+ "url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7",
+ }),
]
def __init__(self, match):
@@ -32,7 +39,8 @@ class SlideshareExtractor(Extractor):
self.user, self.presentation = match.groups()
def items(self):
- page = self.request("https://www.slideshare.net/" + self.user + "/" + self.presentation).text
+ page = self.request("https://www.slideshare.net/" + self.user +
+ "/" + self.presentation).text
data = self.get_job_metadata(page)
imgs = self.get_image_urls(page)
data["count"] = len(imgs)
@@ -43,17 +51,29 @@ class SlideshareExtractor(Extractor):
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
- metadata = {}
+ descr, pos = text.extract(
+ page, '', '', pos)
+ views, pos = text.extract(
+ page, '', 'views<', pos)
+ published, pos = text.extract(
+ page, '
', pos)
- text.extract_all(page, (
- ('title', '', ''),
- ('description', ''),
- ), values=metadata)
+ if descr.endswith("…") and alt_descr:
+ descr = text.remove_html(alt_descr).strip()
- metadata["presentation"] = self.presentation
- metadata["user"] = self.user
-
- return metadata
+ return {
+ "user": self.user,
+ "presentation": self.presentation,
+ "title": text.unescape(title.strip()),
+ "description": text.unescape(descr),
+ "views": util.safe_int(views.replace(",", "")),
+ "published": published,
+ }
@staticmethod
def get_image_urls(page):
diff --git a/scripts/build_supportedsites.py b/scripts/build_supportedsites.py
index 21eea89b..e2b34824 100755
--- a/scripts/build_supportedsites.py
+++ b/scripts/build_supportedsites.py
@@ -52,6 +52,7 @@ CATEGORY_MAP = {
"seiga" : "Niconico Seiga",
"senmanga" : "Sen Manga",
"sensescans" : "Sense-Scans",
+ "slideshare" : "SlideShare",
"spectrumnexus" : "Spectrum Nexus",
"thebarchive" : "The /b/ Archive",
"worldthree" : "World Three",
diff --git a/test/test_extractors.py b/test/test_extractors.py
index 3e5cee7c..11c00ea8 100644
--- a/test/test_extractors.py
+++ b/test/test_extractors.py
@@ -82,6 +82,8 @@ skip = [
"exhentai", "kissmanga", "mangafox", "dynastyscans", "nijie",
"archivedmoe", "archiveofsins", "thebarchive",
# temporary issues
+ "mangareader",
+ "mangapanda",
]
# enable selective testing for direct calls
if __name__ == '__main__' and len(sys.argv) > 1: