[slideshare] improve metadata; flake8
- added 'views' and 'published' keywords - fixed longer titles and descriptions
This commit is contained in:
@@ -1,6 +1,8 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
- Added support for:
|
||||||
|
- `slideshare` - https://www.slideshare.net/ ([#54](https://github.com/mikf/gallery-dl/issues/54))
|
||||||
- Added pool- and post-extractors for `sankaku`
|
- Added pool- and post-extractors for `sankaku`
|
||||||
|
|
||||||
## 1.1.0 - 2017-12-08
|
## 1.1.0 - 2017-12-08
|
||||||
|
|||||||
@@ -66,6 +66,7 @@ Sankaku Channel https://chan.sankakucomplex.com/ Pools, Posts, Tag-Searc
|
|||||||
Sea Otter Scans https://reader.seaotterscans.com/ Chapters, Manga
|
Sea Otter Scans https://reader.seaotterscans.com/ Chapters, Manga
|
||||||
Sen Manga http://raw.senmanga.com/ Chapters
|
Sen Manga http://raw.senmanga.com/ Chapters
|
||||||
Sense-Scans http://sensescans.com/ Chapters, Manga
|
Sense-Scans http://sensescans.com/ Chapters, Manga
|
||||||
|
SlideShare https://www.slideshare.net/ Presentations
|
||||||
Spectrum Nexus |http://www.thes-0| Chapters, Manga
|
Spectrum Nexus |http://www.thes-0| Chapters, Manga
|
||||||
The /b/ Archive https://thebarchive.com/ Threads
|
The /b/ Archive https://thebarchive.com/ Threads
|
||||||
Tumblr https://www.tumblr.com/ Images from Users, Posts, Tag-Searches
|
Tumblr https://www.tumblr.com/ Images from Users, Posts, Tag-Searches
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
"""Extract images from https://www.slideshare.net/"""
|
"""Extract images from https://www.slideshare.net/"""
|
||||||
|
|
||||||
from .common import Extractor, Message
|
from .common import Extractor, Message
|
||||||
from .. import text
|
from .. import text, util
|
||||||
|
|
||||||
|
|
||||||
class SlideshareExtractor(Extractor):
|
class SlideshareExtractor(Extractor):
|
||||||
@@ -17,14 +17,21 @@ class SlideshareExtractor(Extractor):
|
|||||||
category = "slideshare"
|
category = "slideshare"
|
||||||
subcategory = "presentation"
|
subcategory = "presentation"
|
||||||
directory_fmt = ["{category}", "{user}"]
|
directory_fmt = ["{category}", "{user}"]
|
||||||
filename_fmt = "{presentation}-{num}.{extension}"
|
filename_fmt = "{presentation}-{num:>02}.{extension}"
|
||||||
pattern = [r"(?:https?://)?(?:www\.)?slideshare\.net/"
|
pattern = [r"(?:https?://)?(?:www\.)?slideshare\.net/"
|
||||||
r"([^/]+)/([^/]+)"]
|
r"([^/?&#]+)/([^/?&#]+)"]
|
||||||
test = [
|
test = [
|
||||||
("https://www.slideshare.net/Slideshare/get-started-with-slide-share", {
|
(("https://www.slideshare.net/"
|
||||||
|
"Slideshare/get-started-with-slide-share"), {
|
||||||
"url": "23685fb9b94b32c77a547d45dc3a82fe7579ea18",
|
"url": "23685fb9b94b32c77a547d45dc3a82fe7579ea18",
|
||||||
"content": "ee54e54898778e92696a7afec3ffabdbd98eb0cc",
|
"content": "ee54e54898778e92696a7afec3ffabdbd98eb0cc",
|
||||||
}),
|
}),
|
||||||
|
|
||||||
|
# long title
|
||||||
|
(("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren"
|
||||||
|
"-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), {
|
||||||
|
"url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7",
|
||||||
|
}),
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, match):
|
def __init__(self, match):
|
||||||
@@ -32,7 +39,8 @@ class SlideshareExtractor(Extractor):
|
|||||||
self.user, self.presentation = match.groups()
|
self.user, self.presentation = match.groups()
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
page = self.request("https://www.slideshare.net/" + self.user + "/" + self.presentation).text
|
page = self.request("https://www.slideshare.net/" + self.user +
|
||||||
|
"/" + self.presentation).text
|
||||||
data = self.get_job_metadata(page)
|
data = self.get_job_metadata(page)
|
||||||
imgs = self.get_image_urls(page)
|
imgs = self.get_image_urls(page)
|
||||||
data["count"] = len(imgs)
|
data["count"] = len(imgs)
|
||||||
@@ -43,17 +51,29 @@ class SlideshareExtractor(Extractor):
|
|||||||
|
|
||||||
def get_job_metadata(self, page):
|
def get_job_metadata(self, page):
|
||||||
"""Collect metadata for extractor-job"""
|
"""Collect metadata for extractor-job"""
|
||||||
metadata = {}
|
descr, pos = text.extract(
|
||||||
|
page, '<meta name="description" content="', '"')
|
||||||
|
title, pos = text.extract(
|
||||||
|
page, '<span class="j-title-breadcrumb">', '</span>', pos)
|
||||||
|
views, pos = text.extract(
|
||||||
|
page, '<span class="notranslate pippin-data">', 'views<', pos)
|
||||||
|
published, pos = text.extract(
|
||||||
|
page, '<time datetime="', '"', pos)
|
||||||
|
alt_descr, pos = text.extract(
|
||||||
|
page, 'id="slideshow-description-paragraph" class="notranslate">',
|
||||||
|
'</p>', pos)
|
||||||
|
|
||||||
text.extract_all(page, (
|
if descr.endswith("…") and alt_descr:
|
||||||
('title', '<title>', '</title>'),
|
descr = text.remove_html(alt_descr).strip()
|
||||||
('description', '<meta name="description" content="', '">'),
|
|
||||||
), values=metadata)
|
|
||||||
|
|
||||||
metadata["presentation"] = self.presentation
|
return {
|
||||||
metadata["user"] = self.user
|
"user": self.user,
|
||||||
|
"presentation": self.presentation,
|
||||||
return metadata
|
"title": text.unescape(title.strip()),
|
||||||
|
"description": text.unescape(descr),
|
||||||
|
"views": util.safe_int(views.replace(",", "")),
|
||||||
|
"published": published,
|
||||||
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_image_urls(page):
|
def get_image_urls(page):
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ CATEGORY_MAP = {
|
|||||||
"seiga" : "Niconico Seiga",
|
"seiga" : "Niconico Seiga",
|
||||||
"senmanga" : "Sen Manga",
|
"senmanga" : "Sen Manga",
|
||||||
"sensescans" : "Sense-Scans",
|
"sensescans" : "Sense-Scans",
|
||||||
|
"slideshare" : "SlideShare",
|
||||||
"spectrumnexus" : "Spectrum Nexus",
|
"spectrumnexus" : "Spectrum Nexus",
|
||||||
"thebarchive" : "The /b/ Archive",
|
"thebarchive" : "The /b/ Archive",
|
||||||
"worldthree" : "World Three",
|
"worldthree" : "World Three",
|
||||||
|
|||||||
@@ -82,6 +82,8 @@ skip = [
|
|||||||
"exhentai", "kissmanga", "mangafox", "dynastyscans", "nijie",
|
"exhentai", "kissmanga", "mangafox", "dynastyscans", "nijie",
|
||||||
"archivedmoe", "archiveofsins", "thebarchive",
|
"archivedmoe", "archiveofsins", "thebarchive",
|
||||||
# temporary issues
|
# temporary issues
|
||||||
|
"mangareader",
|
||||||
|
"mangapanda",
|
||||||
]
|
]
|
||||||
# enable selective testing for direct calls
|
# enable selective testing for direct calls
|
||||||
if __name__ == '__main__' and len(sys.argv) > 1:
|
if __name__ == '__main__' and len(sys.argv) > 1:
|
||||||
|
|||||||
Reference in New Issue
Block a user