# -*- coding: utf-8 -*- # Copyright 2016-2017 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extract images from https://www.tumblr.com/""" from .common import Extractor, Message from .. import text, exception from ..cache import memcache import re def _original_image(url): if url.endswith(".gif") and "_inline_" in url: return url return re.sub( (r"https?://\d+\.media\.tumblr\.com" r"/([0-9a-f]+)/tumblr_([^/?&#.]+)_\d+\.([0-9a-z]+)"), r"http://data.tumblr.com/\1/tumblr_\2_raw.\3", url ) def _original_video(url): return re.sub( (r"https?://vt\.media\.tumblr\.com" r"/tumblr_([^_]+)_\d+\.([0-9a-z]+)"), r"https://vt.media.tumblr.com/tumblr_\1.\2", url ) POST_TYPES = frozenset(( "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) class TumblrExtractor(Extractor): """Base class for tumblr extractors""" category = "tumblr" directory_fmt = ["{category}", "{name}"] filename_fmt = "{category}_{blog[name]}_{id}{offset:?o//}.{extension}" def __init__(self, match): Extractor.__init__(self) self.user = match.group(1) self.api = TumblrAPI(self) self.types = self._setup_posttypes() self.inline = self.config("inline", False) self.external = self.config("external", False) if len(self.types) == 1: self.api.params["type"] = next(iter(self.types)) elif not self.types: self.log.warning("no valid post types selected") def items(self): blog = self.api.info(self.user) yield Message.Version, 1 yield Message.Directory, blog for post in self.posts(): if post["type"] not in self.types: continue post["blog"] = blog post["offset"] = 0 if "trail" in post: del post["trail"] if "photos" in post: # type "photo" or "link" photos = post["photos"] del post["photos"] for photo in photos: post["photo"] = photo photo.update(photo["original_size"]) del photo["original_size"] del photo["alt_sizes"] yield self._prepare(_original_image(photo["url"]), post) if "audio_url" in post: # type: "audio" yield self._prepare(post["audio_url"], post) if "video_url" in post: # type: "video" yield self._prepare(_original_video(post["video_url"]), post) if self.inline: # inline images for key in ("body", "description"): if key in post: for url in re.findall('= data["total_posts"]: return