[twitter] add experimental 'videos' option (#99)

Enabling this option will detect videos in tweets and output them as
"unsupported" URLs, so that these can then be downloaded with youtube-dl

There are a lot of improvements to be made to the current
implementation, but it works and does what it is supposed to, even if
inefficient as can be ...
This commit is contained in:
Mike Fährmann
2018-09-30 18:41:39 +02:00
parent 5507f5ce2e
commit f8b3b00249
3 changed files with 24 additions and 8 deletions

View File

@@ -654,6 +654,15 @@ Description Extract images from retweets.
=========== ===== =========== =====
extractor.twitter.videos
------------------------
=========== =====
Type ``bool``
Default ``false``
Description Output video tweets as unsupported URLs.
=========== =====
extractor.[booru].tags extractor.[booru].tags
---------------------- ----------------------
=========== ===== =========== =====

View File

@@ -110,7 +110,8 @@
}, },
"twitter": "twitter":
{ {
"retweets": true "retweets": true,
"videos": false
}, },
"booru": "booru":
{ {

View File

@@ -9,7 +9,7 @@
"""Extract images from https://twitter.com/""" """Extract images from https://twitter.com/"""
from .common import Extractor, Message from .common import Extractor, Message
from .. import text from .. import text, extractor
class TwitterExtractor(Extractor): class TwitterExtractor(Extractor):
@@ -24,32 +24,38 @@ class TwitterExtractor(Extractor):
Extractor.__init__(self) Extractor.__init__(self)
self.user = match.group(1) self.user = match.group(1)
self.retweets = self.config("retweets", True) self.retweets = self.config("retweets", True)
self.videos = self.config("videos", False)
if self.videos:
self._blacklist = extractor.blacklist(("twitter",))
def items(self): def items(self):
yield Message.Version, 1 yield Message.Version, 1
yield Message.Directory, self.metadata() yield Message.Directory, self.metadata()
for tweet in self.tweets(): for tweet in self.tweets():
images = list(text.extract_iter(
tweet, 'data-image-url="', '"'))
if not images:
continue
data = self._data_from_tweet(tweet) data = self._data_from_tweet(tweet)
if not self.retweets and data["retweet_id"]: if not self.retweets and data["retweet_id"]:
continue continue
images = text.extract_iter(
tweet, 'data-image-url="', '"')
for data["num"], url in enumerate(images, 1): for data["num"], url in enumerate(images, 1):
text.nameext_from_url(url, data) text.nameext_from_url(url, data)
yield Message.Url, url + ":orig", data yield Message.Url, url + ":orig", data
if self.videos and "-videoContainer" in tweet:
url = "{}/{}/status/{}".format(
self.root, data["user"], data["tweet_id"])
with self._blacklist:
yield Message.Queue, url, data
def metadata(self): def metadata(self):
"""Return general metadata""" """Return general metadata"""
return {"user": self.user} return {"user": self.user}
def tweets(self): def tweets(self):
"""Yield HTML content of all relevant tweets""" """Yield HTML content of all relevant tweets"""
return ()
@staticmethod @staticmethod
def _data_from_tweet(tweet): def _data_from_tweet(tweet):