diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 4c7b757c..f0e02254 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -11,114 +11,96 @@
from .common import Extractor, Message
from .. import text, exception
from ..cache import cache, memcache
-import json
-import re
+import hashlib
+import time
class TwitterExtractor(Extractor):
"""Base class for twitter extractors"""
category = "twitter"
- directory_fmt = ("{category}", "{user[name]}")
- filename_fmt = "{tweet_id}_{num}.{extension}"
- archive_fmt = "{tweet_id}_{retweet_id}_{num}"
+ directory_fmt = ("{category}", "{user[screen_name]}")
+ filename_fmt = "{id_str}_{num}.{extension}"
+ archive_fmt = "{id_str}_{num}"
cookiedomain = ".twitter.com"
root = "https://twitter.com"
sizes = (":orig", ":large", ":medium", ":small")
- user_agent = ("Mozilla/5.0 (Windows NT 6.1; WOW64; "
- "Trident/7.0; rv:11.0) like Gecko")
def __init__(self, match):
Extractor.__init__(self, match)
self.user = match.group(1)
- self._user_dict = None
- self.logged_in = False
self.retweets = self.config("retweets", True)
self.replies = self.config("replies", True)
self.twitpic = self.config("twitpic", False)
- self.content = self.config("content", False)
self.videos = self.config("videos", True)
- if self.content:
- self._emoji_sub = re.compile(
- r']*>').sub
-
def items(self):
self.login()
metadata = self.metadata()
yield Message.Version, 1
for tweet in self.tweets():
- data = self._data_from_tweet(tweet)
- if not data or \
- not self.retweets and data["retweet_id"] or \
- not self.replies and data["reply"]:
+ if not self.retweets and "retweeted_status_id_str" in tweet or \
+ not self.replies and "in_reply_to_user_id_str" in tweet:
continue
- data.update(metadata)
- if self.videos and "-videoContainer" in tweet:
- yield Message.Directory, data
+ if "extended_entities" not in tweet:
+ continue
+
+ tweet.update(metadata)
+ yield Message.Directory, tweet
+ for tweet["num"], media in enumerate(
+ tweet["extended_entities"]["media"], 1):
+ tweet.update(media["original_info"])
+
+ if "video_info" in media and self.videos:
+
+ if self.videos == "ytdl":
+ url = "ytdl:{}/i/web/status/{}".format(
+ self.root, tweet["id_str"])
+ tweet["extension"] = None
+ yield Message.Url, url, tweet
+
+ else:
+ video_info = media["video_info"]
+ variant = max(
+ video_info["variants"],
+ key=lambda v: v.get("bitrate", 0),
+ )
+ tweet["duration"] = video_info.get(
+ "duration_millis", 0) / 1000
+ tweet["bitrate"] = variant.get("bitrate", 0)
+
+ url = variant["url"]
+ text.nameext_from_url(url, tweet)
+ yield Message.Url, url, tweet
- if self.videos == "ytdl":
- data["extension"] = None
- url = "ytdl:{}/i/web/status/{}".format(
- self.root, data["tweet_id"])
else:
- url = self._video_from_tweet(data["tweet_id"])
- if not url:
- continue
- text.nameext_from_url(url, data)
- if data["extension"] == "m3u8":
- url = "ytdl:" + url
- data["extension"] = "mp4"
- data["_ytdl_extra"] = {"protocol": "m3u8_native"}
- data["num"] = 1
- yield Message.Url, url, data
-
- elif "data-image-url=" in tweet:
- yield Message.Directory, data
-
- images = text.extract_iter(
- tweet, 'data-image-url="', '"')
- for data["num"], url in enumerate(images, 1):
- text.nameext_from_url(url, data)
+ url = media["media_url_https"]
urls = [url + size for size in self.sizes]
- yield Message.Urllist, urls, data
-
- if self.twitpic and "//twitpic.com/" in tweet:
- urls = [
- url for url in text.extract_iter(
- tweet, 'data-expanded-url="', '"')
- if "//twitpic.com/" in url
- ]
-
- if "num" not in data:
- if urls:
- yield Message.Directory, data
- data["num"] = 0
-
- for data["num"], url in enumerate(urls, data["num"]+1):
- response = self.request(url, fatal=False)
- if response.status_code >= 400:
- continue
- url = text.extract(
- response.text, 'name="twitter:image" value="', '"')[0]
- yield Message.Url, url, text.nameext_from_url(url, data)
+ text.nameext_from_url(url, tweet)
+ yield Message.Urllist, urls, tweet
def metadata(self):
"""Return general metadata"""
return {}
def tweets(self):
- """Yield HTML content of all relevant tweets"""
+ """Yield all relevant tweet objects"""
def login(self):
username, password = self._get_auth_info()
if username:
self._update_cookies(self._login_impl(username, password))
- self.logged_in = True
+ # self.logged_in = True
@cache(maxage=360*24*3600, keyarg=1)
def _login_impl(self, username, password):
+ self.log.warning(
+ 'Logging in with username and password is currently not possible. '
+ 'Use cookies from your browser session instead.')
+ return {}
+
+ """
self.log.info("Logging in as %s", username)
headers = {"User-Agent": self.user_agent}
@@ -130,7 +112,7 @@ class TwitterExtractor(Extractor):
data = {
"session[username_or_email]": username,
"session[password]" : password,
- "authenticity_token" : token,
+ # "authenticity_token" : token,
"ui_metrics" : '{"rf":{},"s":""}',
"scribe_log" : "",
"redirect_after_login" : "",
@@ -145,138 +127,7 @@ class TwitterExtractor(Extractor):
for cookie in self.session.cookies
if cookie.domain and "twitter.com" in cookie.domain
}
-
- def _data_from_tweet(self, tweet):
- extr = text.extract_from(tweet)
- data = {
- "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')),
- "reply" : bool(extr('data-is-reply-to="' , '"')),
- "retweet_id": text.parse_int(extr('data-retweet-id="', '"')),
- "retweeter" : extr('data-retweeter="' , '"'),
- "author" : {
- "name" : extr('data-screen-name="', '"'),
- "nick" : text.unescape(extr('data-name="' , '"')),
- "id" : text.parse_int(extr('data-user-id="' , '"')),
- },
- }
-
- if not self._user_dict:
- if data["retweet_id"]:
- for user in json.loads(text.unescape(extr(
- 'data-reply-to-users-json="', '"'))):
- if user["screen_name"] == data["retweeter"]:
- break
- else:
- self.log.warning("Unable to extract user info")
- return None
- self._user_dict = {
- "name": user["screen_name"],
- "nick": text.unescape(user["name"]),
- "id" : text.parse_int(user["id_str"]),
- }
- else:
- self._user_dict = data["author"]
-
- data["user"] = self._user_dict
- data["date"] = text.parse_timestamp(extr('data-time="', '"'))
-
- if self.content:
- content = extr('