[twitter] improve 'cards-blacklist' (#2875)
allow blacklisting domains and 'name:domain', where 'domain' depends on a card's 'vanity_url' value
This commit is contained in:
@@ -2362,9 +2362,15 @@ extractor.twitter.cards-blacklist
|
||||
Type
|
||||
``list`` of ``strings``
|
||||
Example
|
||||
``["player", "summary"]``
|
||||
``["summary", "youtube.com", "player:twitch.tv"]``
|
||||
Description
|
||||
List of card types to ignore
|
||||
List of card types to ignore.
|
||||
|
||||
Possible values are
|
||||
|
||||
* card names
|
||||
* card domains
|
||||
* ``<card name>:<card domain>``
|
||||
|
||||
|
||||
extractor.twitter.conversations
|
||||
|
||||
@@ -41,7 +41,7 @@ class TwitterExtractor(Extractor):
|
||||
self.quoted = self.config("quoted", False)
|
||||
self.videos = self.config("videos", True)
|
||||
self.cards = self.config("cards", False)
|
||||
self.cards_blacklist = self.config("cards-blacklist") or ()
|
||||
self.cards_blacklist = self.config("cards-blacklist")
|
||||
self._user = self._user_obj = None
|
||||
self._user_cache = {}
|
||||
self._init_sizes()
|
||||
@@ -180,16 +180,21 @@ class TwitterExtractor(Extractor):
|
||||
card = card["legacy"]
|
||||
|
||||
name = card["name"].rpartition(":")[2]
|
||||
if name in self.cards_blacklist:
|
||||
bvals = card["binding_values"]
|
||||
if isinstance(bvals, list):
|
||||
bvals = {bval["key"]: bval["value"]
|
||||
for bval in card["binding_values"]}
|
||||
|
||||
cbl = self.cards_blacklist
|
||||
if cbl:
|
||||
if name in cbl:
|
||||
return
|
||||
if "vanity_url" in bvals:
|
||||
domain = bvals["vanity_url"]["string_value"]
|
||||
if domain in cbl or name + ":" + domain in cbl:
|
||||
return
|
||||
|
||||
if name in ("summary", "summary_large_image"):
|
||||
bvals = card["binding_values"]
|
||||
if isinstance(bvals, list):
|
||||
bvals = {
|
||||
bval["key"]: bval["value"]
|
||||
for bval in card["binding_values"]
|
||||
}
|
||||
for prefix in ("photo_image_full_size_",
|
||||
"summary_photo_image_",
|
||||
"thumbnail_image_"):
|
||||
@@ -206,15 +211,7 @@ class TwitterExtractor(Extractor):
|
||||
files.append(value)
|
||||
return
|
||||
elif name == "unified_card":
|
||||
bvals = card["binding_values"]
|
||||
if isinstance(bvals, list):
|
||||
for bval in card["binding_values"]:
|
||||
if bval["key"] == "unified_card":
|
||||
bval = bval["value"]["string_value"]
|
||||
break
|
||||
else:
|
||||
bval = bvals["unified_card"]["string_value"]
|
||||
data = json.loads(bval)
|
||||
data = json.loads(bvals["unified_card"]["string_value"])
|
||||
self._extract_media(tweet, data["media_entities"].values(), files)
|
||||
return
|
||||
|
||||
@@ -761,6 +758,12 @@ class TwitterTweetExtractor(TwitterExtractor):
|
||||
("https://twitter.com/i/web/status/1466183847628865544", {
|
||||
"count": 0,
|
||||
}),
|
||||
# 'cards-blacklist' option
|
||||
("https://twitter.com/i/web/status/1571141912295243776", {
|
||||
"options": (("cards", "ytdl"),
|
||||
("cards-blacklist", ("twitch.tv",))),
|
||||
"count": 0,
|
||||
}),
|
||||
# original retweets (#1026)
|
||||
("https://twitter.com/jessica_3978/status/1296304589591810048", {
|
||||
"options": (("retweets", "original"),),
|
||||
|
||||
Reference in New Issue
Block a user