[twitter] improve 'cards-blacklist' (#2875)
allow blacklisting domains and 'name:domain', where 'domain' depends on a card's 'vanity_url' value
This commit is contained in:
@@ -2362,9 +2362,15 @@ extractor.twitter.cards-blacklist
|
|||||||
Type
|
Type
|
||||||
``list`` of ``strings``
|
``list`` of ``strings``
|
||||||
Example
|
Example
|
||||||
``["player", "summary"]``
|
``["summary", "youtube.com", "player:twitch.tv"]``
|
||||||
Description
|
Description
|
||||||
List of card types to ignore
|
List of card types to ignore.
|
||||||
|
|
||||||
|
Possible values are
|
||||||
|
|
||||||
|
* card names
|
||||||
|
* card domains
|
||||||
|
* ``<card name>:<card domain>``
|
||||||
|
|
||||||
|
|
||||||
extractor.twitter.conversations
|
extractor.twitter.conversations
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ class TwitterExtractor(Extractor):
|
|||||||
self.quoted = self.config("quoted", False)
|
self.quoted = self.config("quoted", False)
|
||||||
self.videos = self.config("videos", True)
|
self.videos = self.config("videos", True)
|
||||||
self.cards = self.config("cards", False)
|
self.cards = self.config("cards", False)
|
||||||
self.cards_blacklist = self.config("cards-blacklist") or ()
|
self.cards_blacklist = self.config("cards-blacklist")
|
||||||
self._user = self._user_obj = None
|
self._user = self._user_obj = None
|
||||||
self._user_cache = {}
|
self._user_cache = {}
|
||||||
self._init_sizes()
|
self._init_sizes()
|
||||||
@@ -180,16 +180,21 @@ class TwitterExtractor(Extractor):
|
|||||||
card = card["legacy"]
|
card = card["legacy"]
|
||||||
|
|
||||||
name = card["name"].rpartition(":")[2]
|
name = card["name"].rpartition(":")[2]
|
||||||
if name in self.cards_blacklist:
|
bvals = card["binding_values"]
|
||||||
return
|
if isinstance(bvals, list):
|
||||||
|
bvals = {bval["key"]: bval["value"]
|
||||||
|
for bval in card["binding_values"]}
|
||||||
|
|
||||||
|
cbl = self.cards_blacklist
|
||||||
|
if cbl:
|
||||||
|
if name in cbl:
|
||||||
|
return
|
||||||
|
if "vanity_url" in bvals:
|
||||||
|
domain = bvals["vanity_url"]["string_value"]
|
||||||
|
if domain in cbl or name + ":" + domain in cbl:
|
||||||
|
return
|
||||||
|
|
||||||
if name in ("summary", "summary_large_image"):
|
if name in ("summary", "summary_large_image"):
|
||||||
bvals = card["binding_values"]
|
|
||||||
if isinstance(bvals, list):
|
|
||||||
bvals = {
|
|
||||||
bval["key"]: bval["value"]
|
|
||||||
for bval in card["binding_values"]
|
|
||||||
}
|
|
||||||
for prefix in ("photo_image_full_size_",
|
for prefix in ("photo_image_full_size_",
|
||||||
"summary_photo_image_",
|
"summary_photo_image_",
|
||||||
"thumbnail_image_"):
|
"thumbnail_image_"):
|
||||||
@@ -206,15 +211,7 @@ class TwitterExtractor(Extractor):
|
|||||||
files.append(value)
|
files.append(value)
|
||||||
return
|
return
|
||||||
elif name == "unified_card":
|
elif name == "unified_card":
|
||||||
bvals = card["binding_values"]
|
data = json.loads(bvals["unified_card"]["string_value"])
|
||||||
if isinstance(bvals, list):
|
|
||||||
for bval in card["binding_values"]:
|
|
||||||
if bval["key"] == "unified_card":
|
|
||||||
bval = bval["value"]["string_value"]
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
bval = bvals["unified_card"]["string_value"]
|
|
||||||
data = json.loads(bval)
|
|
||||||
self._extract_media(tweet, data["media_entities"].values(), files)
|
self._extract_media(tweet, data["media_entities"].values(), files)
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -761,6 +758,12 @@ class TwitterTweetExtractor(TwitterExtractor):
|
|||||||
("https://twitter.com/i/web/status/1466183847628865544", {
|
("https://twitter.com/i/web/status/1466183847628865544", {
|
||||||
"count": 0,
|
"count": 0,
|
||||||
}),
|
}),
|
||||||
|
# 'cards-blacklist' option
|
||||||
|
("https://twitter.com/i/web/status/1571141912295243776", {
|
||||||
|
"options": (("cards", "ytdl"),
|
||||||
|
("cards-blacklist", ("twitch.tv",))),
|
||||||
|
"count": 0,
|
||||||
|
}),
|
||||||
# original retweets (#1026)
|
# original retweets (#1026)
|
||||||
("https://twitter.com/jessica_3978/status/1296304589591810048", {
|
("https://twitter.com/jessica_3978/status/1296304589591810048", {
|
||||||
"options": (("retweets", "original"),),
|
"options": (("retweets", "original"),),
|
||||||
|
|||||||
Reference in New Issue
Block a user