[facebook] decode surrogate pairs in metadata values (#6599)

This commit is contained in:
Mike Fährmann
2024-12-12 20:20:30 +01:00
parent a33065be86
commit 85a37ca039
2 changed files with 11 additions and 1 deletions

View File

@@ -40,7 +40,8 @@ class FacebookExtractor(Extractor):
@staticmethod
def decode_all(txt):
return text.unescape(
txt.encode("utf-8").decode("unicode_escape")
txt.encode().decode("unicode_escape")
.encode("utf_16", "surrogatepass").decode("utf_16")
).replace("\\/", "/")
@staticmethod

View File

@@ -113,6 +113,15 @@ __tests__ = (
"username" : "Facebook",
},
{
"#url" : "https://www.facebook.com/photo.php?fbid=1156625586261770",
"#comment" : "surrogate pair in 'caption' data (#6599)",
"#category": ("", "facebook", "photo"),
"#class" : facebook.FacebookPhotoExtractor,
"caption" : "A century of innovation parked side by side.\n\n📸: Vocabutesla via X",
},
{
"#url" : "https://www.facebook.com/watch/?v=1165557851291824",
"#category": ("", "facebook", "video"),