From 85a37ca039e982f465d6ca77f176dcce479fafd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 12 Dec 2024 20:20:30 +0100 Subject: [PATCH] [facebook] decode surrogate pairs in metadata values (#6599) --- gallery_dl/extractor/facebook.py | 3 ++- test/results/facebook.py | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py index 04acfc52..2f3fdbf3 100644 --- a/gallery_dl/extractor/facebook.py +++ b/gallery_dl/extractor/facebook.py @@ -40,7 +40,8 @@ class FacebookExtractor(Extractor): @staticmethod def decode_all(txt): return text.unescape( - txt.encode("utf-8").decode("unicode_escape") + txt.encode().decode("unicode_escape") + .encode("utf_16", "surrogatepass").decode("utf_16") ).replace("\\/", "/") @staticmethod diff --git a/test/results/facebook.py b/test/results/facebook.py index 7b34a248..165a2239 100644 --- a/test/results/facebook.py +++ b/test/results/facebook.py @@ -113,6 +113,15 @@ __tests__ = ( "username" : "Facebook", }, +{ + "#url" : "https://www.facebook.com/photo.php?fbid=1156625586261770", + "#comment" : "surrogate pair in 'caption' data (#6599)", + "#category": ("", "facebook", "photo"), + "#class" : facebook.FacebookPhotoExtractor, + + "caption" : "A century of innovation parked side by side.\n\n📸: Vocabutesla via X", +}, + { "#url" : "https://www.facebook.com/watch/?v=1165557851291824", "#category": ("", "facebook", "video"),