diff --git a/docs/configuration.rst b/docs/configuration.rst index 0dce61f2..63b72ea1 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -2600,6 +2600,27 @@ description Extract comments that include photo attachments made by the author of the post. +extractor.facebook.include +-------------------------- +Type + * ``string`` + * ``list`` of ``strings`` +Default + ``"photos"`` +Example + * ``"avatar,photos"`` + * ``["avatar", "photos"]`` +Description + A (comma-separated) list of subcategories to include + when processing a user profile. + + Supported values are + * ``"avatar"`` + * ``"photos"`` + + It is possible to use ``"all"`` instead of listing all values separately. + + extractor.facebook.videos ------------------------- Type diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index eb4a657c..45cade66 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -293,6 +293,14 @@ "limits-action" : "stop", "fallback-retries": 2 }, + "facebook": + { + "cookies": null, + + "author-followups": false, + "include": "photos", + "videos" : true + }, "fanbox": { "cookies" : null, diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 5a2f5425..90669ed9 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -274,7 +274,7 @@ Consider all listed sites to potentially be NSFW. Facebook https://www.facebook.com/ - Photos, Profiles, Sets, Videos + Avatars, Photos, Profile Photos, Sets, User Profiles, Videos Cookies diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py index d8bb2f00..069ed994 100644 --- a/gallery_dl/extractor/facebook.py +++ b/gallery_dl/extractor/facebook.py @@ -6,10 +6,14 @@ """Extractors for https://www.facebook.com/""" -from .common import Extractor, Message +from .common import Extractor, Message, Dispatch from .. import text, exception +from ..cache import memcache BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com" +USER_PATTERN = (BASE_PATTERN + + r"/(?!media/|photo/|photo.php|watch/)" + r"(?:profile\.php\?id=|people/[^/?#]+/)?([^/?&#]+)") class FacebookExtractor(Extractor): @@ -291,6 +295,36 @@ class FacebookExtractor(Extractor): i += 1 + @memcache(keyarg=1) + def _extract_profile_photos_page(self, profile): + profile_photos_url = f"{self.root}/{profile}/photos_by" + + for _ in range(self.fallback_retries + 1): + profile_photos_page = self.request(profile_photos_url).text + if set_id := self._extract_profile_set_id(profile_photos_page): + break + self.log.debug("Got empty profile photos page, retrying...") + else: + raise exception.AbortExtraction("Failed to extract profile data") + + avatar_page_url = text.extr( + profile_photos_page, ',"profilePhoto":{"url":"', '"') + + return set_id, avatar_page_url.replace("\\/", "/") + + def _extract_profile_set_id(self, profile_photos_page): + set_ids_raw = text.extr( + profile_photos_page, '"pageItems"', '"page_info"' + ) + + set_id = text.extr( + set_ids_raw, 'set=', '"' + ).rsplit("&", 1)[0] or text.extr( + set_ids_raw, '\\/photos\\/', '\\/' + ) + + return set_id + class FacebookSetExtractor(FacebookExtractor): """Base class for Facebook Set extractors""" @@ -384,47 +418,50 @@ class FacebookVideoExtractor(FacebookExtractor): yield Message.Url, audio["url"], audio -class FacebookProfileExtractor(FacebookExtractor): - """Base class for Facebook Profile Photos Set extractors""" - subcategory = "profile" - pattern = ( - BASE_PATTERN + - r"/(?!media/|photo/|photo.php|watch/)" - r"(?:profile\.php\?id=|people/[^/?#]+/)?" - r"([^/?&#]+)(?:/photos(?:_by)?|/videos|/posts)?/?(?:$|\?|#)" - ) - example = "https://www.facebook.com/USERNAME" - - def get_profile_photos_set_id(self, profile_photos_page): - set_ids_raw = text.extr( - profile_photos_page, '"pageItems"', '"page_info"' - ) - - set_id = text.extr( - set_ids_raw, 'set=', '"' - ).rsplit("&", 1)[0] or text.extr( - set_ids_raw, '\\/photos\\/', '\\/' - ) - - return set_id +class FacebookPhotosExtractor(FacebookExtractor): + """Extractor for Facebook Profile Photos""" + subcategory = "photos" + pattern = USER_PATTERN + r"/photos(?:_by)?" + example = "https://www.facebook.com/USERNAME/photos" def items(self): - profile_photos_url = ( - self.root + "/" + self.groups[0] + "/photos_by" - ) + set_id = self._extract_profile_photos_page(self.groups[0])[0] + set_url = f"{self.root}/media/set/?set={set_id}" + set_page = self.request(set_url).text + set_data = self.parse_set_page(set_page) + return self.extract_set(set_data) - for _ in range(self.fallback_retries + 1): - profile_photos_page = self.request(profile_photos_url).text - set_id = self.get_profile_photos_set_id(profile_photos_page) - if set_id: - break - self.log.debug("Failed to find profile photos set ID, retrying...") - if set_id: - set_url = f"{self.root}/media/set/?set={set_id}" - set_page = self.request(set_url).text - set_data = self.parse_set_page(set_page) - return self.extract_set(set_data) +class FacebookAvatarExtractor(FacebookExtractor): + """Extractor for Facebook Profile Avatars""" + subcategory = "avatar" + pattern = USER_PATTERN + r"/avatar" + example = "https://www.facebook.com/USERNAME/avatar" - self.log.debug("Profile photos set ID not found.") - return iter(()) + def items(self): + avatar_page_url = self._extract_profile_photos_page(self.groups[0])[1] + avatar_page = self.photo_page_request_wrapper(avatar_page_url).text + + avatar = self.parse_photo_page(avatar_page) + avatar["count"] = avatar["num"] = 1 + avatar["type"] = "avatar" + + set_url = f"{self.root}/media/set/?set={avatar['set_id']}" + set_page = self.request(set_url).text + directory = self.parse_set_page(set_page) + + yield Message.Directory, directory + yield Message.Url, avatar["url"], avatar + + +class FacebookUserExtractor(Dispatch, FacebookExtractor): + """Extractor for Facebook Profiles""" + pattern = USER_PATTERN + r"/?(?:$|\?|#)" + example = "https://www.facebook.com/USERNAME" + + def items(self): + base = f"{self.root}/{self.groups[0]}/" + return self._dispatch_extractors(( + (FacebookAvatarExtractor, base + "avatar"), + (FacebookPhotosExtractor, base + "photos"), + ), ("photos",)) diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 46399bfb..36708ff3 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -260,6 +260,9 @@ SUBCATEGORY_MAP = { "discord": { "direct-message" : "", }, + "facebook": { + "photos" : "Profile Photos", + }, "fanbox": { "supporting": "Supported User Feed", "redirect" : "Pixiv Redirects", diff --git a/test/results/facebook.py b/test/results/facebook.py index a445a6ae..a4a554ff 100644 --- a/test/results/facebook.py +++ b/test/results/facebook.py @@ -11,46 +11,71 @@ import datetime __tests__ = ( { "#url" : "https://www.facebook.com/facebook", - "#category": ("", "facebook", "profile"), - "#class" : facebook.FacebookProfileExtractor, + "#class" : facebook.FacebookUserExtractor, + "#results" : "https://www.facebook.com/facebook/photos" +}, + +{ + "#url" : "https://www.facebook.com/people/facebook/100064860875397/?sk=photos", + "#class" : facebook.FacebookUserExtractor, + "#results" : "https://www.facebook.com/100064860875397/photos" +}, + +{ + "#url" : "https://www.facebook.com/profile.php?id=100064860875397", + "#class" : facebook.FacebookUserExtractor, + "#results" : "https://www.facebook.com/100064860875397/photos" +}, + +{ + "#url" : "https://www.facebook.com/facebook/photos", + "#class" : facebook.FacebookPhotosExtractor, + "#range" : "1-3", "#count" : 3, }, { - "#url" : "https://www.facebook.com/facebook/photos", - "#category": ("", "facebook", "profile"), - "#class" : facebook.FacebookProfileExtractor, + "#url" : "https://www.facebook.com/100064860875397/photos", + "#class" : facebook.FacebookPhotosExtractor, + + "#range" : "1-3", + "#count" : 3, }, { "#url" : "https://www.facebook.com/facebook/photos_by", - "#category": ("", "facebook", "profile"), - "#class" : facebook.FacebookProfileExtractor, + "#class" : facebook.FacebookPhotosExtractor, }, { - "#url" : "https://www.facebook.com/people/facebook/100064860875397/?sk=photos", - "#category": ("", "facebook", "profile"), - "#class" : facebook.FacebookProfileExtractor, -}, + "#url" : "https://www.facebook.com/facebook/avatar", + "#class" : facebook.FacebookAvatarExtractor, + "#pattern" : r"https://scontent-[^7?#]+\.fbcdn\.net/v/t39.30808-6/380700650_10162533193146729_2379134611963304810_n.jpg?.+", + "#count" : 1, -{ - "#url" : "https://www.facebook.com/profile.php?id=100064860875397", - "#category": ("", "facebook", "profile"), - "#class" : facebook.FacebookProfileExtractor, + "caption" : "", + "count" : 1, + "date" : "dt:2023-10-06 21:13:59", + "extension": "jpg", + "filename" : str, + "id" : "736550615183628", + "num" : 1, + "set_id" : "a.736550601850296", + "type" : "avatar", + "url" : str, + "user_id" : "100064860875397", + "username" : "Facebook", }, { "#url" : "https://www.facebook.com/media/set/?set=a.10152716010956729&type=3", - "#category": ("", "facebook", "set"), "#class" : facebook.FacebookSetExtractor, "#count" : 6, }, { "#url" : "https://www.facebook.com/joho.press.jp/posts/pfbid02mfFRpVkErLQxQ8cpD2f1hwXEVsFzK8kfNBKdK2Jndnx6AkmMQZuXhovwDgwvoDNil", - "#category": ("", "facebook", "set"), "#class" : facebook.FacebookSetExtractor, "#range" : "1-3", "#count" : 3, @@ -62,20 +87,17 @@ __tests__ = ( { "#url" : "https://www.facebook.com/photo/?fbid=10152716011076729&set=a.10152716010956729&setextract", - "#category": ("", "facebook", "set"), "#class" : facebook.FacebookSetExtractor, "#count" : 4, }, { "#url" : "https://www.facebook.com/photo.php?fbid=10165113568399554&set=t.100064860875397&type=3", - "#category": ("", "facebook", "photo"), "#class" : facebook.FacebookPhotoExtractor, }, { "#url" : "https://www.facebook.com/photo/?fbid=10160743390456729", - "#category": ("", "facebook", "photo"), "#class" : facebook.FacebookPhotoExtractor, "#count" : 1, @@ -92,13 +114,11 @@ __tests__ = ( { "#url" : "https://www.facebook.com/photo/?fbs=home&fbid=10160743390456729", - "#category": ("", "facebook", "photo"), "#class" : facebook.FacebookPhotoExtractor, }, { "#url" : "https://www.facebook.com/Facebook/photos/a.10152716010956729/10152716011076729", - "#category": ("", "facebook", "photo"), "#class" : facebook.FacebookPhotoExtractor, "#count" : 1, @@ -116,7 +136,6 @@ __tests__ = ( { "#url" : "https://www.facebook.com/photo.php?fbid=1156625586261770", "#comment" : "surrogate pair in 'caption' data (#6599)", - "#category": ("", "facebook", "photo"), "#class" : facebook.FacebookPhotoExtractor, "caption" : "A century of innovation parked side by side.\n\n📸: Vocabutesla via X", @@ -125,7 +144,6 @@ __tests__ = ( { "#url" : "https://www.facebook.com/photo.php?fbid=989340003138066&set=pb.100061862277212.-2207520000&type=3", "#comment" : "no 'publish_time' (#7151)", - "#category": ("", "facebook", "photo"), "#class" : facebook.FacebookPhotoExtractor, "date" : "dt:2025-02-25 15:00:09", @@ -133,7 +151,6 @@ __tests__ = ( { "#url" : "https://www.facebook.com/watch/?v=1165557851291824", - "#category": ("", "facebook", "video"), "#class" : facebook.FacebookVideoExtractor, "#count" : 1, @@ -147,7 +164,6 @@ __tests__ = ( { "#url" : "https://www.facebook.com/100064860875397/videos/644342003942740", - "#category": ("", "facebook", "video"), "#class" : facebook.FacebookVideoExtractor, "#count" : 2,