[facebook] add 'info' extractor (#6582)

https://github.com/mikf/gallery-dl/issues/6582#issuecomment-3151899420

Currently relies on the profile having a /photos_by page.
This commit is contained in:
Mike Fährmann
2025-08-09 17:41:49 +02:00
parent 5bc198a7e6
commit 0d2b8f53cc
4 changed files with 64 additions and 11 deletions

View File

@@ -2678,6 +2678,7 @@ Description
Supported values are
* ``info``
* ``avatar``
* ``photos``
* ``albums``

View File

@@ -280,7 +280,7 @@ Consider all listed sites to potentially be NSFW.
<tr>
<td>Facebook</td>
<td>https://www.facebook.com/</td>
<td>Albums, Avatars, Photos, Profile Photos, Sets, User Profiles, Videos</td>
<td>Albums, Avatars, User Profile Information, Photos, Profile Photos, Sets, User Profiles, Videos</td>
<td><a href="https://github.com/mikf/gallery-dl#cookies">Cookies</a></td>
</tr>
<tr>

View File

@@ -322,17 +322,39 @@ class FacebookExtractor(Extractor):
"authenticated cookies", "profile",
"This content isn't available right now")
set_id = self._extract_profile_set_id(profile_photos_page)
avatar_page_url = text.extr(
profile_photos_page, ',"profilePhoto":{"url":"', '"')
set_id = self._extract_profile_set_id(
profile_photos_page)
user_data = text.extr(
profile_photos_page, '","user":{"', '},"viewer":{')
if set_id or avatar_page_url:
if set_id or user_data:
break
self.log.debug("Got empty profile photos page, retrying...")
else:
raise exception.AbortExtraction("Failed to extract profile data")
return set_id, avatar_page_url.replace("\\/", "/")
try:
data = util.json_loads(f'{{"{user_data}}}')
except Exception:
data = {}
self.log.debug(user_data)
try:
data["profile_tabs"] = [
edge["node"]
for edge in (data["profile_tabs"]["profile_user"]
["timeline_nav_app_sections"]["edges"])
]
except Exception:
pass
data["set_id"] = set_id
data["vanity"] = (
text.extr(profile_photos_page, '"userVanity":"', '"') or
text.extr(profile_photos_page, '"vanity":"', '"')
)
return data
def _extract_profile_set_id(self, profile_photos_page):
set_ids_raw = text.extr(
@@ -440,6 +462,17 @@ class FacebookVideoExtractor(FacebookExtractor):
yield Message.Url, audio["url"], audio
class FacebookInfoExtractor(FacebookExtractor):
"""Extractor for Facebook Profile data"""
subcategory = "info"
pattern = USER_PATTERN + r"/info"
example = "https://www.facebook.com/USERNAME/info"
def items(self):
user = self._extract_profile_photos_page(self.groups[0])
return iter(((Message.Directory, user),))
class FacebookAlbumsExtractor(FacebookExtractor):
"""Extractor for Facebook Profile albums"""
subcategory = "albums"
@@ -480,7 +513,7 @@ class FacebookPhotosExtractor(FacebookExtractor):
example = "https://www.facebook.com/USERNAME/photos"
def items(self):
set_id = self._extract_profile_photos_page(self.groups[0])[0]
set_id = self._extract_profile_photos_page(self.groups[0])["set_id"]
if not set_id:
return iter(())
@@ -497,7 +530,8 @@ class FacebookAvatarExtractor(FacebookExtractor):
example = "https://www.facebook.com/USERNAME/avatar"
def items(self):
avatar_page_url = self._extract_profile_photos_page(self.groups[0])[1]
user = self._extract_profile_photos_page(self.groups[0])
avatar_page_url = user["profilePhoto"]["url"]
avatar_page = self.photo_page_request_wrapper(avatar_page_url).text
avatar = self.parse_photo_page(avatar_page)
@@ -520,6 +554,7 @@ class FacebookUserExtractor(Dispatch, FacebookExtractor):
def items(self):
base = f"{self.root}/{self.groups[0]}/"
return self._dispatch_extractors((
(FacebookInfoExtractor , base + "info"),
(FacebookAvatarExtractor, base + "avatar"),
(FacebookPhotosExtractor, base + "photos"),
(FacebookAlbumsExtractor, base + "photos_albums"),

View File

@@ -28,6 +28,18 @@ __tests__ = (
"#results" : "https://www.facebook.com/100064860875397/photos"
},
{
"#url" : "https://www.facebook.com/facebook",
"#class" : facebook.FacebookUserExtractor,
"#options" : {"include": "all"},
"#results" : [
"https://www.facebook.com/facebook/info",
"https://www.facebook.com/facebook/avatar",
"https://www.facebook.com/facebook/photos",
"https://www.facebook.com/facebook/photos_albums",
],
},
{
"#url" : "https://www.facebook.com/facebook/photos",
"#class" : facebook.FacebookPhotosExtractor,
@@ -51,7 +63,7 @@ __tests__ = (
"#range" : "1",
"user_id" : "100074229772340",
"user_pfbid": r"re:pfbid0x\w{64}",
"user_pfbid": r"re:pfbid\w{66}",
},
{
@@ -104,7 +116,7 @@ __tests__ = (
"set_id" : "a.104622317759666",
"type" : "avatar",
"user_id" : "100046356937542",
"user_pfbid": r"re:pfbid0x\w{64}",
"user_pfbid": r"re:pfbid\w{66}",
"username" : "Throwaway Idk",
},
@@ -198,7 +210,7 @@ __tests__ = (
"id" : "221820450302279",
"set_id" : "a.109762038174788",
"user_id" : "100074229772340",
"user_pfbid": r"re:pfbid0x\w{64}",
"user_pfbid": r"re:pfbid\w{66}",
"username": "Throwaway Kwon",
},
@@ -278,4 +290,9 @@ __tests__ = (
"url" : "https://www.facebook.com/media/set/?set=a.736550611850295&type=3",
},
{
"#url" : "https://www.facebook.com/brando.cha.3/info",
"#class" : facebook.FacebookInfoExtractor,
},
)