[facebook] implement 'include' option & add 'avatar' extractor (#7848)
rename 'profile' extractor to 'photos'
This commit is contained in:
@@ -2600,6 +2600,27 @@ description
|
|||||||
Extract comments that include photo attachments made by the author of the post.
|
Extract comments that include photo attachments made by the author of the post.
|
||||||
|
|
||||||
|
|
||||||
|
extractor.facebook.include
|
||||||
|
--------------------------
|
||||||
|
Type
|
||||||
|
* ``string``
|
||||||
|
* ``list`` of ``strings``
|
||||||
|
Default
|
||||||
|
``"photos"``
|
||||||
|
Example
|
||||||
|
* ``"avatar,photos"``
|
||||||
|
* ``["avatar", "photos"]``
|
||||||
|
Description
|
||||||
|
A (comma-separated) list of subcategories to include
|
||||||
|
when processing a user profile.
|
||||||
|
|
||||||
|
Supported values are
|
||||||
|
* ``"avatar"``
|
||||||
|
* ``"photos"``
|
||||||
|
|
||||||
|
It is possible to use ``"all"`` instead of listing all values separately.
|
||||||
|
|
||||||
|
|
||||||
extractor.facebook.videos
|
extractor.facebook.videos
|
||||||
-------------------------
|
-------------------------
|
||||||
Type
|
Type
|
||||||
|
|||||||
@@ -293,6 +293,14 @@
|
|||||||
"limits-action" : "stop",
|
"limits-action" : "stop",
|
||||||
"fallback-retries": 2
|
"fallback-retries": 2
|
||||||
},
|
},
|
||||||
|
"facebook":
|
||||||
|
{
|
||||||
|
"cookies": null,
|
||||||
|
|
||||||
|
"author-followups": false,
|
||||||
|
"include": "photos",
|
||||||
|
"videos" : true
|
||||||
|
},
|
||||||
"fanbox":
|
"fanbox":
|
||||||
{
|
{
|
||||||
"cookies" : null,
|
"cookies" : null,
|
||||||
|
|||||||
@@ -274,7 +274,7 @@ Consider all listed sites to potentially be NSFW.
|
|||||||
<tr>
|
<tr>
|
||||||
<td>Facebook</td>
|
<td>Facebook</td>
|
||||||
<td>https://www.facebook.com/</td>
|
<td>https://www.facebook.com/</td>
|
||||||
<td>Photos, Profiles, Sets, Videos</td>
|
<td>Avatars, Photos, Profile Photos, Sets, User Profiles, Videos</td>
|
||||||
<td><a href="https://github.com/mikf/gallery-dl#cookies">Cookies</a></td>
|
<td><a href="https://github.com/mikf/gallery-dl#cookies">Cookies</a></td>
|
||||||
</tr>
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
|
|||||||
@@ -6,10 +6,14 @@
|
|||||||
|
|
||||||
"""Extractors for https://www.facebook.com/"""
|
"""Extractors for https://www.facebook.com/"""
|
||||||
|
|
||||||
from .common import Extractor, Message
|
from .common import Extractor, Message, Dispatch
|
||||||
from .. import text, exception
|
from .. import text, exception
|
||||||
|
from ..cache import memcache
|
||||||
|
|
||||||
BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com"
|
BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com"
|
||||||
|
USER_PATTERN = (BASE_PATTERN +
|
||||||
|
r"/(?!media/|photo/|photo.php|watch/)"
|
||||||
|
r"(?:profile\.php\?id=|people/[^/?#]+/)?([^/?&#]+)")
|
||||||
|
|
||||||
|
|
||||||
class FacebookExtractor(Extractor):
|
class FacebookExtractor(Extractor):
|
||||||
@@ -291,6 +295,36 @@ class FacebookExtractor(Extractor):
|
|||||||
|
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
|
@memcache(keyarg=1)
|
||||||
|
def _extract_profile_photos_page(self, profile):
|
||||||
|
profile_photos_url = f"{self.root}/{profile}/photos_by"
|
||||||
|
|
||||||
|
for _ in range(self.fallback_retries + 1):
|
||||||
|
profile_photos_page = self.request(profile_photos_url).text
|
||||||
|
if set_id := self._extract_profile_set_id(profile_photos_page):
|
||||||
|
break
|
||||||
|
self.log.debug("Got empty profile photos page, retrying...")
|
||||||
|
else:
|
||||||
|
raise exception.AbortExtraction("Failed to extract profile data")
|
||||||
|
|
||||||
|
avatar_page_url = text.extr(
|
||||||
|
profile_photos_page, ',"profilePhoto":{"url":"', '"')
|
||||||
|
|
||||||
|
return set_id, avatar_page_url.replace("\\/", "/")
|
||||||
|
|
||||||
|
def _extract_profile_set_id(self, profile_photos_page):
|
||||||
|
set_ids_raw = text.extr(
|
||||||
|
profile_photos_page, '"pageItems"', '"page_info"'
|
||||||
|
)
|
||||||
|
|
||||||
|
set_id = text.extr(
|
||||||
|
set_ids_raw, 'set=', '"'
|
||||||
|
).rsplit("&", 1)[0] or text.extr(
|
||||||
|
set_ids_raw, '\\/photos\\/', '\\/'
|
||||||
|
)
|
||||||
|
|
||||||
|
return set_id
|
||||||
|
|
||||||
|
|
||||||
class FacebookSetExtractor(FacebookExtractor):
|
class FacebookSetExtractor(FacebookExtractor):
|
||||||
"""Base class for Facebook Set extractors"""
|
"""Base class for Facebook Set extractors"""
|
||||||
@@ -384,47 +418,50 @@ class FacebookVideoExtractor(FacebookExtractor):
|
|||||||
yield Message.Url, audio["url"], audio
|
yield Message.Url, audio["url"], audio
|
||||||
|
|
||||||
|
|
||||||
class FacebookProfileExtractor(FacebookExtractor):
|
class FacebookPhotosExtractor(FacebookExtractor):
|
||||||
"""Base class for Facebook Profile Photos Set extractors"""
|
"""Extractor for Facebook Profile Photos"""
|
||||||
subcategory = "profile"
|
subcategory = "photos"
|
||||||
pattern = (
|
pattern = USER_PATTERN + r"/photos(?:_by)?"
|
||||||
BASE_PATTERN +
|
example = "https://www.facebook.com/USERNAME/photos"
|
||||||
r"/(?!media/|photo/|photo.php|watch/)"
|
|
||||||
r"(?:profile\.php\?id=|people/[^/?#]+/)?"
|
|
||||||
r"([^/?&#]+)(?:/photos(?:_by)?|/videos|/posts)?/?(?:$|\?|#)"
|
|
||||||
)
|
|
||||||
example = "https://www.facebook.com/USERNAME"
|
|
||||||
|
|
||||||
def get_profile_photos_set_id(self, profile_photos_page):
|
|
||||||
set_ids_raw = text.extr(
|
|
||||||
profile_photos_page, '"pageItems"', '"page_info"'
|
|
||||||
)
|
|
||||||
|
|
||||||
set_id = text.extr(
|
|
||||||
set_ids_raw, 'set=', '"'
|
|
||||||
).rsplit("&", 1)[0] or text.extr(
|
|
||||||
set_ids_raw, '\\/photos\\/', '\\/'
|
|
||||||
)
|
|
||||||
|
|
||||||
return set_id
|
|
||||||
|
|
||||||
def items(self):
|
def items(self):
|
||||||
profile_photos_url = (
|
set_id = self._extract_profile_photos_page(self.groups[0])[0]
|
||||||
self.root + "/" + self.groups[0] + "/photos_by"
|
set_url = f"{self.root}/media/set/?set={set_id}"
|
||||||
)
|
set_page = self.request(set_url).text
|
||||||
|
set_data = self.parse_set_page(set_page)
|
||||||
|
return self.extract_set(set_data)
|
||||||
|
|
||||||
for _ in range(self.fallback_retries + 1):
|
|
||||||
profile_photos_page = self.request(profile_photos_url).text
|
|
||||||
set_id = self.get_profile_photos_set_id(profile_photos_page)
|
|
||||||
if set_id:
|
|
||||||
break
|
|
||||||
self.log.debug("Failed to find profile photos set ID, retrying...")
|
|
||||||
|
|
||||||
if set_id:
|
class FacebookAvatarExtractor(FacebookExtractor):
|
||||||
set_url = f"{self.root}/media/set/?set={set_id}"
|
"""Extractor for Facebook Profile Avatars"""
|
||||||
set_page = self.request(set_url).text
|
subcategory = "avatar"
|
||||||
set_data = self.parse_set_page(set_page)
|
pattern = USER_PATTERN + r"/avatar"
|
||||||
return self.extract_set(set_data)
|
example = "https://www.facebook.com/USERNAME/avatar"
|
||||||
|
|
||||||
self.log.debug("Profile photos set ID not found.")
|
def items(self):
|
||||||
return iter(())
|
avatar_page_url = self._extract_profile_photos_page(self.groups[0])[1]
|
||||||
|
avatar_page = self.photo_page_request_wrapper(avatar_page_url).text
|
||||||
|
|
||||||
|
avatar = self.parse_photo_page(avatar_page)
|
||||||
|
avatar["count"] = avatar["num"] = 1
|
||||||
|
avatar["type"] = "avatar"
|
||||||
|
|
||||||
|
set_url = f"{self.root}/media/set/?set={avatar['set_id']}"
|
||||||
|
set_page = self.request(set_url).text
|
||||||
|
directory = self.parse_set_page(set_page)
|
||||||
|
|
||||||
|
yield Message.Directory, directory
|
||||||
|
yield Message.Url, avatar["url"], avatar
|
||||||
|
|
||||||
|
|
||||||
|
class FacebookUserExtractor(Dispatch, FacebookExtractor):
|
||||||
|
"""Extractor for Facebook Profiles"""
|
||||||
|
pattern = USER_PATTERN + r"/?(?:$|\?|#)"
|
||||||
|
example = "https://www.facebook.com/USERNAME"
|
||||||
|
|
||||||
|
def items(self):
|
||||||
|
base = f"{self.root}/{self.groups[0]}/"
|
||||||
|
return self._dispatch_extractors((
|
||||||
|
(FacebookAvatarExtractor, base + "avatar"),
|
||||||
|
(FacebookPhotosExtractor, base + "photos"),
|
||||||
|
), ("photos",))
|
||||||
|
|||||||
@@ -260,6 +260,9 @@ SUBCATEGORY_MAP = {
|
|||||||
"discord": {
|
"discord": {
|
||||||
"direct-message" : "",
|
"direct-message" : "",
|
||||||
},
|
},
|
||||||
|
"facebook": {
|
||||||
|
"photos" : "Profile Photos",
|
||||||
|
},
|
||||||
"fanbox": {
|
"fanbox": {
|
||||||
"supporting": "Supported User Feed",
|
"supporting": "Supported User Feed",
|
||||||
"redirect" : "Pixiv Redirects",
|
"redirect" : "Pixiv Redirects",
|
||||||
|
|||||||
@@ -11,46 +11,71 @@ import datetime
|
|||||||
__tests__ = (
|
__tests__ = (
|
||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/facebook",
|
"#url" : "https://www.facebook.com/facebook",
|
||||||
"#category": ("", "facebook", "profile"),
|
"#class" : facebook.FacebookUserExtractor,
|
||||||
"#class" : facebook.FacebookProfileExtractor,
|
"#results" : "https://www.facebook.com/facebook/photos"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://www.facebook.com/people/facebook/100064860875397/?sk=photos",
|
||||||
|
"#class" : facebook.FacebookUserExtractor,
|
||||||
|
"#results" : "https://www.facebook.com/100064860875397/photos"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://www.facebook.com/profile.php?id=100064860875397",
|
||||||
|
"#class" : facebook.FacebookUserExtractor,
|
||||||
|
"#results" : "https://www.facebook.com/100064860875397/photos"
|
||||||
|
},
|
||||||
|
|
||||||
|
{
|
||||||
|
"#url" : "https://www.facebook.com/facebook/photos",
|
||||||
|
"#class" : facebook.FacebookPhotosExtractor,
|
||||||
|
|
||||||
"#range" : "1-3",
|
"#range" : "1-3",
|
||||||
"#count" : 3,
|
"#count" : 3,
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/facebook/photos",
|
"#url" : "https://www.facebook.com/100064860875397/photos",
|
||||||
"#category": ("", "facebook", "profile"),
|
"#class" : facebook.FacebookPhotosExtractor,
|
||||||
"#class" : facebook.FacebookProfileExtractor,
|
|
||||||
|
"#range" : "1-3",
|
||||||
|
"#count" : 3,
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/facebook/photos_by",
|
"#url" : "https://www.facebook.com/facebook/photos_by",
|
||||||
"#category": ("", "facebook", "profile"),
|
"#class" : facebook.FacebookPhotosExtractor,
|
||||||
"#class" : facebook.FacebookProfileExtractor,
|
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/people/facebook/100064860875397/?sk=photos",
|
"#url" : "https://www.facebook.com/facebook/avatar",
|
||||||
"#category": ("", "facebook", "profile"),
|
"#class" : facebook.FacebookAvatarExtractor,
|
||||||
"#class" : facebook.FacebookProfileExtractor,
|
"#pattern" : r"https://scontent-[^7?#]+\.fbcdn\.net/v/t39.30808-6/380700650_10162533193146729_2379134611963304810_n.jpg?.+",
|
||||||
},
|
"#count" : 1,
|
||||||
|
|
||||||
{
|
"caption" : "",
|
||||||
"#url" : "https://www.facebook.com/profile.php?id=100064860875397",
|
"count" : 1,
|
||||||
"#category": ("", "facebook", "profile"),
|
"date" : "dt:2023-10-06 21:13:59",
|
||||||
"#class" : facebook.FacebookProfileExtractor,
|
"extension": "jpg",
|
||||||
|
"filename" : str,
|
||||||
|
"id" : "736550615183628",
|
||||||
|
"num" : 1,
|
||||||
|
"set_id" : "a.736550601850296",
|
||||||
|
"type" : "avatar",
|
||||||
|
"url" : str,
|
||||||
|
"user_id" : "100064860875397",
|
||||||
|
"username" : "Facebook",
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/media/set/?set=a.10152716010956729&type=3",
|
"#url" : "https://www.facebook.com/media/set/?set=a.10152716010956729&type=3",
|
||||||
"#category": ("", "facebook", "set"),
|
|
||||||
"#class" : facebook.FacebookSetExtractor,
|
"#class" : facebook.FacebookSetExtractor,
|
||||||
"#count" : 6,
|
"#count" : 6,
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/joho.press.jp/posts/pfbid02mfFRpVkErLQxQ8cpD2f1hwXEVsFzK8kfNBKdK2Jndnx6AkmMQZuXhovwDgwvoDNil",
|
"#url" : "https://www.facebook.com/joho.press.jp/posts/pfbid02mfFRpVkErLQxQ8cpD2f1hwXEVsFzK8kfNBKdK2Jndnx6AkmMQZuXhovwDgwvoDNil",
|
||||||
"#category": ("", "facebook", "set"),
|
|
||||||
"#class" : facebook.FacebookSetExtractor,
|
"#class" : facebook.FacebookSetExtractor,
|
||||||
"#range" : "1-3",
|
"#range" : "1-3",
|
||||||
"#count" : 3,
|
"#count" : 3,
|
||||||
@@ -62,20 +87,17 @@ __tests__ = (
|
|||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/photo/?fbid=10152716011076729&set=a.10152716010956729&setextract",
|
"#url" : "https://www.facebook.com/photo/?fbid=10152716011076729&set=a.10152716010956729&setextract",
|
||||||
"#category": ("", "facebook", "set"),
|
|
||||||
"#class" : facebook.FacebookSetExtractor,
|
"#class" : facebook.FacebookSetExtractor,
|
||||||
"#count" : 4,
|
"#count" : 4,
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/photo.php?fbid=10165113568399554&set=t.100064860875397&type=3",
|
"#url" : "https://www.facebook.com/photo.php?fbid=10165113568399554&set=t.100064860875397&type=3",
|
||||||
"#category": ("", "facebook", "photo"),
|
|
||||||
"#class" : facebook.FacebookPhotoExtractor,
|
"#class" : facebook.FacebookPhotoExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/photo/?fbid=10160743390456729",
|
"#url" : "https://www.facebook.com/photo/?fbid=10160743390456729",
|
||||||
"#category": ("", "facebook", "photo"),
|
|
||||||
"#class" : facebook.FacebookPhotoExtractor,
|
"#class" : facebook.FacebookPhotoExtractor,
|
||||||
"#count" : 1,
|
"#count" : 1,
|
||||||
|
|
||||||
@@ -92,13 +114,11 @@ __tests__ = (
|
|||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/photo/?fbs=home&fbid=10160743390456729",
|
"#url" : "https://www.facebook.com/photo/?fbs=home&fbid=10160743390456729",
|
||||||
"#category": ("", "facebook", "photo"),
|
|
||||||
"#class" : facebook.FacebookPhotoExtractor,
|
"#class" : facebook.FacebookPhotoExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/Facebook/photos/a.10152716010956729/10152716011076729",
|
"#url" : "https://www.facebook.com/Facebook/photos/a.10152716010956729/10152716011076729",
|
||||||
"#category": ("", "facebook", "photo"),
|
|
||||||
"#class" : facebook.FacebookPhotoExtractor,
|
"#class" : facebook.FacebookPhotoExtractor,
|
||||||
"#count" : 1,
|
"#count" : 1,
|
||||||
|
|
||||||
@@ -116,7 +136,6 @@ __tests__ = (
|
|||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/photo.php?fbid=1156625586261770",
|
"#url" : "https://www.facebook.com/photo.php?fbid=1156625586261770",
|
||||||
"#comment" : "surrogate pair in 'caption' data (#6599)",
|
"#comment" : "surrogate pair in 'caption' data (#6599)",
|
||||||
"#category": ("", "facebook", "photo"),
|
|
||||||
"#class" : facebook.FacebookPhotoExtractor,
|
"#class" : facebook.FacebookPhotoExtractor,
|
||||||
|
|
||||||
"caption" : "A century of innovation parked side by side.\n\n📸: Vocabutesla via X",
|
"caption" : "A century of innovation parked side by side.\n\n📸: Vocabutesla via X",
|
||||||
@@ -125,7 +144,6 @@ __tests__ = (
|
|||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/photo.php?fbid=989340003138066&set=pb.100061862277212.-2207520000&type=3",
|
"#url" : "https://www.facebook.com/photo.php?fbid=989340003138066&set=pb.100061862277212.-2207520000&type=3",
|
||||||
"#comment" : "no 'publish_time' (#7151)",
|
"#comment" : "no 'publish_time' (#7151)",
|
||||||
"#category": ("", "facebook", "photo"),
|
|
||||||
"#class" : facebook.FacebookPhotoExtractor,
|
"#class" : facebook.FacebookPhotoExtractor,
|
||||||
|
|
||||||
"date" : "dt:2025-02-25 15:00:09",
|
"date" : "dt:2025-02-25 15:00:09",
|
||||||
@@ -133,7 +151,6 @@ __tests__ = (
|
|||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/watch/?v=1165557851291824",
|
"#url" : "https://www.facebook.com/watch/?v=1165557851291824",
|
||||||
"#category": ("", "facebook", "video"),
|
|
||||||
"#class" : facebook.FacebookVideoExtractor,
|
"#class" : facebook.FacebookVideoExtractor,
|
||||||
"#count" : 1,
|
"#count" : 1,
|
||||||
|
|
||||||
@@ -147,7 +164,6 @@ __tests__ = (
|
|||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "https://www.facebook.com/100064860875397/videos/644342003942740",
|
"#url" : "https://www.facebook.com/100064860875397/videos/644342003942740",
|
||||||
"#category": ("", "facebook", "video"),
|
|
||||||
"#class" : facebook.FacebookVideoExtractor,
|
"#class" : facebook.FacebookVideoExtractor,
|
||||||
"#count" : 2,
|
"#count" : 2,
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user