[bluesky] add 'quoted' option (#6323)

This commit is contained in:
Mike Fährmann
2024-10-25 17:18:30 +02:00
parent d34e2d56aa
commit 8f396cfc57
3 changed files with 106 additions and 39 deletions

View File

@@ -42,62 +42,76 @@ class BlueskyExtractor(Extractor):
self._user = self._user_did = None
self.instance = self.root.partition("://")[2]
self.videos = self.config("videos", True)
self.quoted = self.config("quoted", False)
def items(self):
for post in self.posts():
if "post" in post:
post = post["post"]
pid = post["uri"].rpartition("/")[2]
if self._user_did and post["author"]["did"] != self._user_did:
self.log.debug("Skipping %s (repost)", pid)
self.log.debug("Skipping %s (repost)", self._pid(post))
continue
embed = post.get("embed")
post.update(post.pop("record"))
post.update(post["record"])
del post["record"]
while True:
self._prepare(post)
files = self._extract_files(post)
if self._metadata_facets:
if "facets" in post:
post["hashtags"] = tags = []
post["mentions"] = dids = []
post["uris"] = uris = []
for facet in post["facets"]:
features = facet["features"][0]
if "tag" in features:
tags.append(features["tag"])
elif "did" in features:
dids.append(features["did"])
elif "uri" in features:
uris.append(features["uri"])
else:
post["hashtags"] = post["mentions"] = post["uris"] = ()
yield Message.Directory, post
if files:
base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob"
"?did={}&cid=".format(post["author"]["did"]))
for post["num"], file in enumerate(files, 1):
post.update(file)
yield Message.Url, base + file["filename"], post
if self._metadata_user:
post["user"] = self._user or post["author"]
if not self.quoted or not embed or "record" not in embed:
break
files = self._extract_files(post)
post["instance"] = self.instance
post["post_id"] = pid
post["count"] = len(files)
post["date"] = text.parse_datetime(
post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S")
yield Message.Directory, post
if not files:
continue
base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob"
"?did={}&cid=".format(post["author"]["did"]))
for post["num"], file in enumerate(files, 1):
post.update(file)
yield Message.Url, base + file["filename"], post
quote = embed["record"]
if "record" in quote:
quote = quote["record"]
quote["quote_id"] = self._pid(post)
quote["quote_by"] = post["author"]
embed = quote.get("embed")
quote.update(quote.pop("value"))
post = quote
def posts(self):
return ()
def _pid(self, post):
return post["uri"].rpartition("/")[2]
def _prepare(self, post):
if self._metadata_facets:
if "facets" in post:
post["hashtags"] = tags = []
post["mentions"] = dids = []
post["uris"] = uris = []
for facet in post["facets"]:
features = facet["features"][0]
if "tag" in features:
tags.append(features["tag"])
elif "did" in features:
dids.append(features["did"])
elif "uri" in features:
uris.append(features["uri"])
else:
post["hashtags"] = post["mentions"] = post["uris"] = ()
if self._metadata_user:
post["user"] = self._user or post["author"]
post["instance"] = self.instance
post["post_id"] = self._pid(post)
post["date"] = text.parse_datetime(
post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S")
def _extract_files(self, post):
if "embed" not in post:
post["count"] = 0
return ()
files = []
@@ -111,6 +125,7 @@ class BlueskyExtractor(Extractor):
if "video" in media and self.videos:
files.append(self._extract_media(media, "video"))
post["count"] = len(files)
return files
def _extract_media(self, media, key):