[fantia] improve metadata extraction (#4126)

extract all metadata and URLs before starting to download
This commit is contained in:
Mike Fährmann
2023-07-31 22:16:10 +02:00
parent d50c312ff0
commit c79359eb3a

View File

@@ -19,13 +19,13 @@ class FantiaExtractor(Extractor):
archive_fmt = "{post_id}_{file_id}" archive_fmt = "{post_id}_{file_id}"
_warning = True _warning = True
def items(self): def _init(self):
self.headers = { self.headers = {
"Accept" : "application/json, text/plain, */*", "Accept" : "application/json, text/plain, */*",
"Referer": self.root, "Referer": self.root,
"X-Requested-With": "XMLHttpRequest", "X-Requested-With": "XMLHttpRequest",
} }
_empty_plan = { self._empty_plan = {
"id" : 0, "id" : 0,
"price": 0, "price": 0,
"limit": 0, "limit": 0,
@@ -33,22 +33,18 @@ class FantiaExtractor(Extractor):
"description": "", "description": "",
"thumb": self.root + "/images/fallback/plan/thumb_default.png", "thumb": self.root + "/images/fallback/plan/thumb_default.png",
} }
if self._warning: if self._warning:
if not self.cookies_check(("_session_id",)): if not self.cookies_check(("_session_id",)):
self.log.warning("no '_session_id' cookie set") self.log.warning("no '_session_id' cookie set")
FantiaExtractor._warning = False FantiaExtractor._warning = False
def items(self):
for post_id in self.posts(): for post_id in self.posts():
post = self._get_post_data(post_id) post = self._get_post_data(post_id)
post["num"] = 0 post["num"] = 0
for content in self._get_post_contents(post): for content in self._get_post_contents(post):
post["content_category"] = content["category"] files = self._process_content(post, content)
post["content_title"] = content["title"]
post["content_filename"] = content.get("filename", "")
post["content_id"] = content["id"]
post["plan"] = content["plan"] or _empty_plan
yield Message.Directory, post yield Message.Directory, post
if content["visible_status"] != "visible": if content["visible_status"] != "visible":
@@ -57,12 +53,12 @@ class FantiaExtractor(Extractor):
"%s#post-content-id-%s", content["visible_status"], "%s#post-content-id-%s", content["visible_status"],
post["post_url"], content["id"]) post["post_url"], content["id"])
for url in self._get_content_urls(post, content): for file in files:
text.nameext_from_url( post.update(file)
post["content_filename"] or url, post)
post["file_url"] = url
post["num"] += 1 post["num"] += 1
yield Message.Url, url, post text.nameext_from_url(
post["content_filename"] or file["file_url"], post)
yield Message.Url, file["file_url"], post
def posts(self): def posts(self):
"""Return post IDs""" """Return post IDs"""
@@ -132,42 +128,45 @@ class FantiaExtractor(Extractor):
return contents return contents
def _get_content_urls(self, post, content): def _process_content(self, post, content):
"""Extract individual URL data from the response""" post["content_category"] = content["category"]
if "comment" in content: post["content_title"] = content["title"]
post["content_comment"] = content["comment"] post["content_filename"] = content.get("filename") or ""
post["content_id"] = content["id"]
post["content_comment"] = content.get("comment") or ""
post["plan"] = content["plan"] or self._empty_plan
files = []
if "post_content_photos" in content: if "post_content_photos" in content:
for photo in content["post_content_photos"]: for photo in content["post_content_photos"]:
post["file_id"] = photo["id"] files.append({"file_id" : photo["id"],
yield photo["url"]["original"] "file_url": photo["url"]["original"]})
if "download_uri" in content: if "download_uri" in content:
post["file_id"] = content["id"]
url = content["download_uri"] url = content["download_uri"]
if url[0] == "/": if url[0] == "/":
url = self.root + url url = self.root + url
yield url files.append({"file_id" : content["id"],
"file_url": url})
if content["category"] == "blog" and "comment" in content: if content["category"] == "blog" and "comment" in content:
comment_json = util.json_loads(content["comment"]) comment_json = util.json_loads(content["comment"])
ops = comment_json.get("ops") or ()
# collect blogpost text first
blog_text = "" blog_text = ""
for op in ops: for op in comment_json.get("ops") or ():
insert = op.get("insert") insert = op.get("insert")
if isinstance(insert, str): if isinstance(insert, str):
blog_text += insert blog_text += insert
post["blogpost_text"] = blog_text elif isinstance(insert, dict) and "fantiaImage" in insert:
# collect images
for op in ops:
insert = op.get("insert")
if isinstance(insert, dict) and "fantiaImage" in insert:
img = insert["fantiaImage"] img = insert["fantiaImage"]
post["file_id"] = img["id"] files.append({"file_id" : img["id"],
yield self.root + img["original_url"] "file_url": self.root + img["original_url"]})
post["blogpost_text"] = blog_text
else:
post["blogpost_text"] = ""
return files
class FantiaCreatorExtractor(FantiaExtractor): class FantiaCreatorExtractor(FantiaExtractor):
@@ -200,6 +199,42 @@ class FantiaPostExtractor(FantiaExtractor):
subcategory = "post" subcategory = "post"
pattern = r"(?:https?://)?(?:www\.)?fantia\.jp/posts/(\d+)" pattern = r"(?:https?://)?(?:www\.)?fantia\.jp/posts/(\d+)"
test = ( test = (
("https://fantia.jp/posts/1166373", {
"pattern": r"https://("
r"c\.fantia\.jp/uploads/post/file/1166373/|"
r"cc\.fantia\.jp/uploads/post_content_photo"
r"/file/732549[01]|"
r"fantia\.jp/posts/1166373/album_image\?)",
"keyword": {
"blogpost_text": r"re:^$|"
r"This is a test.\n\nThis is a test.\n\n|"
r"Link to video:\nhttps://www.youtube.com"
r"/watch\?v=5SSdvNcAagI\n\nhtml img from "
r"another site:\n\n\n\n\n\n",
"comment": "\n\n",
"content_category": "re:thumb|blog|photo_gallery",
"content_comment": str,
"content_filename": "re:|",
"content_title": r"re:Test (Blog Content \d+|Image Gallery)"
r"|thumb",
"date": "dt:2022-03-09 16:46:12",
"fanclub_id": 356320,
"fanclub_name": "Test Fantia",
"fanclub_url": "https://fantia.jp/fanclubs/356320",
"fanclub_user_id": 7487131,
"fanclub_user_name": "2022/03/08 15:13:52の名無し",
"file_url": str,
"filename": str,
"num": int,
"plan": dict,
"post_id": 1166373,
"post_title": "Test Fantia Post",
"post_url": "https://fantia.jp/posts/1166373",
"posted_at": "Thu, 10 Mar 2022 01:46:12 +0900",
"rating": "general",
"tags": [],
},
}),
("https://fantia.jp/posts/508363", { ("https://fantia.jp/posts/508363", {
"count": 6, "count": 6,
"keyword": { "keyword": {