From c79359eb3a5e04f2830cb9929e5d67119bf89482 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 31 Jul 2023 22:16:10 +0200 Subject: [PATCH] [fantia] improve metadata extraction (#4126) extract all metadata and URLs before starting to download --- gallery_dl/extractor/fantia.py | 99 +++++++++++++++++++++++----------- 1 file changed, 67 insertions(+), 32 deletions(-) diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 3679e375..ca1eeef1 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -19,13 +19,13 @@ class FantiaExtractor(Extractor): archive_fmt = "{post_id}_{file_id}" _warning = True - def items(self): + def _init(self): self.headers = { "Accept" : "application/json, text/plain, */*", "Referer": self.root, "X-Requested-With": "XMLHttpRequest", } - _empty_plan = { + self._empty_plan = { "id" : 0, "price": 0, "limit": 0, @@ -33,22 +33,18 @@ class FantiaExtractor(Extractor): "description": "", "thumb": self.root + "/images/fallback/plan/thumb_default.png", } - if self._warning: if not self.cookies_check(("_session_id",)): self.log.warning("no '_session_id' cookie set") FantiaExtractor._warning = False + def items(self): for post_id in self.posts(): post = self._get_post_data(post_id) post["num"] = 0 for content in self._get_post_contents(post): - post["content_category"] = content["category"] - post["content_title"] = content["title"] - post["content_filename"] = content.get("filename", "") - post["content_id"] = content["id"] - post["plan"] = content["plan"] or _empty_plan + files = self._process_content(post, content) yield Message.Directory, post if content["visible_status"] != "visible": @@ -57,12 +53,12 @@ class FantiaExtractor(Extractor): "%s#post-content-id-%s", content["visible_status"], post["post_url"], content["id"]) - for url in self._get_content_urls(post, content): - text.nameext_from_url( - post["content_filename"] or url, post) - post["file_url"] = url + for file in files: + post.update(file) post["num"] += 1 - yield Message.Url, url, post + text.nameext_from_url( + post["content_filename"] or file["file_url"], post) + yield Message.Url, file["file_url"], post def posts(self): """Return post IDs""" @@ -132,42 +128,45 @@ class FantiaExtractor(Extractor): return contents - def _get_content_urls(self, post, content): - """Extract individual URL data from the response""" - if "comment" in content: - post["content_comment"] = content["comment"] + def _process_content(self, post, content): + post["content_category"] = content["category"] + post["content_title"] = content["title"] + post["content_filename"] = content.get("filename") or "" + post["content_id"] = content["id"] + post["content_comment"] = content.get("comment") or "" + post["plan"] = content["plan"] or self._empty_plan + + files = [] if "post_content_photos" in content: for photo in content["post_content_photos"]: - post["file_id"] = photo["id"] - yield photo["url"]["original"] + files.append({"file_id" : photo["id"], + "file_url": photo["url"]["original"]}) if "download_uri" in content: - post["file_id"] = content["id"] url = content["download_uri"] if url[0] == "/": url = self.root + url - yield url + files.append({"file_id" : content["id"], + "file_url": url}) if content["category"] == "blog" and "comment" in content: comment_json = util.json_loads(content["comment"]) - ops = comment_json.get("ops") or () - # collect blogpost text first blog_text = "" - for op in ops: + for op in comment_json.get("ops") or (): insert = op.get("insert") if isinstance(insert, str): blog_text += insert - post["blogpost_text"] = blog_text - - # collect images - for op in ops: - insert = op.get("insert") - if isinstance(insert, dict) and "fantiaImage" in insert: + elif isinstance(insert, dict) and "fantiaImage" in insert: img = insert["fantiaImage"] - post["file_id"] = img["id"] - yield self.root + img["original_url"] + files.append({"file_id" : img["id"], + "file_url": self.root + img["original_url"]}) + post["blogpost_text"] = blog_text + else: + post["blogpost_text"] = "" + + return files class FantiaCreatorExtractor(FantiaExtractor): @@ -200,6 +199,42 @@ class FantiaPostExtractor(FantiaExtractor): subcategory = "post" pattern = r"(?:https?://)?(?:www\.)?fantia\.jp/posts/(\d+)" test = ( + ("https://fantia.jp/posts/1166373", { + "pattern": r"https://(" + r"c\.fantia\.jp/uploads/post/file/1166373/|" + r"cc\.fantia\.jp/uploads/post_content_photo" + r"/file/732549[01]|" + r"fantia\.jp/posts/1166373/album_image\?)", + "keyword": { + "blogpost_text": r"re:^$|" + r"This is a test.\n\nThis is a test.\n\n|" + r"Link to video:\nhttps://www.youtube.com" + r"/watch\?v=5SSdvNcAagI\n\nhtml img from " + r"another site:\n\n\n\n\n\n", + "comment": "\n\n", + "content_category": "re:thumb|blog|photo_gallery", + "content_comment": str, + "content_filename": "re:|", + "content_title": r"re:Test (Blog Content \d+|Image Gallery)" + r"|thumb", + "date": "dt:2022-03-09 16:46:12", + "fanclub_id": 356320, + "fanclub_name": "Test Fantia", + "fanclub_url": "https://fantia.jp/fanclubs/356320", + "fanclub_user_id": 7487131, + "fanclub_user_name": "2022/03/08 15:13:52ใฎๅ็„กใ—", + "file_url": str, + "filename": str, + "num": int, + "plan": dict, + "post_id": 1166373, + "post_title": "Test Fantia Post", + "post_url": "https://fantia.jp/posts/1166373", + "posted_at": "Thu, 10 Mar 2022 01:46:12 +0900", + "rating": "general", + "tags": [], + }, + }), ("https://fantia.jp/posts/508363", { "count": 6, "keyword": {