From 8fbbaa54ff3c6c8be46387a31ade010fd64b0fa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 28 Feb 2020 22:57:10 +0100 Subject: [PATCH] [bcy] fix partial image URLs (#613) Images from new posts can have incomplete/partial URLs (1) without any filename extension when fetching their data from '/apiv3/user/selfPosts', so now all data gets taken from '/item/detail/ID' pages. It is currently unknown how to get the non-watermarked original version of these images, or if that is possible at all. (2) Images with a watermark will have their 'filter' metadata field set to "watermark". For original images this field is an empty string "". Enabling the 'noop' option will, in addition to the watermarked version, yield the the '~noop.image' filter version (3), where 'filter' is set to "noop". (1) "https://img-bcy-qn.pstatp.com/banciyuan/3ccdff22479c4060aadc86718209b281" (2) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~tplv-banciyuan-logo-v3:wqnpnLLlhZLlpKfprZTnjotfCuWNiuasoeWFgyAtIEFDR-eIseWlveiAheekvuWMug==.image" (3) "https://p1-bcy.byteimg.com/img/banciyuan/3ccdff22479c4060aadc86718209b281~noop.image" --- gallery_dl/extractor/bcy.py | 109 +++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 45 deletions(-) diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py index 634ff186..f1fcfc62 100644 --- a/gallery_dl/extractor/bcy.py +++ b/gallery_dl/extractor/bcy.py @@ -29,41 +29,72 @@ class BcyExtractor(Extractor): def items(self): sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub iroot = "https://img-bcy-qn.pstatp.com" + noop = self.config("noop") - for post in self.posts(): - if not post["image_list"]: + for post_id in self.posts(): + post = self._parse_post(post_id) + if not post: continue - data = { - "user": { - "id" : post["uid"], - "name" : post["uname"], - "avatar" : sub(iroot, post["avatar"].partition("~")[0]), - }, - "post": { - "id" : text.parse_int(post["item_id"]), - "tags" : [t["tag_name"] for t in post["post_tags"]], - "date" : text.parse_timestamp(post["ctime"]), - "parody" : post["work"], - "content": post["plain"], - "likes" : post["like_count"], - "shares" : post["share_count"], - "replies": post["reply_count"], - }, - } + yield Message.Directory, post + for post["num"], image in enumerate(post["_multi"], 1): + post["id"] = image["mid"] + post["width"] = image["w"] + post["height"] = image["h"] - yield Message.Directory, data - for data["num"], image in enumerate(post["image_list"], 1): - data["id"] = image["mid"] - data["width"] = image["w"] - data["height"] = image["h"] + url = image["path"].partition("~")[0] + text.nameext_from_url(url, post) - url = image["path"] - if not url.startswith(iroot): - url = sub(iroot, url.partition("~")[0]) - data["url"] = url + if post["extension"]: + if not url.startswith(iroot): + url = sub(iroot, url) + post["filter"] = "" + yield Message.Url, url, post - yield Message.Url, url, text.nameext_from_url(url, data) + else: + post["filter"] = "watermark" + yield Message.Url, image["origin"], post + + if noop: + post["extension"] = "" + post["filter"] = "noop" + yield Message.Url, image["original_path"], post + + def _parse_post(self, post_id): + url = "{}/item/detail/{}".format(self.root, post_id) + response = self.request(url) + if response.status_code >= 400: + return None + + data = json.loads( + text.extract(response.text, 'JSON.parse("', '");')[0] + .replace('\\\\u002F', '/') + .replace('\\"', '"') + )["detail"] + + post = data["post_data"] + if not post["multi"]: + return None + user = data["detail_user"] + + return { + "user": { + "id" : user["uid"], + "name" : user["uname"], + "avatar" : user["avatar"], + }, + "post": { + "id" : text.parse_int(post["item_id"]), + "tags" : [t["tag_name"] for t in post["post_tags"]], + "date" : text.parse_timestamp(post["ctime"]), + "parody" : text.parse_unicode_escapes(post["work"]), + "content": post["plain"], + "likes" : post["like_count"], + "shares" : post["share_count"], + "replies": post["reply_count"], + }, + "_multi": post["multi"], + } class BcyUserExtractor(BcyExtractor): @@ -88,7 +119,8 @@ class BcyUserExtractor(BcyExtractor): item = None for item in data["data"]["items"]: - yield item["item_detail"] + if item["item_detail"]["multi"]: + yield item["item_detail"]["item_id"] if not item: return @@ -106,7 +138,7 @@ class BcyPostExtractor(BcyExtractor): "user": { "id" : 1933712, "name" : "wukloo", - "avatar" : "re:https://img-bcy-qn.pstatp.com/Public/Upload/", + "avatar" : str, }, "post": { "id" : 6355835481002893070, @@ -128,17 +160,4 @@ class BcyPostExtractor(BcyExtractor): }) def posts(self): - url = self.root + "/item/detail/" + self.item_id - page = self.request(url).text - - data = json.loads( - text.extract(page, 'JSON.parse("', '");')[0] - .replace('\\\\u002F', '/') - .replace('\\"', '"') - )["detail"] - - post = data["post_data"] - post["image_list"] = post["multi"] - post["plain"] = text.parse_unicode_escapes(post["plain"]) - post.update(data["detail_user"]) - return (post,) + return (self.item_id,)