From f64fb8f239744c912ad4b15c3accf9e3bc6b7018 Mon Sep 17 00:00:00 2001 From: Johann Hong <57867081+986569200-johann-Hong@users.noreply.github.com> Date: Mon, 29 Jan 2024 00:23:09 +0900 Subject: [PATCH 1/2] [naver] EUC-KR encoding issue in old image URLs Fix Around October 2010, the image server URL format and file name encoding changed from EUC-KR to UTF-8. Modified to detect old URL format and decode image URLs into EUC-KR - (lint with flake8) Customize conditions Wrap lines smaller than 79 characters - (lint with flake8) Customize conditions (2nd try) - One import per line - Indent on consecutive lines - (lint with flake8) Customize conditions (3rd try) - E128 continuation line under-indented for visual indent - E123 closing bracket does not match indentation of opening bracket's line - Update naver.py Check encoding for all image URLs --- gallery_dl/extractor/naver.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/naver.py b/gallery_dl/extractor/naver.py index 55faf9e7..25801c7e 100644 --- a/gallery_dl/extractor/naver.py +++ b/gallery_dl/extractor/naver.py @@ -10,6 +10,7 @@ from .common import GalleryExtractor, Extractor, Message from .. import text +from urllib.parse import unquote class NaverBase(): @@ -63,7 +64,13 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): def images(self, page): return [ - (url.replace("://post", "://blog", 1).partition("?")[0], None) + (unquote(url, encoding="EUC-KR") + .replace("://post", "://blog", 1) + .partition("?")[0], None) + if "\ufffd" in unquote(url) + else + (url.replace("://post", "://blog", 1) + .partition("?")[0], None) for url in text.extract_iter(page, 'data-lazy-src="', '"') ] From a8d3efbb99815d1b0a33d760c53fa46d8d7d4924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Tue, 5 Mar 2024 23:18:20 +0100 Subject: [PATCH 2/2] [naver] simplify code + add test --- gallery_dl/extractor/naver.py | 18 +++++++----------- test/results/naver.py | 27 +++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/gallery_dl/extractor/naver.py b/gallery_dl/extractor/naver.py index 25801c7e..5062cb26 100644 --- a/gallery_dl/extractor/naver.py +++ b/gallery_dl/extractor/naver.py @@ -10,7 +10,6 @@ from .common import GalleryExtractor, Extractor, Message from .. import text -from urllib.parse import unquote class NaverBase(): @@ -63,16 +62,13 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): return data def images(self, page): - return [ - (unquote(url, encoding="EUC-KR") - .replace("://post", "://blog", 1) - .partition("?")[0], None) - if "\ufffd" in unquote(url) - else - (url.replace("://post", "://blog", 1) - .partition("?")[0], None) - for url in text.extract_iter(page, 'data-lazy-src="', '"') - ] + results = [] + for url in text.extract_iter(page, 'data-lazy-src="', '"'): + url = url.replace("://post", "://blog", 1).partition("?")[0] + if "\ufffd" in text.unquote(url): + url = text.unquote(url, encoding="EUC-KR") + results.append((url, None)) + return results class NaverBlogExtractor(NaverBase, Extractor): diff --git a/test/results/naver.py b/test/results/naver.py index 81d18efd..9a8f92ec 100644 --- a/test/results/naver.py +++ b/test/results/naver.py @@ -24,6 +24,33 @@ __tests__ = ( "#sha1_metadata": "a6e23d19afbee86b37d6e7ad934650c379d2cb1e", }, +{ + "#url" : "https://blog.naver.com/PostView.nhn?blogId=rlfqjxm0&logNo=70161391809", + "#comment" : "filenames in EUC-KR encoding (#5126)", + "#category": ("", "naver", "post"), + "#class" : naver.NaverPostExtractor, + "#urls": ( + "https://blogfiles.pstatic.net/20130305_23/ping9303_1362411028002Dpz9z_PNG/1_사본.png", + "https://blogfiles.pstatic.net/20130305_46/rlfqjxm0_1362473322580x33zi_PNG/오마갓합작.png", + ), + + "blog": { + "id" : "rlfqjxm0", + "num" : 43030507, + "user": "에나", + }, + "post": { + "date" : "dt:2013-03-05 17:48:00", + "description": "&nbsp;◈ &nbsp; &nbsp; PROMOTER&nbsp;:핑수 ˚ 아담 EDITOR:핑수 &nbsp; 넵:이크:핑수...", + "num" : 70161391809, + "title" : "[공유] { 합작}  OH, MY GOD! ~ 아 또 무슨 종말을 한다 그래~" + }, + "count" : 2, + "num" : range(1, 2), + "filename" : r"re:1_사본|오마갓합작", + "extension": "png", +}, + { "#url" : "https://blog.naver.com/gukjung", "#category": ("", "naver", "blog"),