[naver] EUC-KR encoding issue in old image URLs Fix
Around October 2010, the image server URL format and file name encoding changed from EUC-KR to UTF-8. Modified to detect old URL format and decode image URLs into EUC-KR - (lint with flake8) Customize conditions Wrap lines smaller than 79 characters - (lint with flake8) Customize conditions (2nd try) - One import per line - Indent on consecutive lines - (lint with flake8) Customize conditions (3rd try) - E128 continuation line under-indented for visual indent - E123 closing bracket does not match indentation of opening bracket's line - Update naver.py Check encoding for all image URLs
This commit is contained in:
committed by
Mike Fährmann
parent
36fc510d3a
commit
f64fb8f239
@@ -10,6 +10,7 @@
|
|||||||
|
|
||||||
from .common import GalleryExtractor, Extractor, Message
|
from .common import GalleryExtractor, Extractor, Message
|
||||||
from .. import text
|
from .. import text
|
||||||
|
from urllib.parse import unquote
|
||||||
|
|
||||||
|
|
||||||
class NaverBase():
|
class NaverBase():
|
||||||
@@ -63,7 +64,13 @@ class NaverPostExtractor(NaverBase, GalleryExtractor):
|
|||||||
|
|
||||||
def images(self, page):
|
def images(self, page):
|
||||||
return [
|
return [
|
||||||
(url.replace("://post", "://blog", 1).partition("?")[0], None)
|
(unquote(url, encoding="EUC-KR")
|
||||||
|
.replace("://post", "://blog", 1)
|
||||||
|
.partition("?")[0], None)
|
||||||
|
if "\ufffd" in unquote(url)
|
||||||
|
else
|
||||||
|
(url.replace("://post", "://blog", 1)
|
||||||
|
.partition("?")[0], None)
|
||||||
for url in text.extract_iter(page, 'data-lazy-src="', '"')
|
for url in text.extract_iter(page, 'data-lazy-src="', '"')
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user