From 34a7fab0e2be42d10489b858be6f8218d1f0745a Mon Sep 17 00:00:00 2001 From: ClosedPort22 <44864697+ClosedPort22@users.noreply.github.com> Date: Mon, 6 Mar 2023 19:51:25 +0800 Subject: [PATCH] [generic] add support for IDNs (internationalized domain name) --- gallery_dl/extractor/directlink.py | 4 ++++ gallery_dl/extractor/generic.py | 26 ++++++++++++++++++++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 8b90250f..4827be52 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -44,6 +44,10 @@ class DirectlinkExtractor(Extractor): ("https://post-phinf.pstatic.net/MjAxOTA1MjlfMTQ4/MDAxNTU5MTI2NjcyNTkw" ".JUzkGb4V6dj9DXjLclrOoqR64uDxHFUO5KDriRdKpGwg.88mCtd4iT1NHlpVKSCaUpP" "mZPiDgT8hmQdQ5K_gYyu0g.JPEG/2.JPG"), + # internationalized domain name + ("https://räksmörgås.josefsson.org/raksmorgas.jpg", { + "content": "f7e00768ab009c969e70d775047cdd302ca51762", + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 9292da3d..d4276e62 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -26,12 +26,34 @@ class GenericExtractor(Extractor): # Based on: https://tools.ietf.org/html/rfc3986#appendix-B pattern += r""" (?Phttps?://)? # optional http(s) scheme - (?P[-\w\.]+) # required domain + (?P[^/?#]+) # required domain (?P/[^?#]*)? # optional path (?:\?(?P[^#]*))? # optional query (?:\#(?P.*))? # optional fragment """ + test = ( + ("generic:https://www.nongnu.org/lzip/", { + "count": 1, + "content": "40be5c77773d3e91db6e1c5df720ee30afb62368", + "keyword": { + "description": "Lossless data compressor", + "imageurl": "https://www.nongnu.org/lzip/lzip.png", + "keywords": "lzip, clzip, plzip, lzlib, LZMA, bzip2, " + "gzip, data compression, GNU, free software", + "pageurl": "https://www.nongnu.org/lzip/", + }, + }), + # internationalized domain name + ("generic:https://räksmörgås.josefsson.org/", { + "count": 2, + "pattern": "^https://räksmörgås.josefsson.org/", + }), + ("generic:https://en.wikipedia.org/Main_Page"), + ("generic:https://example.org/path/to/file?que=1?&ry=2/#fragment"), + ("generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E"), + ) + def __init__(self, match): """Init.""" Extractor.__init__(self, match) @@ -56,7 +78,7 @@ class GenericExtractor(Extractor): self.root = self.scheme + match.group('domain') def items(self): - """Get page, extract metadata & images, yield them in suitable messages. + """Get page, extract metadata & images, yield them in suitable messages Adapted from common.GalleryExtractor.items()