From a552fb5bcd6c6a1ff24b153072b6605356a5cd54 Mon Sep 17 00:00:00 2001 From: Allen <64094914+allendema@users.noreply.github.com> Date: Thu, 29 Aug 2024 21:31:15 +0200 Subject: [PATCH] [generic] better directory formatting/md --- gallery_dl/extractor/generic.py | 8 ++++++-- test/results/generic.py | 16 ++++++++-------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 16d4340c..a6c1d5ae 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -15,7 +15,7 @@ import re class GenericExtractor(Extractor): """Extractor for images in a generic web page.""" category = "generic" - directory_fmt = ("{category}", "{pageurl}") + directory_fmt = ("{category}", "{subcategory}", "{path}") archive_fmt = "{imageurl}" # By default, the generic extractor is disabled @@ -52,7 +52,10 @@ class GenericExtractor(Extractor): self.scheme = match.group('scheme') else: self.scheme = 'https://' - self.url = self.scheme + self.url + self.url = text.ensure_http_scheme(self.url, self.scheme) + + self.subcategory = match.group('domain') + self.path = match.group('path') # Used to resolve relative image urls self.root = self.scheme + match.group('domain') @@ -87,6 +90,7 @@ class GenericExtractor(Extractor): def metadata(self, page): """Extract generic webpage metadata, return them in a dict.""" data = {} + data['path'] = self.path.replace("/", "") data['pageurl'] = self.url data['title'] = text.extr(page, '', "") data['description'] = text.extr( diff --git a/test/results/generic.py b/test/results/generic.py index 5ec0400e..4d940afe 100644 --- a/test/results/generic.py +++ b/test/results/generic.py @@ -10,7 +10,7 @@ from gallery_dl.extractor import generic __tests__ = ( { "#url" : "generic:https://www.nongnu.org/lzip/", - "#category": ("", "generic", ""), + "#category": ("", "generic", "www.nongnu.org"), "#class" : generic.GenericExtractor, "#count" : 1, "#sha1_content": "40be5c77773d3e91db6e1c5df720ee30afb62368", @@ -23,7 +23,7 @@ __tests__ = ( { "#url" : "generic:https://räksmörgås.josefsson.org/", - "#category": ("", "generic", ""), + "#category": ("", "generic", "räksmörgås.josefsson.org"), "#class" : generic.GenericExtractor, "#pattern" : "^https://räksmörgås.josefsson.org/", "#count" : 2, @@ -31,37 +31,37 @@ __tests__ = ( { "#url" : "generic:https://en.wikipedia.org/Main_Page", - "#category": ("", "generic", ""), + "#category": ("", "generic", "en.wikipedia.org"), "#class" : generic.GenericExtractor, }, { "#url" : "generic:https://example.org/path/to/file?que=1?&ry=2/#fragment", - "#category": ("", "generic", ""), + "#category": ("", "generic", "example.org"), "#class" : generic.GenericExtractor, }, { "#url" : "generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E", - "#category": ("", "generic", ""), + "#category": ("", "generic", "example.org"), "#class" : generic.GenericExtractor, }, { "#url" : "generic:https://en.wikipedia.org/Main_Page", - "#category": ("", "generic", ""), + "#category": ("", "generic", "en.wikipedia.org"), "#class" : generic.GenericExtractor, }, { "#url" : "generic:https://example.org/path/to/file?que=1?&ry=2/#fragment", - "#category": ("", "generic", ""), + "#category": ("", "generic", "example.org"), "#class" : generic.GenericExtractor, }, { "#url" : "generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E", - "#category": ("", "generic", ""), + "#category": ("", "generic", "example.org"), "#class" : generic.GenericExtractor, },