[generic] better directory formatting/md
This commit is contained in:
@@ -15,7 +15,7 @@ import re
|
|||||||
class GenericExtractor(Extractor):
|
class GenericExtractor(Extractor):
|
||||||
"""Extractor for images in a generic web page."""
|
"""Extractor for images in a generic web page."""
|
||||||
category = "generic"
|
category = "generic"
|
||||||
directory_fmt = ("{category}", "{pageurl}")
|
directory_fmt = ("{category}", "{subcategory}", "{path}")
|
||||||
archive_fmt = "{imageurl}"
|
archive_fmt = "{imageurl}"
|
||||||
|
|
||||||
# By default, the generic extractor is disabled
|
# By default, the generic extractor is disabled
|
||||||
@@ -52,7 +52,10 @@ class GenericExtractor(Extractor):
|
|||||||
self.scheme = match.group('scheme')
|
self.scheme = match.group('scheme')
|
||||||
else:
|
else:
|
||||||
self.scheme = 'https://'
|
self.scheme = 'https://'
|
||||||
self.url = self.scheme + self.url
|
self.url = text.ensure_http_scheme(self.url, self.scheme)
|
||||||
|
|
||||||
|
self.subcategory = match.group('domain')
|
||||||
|
self.path = match.group('path')
|
||||||
|
|
||||||
# Used to resolve relative image urls
|
# Used to resolve relative image urls
|
||||||
self.root = self.scheme + match.group('domain')
|
self.root = self.scheme + match.group('domain')
|
||||||
@@ -87,6 +90,7 @@ class GenericExtractor(Extractor):
|
|||||||
def metadata(self, page):
|
def metadata(self, page):
|
||||||
"""Extract generic webpage metadata, return them in a dict."""
|
"""Extract generic webpage metadata, return them in a dict."""
|
||||||
data = {}
|
data = {}
|
||||||
|
data['path'] = self.path.replace("/", "")
|
||||||
data['pageurl'] = self.url
|
data['pageurl'] = self.url
|
||||||
data['title'] = text.extr(page, '<title>', "</title>")
|
data['title'] = text.extr(page, '<title>', "</title>")
|
||||||
data['description'] = text.extr(
|
data['description'] = text.extr(
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ from gallery_dl.extractor import generic
|
|||||||
__tests__ = (
|
__tests__ = (
|
||||||
{
|
{
|
||||||
"#url" : "generic:https://www.nongnu.org/lzip/",
|
"#url" : "generic:https://www.nongnu.org/lzip/",
|
||||||
"#category": ("", "generic", ""),
|
"#category": ("", "generic", "www.nongnu.org"),
|
||||||
"#class" : generic.GenericExtractor,
|
"#class" : generic.GenericExtractor,
|
||||||
"#count" : 1,
|
"#count" : 1,
|
||||||
"#sha1_content": "40be5c77773d3e91db6e1c5df720ee30afb62368",
|
"#sha1_content": "40be5c77773d3e91db6e1c5df720ee30afb62368",
|
||||||
@@ -23,7 +23,7 @@ __tests__ = (
|
|||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "generic:https://räksmörgås.josefsson.org/",
|
"#url" : "generic:https://räksmörgås.josefsson.org/",
|
||||||
"#category": ("", "generic", ""),
|
"#category": ("", "generic", "räksmörgås.josefsson.org"),
|
||||||
"#class" : generic.GenericExtractor,
|
"#class" : generic.GenericExtractor,
|
||||||
"#pattern" : "^https://räksmörgås.josefsson.org/",
|
"#pattern" : "^https://räksmörgås.josefsson.org/",
|
||||||
"#count" : 2,
|
"#count" : 2,
|
||||||
@@ -31,37 +31,37 @@ __tests__ = (
|
|||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "generic:https://en.wikipedia.org/Main_Page",
|
"#url" : "generic:https://en.wikipedia.org/Main_Page",
|
||||||
"#category": ("", "generic", ""),
|
"#category": ("", "generic", "en.wikipedia.org"),
|
||||||
"#class" : generic.GenericExtractor,
|
"#class" : generic.GenericExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "generic:https://example.org/path/to/file?que=1?&ry=2/#fragment",
|
"#url" : "generic:https://example.org/path/to/file?que=1?&ry=2/#fragment",
|
||||||
"#category": ("", "generic", ""),
|
"#category": ("", "generic", "example.org"),
|
||||||
"#class" : generic.GenericExtractor,
|
"#class" : generic.GenericExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E",
|
"#url" : "generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E",
|
||||||
"#category": ("", "generic", ""),
|
"#category": ("", "generic", "example.org"),
|
||||||
"#class" : generic.GenericExtractor,
|
"#class" : generic.GenericExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "generic:https://en.wikipedia.org/Main_Page",
|
"#url" : "generic:https://en.wikipedia.org/Main_Page",
|
||||||
"#category": ("", "generic", ""),
|
"#category": ("", "generic", "en.wikipedia.org"),
|
||||||
"#class" : generic.GenericExtractor,
|
"#class" : generic.GenericExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "generic:https://example.org/path/to/file?que=1?&ry=2/#fragment",
|
"#url" : "generic:https://example.org/path/to/file?que=1?&ry=2/#fragment",
|
||||||
"#category": ("", "generic", ""),
|
"#category": ("", "generic", "example.org"),
|
||||||
"#class" : generic.GenericExtractor,
|
"#class" : generic.GenericExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
{
|
{
|
||||||
"#url" : "generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E",
|
"#url" : "generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E",
|
||||||
"#category": ("", "generic", ""),
|
"#category": ("", "generic", "example.org"),
|
||||||
"#class" : generic.GenericExtractor,
|
"#class" : generic.GenericExtractor,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user