[lensdump] fix extraction (#6313)
- support custom sort orders & query parameters - rewrite pagination logic
This commit is contained in:
@@ -17,42 +17,30 @@ class LensdumpBase():
|
||||
category = "lensdump"
|
||||
root = "https://lensdump.com"
|
||||
|
||||
def nodes(self, page=None):
|
||||
if page is None:
|
||||
page = self.request(self.url).text
|
||||
def _pagination(self, page, begin, end):
|
||||
while True:
|
||||
yield from text.extract_iter(page, begin, end)
|
||||
|
||||
# go through all pages starting from the oldest
|
||||
page_url = text.urljoin(self.root, text.extr(
|
||||
text.extr(page, ' id="list-most-oldest-link"', '>'),
|
||||
'href="', '"'))
|
||||
while page_url is not None:
|
||||
if page_url == self.url:
|
||||
current_page = page
|
||||
else:
|
||||
current_page = self.request(page_url).text
|
||||
next = text.extr(page, ' data-pagination="next"', '>')
|
||||
if not next:
|
||||
return
|
||||
|
||||
for node in text.extract_iter(
|
||||
current_page, ' class="list-item ', '>'):
|
||||
yield node
|
||||
|
||||
# find url of next page
|
||||
page_url = text.extr(
|
||||
text.extr(current_page, ' data-pagination="next"', '>'),
|
||||
'href="', '"')
|
||||
if page_url is not None and len(page_url) > 0:
|
||||
page_url = text.urljoin(self.root, page_url)
|
||||
else:
|
||||
page_url = None
|
||||
url = text.urljoin(self.root, text.extr(next, 'href="', '"'))
|
||||
page = self.request(url).text
|
||||
|
||||
|
||||
class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor):
|
||||
subcategory = "album"
|
||||
pattern = BASE_PATTERN + r"/(?:((?!\w+/albums|a/|i/)\w+)|a/(\w+))"
|
||||
pattern = BASE_PATTERN + r"/a/(\w+)(?:/?\?([^#]+))?"
|
||||
example = "https://lensdump.com/a/ID"
|
||||
|
||||
def __init__(self, match):
|
||||
GalleryExtractor.__init__(self, match, match.string)
|
||||
self.gallery_id = match.group(1) or match.group(2)
|
||||
self.gallery_id, query = match.groups()
|
||||
if query:
|
||||
url = "{}/a/{}/?{}".format(self.root, self.gallery_id, query)
|
||||
else:
|
||||
url = "{}/a/{}".format(self.root, self.gallery_id)
|
||||
GalleryExtractor.__init__(self, match, url)
|
||||
|
||||
def metadata(self, page):
|
||||
return {
|
||||
@@ -62,40 +50,48 @@ class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor):
|
||||
}
|
||||
|
||||
def images(self, page):
|
||||
for node in self.nodes(page):
|
||||
# get urls and filenames of images in current page
|
||||
json_data = util.json_loads(text.unquote(
|
||||
text.extr(node, "data-object='", "'") or
|
||||
text.extr(node, 'data-object="', '"')))
|
||||
image_id = json_data.get('name')
|
||||
image_url = json_data.get('url')
|
||||
image_title = json_data.get('title')
|
||||
for image in self._pagination(page, ' class="list-item ', '>'):
|
||||
|
||||
data = util.json_loads(text.unquote(
|
||||
text.extr(image, "data-object='", "'") or
|
||||
text.extr(image, 'data-object="', '"')))
|
||||
image_id = data.get("name")
|
||||
image_url = data.get("url")
|
||||
image_title = data.get("title")
|
||||
if image_title is not None:
|
||||
image_title = text.unescape(image_title)
|
||||
|
||||
yield (image_url, {
|
||||
'id': image_id,
|
||||
'url': image_url,
|
||||
'title': image_title,
|
||||
'name': json_data.get('filename'),
|
||||
'filename': image_id,
|
||||
'extension': json_data.get('extension'),
|
||||
'height': text.parse_int(json_data.get('height')),
|
||||
'width': text.parse_int(json_data.get('width')),
|
||||
"id" : image_id,
|
||||
"url" : image_url,
|
||||
"title" : image_title,
|
||||
"name" : data.get("filename"),
|
||||
"filename" : image_id,
|
||||
"extension": data.get("extension"),
|
||||
"width" : text.parse_int(data.get("width")),
|
||||
"height" : text.parse_int(data.get("height")),
|
||||
})
|
||||
|
||||
|
||||
class LensdumpAlbumsExtractor(LensdumpBase, Extractor):
|
||||
"""Extractor for album list from lensdump.com"""
|
||||
subcategory = "albums"
|
||||
pattern = BASE_PATTERN + r"/\w+/albums"
|
||||
example = "https://lensdump.com/USER/albums"
|
||||
pattern = BASE_PATTERN + r"/(?![ai]/)([^/?#]+)(?:/?\?([^#]+))?"
|
||||
example = "https://lensdump.com/USER"
|
||||
|
||||
def items(self):
|
||||
for node in self.nodes():
|
||||
album_url = text.urljoin(self.root, text.extr(
|
||||
node, 'data-url-short="', '"'))
|
||||
yield Message.Queue, album_url, {
|
||||
"_extractor": LensdumpAlbumExtractor}
|
||||
user, query = self.groups
|
||||
url = "{}/{}/".format(self.root, user)
|
||||
if query:
|
||||
params = text.parse_query(query)
|
||||
else:
|
||||
params = {"sort": "date_asc", "page": "1"}
|
||||
page = self.request(url, params=params).text
|
||||
|
||||
data = {"_extractor": LensdumpAlbumExtractor}
|
||||
for album_path in self._pagination(page, 'data-url-short="', '"'):
|
||||
album_url = text.urljoin(self.root, album_path)
|
||||
yield Message.Queue, album_url, data
|
||||
|
||||
|
||||
class LensdumpImageExtractor(LensdumpBase, Extractor):
|
||||
@@ -107,16 +103,13 @@ class LensdumpImageExtractor(LensdumpBase, Extractor):
|
||||
pattern = r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)/i/(\w+)"
|
||||
example = "https://lensdump.com/i/ID"
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self, match)
|
||||
self.key = match.group(1)
|
||||
|
||||
def items(self):
|
||||
url = "{}/i/{}".format(self.root, self.key)
|
||||
key = self.groups[0]
|
||||
url = "{}/i/{}".format(self.root, key)
|
||||
extr = text.extract_from(self.request(url).text)
|
||||
|
||||
data = {
|
||||
"id" : self.key,
|
||||
"id" : key,
|
||||
"title" : text.unescape(extr(
|
||||
'property="og:title" content="', '"')),
|
||||
"url" : extr(
|
||||
|
||||
@@ -10,7 +10,6 @@ from gallery_dl.extractor import lensdump
|
||||
__tests__ = (
|
||||
{
|
||||
"#url" : "https://lensdump.com/a/1IhJr",
|
||||
"#category": ("", "lensdump", "album"),
|
||||
"#class" : lensdump.LensdumpAlbumExtractor,
|
||||
"#pattern" : r"https://[abcd]\.l3n\.co/i/tq\w{4}\.png",
|
||||
|
||||
@@ -22,15 +21,44 @@ __tests__ = (
|
||||
"width" : int,
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://lensdump.com/a/tA4lA",
|
||||
"#comment" : "2 pages",
|
||||
"#class" : lensdump.LensdumpAlbumExtractor,
|
||||
"#pattern" : r"https://[abcd]\.l3n\.co/i/\w{6}\.(jpe?g|png)",
|
||||
"#count" : 64,
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://lensdump.com/vstar925",
|
||||
"#class" : lensdump.LensdumpAlbumsExtractor,
|
||||
"#urls" : (
|
||||
"https://lensdump.com/a/tX1uA",
|
||||
"https://lensdump.com/a/R0gfK",
|
||||
"https://lensdump.com/a/RSOMv",
|
||||
"https://lensdump.com/a/9TbdT",
|
||||
),
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://lensdump.com/vstar925/?sort=likes_desc&page=1",
|
||||
"#comment" : "custom sort order",
|
||||
"#class" : lensdump.LensdumpAlbumsExtractor,
|
||||
"#urls" : (
|
||||
"https://lensdump.com/a/9TbdT",
|
||||
"https://lensdump.com/a/RSOMv",
|
||||
"https://lensdump.com/a/R0gfK",
|
||||
"https://lensdump.com/a/tX1uA",
|
||||
),
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://lensdump.com/vstar925/albums",
|
||||
"#category": ("", "lensdump", "albums"),
|
||||
"#class" : lensdump.LensdumpAlbumsExtractor,
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://lensdump.com/i/tyoAyM",
|
||||
"#category": ("", "lensdump", "image"),
|
||||
"#class" : lensdump.LensdumpImageExtractor,
|
||||
"#urls" : "https://c.l3n.co/i/tyoAyM.webp",
|
||||
"#sha1_content": "1aa749ed2c0cf679ec8e1df60068edaf3875de46",
|
||||
@@ -47,7 +75,6 @@ __tests__ = (
|
||||
|
||||
{
|
||||
"#url" : "https://c.l3n.co/i/tyoAyM.webp",
|
||||
"#category": ("", "lensdump", "image"),
|
||||
"#class" : lensdump.LensdumpImageExtractor,
|
||||
"#urls" : "https://c.l3n.co/i/tyoAyM.webp",
|
||||
|
||||
@@ -63,13 +90,11 @@ __tests__ = (
|
||||
|
||||
{
|
||||
"#url" : "https://i.lensdump.com/i/tyoAyM",
|
||||
"#category": ("", "lensdump", "image"),
|
||||
"#class" : lensdump.LensdumpImageExtractor,
|
||||
},
|
||||
|
||||
{
|
||||
"#url" : "https://i3.lensdump.com/i/tyoAyM",
|
||||
"#category": ("", "lensdump", "image"),
|
||||
"#class" : lensdump.LensdumpImageExtractor,
|
||||
},
|
||||
|
||||
|
||||
Reference in New Issue
Block a user