From 9b2326e4e19e9de365357508bc7dc44ba721b4d7 Mon Sep 17 00:00:00 2001 From: chio0hai <94094996+chio0hai@users.noreply.github.com> Date: Fri, 26 May 2023 03:22:23 -0400 Subject: [PATCH 1/4] [lensdump] add lensdump.com extractor --- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/lensdump.py | 152 +++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 gallery_dl/extractor/lensdump.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 1f77f94a..3e47c3ec 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -81,6 +81,7 @@ modules = [ "kemonoparty", "khinsider", "komikcast", + "lensdump", "lexica", "lightroom", "lineblog", diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py new file mode 100644 index 00000000..a88f5a64 --- /dev/null +++ b/gallery_dl/extractor/lensdump.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- + +"""Extractors for https://lensdump.com/""" + +import json + +from .common import GalleryExtractor, Extractor, Message +from .. import text + + +class LensdumpExtractor(GalleryExtractor): + """Extractor for lensdump.com""" + category = "lensdump" + root = "https://lensdump.com" + + def get_meta_prop(self, page, name): + return text.extr(page, f'property="{name}" content="', '"') + + def nodes(self, page=None): + if page is None: + page = self.request(self.url).text + + # go through all pages starting from the oldest + page_url = text.urljoin(self.root, text.extr( + text.extr(page, ' id="list-most-oldest-link"', '>'), + 'href="', '"')) + while page_url is not None: + if page_url == self.url: + current_page = page + else: + current_page = self.request(page_url).text + + for node in text.extract_iter( + current_page, ' class="list-item ', '>'): + yield node + + # find url of next page + page_url = text.extr( + text.extr(current_page, ' data-pagination="next"', '>'), + 'href="', '"') + if page_url is not None and len(page_url) > 0: + page_url = text.urljoin(self.root, page_url) + else: + page_url = None + + +class LensdumpAlbumExtractor(LensdumpExtractor): + subcategory = "album" + pattern = (r"(?:https?://)?lensdump\.com/" + r"(?:((?!\w+/albums|a/|i/)\w+)|a/(\w+))") + test = ( + ("https://lensdump.com/a/1IhJr", { + "url": "7428cc906e7b291c778d446a11c602b81ba72840", + "keyword": { + "extension": "png", + "name": str, + "num": int, + "title": str, + "url": str, + "width": int, + }, + }), + ) + + def __init__(self, match): + GalleryExtractor.__init__(self, match, match.string) + self.gallery_id = match.group(1) or match.group(2) + + def metadata(self, page): + return { + "gallery_id": self.gallery_id, + "title": text.unescape(text.extr( + page, 'property="og:title" content="', '"').strip()) + } + + def images(self, page): + for node in self.nodes(page): + # get urls and filenames of images in current page + json_data = json.loads(text.unquote( + text.extr(node, 'data-object="', '"'))) + image_id = json_data.get('name') + image_url = json_data.get('url') + image_title = json_data.get('title') + if image_title is not None: + image_title = text.unescape(image_title) + yield (image_url, { + 'id': image_id, + 'url': image_url, + 'title': image_title, + 'name': json_data.get('filename'), + 'filename': image_id, + 'extension': json_data.get('extension'), + 'height': text.parse_int(json_data.get('height')), + 'width': text.parse_int(json_data.get('width')), + }) + + +class LensdumpAlbumsExtractor(LensdumpExtractor): + """Extractor for album list from lensdump.com""" + pattern = r"(?:https?://)?lensdump\.com/\w+/albums" + + def __init__(self, match): + Extractor.__init__(self, match) + + def items(self): + for node in self.nodes(): + album_url = text.urljoin(self.root, text.extr( + node, 'data-url-short="', '"')) + yield Message.Queue, album_url, { + "_extractor": LensdumpAlbumExtractor} + + +class LensdumpImageExtractor(LensdumpExtractor): + """Extractor for individual images on lensdump.com""" + subcategory = "image" + filename_fmt = "{category}_{id}{title:?_//}.{extension}" + directory_fmt = ("{category}",) + archive_fmt = "{id}" + pattern = r"(?:https?://)?lensdump\.com/i/(\w+)" + test = ( + ("https://lensdump.com/i/tyoAyM", { + "url": "ae9933f5f3bd9497bfc34e3e70a0fbef6c562d38", + "content": "1aa749ed2c0cf679ec8e1df60068edaf3875de46", + "keyword": { + "extension": "webp", + "filename": "tyoAyM", + "height": "400", + "id": "tyoAyM", + "title": "MYOBI clovis bookcaseset", + "url": "https://i2.lensdump.com/i/tyoAyM.webp", + "width": "620", + }, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.key = match.group(1) + + def items(self): + page = self.request(self.url).text + image_url = text.extr(page, 'property="og:image" content="', '"') + data = text.nameext_from_url(image_url) + data.update({ + 'id': self.key, + 'url': image_url, + 'title': self.get_meta_prop(page, "og:title"), + 'height': self.get_meta_prop(page, "image:height"), + 'width': self.get_meta_prop(page, "image:width"), + }) + yield Message.Directory, data + yield Message.Url, image_url, data From 82ba6bfdc0ff9cfd1932520a52751f3a6236dd4f Mon Sep 17 00:00:00 2001 From: chio0hai <94094996+chio0hai@users.noreply.github.com> Date: Fri, 26 May 2023 03:46:12 -0400 Subject: [PATCH 2/4] [lensdump] f-string fix --- gallery_dl/extractor/lensdump.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index a88f5a64..b0545ca3 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -14,7 +14,7 @@ class LensdumpExtractor(GalleryExtractor): root = "https://lensdump.com" def get_meta_prop(self, page, name): - return text.extr(page, f'property="{name}" content="', '"') + return text.extr(page, 'property="{}" content="'.format(name), '"') def nodes(self, page=None): if page is None: From d5300cf381024728d6405815e88383609837f632 Mon Sep 17 00:00:00 2001 From: chio0hai <94094996+chio0hai@users.noreply.github.com> Date: Fri, 26 May 2023 03:51:42 -0400 Subject: [PATCH 3/4] [lensdump] subcategory --- gallery_dl/extractor/lensdump.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index b0545ca3..c35c33ef 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -97,6 +97,7 @@ class LensdumpAlbumExtractor(LensdumpExtractor): class LensdumpAlbumsExtractor(LensdumpExtractor): """Extractor for album list from lensdump.com""" + subcategory = "albums" pattern = r"(?:https?://)?lensdump\.com/\w+/albums" def __init__(self, match): From 58f7480d46dec2cc5b3d07789e29bc76a9cfe1f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 26 May 2023 23:39:17 +0200 Subject: [PATCH 4/4] [lensdump] update - update docs/supportedsites.md - add GPL2 header - use BASE_PATTERN - improve LensdumpImageExtractor --- docs/supportedsites.md | 6 +++ gallery_dl/extractor/lensdump.py | 72 ++++++++++++++++++-------------- scripts/supportedsites.py | 3 ++ 3 files changed, 49 insertions(+), 32 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 27bb0bbe..995f2519 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -463,6 +463,12 @@ Consider all sites to be NSFW unless otherwise known. Chapters, Manga + + Lensdump + https://lensdump.com/ + Albums, individual Images + + Lexica https://lexica.art/ diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index c35c33ef..89906215 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -1,21 +1,22 @@ # -*- coding: utf-8 -*- +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + """Extractors for https://lensdump.com/""" -import json - from .common import GalleryExtractor, Extractor, Message -from .. import text +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?lensdump\.com" -class LensdumpExtractor(GalleryExtractor): - """Extractor for lensdump.com""" +class LensdumpBase(): + """Base class for lensdump extractors""" category = "lensdump" root = "https://lensdump.com" - def get_meta_prop(self, page, name): - return text.extr(page, 'property="{}" content="'.format(name), '"') - def nodes(self, page=None): if page is None: page = self.request(self.url).text @@ -44,10 +45,9 @@ class LensdumpExtractor(GalleryExtractor): page_url = None -class LensdumpAlbumExtractor(LensdumpExtractor): +class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor): subcategory = "album" - pattern = (r"(?:https?://)?lensdump\.com/" - r"(?:((?!\w+/albums|a/|i/)\w+)|a/(\w+))") + pattern = BASE_PATTERN + r"/(?:((?!\w+/albums|a/|i/)\w+)|a/(\w+))" test = ( ("https://lensdump.com/a/1IhJr", { "url": "7428cc906e7b291c778d446a11c602b81ba72840", @@ -76,7 +76,7 @@ class LensdumpAlbumExtractor(LensdumpExtractor): def images(self, page): for node in self.nodes(page): # get urls and filenames of images in current page - json_data = json.loads(text.unquote( + json_data = util.json_loads(text.unquote( text.extr(node, 'data-object="', '"'))) image_id = json_data.get('name') image_url = json_data.get('url') @@ -95,13 +95,11 @@ class LensdumpAlbumExtractor(LensdumpExtractor): }) -class LensdumpAlbumsExtractor(LensdumpExtractor): +class LensdumpAlbumsExtractor(LensdumpBase, Extractor): """Extractor for album list from lensdump.com""" subcategory = "albums" - pattern = r"(?:https?://)?lensdump\.com/\w+/albums" - - def __init__(self, match): - Extractor.__init__(self, match) + pattern = BASE_PATTERN + r"/\w+/albums" + test = ("https://lensdump.com/vstar925/albums",) def items(self): for node in self.nodes(): @@ -111,25 +109,27 @@ class LensdumpAlbumsExtractor(LensdumpExtractor): "_extractor": LensdumpAlbumExtractor} -class LensdumpImageExtractor(LensdumpExtractor): +class LensdumpImageExtractor(LensdumpBase, Extractor): """Extractor for individual images on lensdump.com""" subcategory = "image" filename_fmt = "{category}_{id}{title:?_//}.{extension}" directory_fmt = ("{category}",) archive_fmt = "{id}" - pattern = r"(?:https?://)?lensdump\.com/i/(\w+)" + pattern = BASE_PATTERN + r"/i/(\w+)" test = ( ("https://lensdump.com/i/tyoAyM", { + "pattern": r"https://i\d\.lensdump\.com/i/tyoAyM\.webp", "url": "ae9933f5f3bd9497bfc34e3e70a0fbef6c562d38", "content": "1aa749ed2c0cf679ec8e1df60068edaf3875de46", "keyword": { + "date": "dt:2022-08-01 08:24:28", "extension": "webp", "filename": "tyoAyM", - "height": "400", + "height": 400, "id": "tyoAyM", "title": "MYOBI clovis bookcaseset", "url": "https://i2.lensdump.com/i/tyoAyM.webp", - "width": "620", + "width": 620, }, }), ) @@ -139,15 +139,23 @@ class LensdumpImageExtractor(LensdumpExtractor): self.key = match.group(1) def items(self): - page = self.request(self.url).text - image_url = text.extr(page, 'property="og:image" content="', '"') - data = text.nameext_from_url(image_url) - data.update({ - 'id': self.key, - 'url': image_url, - 'title': self.get_meta_prop(page, "og:title"), - 'height': self.get_meta_prop(page, "image:height"), - 'width': self.get_meta_prop(page, "image:width"), - }) + url = "{}/i/{}".format(self.root, self.key) + extr = text.extract_from(self.request(url).text) + + data = { + "id" : self.key, + "title" : text.unescape(extr( + 'property="og:title" content="', '"')), + "url" : extr( + 'property="og:image" content="', '"'), + "width" : text.parse_int(extr( + 'property="image:width" content="', '"')), + "height": text.parse_int(extr( + 'property="image:height" content="', '"')), + "date" : text.parse_datetime(extr( + '