From 86f0597c951d999b1989504ddcf99f2c39de75d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Mon, 5 Dec 2022 19:28:50 +0100 Subject: [PATCH 01/13] [kissgoddess] remove module site does not host albums anymore --- docs/supportedsites.md | 6 --- gallery_dl/extractor/__init__.py | 1 - gallery_dl/extractor/kissgoddess.py | 82 ----------------------------- gallery_dl/version.py | 2 +- scripts/supportedsites.py | 1 - 5 files changed, 1 insertion(+), 91 deletions(-) delete mode 100644 gallery_dl/extractor/kissgoddess.py diff --git a/docs/supportedsites.md b/docs/supportedsites.md index fffe3ac6..439fcd30 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -433,12 +433,6 @@ Consider all sites to be NSFW unless otherwise known. Soundtracks - - Kiss Goddess - https://kissgoddess.com/ - Galleries, Models - - Kohlchan https://kohlchan.net/ diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index d2bbcbb7..3b553c84 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -74,7 +74,6 @@ modules = [ "keenspot", "kemonoparty", "khinsider", - "kissgoddess", "kohlchan", "komikcast", "lightroom", diff --git a/gallery_dl/extractor/kissgoddess.py b/gallery_dl/extractor/kissgoddess.py deleted file mode 100644 index 4ec685c2..00000000 --- a/gallery_dl/extractor/kissgoddess.py +++ /dev/null @@ -1,82 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2022 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://kissgoddess.com/""" - -from .common import GalleryExtractor, Extractor, Message -from .. import text, exception - - -class KissgoddessGalleryExtractor(GalleryExtractor): - """Extractor for image galleries on kissgoddess.com""" - category = "kissgoddess" - root = "https://kissgoddess.com" - pattern = r"(?:https?://)?(?:www\.)?kissgoddess\.com/album/(\d+)" - test = ("https://kissgoddess.com/album/18285.html", { - "pattern": r"https://pic\.kissgoddess\.com" - r"/gallery/16473/18285/s/\d+\.jpg", - "count": 19, - "keyword": { - "gallery_id": 18285, - "title": "[Young Champion Extra] 2016.02 No.03 菜乃花 安枝瞳 葉月あや", - }, - }) - - def __init__(self, match): - self.gallery_id = match.group(1) - url = "{}/album/{}.html".format(self.root, self.gallery_id) - GalleryExtractor.__init__(self, match, url) - - def metadata(self, page): - return { - "gallery_id": text.parse_int(self.gallery_id), - "title" : text.extr( - page, '', "<")[0].rpartition(" | "), - } - - def images(self, page): - pnum = 1 - - while page: - for url in text.extract_iter(page, "<img src='", "'"): - yield url, None - for url in text.extract_iter(page, "<img data-original='", "'"): - yield url, None - - pnum += 1 - url = "{}/album/{}_{}.html".format( - self.root, self.gallery_id, pnum) - try: - page = self.request(url).text - except exception.HttpError: - return - - -class KissgoddessModelExtractor(Extractor): - """Extractor for all galleries of a model on kissgoddess.com""" - category = "kissgoddess" - subcategory = "model" - root = "https://kissgoddess.com" - pattern = r"(?:https?://)?(?:www\.)?kissgoddess\.com/people/([^./?#]+)" - test = ("https://kissgoddess.com/people/aya-hazuki.html", { - "pattern": KissgoddessGalleryExtractor.pattern, - "count": ">= 7", - }) - - def __init__(self, match): - Extractor.__init__(self, match) - self.model = match.group(1) - - def items(self): - url = "{}/people/{}.html".format(self.root, self.model) - page = self.request(url).text - - data = {"_extractor": KissgoddessGalleryExtractor} - for path in text.extract_iter(page, 'thumb"><a href="/album/', '"'): - url = self.root + "/album/" + path - yield Message.Queue, url, data diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d2890098..6975192a 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.24.1" +__version__ = "1.24.2-dev" diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index d8106095..edd65546 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -61,7 +61,6 @@ CATEGORY_MAP = { "kabeuchi" : "かべうち", "kemonoparty" : "Kemono", "kireicake" : "Kirei Cake", - "kissgoddess" : "Kiss Goddess", "lineblog" : "LINE BLOG", "livedoor" : "livedoor Blog", "omgmiamiswimwear": "Omg Miami Swimwear", From a42ba25ca19978c69a0ff5b265ea23988e7bf792 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 5 Dec 2022 19:38:31 +0100 Subject: [PATCH 02/13] [foolslide] remove 'kireicake' site redirects to (unclaimed) mangadex group --- docs/supportedsites.md | 6 ------ gallery_dl/extractor/foolslide.py | 12 ------------ scripts/supportedsites.py | 1 - 3 files changed, 19 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 439fcd30..a0aded23 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -1263,12 +1263,6 @@ Consider all sites to be NSFW unless otherwise known. <tr> <td colspan="4"><strong>FoOlSlide Instances</strong></td> </tr> -<tr> - <td>Kirei Cake</td> - <td>https://reader.kireicake.com/</td> - <td>Chapters, Manga</td> - <td></td> -</tr> <tr> <td>PowerManga</td> <td>https://read.powermanga.org/</td> diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 81671ecd..2290cc25 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -39,10 +39,6 @@ class FoolslideExtractor(BaseExtractor): BASE_PATTERN = FoolslideExtractor.update({ - "kireicake": { - "root": "https://reader.kireicake.com", - "pattern": r"reader\.kireicake\.com", - }, "powermanga": { "root": "https://read.powermanga.org", "pattern": r"read(?:er)?\.powermanga\.org", @@ -64,10 +60,6 @@ class FoolslideChapterExtractor(FoolslideExtractor): archive_fmt = "{id}" pattern = BASE_PATTERN + r"(/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" test = ( - ("https://reader.kireicake.com/read/wonderland/en/1/1/", { - "url": "b2d36bc0bc67e4c461c3a4d6444a2fd339f5d07e", - "keyword": "9f80947920a325e33aea7f5cd69ea669171903b6", - }), (("https://read.powermanga.org" "/read/one_piece_digital_colour_comics/en/0/75/"), { "url": "854c5817f8f767e1bccd05fa9d58ffb5a4b09384", @@ -123,10 +115,6 @@ class FoolslideMangaExtractor(FoolslideExtractor): categorytransfer = True pattern = BASE_PATTERN + r"(/series/[^/?#]+)" test = ( - ("https://reader.kireicake.com/series/wonderland/", { - "url": "d067b649af1cc88fa8c8b698fde04a10909fd169", - "keyword": "268f43772fb239888ca5c5f6a4f65f99ffb3eefb", - }), (("https://read.powermanga.org" "/series/one_piece_digital_colour_comics/"), { "count": ">= 1", diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index edd65546..8cff63c3 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -60,7 +60,6 @@ CATEGORY_MAP = { "joyreactor" : "JoyReactor", "kabeuchi" : "かべうち", "kemonoparty" : "Kemono", - "kireicake" : "Kirei Cake", "lineblog" : "LINE BLOG", "livedoor" : "livedoor Blog", "omgmiamiswimwear": "Omg Miami Swimwear", From 5f57a27ba6dc99b418d52b97cadbf64e66605584 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 5 Dec 2022 22:15:59 +0100 Subject: [PATCH 03/13] [imagetwist] fix extraction --- gallery_dl/extractor/imagehosts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 6fcfc555..207562a3 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -200,7 +200,7 @@ class ImagetwistImageExtractor(ImagehostImageExtractor): return self.request(self.page_url).cookies def get_info(self, page): - url , pos = text.extract(page, 'center;"><img src="', '"') + url , pos = text.extract(page, '<img src="', '"') filename, pos = text.extract(page, ' alt="', '"', pos) return url, filename From 4a3a1f4c87febe5a92ece61e6e7982c59b98313d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Mon, 5 Dec 2022 22:36:49 +0100 Subject: [PATCH 04/13] [komikcast] update domain and fix extraction --- docs/supportedsites.md | 2 +- gallery_dl/extractor/komikcast.py | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/supportedsites.md b/docs/supportedsites.md index a0aded23..3c7d6cf2 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -441,7 +441,7 @@ Consider all sites to be NSFW unless otherwise known. </tr> <tr> <td>Komikcast</td> - <td>https://komikcast.me/</td> + <td>https://komikcast.site/</td> <td>Chapters, Manga</td> <td></td> </tr> diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index a9eebf40..04373c4b 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -6,19 +6,19 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://komikcast.me/""" +"""Extractors for https://komikcast.site/""" from .common import ChapterExtractor, MangaExtractor from .. import text import re -BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:me|com)" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:site|me|com)" class KomikcastBase(): """Base class for komikcast extractors""" category = "komikcast" - root = "https://komikcast.me" + root = "https://komikcast.site" @staticmethod def parse_chapter_string(chapter_string, data=None): @@ -46,23 +46,23 @@ class KomikcastBase(): class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): - """Extractor for manga-chapters from komikcast.me""" + """Extractor for manga-chapters from komikcast.site""" pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)" test = ( - (("https://komikcast.me/chapter" + (("https://komikcast.site/chapter" "/apotheosis-chapter-02-2-bahasa-indonesia/"), { - "url": "74eca5c9b27b896816497f9b2d847f2a1fcfc209", + "url": "f6b43fbc027697749b3ea1c14931c83f878d7936", "keyword": "f3938e1aff9ad1f302f52447e9781b21f6da26d4", }), (("https://komikcast.me/chapter" "/soul-land-ii-chapter-300-1-bahasa-indonesia/"), { - "url": "243a5250e210b40d17217e83b7547cefea5638bd", + "url": "efd00a9bd95461272d51990d7bc54b79ff3ff2e6", "keyword": "cb646cfed3d45105bd645ab38b2e9f7d8c436436", }), ) def metadata(self, page): - info = text.extr(page, "<title>", " – Komikcast<") + info = text.extr(page, "<title>", " - Komikcast<") return self.parse_chapter_string(info) @staticmethod @@ -76,12 +76,12 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): - """Extractor for manga from komikcast.me""" + """Extractor for manga from komikcast.site""" chapterclass = KomikcastChapterExtractor pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$" test = ( - ("https://komikcast.me/komik/090-eko-to-issho/", { - "url": "08204f0a703ec5272121abcf0632ecacba1e588f", + ("https://komikcast.site/komik/090-eko-to-issho/", { + "url": "19d3d50d532e84be6280a3d61ff0fd0ca04dd6b4", "keyword": "837a7e96867344ff59d840771c04c20dc46c0ab1", }), ("https://komikcast.me/tonari-no-kashiwagi-san/"), @@ -101,7 +101,7 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): @staticmethod def metadata(page): """Return a dict with general metadata""" - manga , pos = text.extract(page, "<title>" , " – Komikcast<") + manga , pos = text.extract(page, "<title>" , " - Komikcast<") genres, pos = text.extract( page, 'class="komik_info-content-genre">', "</span>", pos) author, pos = text.extract(page, ">Author:", "</span>", pos) From 6afb3cc766f7da7624c5aa0c2040b133b287f0bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 6 Dec 2022 16:25:59 +0100 Subject: [PATCH 05/13] restore paths for archived files (#3362) --- gallery_dl/path.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 7d599ee2..77a33277 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -212,14 +212,19 @@ class PathFormat(): def fix_extension(self, _=None): """Fix filenames without a given filename extension""" - if not self.extension: - self.kwdict["extension"] = self.prefix + self.extension_map("", "") - self.build_path() - if self.path[-1] == ".": - self.path = self.path[:-1] - self.temppath = self.realpath = self.realpath[:-1] - elif not self.temppath: + try: + if not self.extension: + self.kwdict["extension"] = \ + self.prefix + self.extension_map("", "") + self.build_path() + if self.path[-1] == ".": + self.path = self.path[:-1] + self.temppath = self.realpath = self.realpath[:-1] + elif not self.temppath: + self.build_path() + except Exception: self.path = self.directory + "?" + self.realpath = self.temppath = self.realdirectory + "?" return True def build_filename(self, kwdict): From 43c211f1a7cb2870e9f2a4a5a6584b499b8c66b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 6 Dec 2022 18:44:46 +0100 Subject: [PATCH 06/13] extend and rename util.CustomNone --- gallery_dl/util.py | 24 +++++++++++++++++++++--- test/test_util.py | 11 +++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 8ce1fb40..23d5bc8e 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -528,8 +528,8 @@ def parse_inputfile(file, log): yield line -class UniversalNone(): - """None-style object that supports more operations than None itself""" +class CustomNone(): + """None-style type that supports more operations than regular None""" __slots__ = () def __getattribute__(self, _): @@ -538,10 +538,28 @@ class UniversalNone(): def __getitem__(self, _): return self + def __iter__(self): + return self + + def __call__(self, *args, **kwargs): + return self + + @staticmethod + def __next__(): + raise StopIteration + @staticmethod def __bool__(): return False + @staticmethod + def __len__(): + return 0 + + @staticmethod + def __format__(_): + return "None" + @staticmethod def __str__(): return "None" @@ -549,7 +567,7 @@ class UniversalNone(): __repr__ = __str__ -NONE = UniversalNone() +NONE = CustomNone() EPOCH = datetime.datetime(1970, 1, 1) SECOND = datetime.timedelta(0, 1) WINDOWS = (os.name == "nt") diff --git a/test/test_util.py b/test/test_util.py index 2921ea23..4b8f9ae4 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -618,10 +618,21 @@ class TestOther(unittest.TestCase): obj = util.NONE self.assertFalse(obj) + self.assertEqual(len(obj), 0) self.assertEqual(str(obj), str(None)) self.assertEqual(repr(obj), repr(None)) + self.assertEqual(format(obj), str(None)) + self.assertEqual(format(obj, "%F"), str(None)) self.assertIs(obj.attr, obj) self.assertIs(obj["key"], obj) + self.assertIs(obj(), obj) + self.assertIs(obj(1, "a"), obj) + self.assertIs(obj(foo="bar"), obj) + + i = 0 + for _ in obj: + i += 1 + self.assertEqual(i, 0) class TestExtractor(): From ca4742200b90965897bc7d4ea073116e03c4dd6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 6 Dec 2022 22:26:46 +0100 Subject: [PATCH 07/13] use util.NONE as 'keyword-default' default value --- gallery_dl/formatter.py | 14 ++++++++------ gallery_dl/path.py | 2 ++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index ca05fa5a..5fe7f32a 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -18,8 +18,10 @@ import operator import functools from . import text, util +NONE = util.NONE -def parse(format_string, default=None, fmt=format): + +def parse(format_string, default=NONE, fmt=format): key = format_string, default, fmt try: @@ -88,7 +90,7 @@ class StringFormatter(): Example: {f:R /_/} -> "f_o_o_b_a_r" (if "f" is "f o o b a r") """ - def __init__(self, format_string, default=None, fmt=format): + def __init__(self, format_string, default=NONE, fmt=format): self.default = default self.format = fmt self.result = [] @@ -193,7 +195,7 @@ class StringFormatter(): class TemplateFormatter(StringFormatter): """Read format_string from file""" - def __init__(self, path, default=None, fmt=format): + def __init__(self, path, default=NONE, fmt=format): with open(util.expand_path(path)) as fp: format_string = fp.read() StringFormatter.__init__(self, format_string, default, fmt) @@ -202,14 +204,14 @@ class TemplateFormatter(StringFormatter): class ExpressionFormatter(): """Generate text by evaluating a Python expression""" - def __init__(self, expression, default=None, fmt=None): + def __init__(self, expression, default=NONE, fmt=None): self.format_map = util.compile_expression(expression) class ModuleFormatter(): """Generate text by calling an external function""" - def __init__(self, function_spec, default=None, fmt=None): + def __init__(self, function_spec, default=NONE, fmt=None): module_name, _, function_name = function_spec.partition(":") module = __import__(module_name) self.format_map = getattr(module, function_name) @@ -218,7 +220,7 @@ class ModuleFormatter(): class FStringFormatter(): """Generate text by evaluaring an f-string literal""" - def __init__(self, fstring, default=None, fmt=None): + def __init__(self, fstring, default=NONE, fmt=None): self.format_map = util.compile_expression("f'''" + fstring + "'''") diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 77a33277..3b360e99 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -29,6 +29,8 @@ class PathFormat(): def __init__(self, extractor): config = extractor.config kwdefault = config("keywords-default") + if kwdefault is None: + kwdefault = util.NONE filename_fmt = config("filename") try: From 202c1210d5ceefe4a32071278901d28080133700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Tue, 6 Dec 2022 22:39:13 +0100 Subject: [PATCH 08/13] [exhentai] fix pagination --- gallery_dl/extractor/exhentai.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index a546f684..01375d81 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -516,7 +516,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): data["gallery_token"] = gallery.group(3) yield Message.Queue, url + "/", data - next_url = text.extr(page, 'nexturl = "', '"', None) + next_url = text.extr(page, 'nexturl="', '"', None) if next_url is not None: if not next_url: return From 79e52f3539d397ad19ba6c9fced45fa6f47305b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 7 Dec 2022 00:17:15 +0100 Subject: [PATCH 09/13] [imgth] rewrite - inherit from GalleryExtractor - fix image URLs - better metadata --- gallery_dl/extractor/imgth.py | 83 ++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py index 7e4cce4e..9ae22a91 100644 --- a/gallery_dl/extractor/imgth.py +++ b/gallery_dl/extractor/imgth.py @@ -1,60 +1,73 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://imgth.com/""" +"""Extractors for https://imgth.com/""" -from .common import Extractor, Message +from .common import GalleryExtractor from .. import text -class ImgthGalleryExtractor(Extractor): +class ImgthGalleryExtractor(GalleryExtractor): """Extractor for image galleries from imgth.com""" category = "imgth" - subcategory = "gallery" - directory_fmt = ("{category}", "{gallery_id} {title}") - filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" - archive_fmt = "{gallery_id}_{num}" - pattern = r"(?:https?://)?imgth\.com/gallery/(\d+)" - test = ("http://imgth.com/gallery/37/wallpaper-anime", { - "url": "4ae1d281ca2b48952cf5cca57e9914402ad72748", - "keyword": "6f8c00d6849ea89d1a028764675ec1fe9dbd87e2", - }) + root = "https://imgth.com" + pattern = r"(?:https?://)?(?:www\.)?imgth\.com/gallery/(\d+)" + test = ( + ("https://imgth.com/gallery/37/wallpaper-anime", { + "url": "4ae1d281ca2b48952cf5cca57e9914402ad72748", + "pattern": r"https://imgth\.com/images/2009/11/25" + r"/wallpaper-anime_\w+\.jpg", + "keyword": { + "count": 12, + "date": "dt:2009-11-25 18:21:00", + "extension": "jpg", + "filename": r"re:wallpaper-anime_\w+", + "gallery_id": 37, + "num": int, + "title": "Wallpaper anime", + "user": "celebrities", + }, + }), + ("https://www.imgth.com/gallery/37/wallpaper-anime"), + ) def __init__(self, match): - Extractor.__init__(self, match) - self.gid = match.group(1) - self.url_base = "https://imgth.com/gallery/" + self.gid + "/g/page/" + self.gallery_id = gid = match.group(1) + url = "{}/gallery/{}/g/".format(self.root, gid) + GalleryExtractor.__init__(self, match, url) - def items(self): - page = self.request(self.url_base + "0").text - data = self.metadata(page) - yield Message.Directory, data - for data["num"], url in enumerate(self.images(page), 1): - yield Message.Url, url, text.nameext_from_url(url, data) + def metadata(self, page): + extr = text.extract_from(page) + return { + "gallery_id": text.parse_int(self.gallery_id), + "title": text.unescape(extr("<h1>", "</h1>")), + "count": text.parse_int(extr( + "total of images in this gallery: ", " ")), + "date" : text.parse_datetime( + extr("created on ", " by <") + .replace("th, ", " ", 1).replace("nd, ", " ", 1) + .replace("st, ", " ", 1), "%B %d %Y at %H:%M"), + "user" : text.unescape(extr(">", "<")), + } def images(self, page): - """Yield all image urls for this gallery""" pnum = 0 + while True: thumbs = text.extr(page, '<ul class="thumbnails">', '</ul>') for url in text.extract_iter(thumbs, '<img src="', '"'): - yield "https://imgth.com/images" + url[24:] + path = url.partition("/thumbs/")[2] + yield ("{}/images/{}".format(self.root, path), None) + if '<li class="next">' not in page: return - pnum += 1 - page = self.request(self.url_base + str(pnum)).text - def metadata(self, page): - """Collect metadata for extractor-job""" - return text.extract_all(page, ( - ("title", '<h1>', '</h1>'), - ("count", 'total of images in this gallery: ', ' '), - ("date" , 'created on ', ' by <'), - (None , 'href="/users/', ''), - ("user" , '>', '<'), - ), values={"gallery_id": self.gid})[0] + pnum += 1 + url = "{}/gallery/{}/g/page/{}".format( + self.root, self.gallery_id, pnum) + page = self.request(url).text From 1c25cc7a3e4e9094ca6d333e8a559522dd222a91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= <mike_faehrmann@web.de> Date: Wed, 7 Dec 2022 21:23:45 +0100 Subject: [PATCH 10/13] [warosu] fix and update --- gallery_dl/extractor/warosu.py | 62 ++++++++++++++++------------------ 1 file changed, 30 insertions(+), 32 deletions(-) diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 677680fa..bdedfcbc 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -1,21 +1,22 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2019 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://warosu.org/""" +"""Extractors for https://warosu.org/""" from .common import Extractor, Message from .. import text class WarosuThreadExtractor(Extractor): - """Extractor for images from threads on warosu.org""" + """Extractor for threads on warosu.org""" category = "warosu" subcategory = "thread" + root = "https://warosu.org" directory_fmt = ("{category}", "{board}", "{thread} - {title}") filename_fmt = "{tim}-{filename}.{extension}" archive_fmt = "{board}_{thread}_{tim}" @@ -31,7 +32,6 @@ class WarosuThreadExtractor(Extractor): "content": "d48df0a701e6599312bfff8674f4aa5d4fb8db1c", }), ) - root = "https://warosu.org" def __init__(self, match): Extractor.__init__(self, match) @@ -40,12 +40,12 @@ class WarosuThreadExtractor(Extractor): def items(self): url = "{}/{}/thread/{}".format(self.root, self.board, self.thread) page = self.request(url).text - data = self.get_metadata(page) + data = self.metadata(page) posts = self.posts(page) if not data["title"]: - title = text.remove_html(posts[0]["com"]) - data["title"] = text.unescape(title)[:50] + data["title"] = text.unescape(text.remove_html( + posts[0]["com"]))[:50] yield Message.Directory, data for post in posts: @@ -55,25 +55,24 @@ class WarosuThreadExtractor(Extractor): post.update(data) yield Message.Url, post["image"], post - def get_metadata(self, page): - """Collect metadata for extractor-job""" + def metadata(self, page): boardname = text.extr(page, "<title>", "") title = text.extr(page, 'filetitle" itemprop="name">', '<') return { - "board": self.board, + "board" : self.board, "board_name": boardname.rpartition(" - ")[2], - "thread": self.thread, - "title": title, + "thread" : self.thread, + "title" : title, } def posts(self, page): - """Build a list of all post-objects""" + """Build a list of all post objects""" page = text.extr(page, '
', '') needle = '
' return [self.parse(post) for post in page.split(needle)] def parse(self, post): - """Build post-object by extracting data from an HTML post""" + """Build post object by extracting data from an HTML post""" data = self._extract_post(post) if "File:" in post: self._extract_image(post, data) @@ -84,24 +83,23 @@ class WarosuThreadExtractor(Extractor): @staticmethod def _extract_post(post): - data = text.extract_all(post, ( - ("no" , 'id="p', '"'), - ("name", '', ''), - ("time", ''), - ("now" , '', '<'), - ("com" , '

', '

'), - ))[0] - data["com"] = text.unescape(text.remove_html(data["com"].strip())) - return data + extr = text.extract_from(post) + return { + "no" : extr('id="p', '"'), + "name": extr('', ""), + "time": extr(''), + "now" : extr("", "<"), + "com" : text.unescape(text.remove_html(extr( + '

', '

' + ).strip())), + } @staticmethod def _extract_image(post, data): - text.extract_all(post, ( - ("fsize" , 'File: ', ', '), - ("w" , '', 'x'), - ("h" , '', ', '), - ("filename", '', '<'), - ("image" , '
\nFile: ", ", ") + data["w"] = extr("", "x") + data["h"] = extr("", ", ") + data["filename"] = text.unquote(extr("", "<").rpartition(".")[0]) + extr("
", "") + data["image"] = "https:" + extr('
Date: Wed, 7 Dec 2022 21:36:34 +0100 Subject: [PATCH 11/13] [khinsider] fix metadata extraction --- gallery_dl/extractor/khinsider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index d5cca1c2..0c3b002f 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -65,7 +65,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): "count": text.parse_int(extr("Number of Files: ", "<")), "size" : text.parse_bytes(extr("Total Filesize: ", "<")[:-1]), "date" : extr("Date Added: ", "<"), - "type" : extr("Album type: ", "<"), + "type" : text.remove_html(extr("Album type: ", "")), }} def tracks(self, page): From cd931e1139b2146375eb722bd6fa505e8a71b09c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 8 Dec 2022 18:58:29 +0100 Subject: [PATCH 12/13] update extractor test results --- gallery_dl/extractor/35photo.py | 1 + gallery_dl/extractor/8chan.py | 4 +-- gallery_dl/extractor/deviantart.py | 14 ----------- gallery_dl/extractor/erome.py | 37 +++++++++++++++++++--------- gallery_dl/extractor/exhentai.py | 8 ++++-- gallery_dl/extractor/gelbooru_v02.py | 3 ++- gallery_dl/extractor/mangadex.py | 2 +- gallery_dl/extractor/redgifs.py | 4 +-- gallery_dl/extractor/slickpic.py | 3 ++- gallery_dl/extractor/smugmug.py | 2 +- gallery_dl/extractor/twibooru.py | 2 +- gallery_dl/extractor/twitter.py | 6 ++--- gallery_dl/extractor/unsplash.py | 18 +++++++------- gallery_dl/extractor/webtoons.py | 1 + 14 files changed, 56 insertions(+), 49 deletions(-) diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py index 28acc3d5..f86691d4 100644 --- a/gallery_dl/extractor/35photo.py +++ b/gallery_dl/extractor/35photo.py @@ -146,6 +146,7 @@ class _35photoTagExtractor(_35photoExtractor): test = ("https://35photo.pro/tags/landscape/", { "range": "1-25", "count": 25, + "archive": False, }) def __init__(self, match): diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index 1e020c25..0e128c3a 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -92,8 +92,8 @@ class _8chanThreadExtractor(_8chanExtractor): "uniquePosters": 9, "usesCustomCss": True, "usesCustomJs": False, - "wsPort": 8880, - "wssPort": 2087, + "?wsPort": 8880, + "?wssPort": 2087, }, }), ("https://8chan.se/vhs/res/4.html"), diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 45beddf3..df59be4a 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -896,20 +896,6 @@ class DeviantartDeviationExtractor(DeviantartExtractor): "range": "2-", "count": 4, }), - # video - ("https://www.deviantart.com/chi-u/art/-VIDEO-Brushes-330774593", { - "pattern": r"https://wixmp-.+wixmp.com/v/mp4/.+\.720p\.\w+.mp4", - "keyword": { - "filename": r"re:_video____brushes_\w+_by_chi_u-d5gxnb5", - "extension": "mp4", - "target": { - "duration": 306, - "filesize": 19367585, - "quality": "720p", - "src": str, - }, - } - }), # journal ("https://www.deviantart.com/shimoda7/journal/ARTility-583755752", { "url": "d34b2c9f873423e665a1b8ced20fcb75951694a3", diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index b4dadc7e..ad3f16ba 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -92,16 +92,29 @@ class EromeAlbumExtractor(EromeExtractor): """Extractor for albums on erome.com""" subcategory = "album" pattern = BASE_PATTERN + r"/a/(\w+)" - test = ("https://www.erome.com/a/TyFMI7ik", { - "pattern": r"https://s\d+\.erome\.com/\d+/TyFMI7ik/\w+", - "count": 9, - "keyword": { - "album_id": "TyFMI7ik", - "num": int, - "title": "Ryan Ryans", - "user": "xanub", - }, - }) + test = ( + ("https://www.erome.com/a/NQgdlWvk", { + "pattern": r"https://v\d+\.erome\.com/\d+" + r"/NQgdlWvk/j7jlzmYB_480p\.mp4", + "count": 1, + "keyword": { + "album_id": "NQgdlWvk", + "num": 1, + "title": "porn", + "user": "yYgWBZw8o8qsMzM", + }, + }), + ("https://www.erome.com/a/TdbZ4ogi", { + "pattern": r"https://s\d+\.erome\.com/\d+/TdbZ4ogi/\w+", + "count": 6, + "keyword": { + "album_id": "TdbZ4ogi", + "num": int, + "title": "82e78cfbb461ad87198f927fcb1fda9a1efac9ff.", + "user": "yYgWBZw8o8qsMzM", + }, + }), + ) def albums(self): return (self.item,) @@ -110,7 +123,7 @@ class EromeAlbumExtractor(EromeExtractor): class EromeUserExtractor(EromeExtractor): subcategory = "user" pattern = BASE_PATTERN + r"/(?!a/|search\?)([^/?#]+)" - test = ("https://www.erome.com/xanub", { + test = ("https://www.erome.com/yYgWBZw8o8qsMzM", { "range": "1-25", "count": 25, }) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 01375d81..dccc74e4 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -117,9 +117,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): r"|/s/([\da-f]{10})/(\d+)-(\d+))") test = ( ("https://exhentai.org/g/1200119/d55c44d3d0/", { + "options": (("original", False),), "keyword": { "cost": int, - "date": "dt:2018-03-18 20:15:00", + "date": "dt:2018-03-18 20:14:00", "eh_category": "Non-H", "expunged": False, "favorites": r"re:^[12]\d$", @@ -150,7 +151,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "uploader": "klorpa", "width": int, }, - "content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff", + "content": ("2c68cff8a7ca540a78c36fdbf5fbae0260484f87", + "e9891a4c017ed0bb734cd1efba5cd03f594d31ff"), }), ("https://exhentai.org/g/960461/4f0e369d82/", { "exception": exception.NotFoundError, @@ -159,9 +161,11 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "exception": exception.AuthorizationError, }), ("https://exhentai.org/s/f68367b4c8/1200119-3", { + "options": (("original", False),), "count": 2, }), ("https://e-hentai.org/s/f68367b4c8/1200119-3", { + "options": (("original", False),), "count": 2, }), ("https://g.e-hentai.org/g/1200119/d55c44d3d0/"), diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index da87b8f1..facd3dbe 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -174,7 +174,8 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor): pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)" test = ( ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", { - "content": "5c6ae9ee13e6d4bc9cb8bdce224c84e67fbfa36c", + "content": ("5c6ae9ee13e6d4bc9cb8bdce224c84e67fbfa36c", + "622e80be3f496672c44aab5c47fbc6941c61bc79"), "pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg", "count": 2, }), diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 0bc35274..dae203e7 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -109,7 +109,7 @@ class MangadexChapterExtractor(MangadexExtractor): }), # 'externalUrl', but still downloadable (#2503) ("https://mangadex.org/chapter/364728a4-6909-4164-9eea-6b56354f7c78", { - "count": 39, + "count": 0, # 404 }), ) diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 53e5e790..ad4282c8 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -72,7 +72,7 @@ class RedgifsUserExtractor(RedgifsExtractor): pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/users/([^/?#]+)" test = ("https://www.redgifs.com/users/Natalifiction", { "pattern": r"https://\w+\.redgifs\.com/[A-Za-z]+\.mp4", - "count": ">= 120", + "count": ">= 100", }) def metadata(self): @@ -89,7 +89,7 @@ class RedgifsSearchExtractor(RedgifsExtractor): pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/browse/?\?([^#]+)" test = ( ("https://www.redgifs.com/browse?tags=JAV", { - "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.mp4", + "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.(mp4|jpg)", "range": "1-10", "count": 10, }), diff --git a/gallery_dl/extractor/slickpic.py b/gallery_dl/extractor/slickpic.py index ae4e2e8a..3727c0b0 100644 --- a/gallery_dl/extractor/slickpic.py +++ b/gallery_dl/extractor/slickpic.py @@ -43,7 +43,8 @@ class SlickpicAlbumExtractor(SlickpicExtractor): }), ("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", { "range": "34", - "content": ("52b5a310587de1048030ab13a912f6a3a9cc7dab", + "content": ("276eb2c902187bb177ae8013e310e1d6641fba9a", + "52b5a310587de1048030ab13a912f6a3a9cc7dab", "cec6630e659dc72db1ee1a9a6f3b525189261988", "6f81e1e74c6cd6db36844e7211eef8e7cd30055d", "22e83645fc242bc3584eca7ec982c8a53a4d8a44"), diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 2264fe48..713d4c41 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -117,7 +117,7 @@ class SmugmugImageExtractor(SmugmugExtractor): # video ("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", { "url": "04d0ab1ff829ca7d78f5acb5548953df08e9a5ee", - "keyword": "4cef98133ace511adc874c9d9abac5817ba0d856", + "keyword": "2b545184592c282b365fcbb7df6ca7952b8a3173", }), ) diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index f010f926..30bf2f15 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -83,7 +83,7 @@ class TwibooruPostExtractor(TwibooruExtractor): "tag_ids": list, "tags": list, "thumbnails_generated": True, - "updated_at": "2022-09-21T14:31:50.441Z", + "updated_at": "2022-11-27T00:34:50.483Z", "upvotes": int, "view_url": "https://cdn.twibooru.org/img/2020/7/8/1/full.png", "width": 576, diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 22d4a6ec..d0411acf 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -633,7 +633,7 @@ class TwitterEventExtractor(TwitterExtractor): pattern = BASE_PATTERN + r"/i/events/(\d+)" test = ("https://twitter.com/i/events/1484669206993903616", { "range": "1-20", - "count": ">5", + "count": ">=1", }) def metadata(self): @@ -759,7 +759,7 @@ class TwitterTweetExtractor(TwitterExtractor): # retweet with missing media entities (#1555) ("https://twitter.com/morino_ya/status/1392763691599237121", { "options": (("retweets", True),), - "count": 4, + "count": 0, # private }), # deleted quote tweet (#2225) ("https://twitter.com/i/web/status/1460044411165888515", { @@ -782,7 +782,7 @@ class TwitterTweetExtractor(TwitterExtractor): # '?format=...&name=...'-style URLs ("https://twitter.com/poco_dandy/status/1150646424461176832", { "options": (("cards", True),), - "pattern": r"https://pbs.twimg.com/card_img/157\d+/\w+" + "pattern": r"https://pbs.twimg.com/card_img/157\d+/[\w-]+" r"\?format=(jpg|png)&name=orig$", "range": "1-2", }), diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py index 8bea18c7..b298c27e 100644 --- a/gallery_dl/extractor/unsplash.py +++ b/gallery_dl/extractor/unsplash.py @@ -78,11 +78,11 @@ class UnsplashImageExtractor(UnsplashExtractor): pattern = BASE_PATTERN + r"/photos/([^/?#]+)" test = ("https://unsplash.com/photos/lsoogGC_5dg", { "pattern": r"https://images\.unsplash\.com/photo-1586348943529-" - r"beaae6c28db9\?ixid=\w+&ixlib=rb-1.2.1", + r"beaae6c28db9\?ixid=\w+&ixlib=rb-4.0.3", "keyword": { "alt_description": "re:silhouette of trees near body of water ", "blur_hash": "LZP4uQS4jboe%#o0WCa}2doJNaaz", - "categories": list, + "? categories": list, "color": "#f3c08c", "created_at": "2020-04-08T12:29:42Z", "date": "dt:2020-04-08 12:29:42", @@ -108,9 +108,8 @@ class UnsplashImageExtractor(UnsplashExtractor): "name": "Beaver Dam, WI 53916, USA", "position": { "latitude": 43.457769, - "longitude": -88.837329 + "longitude": -88.837329, }, - "title": "Beaver Dam, WI 53916, USA" }, "promoted_at": "2020-04-08T15:12:03Z", "sponsorship": None, @@ -149,7 +148,7 @@ class UnsplashUserExtractor(UnsplashExtractor): pattern = BASE_PATTERN + r"/@(\w+)/?$" test = ("https://unsplash.com/@davehoefler", { "pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+" - r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$", + r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$", "range": "1-30", "count": 30, }) @@ -166,7 +165,7 @@ class UnsplashFavoriteExtractor(UnsplashExtractor): pattern = BASE_PATTERN + r"/@(\w+)/likes" test = ("https://unsplash.com/@davehoefler/likes", { "pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+" - r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$", + r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$", "range": "1-30", "count": 30, }) @@ -184,7 +183,7 @@ class UnsplashCollectionExtractor(UnsplashExtractor): test = ( ("https://unsplash.com/collections/3178572/winter", { "pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+" - r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$", + r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$", "keyword": {"collection_id": "3178572", "collection_title": "winter"}, "range": "1-30", @@ -212,8 +211,9 @@ class UnsplashSearchExtractor(UnsplashExtractor): subcategory = "search" pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^#]+))?" test = ("https://unsplash.com/s/photos/hair-style", { - "pattern": r"https://images\.unsplash\.com/((flagged/)?photo-\d+-\w+" - r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$", + "pattern": r"https://(images|plus)\.unsplash\.com" + r"/((flagged/|premium_)?photo-\d+-\w+" + r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$", "range": "1-30", "count": 30, }) diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 8a22fcb7..21f7c21e 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -57,6 +57,7 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): }), (("https://www.webtoons.com/en/challenge/punderworld" "/happy-earth-day-/viewer?title_no=312584&episode_no=40"), { + "exception": exception.NotFoundError, "keyword": { "comic": "punderworld", "description": str, From dfe7b23579092223356f8aa0a861f7d2a5600e93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Fri, 9 Dec 2022 19:43:55 +0100 Subject: [PATCH 13/13] support Firefox containers for --cookies-from-browser (#3346) --- docs/configuration.rst | 4 +- gallery_dl/__init__.py | 7 +++- gallery_dl/cookies.py | 86 +++++++++++++++++++++++++++++++----------- gallery_dl/option.py | 8 ++-- 4 files changed, 78 insertions(+), 27 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index 59fa8fc2..f2a3aa35 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -432,15 +432,17 @@ Description "isAdult" : "1" } - * A ``list`` with up to 3 entries specifying a browser profile. + * A ``list`` with up to 4 entries specifying a browser profile. * The first entry is the browser name * The optional second entry is a profile name or an absolute path to a profile directory * The optional third entry is the keyring to retrieve passwords for decrypting cookies from + * The optional fourth entry is a (Firefox) container name (``"none"`` for only cookies with no container) .. code:: json ["firefox"] + ["firefox", null, null, "Personal"] ["chromium", "Private", "kwallet"] diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 3701d6fd..611b2b92 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -66,7 +66,12 @@ def main(): if args.cookies_from_browser: browser, _, profile = args.cookies_from_browser.partition(":") browser, _, keyring = browser.partition("+") - config.set((), "cookies", (browser, profile, keyring)) + if profile.startswith(":"): + container = profile[1:] + profile = None + else: + profile, _, container = profile.partition("::") + config.set((), "cookies", (browser, profile, keyring, container)) for opts in args.options: config.set(*opts) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 6f9a92db..ee00bf74 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -24,7 +24,7 @@ import tempfile from datetime import datetime, timedelta, timezone from hashlib import pbkdf2_hmac from http.cookiejar import Cookie -from . import aes +from . import aes, text SUPPORTED_BROWSERS_CHROMIUM = { @@ -35,11 +35,10 @@ logger = logging.getLogger("cookies") def load_cookies(cookiejar, browser_specification): - browser_name, profile, keyring = \ + browser_name, profile, keyring, container = \ _parse_browser_specification(*browser_specification) - if browser_name == "firefox": - load_cookies_firefox(cookiejar, profile) + load_cookies_firefox(cookiejar, profile, container) elif browser_name == "safari": load_cookies_safari(cookiejar, profile) elif browser_name in SUPPORTED_BROWSERS_CHROMIUM: @@ -48,12 +47,24 @@ def load_cookies(cookiejar, browser_specification): raise ValueError("unknown browser '{}'".format(browser_name)) -def load_cookies_firefox(cookiejar, profile=None): - set_cookie = cookiejar.set_cookie - with _firefox_cookies_database(profile) as db: +def load_cookies_firefox(cookiejar, profile=None, container=None): + path, container_id = _firefox_cookies_database(profile, container) + with DatabaseCopy(path) as db: + + sql = ("SELECT name, value, host, path, isSecure, expiry " + "FROM moz_cookies") + parameters = () + + if container_id is False: + sql += " WHERE NOT INSTR(originAttributes,'userContextId=')" + elif container_id: + sql += " WHERE originAttributes LIKE ? OR originAttributes LIKE ?" + uid = "%userContextId={}".format(container_id) + parameters = (uid, uid + "&%") + + set_cookie = cookiejar.set_cookie for name, value, domain, path, secure, expires in db.execute( - "SELECT name, value, host, path, isSecure, expiry " - "FROM moz_cookies"): + sql, parameters): set_cookie(Cookie( 0, name, value, None, False, domain, bool(domain), domain.startswith("."), @@ -79,9 +90,10 @@ def load_cookies_safari(cookiejar, profile=None): def load_cookies_chrome(cookiejar, browser_name, profile, keyring): config = _get_chromium_based_browser_settings(browser_name) + path = _chrome_cookies_database(profile, config) + logger.debug("Extracting cookies from %s", path) - with _chrome_cookies_database(profile, config) as db: - + with DatabaseCopy(path) as db: db.text_factory = bytes decryptor = get_cookie_decryptor( config["directory"], config["keyring"], keyring=keyring) @@ -134,8 +146,8 @@ def load_cookies_chrome(cookiejar, browser_name, profile, keyring): # -------------------------------------------------------------------- # firefox -def _firefox_cookies_database(profile=None): - if profile is None: +def _firefox_cookies_database(profile=None, container=None): + if not profile: search_root = _firefox_browser_directory() elif _is_path(profile): search_root = profile @@ -146,14 +158,45 @@ def _firefox_cookies_database(profile=None): if path is None: raise FileNotFoundError("Unable to find Firefox cookies database in " "{}".format(search_root)) - logger.debug("Extracting cookies from %s", path) - return DatabaseCopy(path) + + if container == "none": + container_id = False + logger.debug("Only loading cookies not belonging to any container") + + elif container: + containers_path = os.path.join( + os.path.dirname(path), "containers.json") + + try: + with open(containers_path) as containers: + identities = json.load(containers)["identities"] + except OSError: + logger.error("Unable to read Firefox container database at %s", + containers_path) + raise + except KeyError: + identities = () + + for context in identities: + if container == context.get("name") or container == text.extr( + context.get("l10nID", ""), "userContext", ".label"): + container_id = context["userContextId"] + break + else: + raise ValueError("Unable to find Firefox container {}".format( + container)) + logger.debug("Only loading cookies from container '%s' (ID %s)", + container, container_id) + else: + container_id = None + + return path, container_id def _firefox_browser_directory(): if sys.platform in ("win32", "cygwin"): - return os.path.expandvars(R"%APPDATA%\Mozilla\Firefox\Profiles") + return os.path.expandvars(r"%APPDATA%\Mozilla\Firefox\Profiles") if sys.platform == "darwin": return os.path.expanduser("~/Library/Application Support/Firefox") return os.path.expanduser("~/.mozilla/firefox") @@ -237,7 +280,7 @@ def _safari_parse_cookies_record(data, cookiejar): cookiejar.set_cookie(Cookie( 0, name, value, None, False, - domain, bool(domain), domain.startswith('.'), + domain, bool(domain), domain.startswith("."), path, bool(path), is_secure, expiration_date, False, None, None, {}, )) @@ -265,9 +308,7 @@ def _chrome_cookies_database(profile, config): if path is None: raise FileNotFoundError("Unable to find {} cookies database in " "'{}'".format(config["browser"], search_root)) - - logger.debug("Extracting cookies from %s", path) - return DatabaseCopy(path) + return path def _get_chromium_based_browser_settings(browser_name): @@ -937,11 +978,12 @@ def _is_path(value): return os.path.sep in value -def _parse_browser_specification(browser, profile=None, keyring=None): +def _parse_browser_specification( + browser, profile=None, keyring=None, container=None): if browser not in SUPPORTED_BROWSERS: raise ValueError("unsupported browser '{}'".format(browser)) if keyring and keyring not in SUPPORTED_KEYRINGS: raise ValueError("unsupported keyring '{}'".format(keyring)) if profile and _is_path(profile): profile = os.path.expanduser(profile) - return browser, profile, keyring + return browser, profile, keyring, container diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 4d9a3587..91e9169c 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -142,10 +142,12 @@ def build_parser(): ) general.add_argument( "--cookies-from-browser", - dest="cookies_from_browser", metavar="BROWSER[+KEYRING][:PROFILE]", + dest="cookies_from_browser", + metavar="BROWSER[+KEYRING][:PROFILE][::CONTAINER]", help=("Name of the browser to load cookies from, " - "with optional keyring name prefixed with '+' and " - "profile prefixed with ':'"), + "with optional keyring name prefixed with '+', " + "profile prefixed with ':', and " + "container prefixed with '::' ('none' for no container)"), ) output = parser.add_argument_group("Output Options")