From 09d872a2b1c3839a12b569dc6606f892aa3ceaff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 7 Mar 2019 22:55:26 +0100 Subject: [PATCH] generalize extractor creation code --- gallery_dl/extractor/common.py | 42 ++++++++++++++++++ gallery_dl/extractor/foolfuuka.py | 72 ++++++++++--------------------- gallery_dl/extractor/foolslide.py | 68 +++++++---------------------- gallery_dl/extractor/shopify.py | 69 +++++++---------------------- 4 files changed, 97 insertions(+), 154 deletions(-) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 3babcbfb..f984811d 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -337,6 +337,48 @@ class SharedConfigMixin(): return value +def generate_extractors(extractor_data, symtable, classes): + """Dynamically generate Extractor classes""" + extractors = config.get(("extractor", classes[0].basecategory)) + ckey = extractor_data.get("_ckey") + prev = None + + if extractors: + extractor_data.update(extractors) + + for category, info in extractor_data.items(): + + if not isinstance(info, dict): + continue + + root = info["root"] + domain = root[root.index(":") + 3:] + pattern = info.get("pattern") or re.escape(domain) + name = (info.get("name") or category).capitalize() + + for cls in classes: + + class Extr(cls): + pass + Extr.__module__ = cls.__module__ + Extr.__name__ = Extr.__qualname__ = \ + name + cls.subcategory.capitalize() + "Extractor" + Extr.__doc__ = \ + "Extractor for " + cls.subcategory + "s from " + domain + Extr.category = category + Extr.pattern = r"(?:https?://)?" + pattern + cls.pattern_fmt + Extr.test = info.get("test-" + cls.subcategory) + Extr.root = root + + if "extra" in info: + for key, value in info["extra"].items(): + setattr(Extr, key, value) + if prev and ckey: + setattr(Extr, ckey, prev) + + symtable[Extr.__name__] = prev = Extr + + # Reduce strictness of the expected magic string in cookiejar files. # (This allows the use of Wget-generated cookiejars without modification) diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 9c9b2d0b..ddb3b92b 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -8,11 +8,10 @@ """Extractors for 4chan archives based on FoolFuuka""" -from .common import Extractor, Message, SharedConfigMixin -from .. import text, config +from .common import Extractor, Message, SharedConfigMixin, generate_extractors +from .. import text import itertools import operator -import re class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor): @@ -23,12 +22,16 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor): "{thread_num}{title:? - //}") filename_fmt = "{media[media]}" archive_fmt = "{board[shortname]}_{num}_{timestamp}" + pattern_fmt = r"/([^/]+)/thread/(\d+)" + resolve = "default" root = "" def __init__(self, match): Extractor.__init__(self, match) self.board, self.thread = match.groups() self.session.headers["Referer"] = self.root + if self.resolve == "direct": + self.remote = self._remote_direct def items(self): op = True @@ -52,6 +55,7 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor): yield Message.Url, url, post def posts(self): + """Return an iterable with all posts in this thread""" url = self.root + "/_/api/chan/thread/" params = {"board": self.board, "num": self.thread} data = self.request(url, params=params).json()[self.thread] @@ -63,59 +67,28 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor): return itertools.chain((data["op"],), posts) def remote(self, media): + """Resolve a remote media link""" needle = '', '') @@ -116,52 +123,6 @@ class FoolslideMangaExtractor(FoolslideBase, MangaExtractor): }))) -def generate_extractors(): - """Dynamically generate Extractor classes for FoOlSlide instances""" - - symtable = globals() - extractors = config.get(("extractor", "foolslide")) - - if extractors: - EXTRACTORS.update(extractors) - - for category, info in EXTRACTORS.items(): - - if not isinstance(info, dict): - continue - - root = info["root"] - domain = root[root.index(":") + 3:] - pattern = info.get("pattern") or re.escape(domain) - name = (info.get("name") or category).capitalize() - - class ChExtr(FoolslideChapterExtractor): - pass - - ChExtr.__name__ = ChExtr.__qualname__ = name + "ChapterExtractor" - ChExtr.__doc__ = "Extractor for manga-chapters from " + domain - ChExtr.category = category - ChExtr.pattern = (r"(?:https?://)?" + pattern + - r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)") - ChExtr.test = info.get("test-chapter") - ChExtr.root = root - if "decode" in info: - ChExtr.decode = info["decode"] - symtable[ChExtr.__name__] = ChExtr - - class MaExtr(FoolslideMangaExtractor): - pass - - MaExtr.__name__ = MaExtr.__qualname__ = name + "MangaExtractor" - MaExtr.__doc__ = "Extractor for manga from " + domain - MaExtr.category = category - MaExtr.pattern = r"(?:https?://)?" + pattern + r"(/series/[^/?&#]+)" - MaExtr.test = info.get("test-manga") - MaExtr.root = root - MaExtr.chapterclass = ChExtr - symtable[MaExtr.__name__] = MaExtr - - EXTRACTORS = { "dokireader": { "root": "https://kobato.hologfx.com/reader", @@ -180,7 +141,7 @@ EXTRACTORS = { "jaiminisbox": { "root": "https://jaiminisbox.com/reader", "pattern": r"(?:www\.)?jaiminisbox\.com/reader", - "decode": "base64", + "extra": {"decode": "base64"}, "test-chapter": ( ("https://jaiminisbox.com/reader/read/uratarou/en/0/1/", { "keyword": "6009af77cc9c05528ab1fdda47b1ad9d4811c673", @@ -290,7 +251,10 @@ EXTRACTORS = { "keyword": "3a24f1088b4d7f3b798a96163f21ca251293a120", }), }, + "_ckey": "chapterclass", } - -generate_extractors() +generate_extractors(EXTRACTORS, globals(), ( + FoolslideChapterExtractor, + FoolslideMangaExtractor, +)) diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index 2c05895d..f84fd8b9 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -8,8 +8,8 @@ """Extractors for Shopify instances""" -from .common import Extractor, Message, SharedConfigMixin -from .. import text, config +from .common import Extractor, Message, SharedConfigMixin, generate_extractors +from .. import text import time import re @@ -63,13 +63,13 @@ class ShopifyExtractor(SharedConfigMixin, Extractor): def products(self): """Return an iterable with all relevant product URLs""" - return () class ShopifyCollectionExtractor(ShopifyExtractor): """Base class for collection extractors for Shopify based sites""" subcategory = "collection" directory_fmt = ("{category}", "{collection[title]}") + pattern_fmt = r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)" def __init__(self, match): ShopifyExtractor.__init__(self, match) @@ -98,58 +98,23 @@ class ShopifyProductExtractor(ShopifyExtractor): """Base class for product extractors for Shopify based sites""" subcategory = "product" directory_fmt = ("{category}", "Products") + pattern_fmt = r"((?:/collections/[\w-]+)?/products/[\w-]+)" def products(self): return (self.item_url,) -def generate_extractors(): - """Dynamically generate Extractor classes for Shopify instances""" - symtable = globals() - extractors = config.get(("extractor", "shopify")) - - if extractors: - EXTRACTORS.update(extractors) - - for category, info in EXTRACTORS.items(): - - if not isinstance(info, dict): - continue - - root = info["root"] - domain = root[root.index(":") + 3:] - pattern = info.get("pattern") or re.escape(domain) - name = (info.get("name") or category).capitalize() - - class CoExtr(ShopifyCollectionExtractor): - pass - - CoExtr.__name__ = CoExtr.__qualname__ = name + "CollectionExtractor" - CoExtr.__doc__ = "Extractor for product collections from " + domain - CoExtr.category = category - CoExtr.pattern = (r"(?:https?://)?" + pattern + - r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)") - CoExtr.test = info.get("test-collection") - CoExtr.root = root - symtable[CoExtr.__name__] = CoExtr - - class PrExtr(ShopifyProductExtractor): - pass - - PrExtr.__name__ = PrExtr.__qualname__ = name + "ProductExtractor" - PrExtr.__doc__ = "Extractor for individual products from " + domain - PrExtr.category = category - PrExtr.pattern = (r"(?:https?://)?" + pattern + - r"((?:/collections/[\w-]+)?/products/[\w-]+)") - PrExtr.test = info.get("test-product") - PrExtr.root = root - symtable[PrExtr.__name__] = PrExtr - - EXTRACTORS = { "fashionnova": { "root": "https://www.fashionnova.com", "pattern": r"(?:www\.)?fashionnova\.com", + "test-product": ( + ("https://www.fashionnova.com/products/essential-slide-red", { + "pattern": r"https?://cdn\.shopify.com/", + "count": 3, + }), + ("https://www.fashionnova.com/collections/flats/products/name"), + ), "test-collection": ( ("https://www.fashionnova.com/collections/mini-dresses", { "range": "1-20", @@ -158,13 +123,11 @@ EXTRACTORS = { ("https://www.fashionnova.com/collections/mini-dresses/?page=1"), ("https://www.fashionnova.com/collections/mini-dresses#1"), ), - "test-product": ( - ("https://www.fashionnova.com/products" - "/only-here-tonight-cut-out-dress-black"), - ("https://www.fashionnova.com/collections/mini-dresses/products" - "/only-here-tonight-cut-out-dress-black"), - ) + }, } -generate_extractors() +generate_extractors(EXTRACTORS, globals(), ( + ShopifyProductExtractor, + ShopifyCollectionExtractor, +))