extend blacklist/whitelist syntax (#2025)

Each entry in such a list can now also include a subcategory
'<category>:<subcategory>'
and it is possible to use '*' or an empty string as placeholder
'*:<subcategory>', ':<subcategory>', '<category>:*'

For example
  "blacklist": "imgur,*:tag,gfycat:user" or
  "blacklist": ["imgur", "*:tag", "gfycat:user"]
will filter all 'imgur' extractors, all extractors  with a 'tag'
subcategory (e.g. https://danbooru.donmai.us/posts?tags=bonocho),
and all 'gfycat' user extractors.
This commit is contained in:
Mike Fährmann
2021-11-23 19:23:02 +01:00
parent 11a3d96d13
commit 010d65dcec
4 changed files with 191 additions and 46 deletions

View File

@@ -11,7 +11,6 @@ import json
import time import time
import errno import errno
import logging import logging
import operator
import functools import functools
import collections import collections
from . import extractor, downloader, postprocessor from . import extractor, downloader, postprocessor
@@ -201,7 +200,6 @@ class DownloadJob(Job):
def __init__(self, url, parent=None): def __init__(self, url, parent=None):
Job.__init__(self, url, parent) Job.__init__(self, url, parent)
self.log = self.get_logger("download") self.log = self.get_logger("download")
self.blacklist = None
self.fallback = None self.fallback = None
self.archive = None self.archive = None
self.sleep = None self.sleep = None
@@ -209,6 +207,7 @@ class DownloadJob(Job):
self.downloaders = {} self.downloaders = {}
self.out = output.select() self.out = output.select()
self.visited = parent.visited if parent else set() self.visited = parent.visited if parent else set()
self._extractor_filter = None
self._skipcnt = 0 self._skipcnt = 0
def handle_url(self, url, kwdict): def handle_url(self, url, kwdict):
@@ -297,9 +296,9 @@ class DownloadJob(Job):
else: else:
extr = extractor.find(url) extr = extractor.find(url)
if extr: if extr:
if self.blacklist is None: if self._extractor_filter is None:
self.blacklist = self._build_blacklist() self._extractor_filter = self._build_extractor_filter()
if extr.category in self.blacklist: if not self._extractor_filter(extr):
extr = None extr = None
if extr: if extr:
@@ -444,22 +443,20 @@ class DownloadJob(Job):
self.hooks = collections.defaultdict(list) self.hooks = collections.defaultdict(list)
pp_log = self.get_logger("postprocessor") pp_log = self.get_logger("postprocessor")
pp_list = [] pp_list = []
category = self.extractor.category
basecategory = self.extractor.basecategory
pp_conf = config.get((), "postprocessor") or {} pp_conf = config.get((), "postprocessor") or {}
for pp_dict in postprocessors: for pp_dict in postprocessors:
if isinstance(pp_dict, str): if isinstance(pp_dict, str):
pp_dict = pp_conf.get(pp_dict) or {"name": pp_dict} pp_dict = pp_conf.get(pp_dict) or {"name": pp_dict}
whitelist = pp_dict.get("whitelist") clist = pp_dict.get("whitelist")
if whitelist and category not in whitelist and \ if clist is not None:
basecategory not in whitelist: negate = False
continue else:
clist = pp_dict.get("blacklist")
blacklist = pp_dict.get("blacklist") negate = True
if blacklist and ( if clist and not util.build_extractor_filter(
category in blacklist or basecategory in blacklist): clist, negate)(self.extractor):
continue continue
name = pp_dict.get("name") name = pp_dict.get("name")
@@ -500,38 +497,18 @@ class DownloadJob(Job):
if condition(pathfmt.kwdict): if condition(pathfmt.kwdict):
callback(pathfmt) callback(pathfmt)
def _build_blacklist(self): def _build_extractor_filter(self):
wlist = self.extractor.config("whitelist") clist = self.extractor.config("whitelist")
if wlist is not None: if clist is not None:
if isinstance(wlist, str): negate = False
wlist = wlist.split(",")
# build a set of all categories
blist = set()
add = blist.add
update = blist.update
get = operator.itemgetter(0)
for extr in extractor._list_classes():
category = extr.category
if category:
add(category)
else:
update(map(get, extr.instances))
# remove whitelisted categories
blist.difference_update(wlist)
return blist
blist = self.extractor.config("blacklist")
if blist is not None:
if isinstance(blist, str):
blist = blist.split(",")
blist = set(blist)
else: else:
blist = {self.extractor.category} clist = self.extractor.config("blacklist")
blist |= util.SPECIAL_EXTRACTORS negate = True
return blist if clist is None:
clist = (self.extractor.category,)
return util.build_extractor_filter(
clist, negate, util.SPECIAL_EXTRACTORS)
class SimulationJob(DownloadJob): class SimulationJob(DownloadJob):

View File

@@ -81,6 +81,16 @@ def identity(x):
return x return x
def true(_):
"""Always returns True"""
return True
def false(_):
"""Always returns False"""
return False
def noop(): def noop():
"""Does nothing""" """Does nothing"""
@@ -432,6 +442,66 @@ def build_duration_func(duration, min=0.0):
return functools.partial(identity, duration if duration > min else min) return functools.partial(identity, duration if duration > min else min)
def build_extractor_filter(categories, negate=True, special=None):
"""Build a function that takes an Extractor class as argument
and returns True if that class is allowed by 'categories'
"""
if isinstance(categories, str):
categories = categories.split(",")
catset = set() # set of categories / basecategories
subset = set() # set of subcategories
catsub = [] # list of category-subcategory pairs
for item in categories:
category, _, subcategory = item.partition(":")
if category and category != "*":
if subcategory and subcategory != "*":
catsub.append((category, subcategory))
else:
catset.add(category)
elif subcategory and subcategory != "*":
subset.add(subcategory)
if special:
catset |= special
elif not catset and not subset and not catsub:
return true if negate else false
tests = []
if negate:
if catset:
tests.append(lambda extr:
extr.category not in catset and
extr.basecategory not in catset)
if subset:
tests.append(lambda extr: extr.subcategory not in subset)
else:
if catset:
tests.append(lambda extr:
extr.category in catset or
extr.basecategory in catset)
if subset:
tests.append(lambda extr: extr.subcategory in subset)
if catsub:
def test(extr):
for category, subcategory in catsub:
if category in (extr.category, extr.basecategory) and \
subcategory == extr.subcategory:
return not negate
return negate
tests.append(test)
if len(tests) == 1:
return tests[0]
if negate:
return lambda extr: all(t(extr) for t in tests)
else:
return lambda extr: any(t(extr) for t in tests)
def build_predicate(predicates): def build_predicate(predicates):
if not predicates: if not predicates:
return lambda url, kwdict: True return lambda url, kwdict: True

View File

@@ -37,6 +37,31 @@ class TestJob(unittest.TestCase):
return buffer.getvalue() return buffer.getvalue()
class TestDownloadJob(TestJob):
jobclass = job.DownloadJob
def test_extractor_filter(self):
extr = TestExtractor.from_url("test:")
tjob = self.jobclass(extr)
func = tjob._build_extractor_filter()
self.assertEqual(func(TestExtractor) , False)
self.assertEqual(func(TestExtractorParent), False)
self.assertEqual(func(TestExtractorAlt) , True)
config.set((), "blacklist", ":test_subcategory")
func = tjob._build_extractor_filter()
self.assertEqual(func(TestExtractor) , False)
self.assertEqual(func(TestExtractorParent), True)
self.assertEqual(func(TestExtractorAlt) , False)
config.set((), "whitelist", "test_category:test_subcategory")
func = tjob._build_extractor_filter()
self.assertEqual(func(TestExtractor) , True)
self.assertEqual(func(TestExtractorParent), False)
self.assertEqual(func(TestExtractorAlt) , False)
class TestKeywordJob(TestJob): class TestKeywordJob(TestJob):
jobclass = job.KeywordJob jobclass = job.KeywordJob
@@ -334,5 +359,10 @@ class TestExtractorException(Extractor):
return 1/0 return 1/0
class TestExtractorAlt(Extractor):
category = "test_category_alt"
subcategory = "test_subcategory"
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@@ -357,6 +357,58 @@ class TestOther(unittest.TestCase):
with self.assertRaises(exception.StopExtraction): with self.assertRaises(exception.StopExtraction):
expr() expr()
def test_extractor_filter(self):
# empty
func = util.build_extractor_filter("")
self.assertEqual(func(TestExtractor) , True)
self.assertEqual(func(TestExtractorParent), True)
self.assertEqual(func(TestExtractorAlt) , True)
# category
func = util.build_extractor_filter("test_category")
self.assertEqual(func(TestExtractor) , False)
self.assertEqual(func(TestExtractorParent), False)
self.assertEqual(func(TestExtractorAlt) , True)
# subcategory
func = util.build_extractor_filter("*:test_subcategory")
self.assertEqual(func(TestExtractor) , False)
self.assertEqual(func(TestExtractorParent), True)
self.assertEqual(func(TestExtractorAlt) , False)
# basecategory
func = util.build_extractor_filter("test_basecategory")
self.assertEqual(func(TestExtractor) , False)
self.assertEqual(func(TestExtractorParent), False)
self.assertEqual(func(TestExtractorAlt) , False)
# category-subcategory pair
func = util.build_extractor_filter("test_category:test_subcategory")
self.assertEqual(func(TestExtractor) , False)
self.assertEqual(func(TestExtractorParent), True)
self.assertEqual(func(TestExtractorAlt) , True)
# combination
func = util.build_extractor_filter(
["test_category", "*:test_subcategory"])
self.assertEqual(func(TestExtractor) , False)
self.assertEqual(func(TestExtractorParent), False)
self.assertEqual(func(TestExtractorAlt) , False)
# whitelist
func = util.build_extractor_filter(
"test_category:test_subcategory", negate=False)
self.assertEqual(func(TestExtractor) , True)
self.assertEqual(func(TestExtractorParent), False)
self.assertEqual(func(TestExtractorAlt) , False)
func = util.build_extractor_filter(
["test_category:test_subcategory", "*:test_subcategory_parent"],
negate=False)
self.assertEqual(func(TestExtractor) , True)
self.assertEqual(func(TestExtractorParent), True)
self.assertEqual(func(TestExtractorAlt) , False)
def test_generate_token(self): def test_generate_token(self):
tokens = set() tokens = set()
for _ in range(100): for _ in range(100):
@@ -469,5 +521,21 @@ class TestOther(unittest.TestCase):
self.assertIs(obj["key"], obj) self.assertIs(obj["key"], obj)
class TestExtractor():
category = "test_category"
subcategory = "test_subcategory"
basecategory = "test_basecategory"
class TestExtractorParent(TestExtractor):
category = "test_category"
subcategory = "test_subcategory_parent"
class TestExtractorAlt(TestExtractor):
category = "test_category_alt"
subcategory = "test_subcategory"
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()