From 268cfa3cfeab09a950e019d1fa332745864a303a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Wed, 6 Sep 2017 17:08:50 +0200 Subject: [PATCH] filter duplicate URLs (#36) Duplicate URLs might occur if, for example, an artist adds another image to his gallery while an extractor is running and images are being downloaded on sites like pixiv/nijie/hentaifoundry. The next image on the next page will have already been downloaded and will cause a premature end if '--abort-on-skip' is being used. --- gallery_dl/job.py | 26 ++++++++++++++++++-------- gallery_dl/util.py | 37 +++++++++++++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/gallery_dl/job.py b/gallery_dl/job.py index d5a74e6c..7331f13d 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -25,17 +25,26 @@ class Job(): self.extractor.log.debug( "Using %s for %s", self.extractor.__class__.__name__, url) + # url predicates + predicates = [util.UniquePredicate()] + items = config.get(("images",)) if items: pred = util.RangePredicate(items) if pred.lower > 1: pred.index += self.extractor.skip(pred.lower - 1) - self.pred_url = pred - else: - self.pred_url = True + predicates.append(pred) + + self.pred_url = util.build_predicate(predicates) + + # queue predicates + predicates = [] items = config.get(("chapters",)) - self.pred_queue = util.RangePredicate(items) if items else True + if items: + predicates.append(util.RangePredicate(items)) + + self.pred_queue = util.build_predicate(predicates) def run(self): """Execute or run the job""" @@ -73,16 +82,17 @@ class Job(): def dispatch(self, msg): """Call the appropriate message handler""" if msg[0] == Message.Url: - if self.pred_url: - self.update_kwdict(msg[2]) - self.handle_url(msg[1], msg[2]) + _, url, kwds = msg + if self.pred_url(url, kwds): + self.update_kwdict(kwds) + self.handle_url(url, kwds) elif msg[0] == Message.Directory: self.update_kwdict(msg[1]) self.handle_directory(msg[1]) elif msg[0] == Message.Queue: - if self.pred_queue: + if self.pred_queue(msg[1], None): self.handle_queue(msg[1]) elif msg[0] == Message.Version: diff --git a/gallery_dl/util.py b/gallery_dl/util.py index cb99be68..cc2b98ea 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -139,8 +139,17 @@ CODES = { SPECIAL_EXTRACTORS = ("oauth", "recursive", "test") +def build_predicate(predicates): + if not predicates: + return lambda url, kwds: True + elif len(predicates) == 1: + return predicates[0] + else: + return ChainPredicate(predicates) + + class RangePredicate(): - """Predicate; is True if the current index is in the given range""" + """Predicate; True if the current index is in the given range""" def __init__(self, rangespec): self.ranges = optimize_range(parse_range(rangespec)) self.index = 0 @@ -149,7 +158,7 @@ class RangePredicate(): else: self.lower, self.upper = 0, 0 - def __bool__(self): + def __call__(self, url, kwds): self.index += 1 if self.index > self.upper: @@ -161,6 +170,30 @@ class RangePredicate(): return False +class UniquePredicate(): + """Predicate; True if given URL has not been encountered before""" + def __init__(self): + self.urls = set() + + def __call__(self, url, kwds): + if url not in self.urls: + self.urls.add(url) + return True + return False + + +class ChainPredicate(): + """Predicate; True if all of its predicates return True""" + def __init__(self, predicates): + self.predicates = predicates + + def __call__(self, url, kwds): + for pred in self.predicates: + if not pred(url, kwds): + return False + return True + + class PathFormat(): def __init__(self, extractor):