filter duplicate URLs (#36)

Duplicate URLs might occur if, for example,  an artist adds another
image to his gallery while an extractor is running and images are being
downloaded on sites like pixiv/nijie/hentaifoundry.
The next image on the next page will have already been downloaded and
will cause a premature end if '--abort-on-skip' is being used.
This commit is contained in:
Mike Fährmann
2017-09-06 17:08:50 +02:00
parent 00420ff202
commit 268cfa3cfe
2 changed files with 53 additions and 10 deletions

View File

@@ -25,17 +25,26 @@ class Job():
self.extractor.log.debug(
"Using %s for %s", self.extractor.__class__.__name__, url)
# url predicates
predicates = [util.UniquePredicate()]
items = config.get(("images",))
if items:
pred = util.RangePredicate(items)
if pred.lower > 1:
pred.index += self.extractor.skip(pred.lower - 1)
self.pred_url = pred
else:
self.pred_url = True
predicates.append(pred)
self.pred_url = util.build_predicate(predicates)
# queue predicates
predicates = []
items = config.get(("chapters",))
self.pred_queue = util.RangePredicate(items) if items else True
if items:
predicates.append(util.RangePredicate(items))
self.pred_queue = util.build_predicate(predicates)
def run(self):
"""Execute or run the job"""
@@ -73,16 +82,17 @@ class Job():
def dispatch(self, msg):
"""Call the appropriate message handler"""
if msg[0] == Message.Url:
if self.pred_url:
self.update_kwdict(msg[2])
self.handle_url(msg[1], msg[2])
_, url, kwds = msg
if self.pred_url(url, kwds):
self.update_kwdict(kwds)
self.handle_url(url, kwds)
elif msg[0] == Message.Directory:
self.update_kwdict(msg[1])
self.handle_directory(msg[1])
elif msg[0] == Message.Queue:
if self.pred_queue:
if self.pred_queue(msg[1], None):
self.handle_queue(msg[1])
elif msg[0] == Message.Version: