filter duplicate URLs (#36)
Duplicate URLs might occur if, for example, an artist adds another image to his gallery while an extractor is running and images are being downloaded on sites like pixiv/nijie/hentaifoundry. The next image on the next page will have already been downloaded and will cause a premature end if '--abort-on-skip' is being used.
This commit is contained in:
@@ -25,17 +25,26 @@ class Job():
|
|||||||
self.extractor.log.debug(
|
self.extractor.log.debug(
|
||||||
"Using %s for %s", self.extractor.__class__.__name__, url)
|
"Using %s for %s", self.extractor.__class__.__name__, url)
|
||||||
|
|
||||||
|
# url predicates
|
||||||
|
predicates = [util.UniquePredicate()]
|
||||||
|
|
||||||
items = config.get(("images",))
|
items = config.get(("images",))
|
||||||
if items:
|
if items:
|
||||||
pred = util.RangePredicate(items)
|
pred = util.RangePredicate(items)
|
||||||
if pred.lower > 1:
|
if pred.lower > 1:
|
||||||
pred.index += self.extractor.skip(pred.lower - 1)
|
pred.index += self.extractor.skip(pred.lower - 1)
|
||||||
self.pred_url = pred
|
predicates.append(pred)
|
||||||
else:
|
|
||||||
self.pred_url = True
|
self.pred_url = util.build_predicate(predicates)
|
||||||
|
|
||||||
|
# queue predicates
|
||||||
|
predicates = []
|
||||||
|
|
||||||
items = config.get(("chapters",))
|
items = config.get(("chapters",))
|
||||||
self.pred_queue = util.RangePredicate(items) if items else True
|
if items:
|
||||||
|
predicates.append(util.RangePredicate(items))
|
||||||
|
|
||||||
|
self.pred_queue = util.build_predicate(predicates)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
"""Execute or run the job"""
|
"""Execute or run the job"""
|
||||||
@@ -73,16 +82,17 @@ class Job():
|
|||||||
def dispatch(self, msg):
|
def dispatch(self, msg):
|
||||||
"""Call the appropriate message handler"""
|
"""Call the appropriate message handler"""
|
||||||
if msg[0] == Message.Url:
|
if msg[0] == Message.Url:
|
||||||
if self.pred_url:
|
_, url, kwds = msg
|
||||||
self.update_kwdict(msg[2])
|
if self.pred_url(url, kwds):
|
||||||
self.handle_url(msg[1], msg[2])
|
self.update_kwdict(kwds)
|
||||||
|
self.handle_url(url, kwds)
|
||||||
|
|
||||||
elif msg[0] == Message.Directory:
|
elif msg[0] == Message.Directory:
|
||||||
self.update_kwdict(msg[1])
|
self.update_kwdict(msg[1])
|
||||||
self.handle_directory(msg[1])
|
self.handle_directory(msg[1])
|
||||||
|
|
||||||
elif msg[0] == Message.Queue:
|
elif msg[0] == Message.Queue:
|
||||||
if self.pred_queue:
|
if self.pred_queue(msg[1], None):
|
||||||
self.handle_queue(msg[1])
|
self.handle_queue(msg[1])
|
||||||
|
|
||||||
elif msg[0] == Message.Version:
|
elif msg[0] == Message.Version:
|
||||||
|
|||||||
@@ -139,8 +139,17 @@ CODES = {
|
|||||||
SPECIAL_EXTRACTORS = ("oauth", "recursive", "test")
|
SPECIAL_EXTRACTORS = ("oauth", "recursive", "test")
|
||||||
|
|
||||||
|
|
||||||
|
def build_predicate(predicates):
|
||||||
|
if not predicates:
|
||||||
|
return lambda url, kwds: True
|
||||||
|
elif len(predicates) == 1:
|
||||||
|
return predicates[0]
|
||||||
|
else:
|
||||||
|
return ChainPredicate(predicates)
|
||||||
|
|
||||||
|
|
||||||
class RangePredicate():
|
class RangePredicate():
|
||||||
"""Predicate; is True if the current index is in the given range"""
|
"""Predicate; True if the current index is in the given range"""
|
||||||
def __init__(self, rangespec):
|
def __init__(self, rangespec):
|
||||||
self.ranges = optimize_range(parse_range(rangespec))
|
self.ranges = optimize_range(parse_range(rangespec))
|
||||||
self.index = 0
|
self.index = 0
|
||||||
@@ -149,7 +158,7 @@ class RangePredicate():
|
|||||||
else:
|
else:
|
||||||
self.lower, self.upper = 0, 0
|
self.lower, self.upper = 0, 0
|
||||||
|
|
||||||
def __bool__(self):
|
def __call__(self, url, kwds):
|
||||||
self.index += 1
|
self.index += 1
|
||||||
|
|
||||||
if self.index > self.upper:
|
if self.index > self.upper:
|
||||||
@@ -161,6 +170,30 @@ class RangePredicate():
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class UniquePredicate():
|
||||||
|
"""Predicate; True if given URL has not been encountered before"""
|
||||||
|
def __init__(self):
|
||||||
|
self.urls = set()
|
||||||
|
|
||||||
|
def __call__(self, url, kwds):
|
||||||
|
if url not in self.urls:
|
||||||
|
self.urls.add(url)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class ChainPredicate():
|
||||||
|
"""Predicate; True if all of its predicates return True"""
|
||||||
|
def __init__(self, predicates):
|
||||||
|
self.predicates = predicates
|
||||||
|
|
||||||
|
def __call__(self, url, kwds):
|
||||||
|
for pred in self.predicates:
|
||||||
|
if not pred(url, kwds):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
class PathFormat():
|
class PathFormat():
|
||||||
|
|
||||||
def __init__(self, extractor):
|
def __init__(self, extractor):
|
||||||
|
|||||||
Reference in New Issue
Block a user