Improve error handling

This commit is contained in:
Daniel
2020-07-21 14:59:14 +02:00
parent 7b40e83aec
commit dda7cdf7d0
3 changed files with 29 additions and 137 deletions

View File

@@ -278,6 +278,10 @@ func resolveAndCache(ctx context.Context, q *Query) (rrCache *RRCache, err error
resolveLoop:
for i = 0; i < 2; i++ {
for _, resolver := range resolvers {
if module.IsStopping() {
return nil, errors.New("shutting down")
}
// check if resolver failed recently (on first run)
if i == 0 && resolver.Conn.IsFailing() {
log.Tracer(ctx).Tracef("resolver: skipping resolver %s, because it failed recently", resolver)
@@ -294,19 +298,25 @@ resolveLoop:
case errors.Is(err, ErrBlocked):
// some resolvers might also block
return nil, err
case errors.Is(err, ErrContinue):
continue
case netenv.GetOnlineStatus() == netenv.StatusOffline &&
!netenv.IsConnectivityDomain(q.FQDN):
log.Tracer(ctx).Debugf("resolver: not resolving %s, device is offline", q.FQDN)
// we are offline and this is not an online check query
return nil, ErrOffline
case errors.Is(err, ErrContinue):
continue
case errors.Is(err, ErrTimeout):
resolver.Conn.ReportFailure()
log.Tracer(ctx).Debugf("resolver: query to %s timed out", resolver.GetName())
continue
default:
// includes ErrTimeout
log.Tracer(ctx).Debugf("resolver: failed to resolve %s: %s", q.FQDN, err)
resolver.Conn.ReportFailure()
log.Tracer(ctx).Debugf("resolver: query to %s failed: %s", resolver.GetName(), err)
continue
}
}
if rrCache == nil {
// Defensive: This should normally not happen.
continue
}
break resolveLoop

View File

@@ -145,7 +145,6 @@ func (tr *TCPResolver) Query(ctx context.Context, q *Query) (*RRCache, error) {
select {
case reply = <-inFlight.Response:
case <-time.After(defaultRequestTimeout):
tr.ReportFailure()
return nil, ErrTimeout
}
@@ -180,6 +179,14 @@ func (tr *TCPResolver) startClient() {
func (mgr *tcpResolverConnMgr) run(workerCtx context.Context) error {
// connection lifecycle loop
for {
// check if we are shutting down
select {
case <-workerCtx.Done():
mgr.shutdown()
return nil
default:
}
// check if we are failing
if mgr.failCnt >= FailThreshold || mgr.tr.IsFailing() {
mgr.shutdown()

View File

@@ -2,13 +2,11 @@ package resolver
import (
"context"
"errors"
"net"
"sync"
"time"
"github.com/miekg/dns"
"github.com/safing/portbase/log"
"github.com/safing/portmaster/netenv"
)
@@ -106,11 +104,10 @@ type ResolverConn interface { //nolint:go-lint // TODO
type BasicResolverConn struct {
sync.Mutex // for lastFail
resolver *Resolver
clientManager *dnsClientManager
resolver *Resolver
lastFail time.Time
fails int
failingUntil time.Time
fails int
}
// ReportFailure reports that an error occurred with this resolver.
@@ -122,17 +119,12 @@ func (brc *BasicResolverConn) ReportFailure() {
brc.Lock()
defer brc.Unlock()
now := time.Now().UTC()
failDuration := time.Duration(nameserverRetryRate()) * time.Second
// reset fail counter if currently not failing
if now.Add(-failDuration).After(brc.lastFail) {
brc.fails++
if brc.fails > FailThreshold {
brc.failingUntil = time.Now().Add(time.Duration(nameserverRetryRate()) * time.Second)
brc.fails = 0
}
// update
brc.lastFail = now
brc.fails++
}
// IsFailing returns if this resolver is currently failing.
@@ -140,122 +132,5 @@ func (brc *BasicResolverConn) IsFailing() bool {
brc.Lock()
defer brc.Unlock()
failDuration := time.Duration(nameserverRetryRate()) * time.Second
return brc.fails >= FailThreshold && time.Now().UTC().Add(-failDuration).Before(brc.lastFail)
}
// Query executes the given query against the resolver.
func (brc *BasicResolverConn) Query(ctx context.Context, q *Query) (*RRCache, error) {
// convenience
resolver := brc.resolver
// create query
dnsQuery := new(dns.Msg)
dnsQuery.SetQuestion(q.FQDN, uint16(q.QType))
// start
var reply *dns.Msg
var ttl time.Duration
var err error
var conn *dns.Conn
var new bool
var tries int
for ; tries < 3; tries++ {
// first get connection
dc := brc.clientManager.getDNSClient()
conn, new, err = dc.getConn()
if err != nil {
log.Tracer(ctx).Tracef("resolver: failed to connect to %s: %s", resolver.Server, err)
// remove client from pool
dc.destroy()
// report that resolver had an error
brc.ReportFailure()
// hint network environment at failed connection
netenv.ReportFailedConnection()
// TODO: handle special cases
// 1. connect: network is unreachable
// 2. timeout
// try again
continue
}
if new {
log.Tracer(ctx).Tracef("resolver: created new connection to %s (%s)", resolver.Name, resolver.ServerAddress)
} else {
log.Tracer(ctx).Tracef("resolver: reusing connection to %s (%s)", resolver.Name, resolver.ServerAddress)
}
// query server
reply, ttl, err = dc.client.ExchangeWithConn(dnsQuery, conn)
log.Tracer(ctx).Tracef("resolver: query took %s", ttl)
// error handling
if err != nil {
log.Tracer(ctx).Tracef("resolver: query to %s encountered error: %s", resolver.Server, err)
// remove client from pool
dc.destroy()
// temporary error
if nerr, ok := err.(net.Error); ok && nerr.Timeout() {
log.Tracer(ctx).Tracef("resolver: retrying to resolve %s%s with %s, error is temporary", q.FQDN, q.QType, resolver.Server)
// try again
continue
}
// report failed if dns (nothing happens at getConn())
if resolver.ServerType == ServerTypeDNS {
// report that resolver had an error
brc.ReportFailure()
// hint network environment at failed connection
netenv.ReportFailedConnection()
}
// permanent error
break
} else if reply == nil {
// remove client from pool
dc.destroy()
log.Errorf("resolver: successful query for %s%s to %s, but reply was nil", q.FQDN, q.QType, resolver.Server)
return nil, errors.New("internal error")
}
// make client available (again)
dc.addToPool()
if resolver.IsBlockedUpstream(reply) {
return nil, &BlockedUpstreamError{resolver.GetName()}
}
// no error
break
}
if err != nil {
return nil, err
// TODO: mark as failed
} else if reply == nil {
log.Errorf("resolver: queried %s for %s%s (%d tries), but reply was nil", q.FQDN, q.QType, resolver.GetName(), tries+1)
return nil, errors.New("internal error")
}
// hint network environment at successful connection
netenv.ReportSuccessfulConnection()
newRecord := &RRCache{
Domain: q.FQDN,
Question: q.QType,
Answer: reply.Answer,
Ns: reply.Ns,
Extra: reply.Extra,
Server: resolver.Server,
ServerScope: resolver.ServerIPScope,
}
// TODO: check if reply.Answer is valid
return newRecord, nil
return time.Now().Before(brc.failingUntil)
}