Improve error handling
This commit is contained in:
@@ -278,6 +278,10 @@ func resolveAndCache(ctx context.Context, q *Query) (rrCache *RRCache, err error
|
||||
resolveLoop:
|
||||
for i = 0; i < 2; i++ {
|
||||
for _, resolver := range resolvers {
|
||||
if module.IsStopping() {
|
||||
return nil, errors.New("shutting down")
|
||||
}
|
||||
|
||||
// check if resolver failed recently (on first run)
|
||||
if i == 0 && resolver.Conn.IsFailing() {
|
||||
log.Tracer(ctx).Tracef("resolver: skipping resolver %s, because it failed recently", resolver)
|
||||
@@ -294,19 +298,25 @@ resolveLoop:
|
||||
case errors.Is(err, ErrBlocked):
|
||||
// some resolvers might also block
|
||||
return nil, err
|
||||
case errors.Is(err, ErrContinue):
|
||||
continue
|
||||
case netenv.GetOnlineStatus() == netenv.StatusOffline &&
|
||||
!netenv.IsConnectivityDomain(q.FQDN):
|
||||
log.Tracer(ctx).Debugf("resolver: not resolving %s, device is offline", q.FQDN)
|
||||
// we are offline and this is not an online check query
|
||||
return nil, ErrOffline
|
||||
case errors.Is(err, ErrContinue):
|
||||
continue
|
||||
case errors.Is(err, ErrTimeout):
|
||||
resolver.Conn.ReportFailure()
|
||||
log.Tracer(ctx).Debugf("resolver: query to %s timed out", resolver.GetName())
|
||||
continue
|
||||
default:
|
||||
// includes ErrTimeout
|
||||
log.Tracer(ctx).Debugf("resolver: failed to resolve %s: %s", q.FQDN, err)
|
||||
resolver.Conn.ReportFailure()
|
||||
log.Tracer(ctx).Debugf("resolver: query to %s failed: %s", resolver.GetName(), err)
|
||||
continue
|
||||
}
|
||||
}
|
||||
if rrCache == nil {
|
||||
// Defensive: This should normally not happen.
|
||||
continue
|
||||
}
|
||||
break resolveLoop
|
||||
|
||||
@@ -145,7 +145,6 @@ func (tr *TCPResolver) Query(ctx context.Context, q *Query) (*RRCache, error) {
|
||||
select {
|
||||
case reply = <-inFlight.Response:
|
||||
case <-time.After(defaultRequestTimeout):
|
||||
tr.ReportFailure()
|
||||
return nil, ErrTimeout
|
||||
}
|
||||
|
||||
@@ -180,6 +179,14 @@ func (tr *TCPResolver) startClient() {
|
||||
func (mgr *tcpResolverConnMgr) run(workerCtx context.Context) error {
|
||||
// connection lifecycle loop
|
||||
for {
|
||||
// check if we are shutting down
|
||||
select {
|
||||
case <-workerCtx.Done():
|
||||
mgr.shutdown()
|
||||
return nil
|
||||
default:
|
||||
}
|
||||
|
||||
// check if we are failing
|
||||
if mgr.failCnt >= FailThreshold || mgr.tr.IsFailing() {
|
||||
mgr.shutdown()
|
||||
|
||||
@@ -2,13 +2,11 @@ package resolver
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"net"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/miekg/dns"
|
||||
"github.com/safing/portbase/log"
|
||||
"github.com/safing/portmaster/netenv"
|
||||
)
|
||||
|
||||
@@ -106,11 +104,10 @@ type ResolverConn interface { //nolint:go-lint // TODO
|
||||
type BasicResolverConn struct {
|
||||
sync.Mutex // for lastFail
|
||||
|
||||
resolver *Resolver
|
||||
clientManager *dnsClientManager
|
||||
resolver *Resolver
|
||||
|
||||
lastFail time.Time
|
||||
fails int
|
||||
failingUntil time.Time
|
||||
fails int
|
||||
}
|
||||
|
||||
// ReportFailure reports that an error occurred with this resolver.
|
||||
@@ -122,17 +119,12 @@ func (brc *BasicResolverConn) ReportFailure() {
|
||||
|
||||
brc.Lock()
|
||||
defer brc.Unlock()
|
||||
now := time.Now().UTC()
|
||||
failDuration := time.Duration(nameserverRetryRate()) * time.Second
|
||||
|
||||
// reset fail counter if currently not failing
|
||||
if now.Add(-failDuration).After(brc.lastFail) {
|
||||
brc.fails++
|
||||
if brc.fails > FailThreshold {
|
||||
brc.failingUntil = time.Now().Add(time.Duration(nameserverRetryRate()) * time.Second)
|
||||
brc.fails = 0
|
||||
}
|
||||
|
||||
// update
|
||||
brc.lastFail = now
|
||||
brc.fails++
|
||||
}
|
||||
|
||||
// IsFailing returns if this resolver is currently failing.
|
||||
@@ -140,122 +132,5 @@ func (brc *BasicResolverConn) IsFailing() bool {
|
||||
brc.Lock()
|
||||
defer brc.Unlock()
|
||||
|
||||
failDuration := time.Duration(nameserverRetryRate()) * time.Second
|
||||
return brc.fails >= FailThreshold && time.Now().UTC().Add(-failDuration).Before(brc.lastFail)
|
||||
}
|
||||
|
||||
// Query executes the given query against the resolver.
|
||||
func (brc *BasicResolverConn) Query(ctx context.Context, q *Query) (*RRCache, error) {
|
||||
// convenience
|
||||
resolver := brc.resolver
|
||||
|
||||
// create query
|
||||
dnsQuery := new(dns.Msg)
|
||||
dnsQuery.SetQuestion(q.FQDN, uint16(q.QType))
|
||||
|
||||
// start
|
||||
var reply *dns.Msg
|
||||
var ttl time.Duration
|
||||
var err error
|
||||
var conn *dns.Conn
|
||||
var new bool
|
||||
var tries int
|
||||
|
||||
for ; tries < 3; tries++ {
|
||||
|
||||
// first get connection
|
||||
dc := brc.clientManager.getDNSClient()
|
||||
conn, new, err = dc.getConn()
|
||||
if err != nil {
|
||||
log.Tracer(ctx).Tracef("resolver: failed to connect to %s: %s", resolver.Server, err)
|
||||
// remove client from pool
|
||||
dc.destroy()
|
||||
// report that resolver had an error
|
||||
brc.ReportFailure()
|
||||
// hint network environment at failed connection
|
||||
netenv.ReportFailedConnection()
|
||||
|
||||
// TODO: handle special cases
|
||||
// 1. connect: network is unreachable
|
||||
// 2. timeout
|
||||
|
||||
// try again
|
||||
continue
|
||||
}
|
||||
if new {
|
||||
log.Tracer(ctx).Tracef("resolver: created new connection to %s (%s)", resolver.Name, resolver.ServerAddress)
|
||||
} else {
|
||||
log.Tracer(ctx).Tracef("resolver: reusing connection to %s (%s)", resolver.Name, resolver.ServerAddress)
|
||||
}
|
||||
|
||||
// query server
|
||||
reply, ttl, err = dc.client.ExchangeWithConn(dnsQuery, conn)
|
||||
log.Tracer(ctx).Tracef("resolver: query took %s", ttl)
|
||||
|
||||
// error handling
|
||||
if err != nil {
|
||||
log.Tracer(ctx).Tracef("resolver: query to %s encountered error: %s", resolver.Server, err)
|
||||
|
||||
// remove client from pool
|
||||
dc.destroy()
|
||||
|
||||
// temporary error
|
||||
if nerr, ok := err.(net.Error); ok && nerr.Timeout() {
|
||||
log.Tracer(ctx).Tracef("resolver: retrying to resolve %s%s with %s, error is temporary", q.FQDN, q.QType, resolver.Server)
|
||||
// try again
|
||||
continue
|
||||
}
|
||||
|
||||
// report failed if dns (nothing happens at getConn())
|
||||
if resolver.ServerType == ServerTypeDNS {
|
||||
// report that resolver had an error
|
||||
brc.ReportFailure()
|
||||
// hint network environment at failed connection
|
||||
netenv.ReportFailedConnection()
|
||||
}
|
||||
|
||||
// permanent error
|
||||
break
|
||||
} else if reply == nil {
|
||||
// remove client from pool
|
||||
dc.destroy()
|
||||
|
||||
log.Errorf("resolver: successful query for %s%s to %s, but reply was nil", q.FQDN, q.QType, resolver.Server)
|
||||
return nil, errors.New("internal error")
|
||||
}
|
||||
|
||||
// make client available (again)
|
||||
dc.addToPool()
|
||||
|
||||
if resolver.IsBlockedUpstream(reply) {
|
||||
return nil, &BlockedUpstreamError{resolver.GetName()}
|
||||
}
|
||||
|
||||
// no error
|
||||
break
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
// TODO: mark as failed
|
||||
} else if reply == nil {
|
||||
log.Errorf("resolver: queried %s for %s%s (%d tries), but reply was nil", q.FQDN, q.QType, resolver.GetName(), tries+1)
|
||||
return nil, errors.New("internal error")
|
||||
}
|
||||
|
||||
// hint network environment at successful connection
|
||||
netenv.ReportSuccessfulConnection()
|
||||
|
||||
newRecord := &RRCache{
|
||||
Domain: q.FQDN,
|
||||
Question: q.QType,
|
||||
Answer: reply.Answer,
|
||||
Ns: reply.Ns,
|
||||
Extra: reply.Extra,
|
||||
Server: resolver.Server,
|
||||
ServerScope: resolver.ServerIPScope,
|
||||
}
|
||||
|
||||
// TODO: check if reply.Answer is valid
|
||||
return newRecord, nil
|
||||
return time.Now().Before(brc.failingUntil)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user