diff --git a/resolver/resolve.go b/resolver/resolve.go index 42b755d6..316edd27 100644 --- a/resolver/resolve.go +++ b/resolver/resolve.go @@ -278,6 +278,10 @@ func resolveAndCache(ctx context.Context, q *Query) (rrCache *RRCache, err error resolveLoop: for i = 0; i < 2; i++ { for _, resolver := range resolvers { + if module.IsStopping() { + return nil, errors.New("shutting down") + } + // check if resolver failed recently (on first run) if i == 0 && resolver.Conn.IsFailing() { log.Tracer(ctx).Tracef("resolver: skipping resolver %s, because it failed recently", resolver) @@ -294,19 +298,25 @@ resolveLoop: case errors.Is(err, ErrBlocked): // some resolvers might also block return nil, err - case errors.Is(err, ErrContinue): - continue case netenv.GetOnlineStatus() == netenv.StatusOffline && !netenv.IsConnectivityDomain(q.FQDN): log.Tracer(ctx).Debugf("resolver: not resolving %s, device is offline", q.FQDN) // we are offline and this is not an online check query return nil, ErrOffline + case errors.Is(err, ErrContinue): + continue + case errors.Is(err, ErrTimeout): + resolver.Conn.ReportFailure() + log.Tracer(ctx).Debugf("resolver: query to %s timed out", resolver.GetName()) + continue default: - // includes ErrTimeout - log.Tracer(ctx).Debugf("resolver: failed to resolve %s: %s", q.FQDN, err) + resolver.Conn.ReportFailure() + log.Tracer(ctx).Debugf("resolver: query to %s failed: %s", resolver.GetName(), err) + continue } } if rrCache == nil { + // Defensive: This should normally not happen. continue } break resolveLoop diff --git a/resolver/resolver-tcp.go b/resolver/resolver-tcp.go index c9f40dcb..62d39706 100644 --- a/resolver/resolver-tcp.go +++ b/resolver/resolver-tcp.go @@ -145,7 +145,6 @@ func (tr *TCPResolver) Query(ctx context.Context, q *Query) (*RRCache, error) { select { case reply = <-inFlight.Response: case <-time.After(defaultRequestTimeout): - tr.ReportFailure() return nil, ErrTimeout } @@ -180,6 +179,14 @@ func (tr *TCPResolver) startClient() { func (mgr *tcpResolverConnMgr) run(workerCtx context.Context) error { // connection lifecycle loop for { + // check if we are shutting down + select { + case <-workerCtx.Done(): + mgr.shutdown() + return nil + default: + } + // check if we are failing if mgr.failCnt >= FailThreshold || mgr.tr.IsFailing() { mgr.shutdown() diff --git a/resolver/resolver.go b/resolver/resolver.go index 28c30f67..fd7461f5 100644 --- a/resolver/resolver.go +++ b/resolver/resolver.go @@ -2,13 +2,11 @@ package resolver import ( "context" - "errors" "net" "sync" "time" "github.com/miekg/dns" - "github.com/safing/portbase/log" "github.com/safing/portmaster/netenv" ) @@ -106,11 +104,10 @@ type ResolverConn interface { //nolint:go-lint // TODO type BasicResolverConn struct { sync.Mutex // for lastFail - resolver *Resolver - clientManager *dnsClientManager + resolver *Resolver - lastFail time.Time - fails int + failingUntil time.Time + fails int } // ReportFailure reports that an error occurred with this resolver. @@ -122,17 +119,12 @@ func (brc *BasicResolverConn) ReportFailure() { brc.Lock() defer brc.Unlock() - now := time.Now().UTC() - failDuration := time.Duration(nameserverRetryRate()) * time.Second - // reset fail counter if currently not failing - if now.Add(-failDuration).After(brc.lastFail) { + brc.fails++ + if brc.fails > FailThreshold { + brc.failingUntil = time.Now().Add(time.Duration(nameserverRetryRate()) * time.Second) brc.fails = 0 } - - // update - brc.lastFail = now - brc.fails++ } // IsFailing returns if this resolver is currently failing. @@ -140,122 +132,5 @@ func (brc *BasicResolverConn) IsFailing() bool { brc.Lock() defer brc.Unlock() - failDuration := time.Duration(nameserverRetryRate()) * time.Second - return brc.fails >= FailThreshold && time.Now().UTC().Add(-failDuration).Before(brc.lastFail) -} - -// Query executes the given query against the resolver. -func (brc *BasicResolverConn) Query(ctx context.Context, q *Query) (*RRCache, error) { - // convenience - resolver := brc.resolver - - // create query - dnsQuery := new(dns.Msg) - dnsQuery.SetQuestion(q.FQDN, uint16(q.QType)) - - // start - var reply *dns.Msg - var ttl time.Duration - var err error - var conn *dns.Conn - var new bool - var tries int - - for ; tries < 3; tries++ { - - // first get connection - dc := brc.clientManager.getDNSClient() - conn, new, err = dc.getConn() - if err != nil { - log.Tracer(ctx).Tracef("resolver: failed to connect to %s: %s", resolver.Server, err) - // remove client from pool - dc.destroy() - // report that resolver had an error - brc.ReportFailure() - // hint network environment at failed connection - netenv.ReportFailedConnection() - - // TODO: handle special cases - // 1. connect: network is unreachable - // 2. timeout - - // try again - continue - } - if new { - log.Tracer(ctx).Tracef("resolver: created new connection to %s (%s)", resolver.Name, resolver.ServerAddress) - } else { - log.Tracer(ctx).Tracef("resolver: reusing connection to %s (%s)", resolver.Name, resolver.ServerAddress) - } - - // query server - reply, ttl, err = dc.client.ExchangeWithConn(dnsQuery, conn) - log.Tracer(ctx).Tracef("resolver: query took %s", ttl) - - // error handling - if err != nil { - log.Tracer(ctx).Tracef("resolver: query to %s encountered error: %s", resolver.Server, err) - - // remove client from pool - dc.destroy() - - // temporary error - if nerr, ok := err.(net.Error); ok && nerr.Timeout() { - log.Tracer(ctx).Tracef("resolver: retrying to resolve %s%s with %s, error is temporary", q.FQDN, q.QType, resolver.Server) - // try again - continue - } - - // report failed if dns (nothing happens at getConn()) - if resolver.ServerType == ServerTypeDNS { - // report that resolver had an error - brc.ReportFailure() - // hint network environment at failed connection - netenv.ReportFailedConnection() - } - - // permanent error - break - } else if reply == nil { - // remove client from pool - dc.destroy() - - log.Errorf("resolver: successful query for %s%s to %s, but reply was nil", q.FQDN, q.QType, resolver.Server) - return nil, errors.New("internal error") - } - - // make client available (again) - dc.addToPool() - - if resolver.IsBlockedUpstream(reply) { - return nil, &BlockedUpstreamError{resolver.GetName()} - } - - // no error - break - } - - if err != nil { - return nil, err - // TODO: mark as failed - } else if reply == nil { - log.Errorf("resolver: queried %s for %s%s (%d tries), but reply was nil", q.FQDN, q.QType, resolver.GetName(), tries+1) - return nil, errors.New("internal error") - } - - // hint network environment at successful connection - netenv.ReportSuccessfulConnection() - - newRecord := &RRCache{ - Domain: q.FQDN, - Question: q.QType, - Answer: reply.Answer, - Ns: reply.Ns, - Extra: reply.Extra, - Server: resolver.Server, - ServerScope: resolver.ServerIPScope, - } - - // TODO: check if reply.Answer is valid - return newRecord, nil + return time.Now().Before(brc.failingUntil) }