LocalAI/core/services/nodes/probe_cache.go

package nodes

import (
	"sync"
	"time"

	"golang.org/x/sync/singleflight"
)

// probeCache memoizes recent successful gRPC HealthCheck results for
// (nodeID, addr) tuples so SmartRouter.probeHealth doesn't pay a round-trip
// on every inference request.
//
// Why this exists: with per-request routing (see pkg/model/loader.go), every
// inference call goes through SmartRouter.Route, which probes the backend
// before returning a client. Many gRPC backends (notably llama.cpp's server)
// serialize HealthCheck against active Predict on a shared goroutine, so a
// burst of new requests can stall behind a single long-running stream —
// exactly the "queue stalling" symptom observed in distributed clusters.
//
// The background HealthMonitor (perModelHealthCheck) is still the cluster-wide
// source of truth that reaps actually-dead backends within ~45s; this cache
// only saves the per-request hot path from re-asking when nothing has changed.
//
// TTL matches healthCheckTTL in pkg/model/model.go so the single-process
// IsRecentlyHealthy path and this distributed-mode path share the same
// staleness budget.
type probeCache struct {
	ttl    time.Duration
	mu     sync.Mutex
	seen   map[string]time.Time // key → last successful probe
	flight singleflight.Group   // coalesces concurrent probes for the same key
}

// newProbeCache returns a probeCache with the given TTL. Zero TTL disables
// caching: every call to DoOrCached invokes the probe.
func newProbeCache(ttl time.Duration) *probeCache {
	return &probeCache{
		ttl:  ttl,
		seen: make(map[string]time.Time),
	}
}

// IsFresh reports whether key was successfully probed within TTL.
func (c *probeCache) IsFresh(key string) bool {
	if c.ttl <= 0 {
		return false
	}
	c.mu.Lock()
	defer c.mu.Unlock()
	last, ok := c.seen[key]
	return ok && time.Since(last) < c.ttl
}

// markFresh records key as successfully probed at the current time.
func (c *probeCache) markFresh(key string) {
	c.mu.Lock()
	defer c.mu.Unlock()
	c.seen[key] = time.Now()
}

// Invalidate drops any cached freshness for key. Used after a probe failure
// (or any other signal that the backend may not be alive) so the next call
// will re-probe instead of trusting stale state.
func (c *probeCache) Invalidate(key string) {
	c.mu.Lock()
	defer c.mu.Unlock()
	delete(c.seen, key)
}

// DoOrCached returns true if key is fresh; otherwise it runs probe (coalescing
// concurrent callers via singleflight) and caches a successful result. Failed
// probes invalidate the cache, so a transient miss doesn't pin every
// subsequent request to a re-probe.
func (c *probeCache) DoOrCached(key string, probe func() bool) bool {
	if c.IsFresh(key) {
		return true
	}
	v, _, _ := c.flight.Do(key, func() (any, error) {
		// Double-check after potentially waiting: another caller in this
		// flight may have just populated the cache.
		if c.IsFresh(key) {
			return true, nil
		}
		ok := probe()
		if ok {
			c.markFresh(key)
		} else {
			c.Invalidate(key)
		}
		return ok, nil
	})
	return v.(bool)
}