feat(concurrency-groups): per-model exclusive groups for backend loading (#9662)

* feat(concurrency-groups): per-model exclusive groups for backend loading

Adds `concurrency_groups: [...]` to model YAML configs. Two models that share
a group cannot be loaded concurrently on the same node — loading one evicts
the others, reusing the existing pinned/busy/retry policy from LRU eviction.

Layered design:
- Watchdog (pkg/model): per-node correctness floor — on every Load(), evict
  any loaded model that shares a group with the requested one. Pinned skips
  surface NeedMore so the loader retries (and ultimately logs a clear
  warning), instead of silently allowing the rule to be violated.
- Distributed scheduler (core/services/nodes): soft anti-affinity hint —
  scheduleNewModel prefers nodes that don't already host a same-group
  model, falling back to eviction only if every candidate has a conflict.
  Composes with NodeSelector at the same point in the candidate pipeline.

Per-node, not cluster-wide: VRAM is a node-local resource, and two heavy
models running on different nodes is fine. The ConfigLoader is wired into
SmartRouter via a small ConcurrencyConflictResolver interface so the nodes
package keeps a narrow surface on core/config.

Refactors the inner LRU eviction body into a shared collectEvictionsLocked
helper and the loader retry loop into retryEnforce(fn, maxRetries, interval),
so both LRU and group enforcement share busy/pinned/retry semantics.

Closes #9659.

Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(watchdog): sync pinned + concurrency_groups at startup

The startup-time watchdog setup lives in initializeWatchdog (startup.go),
not in startWatchdog (watchdog.go). The latter is only invoked from the
runtime-settings RestartWatchdog path. As a result, neither
SyncPinnedModelsToWatchdog nor SyncModelGroupsToWatchdog ran at boot,
so `pinned: true` and `concurrency_groups: [...]` only became effective
after a settings-driven watchdog restart.

Fix by adding both sync calls to initializeWatchdog. Confirmed end-to-end:
loading model A in group "heavy", then C with no group (coexists),
then B in group "heavy" now correctly evicts A and leaves [B, C].

Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix(test): satisfy errcheck on new os.Remove in concurrency_groups spec

CI lint runs new-from-merge-base, so the existing pre-existing
`defer os.Remove(tmp.Name())` lines are baseline-grandfathered but the
one introduced by the concurrency_groups YAML round-trip test is held
to errcheck. Wrap the remove in a closure that discards the error.

Assisted-by: Claude:claude-opus-4-7 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-05-05 08:42:50 +02:00
committed by GitHub
parent 22ae415695
commit bbcaebc1ef
17 changed files with 981 additions and 76 deletions

View File

@@ -197,6 +197,35 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
return model.GRPC(o.parallelRequests, ml.wd), nil
}
// retryEnforce repeatedly invokes fn until it returns NeedMore=false or the
// retry budget is exhausted. It sleeps `retryInterval` between attempts and
// logs progress under `label`. Used by both LRU and group-exclusivity
// enforcement so the busy-model wait behaviour is identical.
func retryEnforce(fn func() EnforceLRULimitResult, maxRetries int, retryInterval time.Duration, label string) {
for attempt := range maxRetries {
result := fn()
if !result.NeedMore {
if result.EvictedCount > 0 {
xlog.Info("[ModelLoader] "+label+" enforcement complete", "evicted", result.EvictedCount)
}
return
}
if attempt < maxRetries-1 {
xlog.Info("[ModelLoader] Waiting for busy models to become idle before eviction",
"label", label,
"evicted", result.EvictedCount,
"attempt", attempt+1,
"maxRetries", maxRetries,
"retryIn", retryInterval)
time.Sleep(retryInterval)
} else {
xlog.Warn("[ModelLoader] "+label+" enforcement incomplete after max retries",
"evicted", result.EvictedCount,
"reason", "conflicts are still busy or pinned")
}
}
}
// enforceLRULimit enforces the LRU limit before loading a new model.
// This is called before loading a model to ensure we don't exceed the limit.
// It accounts for models that are currently being loaded by other goroutines.
@@ -206,41 +235,34 @@ func (ml *ModelLoader) enforceLRULimit() {
return
}
// Get the count of models currently being loaded to account for concurrent requests
pendingLoads := ml.GetLoadingCount()
// Get retry settings from ModelLoader
ml.mu.Lock()
maxRetries := ml.lruEvictionMaxRetries
retryInterval := ml.lruEvictionRetryInterval
ml.mu.Unlock()
for attempt := range maxRetries {
result := ml.wd.EnforceLRULimit(pendingLoads)
retryEnforce(func() EnforceLRULimitResult {
return ml.wd.EnforceLRULimit(pendingLoads)
}, maxRetries, retryInterval, "LRU")
}
if !result.NeedMore {
// Successfully evicted enough models (or no eviction needed)
if result.EvictedCount > 0 {
xlog.Info("[ModelLoader] LRU enforcement complete", "evicted", result.EvictedCount)
}
return
}
// Need more evictions but models are busy - wait and retry
if attempt < maxRetries-1 {
xlog.Info("[ModelLoader] Waiting for busy models to become idle before eviction",
"evicted", result.EvictedCount,
"attempt", attempt+1,
"maxRetries", maxRetries,
"retryIn", retryInterval)
time.Sleep(retryInterval)
} else {
// Last attempt - log warning but proceed (might fail to load, but at least we tried)
xlog.Warn("[ModelLoader] LRU enforcement incomplete after max retries",
"evicted", result.EvictedCount,
"reason", "models are still busy with active API calls")
}
// enforceGroupExclusivity evicts every loaded model that shares a concurrency
// group with modelID before loading proceeds. Reuses the LRU retry settings so
// busy conflicts wait for the same window as a busy LRU eviction.
func (ml *ModelLoader) enforceGroupExclusivity(modelID string) {
if ml.wd == nil {
return
}
ml.mu.Lock()
maxRetries := ml.lruEvictionMaxRetries
retryInterval := ml.lruEvictionRetryInterval
ml.mu.Unlock()
retryEnforce(func() EnforceLRULimitResult {
return ml.wd.EnforceGroupExclusivity(modelID)
}, maxRetries, retryInterval, "group-exclusivity")
}
// updateModelLastUsed updates the last used time for a model (for LRU tracking)
@@ -270,6 +292,12 @@ func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) {
return client, nil
}
// Evict any loaded model that shares a concurrency group with the
// requested one before applying the global LRU cap — group eviction may
// already make room, and otherwise LRU might evict an unrelated model
// only for the group check to immediately evict another.
ml.enforceGroupExclusivity(o.modelID)
// Enforce LRU limit before loading a new model
ml.enforceLRULimit()

View File

@@ -0,0 +1,50 @@
package model
import (
"sync/atomic"
"time"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("retryEnforce", func() {
It("returns immediately when the first attempt is satisfied", func() {
var calls atomic.Int32
retryEnforce(func() EnforceLRULimitResult {
calls.Add(1)
return EnforceLRULimitResult{}
}, 5, 1*time.Millisecond, "test")
Expect(calls.Load()).To(Equal(int32(1)))
})
It("retries until NeedMore clears", func() {
var calls atomic.Int32
retryEnforce(func() EnforceLRULimitResult {
n := calls.Add(1)
if n < 3 {
return EnforceLRULimitResult{NeedMore: true}
}
return EnforceLRULimitResult{EvictedCount: 1}
}, 5, 1*time.Millisecond, "test")
Expect(calls.Load()).To(Equal(int32(3)))
})
It("stops after maxRetries when NeedMore never clears", func() {
var calls atomic.Int32
retryEnforce(func() EnforceLRULimitResult {
calls.Add(1)
return EnforceLRULimitResult{NeedMore: true}
}, 4, 1*time.Millisecond, "test")
Expect(calls.Load()).To(Equal(int32(4)))
})
It("treats maxRetries <= 0 as a no-op (no calls)", func() {
var calls atomic.Int32
retryEnforce(func() EnforceLRULimitResult {
calls.Add(1)
return EnforceLRULimitResult{}
}, 0, 1*time.Millisecond, "test")
Expect(calls.Load()).To(Equal(int32(0)))
})
})

View File

@@ -48,6 +48,11 @@ type WatchDog struct {
// Pinned models are excluded from idle, LRU, and memory-pressure eviction
pinnedModels map[string]bool
// modelGroups maps a model name to its declared concurrency groups.
// Two loaded models that share at least one group cannot coexist on this
// node — see EnforceGroupExclusivity.
modelGroups map[string][]string
}
type ProcessManager interface {
@@ -82,6 +87,7 @@ func NewWatchDog(opts ...WatchDogOption) *WatchDog {
lruLimit: o.lruLimit,
addressModelMap: make(map[string]string),
pinnedModels: make(map[string]bool),
modelGroups: make(map[string][]string),
stop: make(chan bool, 1),
done: make(chan bool, 1),
memoryReclaimerEnabled: o.memoryReclaimerEnabled,
@@ -145,6 +151,34 @@ func (wd *WatchDog) IsModelPinned(modelName string) bool {
return wd.pinnedModels[modelName]
}
// ReplaceModelGroups replaces the per-model concurrency-group registry. The
// supplied map is copied; callers may mutate it after the call. Passing an
// empty or nil map clears all entries.
func (wd *WatchDog) ReplaceModelGroups(groups map[string][]string) {
wd.Lock()
defer wd.Unlock()
wd.modelGroups = make(map[string][]string, len(groups))
for name, gs := range groups {
if len(gs) == 0 {
continue
}
wd.modelGroups[name] = slices.Clone(gs)
}
}
// GetModelGroups returns a copy of the concurrency groups configured for
// the given model, or nil if the model has no groups. The result may be
// freely mutated by the caller.
func (wd *WatchDog) GetModelGroups(modelName string) []string {
wd.Lock()
defer wd.Unlock()
gs, ok := wd.modelGroups[modelName]
if !ok || len(gs) == 0 {
return nil
}
return slices.Clone(gs)
}
func (wd *WatchDog) Shutdown() {
wd.Lock()
defer wd.Unlock()
@@ -327,31 +361,8 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) EnforceLRULimitResult {
})
// Collect models to evict (the oldest ones)
var modelsToShutdown []string
evictedCount := 0
skippedBusyCount := 0
for i := 0; evictedCount < modelsToEvict && i < len(models); i++ {
m := models[i]
// Skip pinned models
if wd.pinnedModels[m.model] {
xlog.Debug("[WatchDog] Skipping LRU eviction for pinned model", "model", m.model)
continue
}
// Check if model is busy
_, isBusy := wd.busyTime[m.address]
if isBusy && !forceEvictionWhenBusy {
// Skip eviction for busy models when forceEvictionWhenBusy is false
xlog.Warn("[WatchDog] Skipping LRU eviction for busy model", "model", m.model, "reason", "model has active API calls")
skippedBusyCount++
continue
}
xlog.Info("[WatchDog] LRU evicting model", "model", m.model, "lastUsed", m.lastUsed, "busy", isBusy)
modelsToShutdown = append(modelsToShutdown, m.model)
// Clean up the maps while we have the lock
wd.untrack(m.address)
evictedCount++
}
needMore := evictedCount < modelsToEvict && skippedBusyCount > 0
modelsToShutdown, skippedBusyCount := wd.collectEvictionsLocked(models, modelsToEvict, forceEvictionWhenBusy)
needMore := len(modelsToShutdown) < modelsToEvict && skippedBusyCount > 0
wd.Unlock()
// Now shutdown models without holding the watchdog lock to prevent deadlock
@@ -363,7 +374,7 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) EnforceLRULimitResult {
}
if needMore {
xlog.Warn("[WatchDog] LRU eviction incomplete", "evicted", evictedCount, "needed", modelsToEvict, "skippedBusy", skippedBusyCount, "reason", "some models are busy with active API calls")
xlog.Warn("[WatchDog] LRU eviction incomplete", "evicted", len(modelsToShutdown), "needed", modelsToEvict, "skippedBusy", skippedBusyCount, "reason", "some models are busy with active API calls")
}
return EnforceLRULimitResult{
@@ -372,6 +383,110 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) EnforceLRULimitResult {
}
}
// collectEvictionsLocked walks `candidates` (already in eviction order) and
// untracks up to `maxToEvict` models that are eligible for eviction. Pinned
// models are always skipped; busy models are skipped unless `force` is true.
// Returns the names of evicted models and the number skipped because they
// were busy. Must be called with wd.Lock() held.
func (wd *WatchDog) collectEvictionsLocked(candidates []modelUsageInfo, maxToEvict int, force bool) (evicted []string, skippedBusy int) {
for i := 0; len(evicted) < maxToEvict && i < len(candidates); i++ {
m := candidates[i]
if wd.pinnedModels[m.model] {
xlog.Debug("[WatchDog] Skipping eviction for pinned model", "model", m.model)
continue
}
_, isBusy := wd.busyTime[m.address]
if isBusy && !force {
xlog.Warn("[WatchDog] Skipping eviction for busy model", "model", m.model, "reason", "model has active API calls")
skippedBusy++
continue
}
xlog.Info("[WatchDog] evicting model", "model", m.model, "busy", isBusy)
evicted = append(evicted, m.model)
wd.untrack(m.address)
}
return evicted, skippedBusy
}
// EnforceGroupExclusivity evicts every loaded model that shares at least one
// concurrency group with the requested model. The pinned/busy/retry semantics
// match EnforceLRULimit so the loader's retry loop can stay generic.
func (wd *WatchDog) EnforceGroupExclusivity(requestedModel string) EnforceLRULimitResult {
wd.Lock()
requestedGroups := wd.modelGroups[requestedModel]
if len(requestedGroups) == 0 {
wd.Unlock()
return EnforceLRULimitResult{}
}
forceEvictionWhenBusy := wd.forceEvictionWhenBusy
// Build the conflict candidate list: every loaded model whose groups
// overlap with requestedGroups. Order doesn't affect correctness, but
// sort by lastUsed (oldest first) so logs and behaviour are deterministic.
var conflicts []modelUsageInfo
for address, name := range wd.addressModelMap {
if name == requestedModel {
continue
}
if !groupsOverlap(requestedGroups, wd.modelGroups[name]) {
continue
}
conflicts = append(conflicts, modelUsageInfo{
address: address,
model: name,
lastUsed: wd.lastUsed[address],
})
}
if len(conflicts) == 0 {
wd.Unlock()
return EnforceLRULimitResult{}
}
slices.SortFunc(conflicts, func(a, b modelUsageInfo) int {
return a.lastUsed.Compare(b.lastUsed)
})
xlog.Debug("[WatchDog] Group exclusivity triggered", "requested", requestedModel, "groups", requestedGroups, "conflicts", len(conflicts))
modelsToShutdown, skippedBusyCount := wd.collectEvictionsLocked(conflicts, len(conflicts), forceEvictionWhenBusy)
// For groups any unresolved conflict matters — busy *or* pinned. The loader
// retries on NeedMore; pinned cases will eventually time out and the load
// proceeds with a visible warning, which is the right signal for what is a
// configuration mismatch.
needMore := len(modelsToShutdown) < len(conflicts)
wd.Unlock()
for _, m := range modelsToShutdown {
if err := wd.pm.ShutdownModel(m); err != nil {
xlog.Error("[WatchDog] error shutting down model during group eviction", "error", err, "model", m)
}
xlog.Debug("[WatchDog] Group eviction complete", "model", m)
}
if needMore {
xlog.Warn("[WatchDog] Group eviction incomplete", "requested", requestedModel, "evicted", len(modelsToShutdown), "needed", len(conflicts), "skippedBusy", skippedBusyCount, "reason", "some conflicts are busy or pinned")
}
return EnforceLRULimitResult{
EvictedCount: len(modelsToShutdown),
NeedMore: needMore,
}
}
// groupsOverlap reports whether the two group lists share any name.
func groupsOverlap(a, b []string) bool {
if len(a) == 0 || len(b) == 0 {
return false
}
for _, x := range a {
if slices.Contains(b, x) {
return true
}
}
return false
}
func (wd *WatchDog) Run() {
xlog.Info("[WatchDog] starting watchdog")

View File

@@ -666,6 +666,182 @@ var _ = Describe("WatchDog", func() {
})
})
Context("Concurrency Groups", func() {
Describe("ReplaceModelGroups / GetModelGroups", func() {
It("returns nil for unknown models", func() {
wd = model.NewWatchDog(model.WithProcessManager(pm))
Expect(wd.GetModelGroups("nope")).To(BeNil())
})
It("stores and retrieves groups", func() {
wd = model.NewWatchDog(model.WithProcessManager(pm))
wd.ReplaceModelGroups(map[string][]string{
"a": {"heavy", "vision"},
"b": {"heavy"},
})
Expect(wd.GetModelGroups("a")).To(Equal([]string{"heavy", "vision"}))
Expect(wd.GetModelGroups("b")).To(Equal([]string{"heavy"}))
Expect(wd.GetModelGroups("c")).To(BeNil())
})
It("replaces previous state on subsequent calls", func() {
wd = model.NewWatchDog(model.WithProcessManager(pm))
wd.ReplaceModelGroups(map[string][]string{"a": {"heavy"}})
wd.ReplaceModelGroups(map[string][]string{"b": {"vision"}})
Expect(wd.GetModelGroups("a")).To(BeNil())
Expect(wd.GetModelGroups("b")).To(Equal([]string{"vision"}))
})
It("clears state when called with an empty map", func() {
wd = model.NewWatchDog(model.WithProcessManager(pm))
wd.ReplaceModelGroups(map[string][]string{"a": {"heavy"}})
wd.ReplaceModelGroups(nil)
Expect(wd.GetModelGroups("a")).To(BeNil())
})
It("returns a defensive copy", func() {
wd = model.NewWatchDog(model.WithProcessManager(pm))
wd.ReplaceModelGroups(map[string][]string{"a": {"heavy"}})
got := wd.GetModelGroups("a")
got[0] = "tampered"
Expect(wd.GetModelGroups("a")).To(Equal([]string{"heavy"}))
})
})
Describe("EnforceGroupExclusivity", func() {
It("is a no-op when the requested model has no groups", func() {
wd = model.NewWatchDog(model.WithProcessManager(pm))
wd.AddAddressModelMap("addr1", "model1")
wd.AddAddressModelMap("addr2", "model2")
result := wd.EnforceGroupExclusivity("requested")
Expect(result.EvictedCount).To(Equal(0))
Expect(result.NeedMore).To(BeFalse())
Expect(pm.getShutdownCalls()).To(BeEmpty())
})
It("is a no-op when no loaded model shares a group", func() {
wd = model.NewWatchDog(model.WithProcessManager(pm))
wd.ReplaceModelGroups(map[string][]string{
"loaded": {"vision"},
"requested": {"heavy"},
})
wd.AddAddressModelMap("addr1", "loaded")
result := wd.EnforceGroupExclusivity("requested")
Expect(result.EvictedCount).To(Equal(0))
Expect(result.NeedMore).To(BeFalse())
Expect(pm.getShutdownCalls()).To(BeEmpty())
})
It("evicts a loaded model that shares a single group", func() {
wd = model.NewWatchDog(model.WithProcessManager(pm))
wd.ReplaceModelGroups(map[string][]string{
"a": {"heavy"},
"b": {"heavy"},
})
wd.AddAddressModelMap("addrA", "a")
wd.Mark("addrA")
wd.UnMark("addrA")
result := wd.EnforceGroupExclusivity("b")
Expect(result.EvictedCount).To(Equal(1))
Expect(result.NeedMore).To(BeFalse())
Expect(pm.getShutdownCalls()).To(ConsistOf("a"))
})
It("evicts when groups overlap on any single name", func() {
wd = model.NewWatchDog(model.WithProcessManager(pm))
wd.ReplaceModelGroups(map[string][]string{
"a": {"x", "y"},
"b": {"y", "z"},
})
wd.AddAddressModelMap("addrA", "a")
wd.Mark("addrA")
wd.UnMark("addrA")
result := wd.EnforceGroupExclusivity("b")
Expect(result.EvictedCount).To(Equal(1))
Expect(pm.getShutdownCalls()).To(ConsistOf("a"))
})
It("evicts every conflicting loaded model", func() {
wd = model.NewWatchDog(
model.WithProcessManager(pm),
model.WithForceEvictionWhenBusy(true),
)
wd.ReplaceModelGroups(map[string][]string{
"a": {"heavy"},
"b": {"heavy"},
"c": {"heavy"},
})
wd.AddAddressModelMap("addrA", "a")
wd.Mark("addrA")
wd.UnMark("addrA")
wd.AddAddressModelMap("addrB", "b")
wd.Mark("addrB")
wd.UnMark("addrB")
result := wd.EnforceGroupExclusivity("c")
Expect(result.EvictedCount).To(Equal(2))
Expect(pm.getShutdownCalls()).To(ConsistOf("a", "b"))
})
It("skips a pinned conflicting model and reports NeedMore", func() {
wd = model.NewWatchDog(
model.WithProcessManager(pm),
model.WithForceEvictionWhenBusy(true),
)
wd.ReplaceModelGroups(map[string][]string{
"a": {"heavy"},
"b": {"heavy"},
})
wd.SetPinnedModels([]string{"a"})
wd.AddAddressModelMap("addrA", "a")
wd.Mark("addrA")
wd.UnMark("addrA")
result := wd.EnforceGroupExclusivity("b")
Expect(result.EvictedCount).To(Equal(0))
Expect(result.NeedMore).To(BeTrue())
Expect(pm.getShutdownCalls()).To(BeEmpty())
})
It("skips a busy conflict when forceEvictionWhenBusy is false", func() {
wd = model.NewWatchDog(model.WithProcessManager(pm))
wd.ReplaceModelGroups(map[string][]string{
"a": {"heavy"},
"b": {"heavy"},
})
wd.AddAddressModelMap("addrA", "a")
wd.Mark("addrA") // leave busy
result := wd.EnforceGroupExclusivity("b")
Expect(result.EvictedCount).To(Equal(0))
Expect(result.NeedMore).To(BeTrue())
Expect(pm.getShutdownCalls()).To(BeEmpty())
})
It("evicts a busy conflict when forceEvictionWhenBusy is true", func() {
wd = model.NewWatchDog(
model.WithProcessManager(pm),
model.WithForceEvictionWhenBusy(true),
)
wd.ReplaceModelGroups(map[string][]string{
"a": {"heavy"},
"b": {"heavy"},
})
wd.AddAddressModelMap("addrA", "a")
wd.Mark("addrA") // leave busy
result := wd.EnforceGroupExclusivity("b")
Expect(result.EvictedCount).To(Equal(1))
Expect(result.NeedMore).To(BeFalse())
Expect(pm.getShutdownCalls()).To(ConsistOf("a"))
})
})
})
Context("Functional Options", func() {
It("should use default options when none provided", func() {
wd = model.NewWatchDog(