mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-17 13:10:23 -04:00
* feat(concurrency-groups): per-model exclusive groups for backend loading Adds `concurrency_groups: [...]` to model YAML configs. Two models that share a group cannot be loaded concurrently on the same node — loading one evicts the others, reusing the existing pinned/busy/retry policy from LRU eviction. Layered design: - Watchdog (pkg/model): per-node correctness floor — on every Load(), evict any loaded model that shares a group with the requested one. Pinned skips surface NeedMore so the loader retries (and ultimately logs a clear warning), instead of silently allowing the rule to be violated. - Distributed scheduler (core/services/nodes): soft anti-affinity hint — scheduleNewModel prefers nodes that don't already host a same-group model, falling back to eviction only if every candidate has a conflict. Composes with NodeSelector at the same point in the candidate pipeline. Per-node, not cluster-wide: VRAM is a node-local resource, and two heavy models running on different nodes is fine. The ConfigLoader is wired into SmartRouter via a small ConcurrencyConflictResolver interface so the nodes package keeps a narrow surface on core/config. Refactors the inner LRU eviction body into a shared collectEvictionsLocked helper and the loader retry loop into retryEnforce(fn, maxRetries, interval), so both LRU and group enforcement share busy/pinned/retry semantics. Closes #9659. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(watchdog): sync pinned + concurrency_groups at startup The startup-time watchdog setup lives in initializeWatchdog (startup.go), not in startWatchdog (watchdog.go). The latter is only invoked from the runtime-settings RestartWatchdog path. As a result, neither SyncPinnedModelsToWatchdog nor SyncModelGroupsToWatchdog ran at boot, so `pinned: true` and `concurrency_groups: [...]` only became effective after a settings-driven watchdog restart. Fix by adding both sync calls to initializeWatchdog. Confirmed end-to-end: loading model A in group "heavy", then C with no group (coexists), then B in group "heavy" now correctly evicts A and leaves [B, C]. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(test): satisfy errcheck on new os.Remove in concurrency_groups spec CI lint runs new-from-merge-base, so the existing pre-existing `defer os.Remove(tmp.Name())` lines are baseline-grandfathered but the one introduced by the concurrency_groups YAML round-trip test is held to errcheck. Wrap the remove in a closure that discards the error. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
711 lines
22 KiB
Go
711 lines
22 KiB
Go
package model
|
|
|
|
import (
|
|
"slices"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/mudler/LocalAI/pkg/xsysinfo"
|
|
process "github.com/mudler/go-processmanager"
|
|
"github.com/mudler/xlog"
|
|
)
|
|
|
|
// WatchDog tracks all the requests from GRPC clients.
|
|
// All GRPC Clients created by ModelLoader should have an associated injected
|
|
// watchdog that will keep track of the state of each backend (busy or not)
|
|
// and for how much time it has been busy.
|
|
// If a backend is busy for too long, the watchdog will kill the process and
|
|
// force a reload of the model.
|
|
// The watchdog also supports LRU (Least Recently Used) eviction when a maximum
|
|
// number of active backends is configured.
|
|
// The watchdog also supports memory threshold monitoring - when memory usage
|
|
// (GPU VRAM if available, otherwise system RAM) exceeds the threshold,
|
|
// it will evict backends using the LRU strategy.
|
|
// The watchdog runs as a separate go routine,
|
|
// and the GRPC client talks to it via a channel to send status updates
|
|
type WatchDog struct {
|
|
sync.Mutex
|
|
busyTime map[string]time.Time
|
|
idleTime map[string]time.Time
|
|
lastUsed map[string]time.Time // LRU tracking: when each model was last used
|
|
timeout, idletimeout time.Duration
|
|
addressMap map[string]*process.Process
|
|
addressModelMap map[string]string
|
|
pm ProcessManager
|
|
stop chan bool
|
|
done chan bool // Signals when Run() has completely shut down
|
|
|
|
busyCheck, idleCheck bool
|
|
lruLimit int // Maximum number of active backends (0 = unlimited)
|
|
|
|
// Memory reclaimer settings (works with GPU if available, otherwise RAM)
|
|
memoryReclaimerEnabled bool // Enable memory threshold monitoring
|
|
memoryReclaimerThreshold float64 // Threshold 0.0-1.0 (e.g., 0.95 = 95%)
|
|
watchdogInterval time.Duration
|
|
|
|
// Eviction settings
|
|
forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
|
|
|
|
// Pinned models are excluded from idle, LRU, and memory-pressure eviction
|
|
pinnedModels map[string]bool
|
|
|
|
// modelGroups maps a model name to its declared concurrency groups.
|
|
// Two loaded models that share at least one group cannot coexist on this
|
|
// node — see EnforceGroupExclusivity.
|
|
modelGroups map[string][]string
|
|
}
|
|
|
|
type ProcessManager interface {
|
|
ShutdownModel(modelName string) error
|
|
}
|
|
|
|
// NewWatchDog creates a new WatchDog with the provided options.
|
|
// Example usage:
|
|
//
|
|
// wd := NewWatchDog(
|
|
// WithProcessManager(pm),
|
|
// WithBusyTimeout(5*time.Minute),
|
|
// WithIdleTimeout(15*time.Minute),
|
|
// WithBusyCheck(true),
|
|
// WithIdleCheck(true),
|
|
// WithLRULimit(3),
|
|
// WithMemoryReclaimer(true, 0.95),
|
|
// )
|
|
func NewWatchDog(opts ...WatchDogOption) *WatchDog {
|
|
o := NewWatchDogOptions(opts...)
|
|
|
|
return &WatchDog{
|
|
timeout: o.busyTimeout,
|
|
idletimeout: o.idleTimeout,
|
|
pm: o.processManager,
|
|
busyTime: make(map[string]time.Time),
|
|
idleTime: make(map[string]time.Time),
|
|
lastUsed: make(map[string]time.Time),
|
|
addressMap: make(map[string]*process.Process),
|
|
busyCheck: o.busyCheck,
|
|
idleCheck: o.idleCheck,
|
|
lruLimit: o.lruLimit,
|
|
addressModelMap: make(map[string]string),
|
|
pinnedModels: make(map[string]bool),
|
|
modelGroups: make(map[string][]string),
|
|
stop: make(chan bool, 1),
|
|
done: make(chan bool, 1),
|
|
memoryReclaimerEnabled: o.memoryReclaimerEnabled,
|
|
memoryReclaimerThreshold: o.memoryReclaimerThreshold,
|
|
watchdogInterval: o.watchdogInterval,
|
|
forceEvictionWhenBusy: o.forceEvictionWhenBusy,
|
|
}
|
|
}
|
|
|
|
// SetLRULimit updates the LRU limit dynamically
|
|
func (wd *WatchDog) SetLRULimit(limit int) {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
wd.lruLimit = limit
|
|
}
|
|
|
|
// GetLRULimit returns the current LRU limit
|
|
func (wd *WatchDog) GetLRULimit() int {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
return wd.lruLimit
|
|
}
|
|
|
|
// SetMemoryReclaimer updates the memory reclaimer settings dynamically
|
|
func (wd *WatchDog) SetMemoryReclaimer(enabled bool, threshold float64) {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
wd.memoryReclaimerEnabled = enabled
|
|
wd.memoryReclaimerThreshold = threshold
|
|
}
|
|
|
|
// GetMemoryReclaimerSettings returns the current memory reclaimer settings
|
|
func (wd *WatchDog) GetMemoryReclaimerSettings() (enabled bool, threshold float64) {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
return wd.memoryReclaimerEnabled, wd.memoryReclaimerThreshold
|
|
}
|
|
|
|
// SetForceEvictionWhenBusy updates the force eviction when busy setting dynamically
|
|
func (wd *WatchDog) SetForceEvictionWhenBusy(force bool) {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
wd.forceEvictionWhenBusy = force
|
|
}
|
|
|
|
// SetPinnedModels replaces the set of pinned model names.
|
|
// Pinned models are excluded from idle, LRU, and memory-pressure eviction.
|
|
func (wd *WatchDog) SetPinnedModels(models []string) {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
wd.pinnedModels = make(map[string]bool, len(models))
|
|
for _, m := range models {
|
|
wd.pinnedModels[m] = true
|
|
}
|
|
}
|
|
|
|
// IsModelPinned returns true if the given model name is pinned
|
|
func (wd *WatchDog) IsModelPinned(modelName string) bool {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
return wd.pinnedModels[modelName]
|
|
}
|
|
|
|
// ReplaceModelGroups replaces the per-model concurrency-group registry. The
|
|
// supplied map is copied; callers may mutate it after the call. Passing an
|
|
// empty or nil map clears all entries.
|
|
func (wd *WatchDog) ReplaceModelGroups(groups map[string][]string) {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
wd.modelGroups = make(map[string][]string, len(groups))
|
|
for name, gs := range groups {
|
|
if len(gs) == 0 {
|
|
continue
|
|
}
|
|
wd.modelGroups[name] = slices.Clone(gs)
|
|
}
|
|
}
|
|
|
|
// GetModelGroups returns a copy of the concurrency groups configured for
|
|
// the given model, or nil if the model has no groups. The result may be
|
|
// freely mutated by the caller.
|
|
func (wd *WatchDog) GetModelGroups(modelName string) []string {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
gs, ok := wd.modelGroups[modelName]
|
|
if !ok || len(gs) == 0 {
|
|
return nil
|
|
}
|
|
return slices.Clone(gs)
|
|
}
|
|
|
|
func (wd *WatchDog) Shutdown() {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
xlog.Info("[WatchDog] Shutting down watchdog")
|
|
wd.stop <- true
|
|
}
|
|
|
|
// WaitDone blocks until the watchdog's Run() goroutine has completely shut down.
|
|
// This should be called after Shutdown() to ensure the watchdog is fully stopped.
|
|
func (wd *WatchDog) WaitDone() {
|
|
<-wd.done
|
|
}
|
|
|
|
func (wd *WatchDog) AddAddressModelMap(address string, model string) {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
wd.addressModelMap[address] = model
|
|
|
|
}
|
|
func (wd *WatchDog) Add(address string, p *process.Process) {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
wd.addressMap[address] = p
|
|
}
|
|
|
|
func (wd *WatchDog) Mark(address string) {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
now := time.Now()
|
|
wd.busyTime[address] = now
|
|
wd.lastUsed[address] = now // Update LRU tracking
|
|
delete(wd.idleTime, address)
|
|
}
|
|
|
|
func (wd *WatchDog) UnMark(ModelAddress string) {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
now := time.Now()
|
|
delete(wd.busyTime, ModelAddress)
|
|
wd.idleTime[ModelAddress] = now
|
|
wd.lastUsed[ModelAddress] = now // Update LRU tracking
|
|
}
|
|
|
|
// UpdateLastUsed updates the last used time for a model address (for LRU tracking)
|
|
// This should be called when a model is accessed (e.g., when checking if loaded)
|
|
func (wd *WatchDog) UpdateLastUsed(address string) {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
wd.lastUsed[address] = time.Now()
|
|
}
|
|
|
|
// GetLoadedModelCount returns the number of currently loaded models tracked by the watchdog
|
|
func (wd *WatchDog) GetLoadedModelCount() int {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
return len(wd.addressModelMap)
|
|
}
|
|
|
|
// WatchDogState holds the current state of models tracked by the watchdog
|
|
type WatchDogState struct {
|
|
AddressModelMap map[string]string
|
|
BusyTime map[string]time.Time
|
|
IdleTime map[string]time.Time
|
|
LastUsed map[string]time.Time
|
|
AddressMap map[string]*process.Process
|
|
}
|
|
|
|
// GetState returns the current state of models tracked by the watchdog
|
|
// This can be used to restore state when creating a new watchdog
|
|
func (wd *WatchDog) GetState() WatchDogState {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
|
|
// Create copies to avoid race conditions
|
|
addressModelMap := make(map[string]string, len(wd.addressModelMap))
|
|
for k, v := range wd.addressModelMap {
|
|
addressModelMap[k] = v
|
|
}
|
|
|
|
busyTime := make(map[string]time.Time, len(wd.busyTime))
|
|
for k, v := range wd.busyTime {
|
|
busyTime[k] = v
|
|
}
|
|
|
|
idleTime := make(map[string]time.Time, len(wd.idleTime))
|
|
for k, v := range wd.idleTime {
|
|
idleTime[k] = v
|
|
}
|
|
|
|
lastUsed := make(map[string]time.Time, len(wd.lastUsed))
|
|
for k, v := range wd.lastUsed {
|
|
lastUsed[k] = v
|
|
}
|
|
|
|
addressMap := make(map[string]*process.Process, len(wd.addressMap))
|
|
for k, v := range wd.addressMap {
|
|
addressMap[k] = v
|
|
}
|
|
|
|
return WatchDogState{
|
|
AddressModelMap: addressModelMap,
|
|
BusyTime: busyTime,
|
|
IdleTime: idleTime,
|
|
LastUsed: lastUsed,
|
|
AddressMap: addressMap,
|
|
}
|
|
}
|
|
|
|
// RestoreState restores the model state from a previous watchdog
|
|
// This should be called after the new watchdog is created but before Run() is started
|
|
func (wd *WatchDog) RestoreState(state WatchDogState) {
|
|
wd.Lock()
|
|
defer wd.Unlock()
|
|
|
|
wd.addressModelMap = state.AddressModelMap
|
|
wd.busyTime = state.BusyTime
|
|
wd.idleTime = state.IdleTime
|
|
wd.lastUsed = state.LastUsed
|
|
wd.addressMap = state.AddressMap
|
|
|
|
xlog.Info("[WatchDog] Restored model state", "modelCount", len(wd.addressModelMap))
|
|
}
|
|
|
|
// modelUsageInfo holds information about a model's usage for LRU sorting
|
|
type modelUsageInfo struct {
|
|
address string
|
|
model string
|
|
lastUsed time.Time
|
|
}
|
|
|
|
// EnforceLRULimitResult contains the result of LRU enforcement
|
|
type EnforceLRULimitResult struct {
|
|
EvictedCount int // Number of models successfully evicted
|
|
NeedMore bool // True if more evictions are needed but couldn't be done (e.g., all models are busy)
|
|
}
|
|
|
|
// EnforceLRULimit ensures we're under the LRU limit by evicting least recently used models.
|
|
// This should be called before loading a new model.
|
|
// pendingLoads is the number of models currently being loaded (to account for concurrent loads).
|
|
// Returns the result containing evicted count and whether more evictions are needed.
|
|
func (wd *WatchDog) EnforceLRULimit(pendingLoads int) EnforceLRULimitResult {
|
|
if wd.lruLimit <= 0 {
|
|
return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false} // LRU disabled
|
|
}
|
|
|
|
wd.Lock()
|
|
|
|
currentCount := len(wd.addressModelMap)
|
|
// We need to evict enough to make room for the new model AND any pending loads
|
|
// Total after loading = currentCount + pendingLoads + 1 (the new one we're about to load)
|
|
// We need: currentCount + pendingLoads + 1 <= lruLimit
|
|
// So evict: currentCount + pendingLoads + 1 - lruLimit = currentCount - lruLimit + pendingLoads + 1
|
|
modelsToEvict := currentCount - wd.lruLimit + pendingLoads + 1
|
|
forceEvictionWhenBusy := wd.forceEvictionWhenBusy
|
|
if modelsToEvict <= 0 {
|
|
wd.Unlock()
|
|
return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false}
|
|
}
|
|
|
|
xlog.Debug("[WatchDog] LRU enforcement triggered", "current", currentCount, "pendingLoads", pendingLoads, "limit", wd.lruLimit, "toEvict", modelsToEvict)
|
|
|
|
// Build a list of models sorted by last used time (oldest first)
|
|
var models []modelUsageInfo
|
|
for address, model := range wd.addressModelMap {
|
|
lastUsed := wd.lastUsed[address]
|
|
if lastUsed.IsZero() {
|
|
// If no lastUsed recorded, use a very old time
|
|
lastUsed = time.Time{}
|
|
}
|
|
models = append(models, modelUsageInfo{
|
|
address: address,
|
|
model: model,
|
|
lastUsed: lastUsed,
|
|
})
|
|
}
|
|
|
|
// Sort by lastUsed time (oldest first)
|
|
slices.SortFunc(models, func(a, b modelUsageInfo) int {
|
|
return a.lastUsed.Compare(b.lastUsed)
|
|
})
|
|
|
|
// Collect models to evict (the oldest ones)
|
|
modelsToShutdown, skippedBusyCount := wd.collectEvictionsLocked(models, modelsToEvict, forceEvictionWhenBusy)
|
|
needMore := len(modelsToShutdown) < modelsToEvict && skippedBusyCount > 0
|
|
wd.Unlock()
|
|
|
|
// Now shutdown models without holding the watchdog lock to prevent deadlock
|
|
for _, model := range modelsToShutdown {
|
|
if err := wd.pm.ShutdownModel(model); err != nil {
|
|
xlog.Error("[WatchDog] error shutting down model during LRU eviction", "error", err, "model", model)
|
|
}
|
|
xlog.Debug("[WatchDog] LRU eviction complete", "model", model)
|
|
}
|
|
|
|
if needMore {
|
|
xlog.Warn("[WatchDog] LRU eviction incomplete", "evicted", len(modelsToShutdown), "needed", modelsToEvict, "skippedBusy", skippedBusyCount, "reason", "some models are busy with active API calls")
|
|
}
|
|
|
|
return EnforceLRULimitResult{
|
|
EvictedCount: len(modelsToShutdown),
|
|
NeedMore: needMore,
|
|
}
|
|
}
|
|
|
|
// collectEvictionsLocked walks `candidates` (already in eviction order) and
|
|
// untracks up to `maxToEvict` models that are eligible for eviction. Pinned
|
|
// models are always skipped; busy models are skipped unless `force` is true.
|
|
// Returns the names of evicted models and the number skipped because they
|
|
// were busy. Must be called with wd.Lock() held.
|
|
func (wd *WatchDog) collectEvictionsLocked(candidates []modelUsageInfo, maxToEvict int, force bool) (evicted []string, skippedBusy int) {
|
|
for i := 0; len(evicted) < maxToEvict && i < len(candidates); i++ {
|
|
m := candidates[i]
|
|
if wd.pinnedModels[m.model] {
|
|
xlog.Debug("[WatchDog] Skipping eviction for pinned model", "model", m.model)
|
|
continue
|
|
}
|
|
_, isBusy := wd.busyTime[m.address]
|
|
if isBusy && !force {
|
|
xlog.Warn("[WatchDog] Skipping eviction for busy model", "model", m.model, "reason", "model has active API calls")
|
|
skippedBusy++
|
|
continue
|
|
}
|
|
xlog.Info("[WatchDog] evicting model", "model", m.model, "busy", isBusy)
|
|
evicted = append(evicted, m.model)
|
|
wd.untrack(m.address)
|
|
}
|
|
return evicted, skippedBusy
|
|
}
|
|
|
|
// EnforceGroupExclusivity evicts every loaded model that shares at least one
|
|
// concurrency group with the requested model. The pinned/busy/retry semantics
|
|
// match EnforceLRULimit so the loader's retry loop can stay generic.
|
|
func (wd *WatchDog) EnforceGroupExclusivity(requestedModel string) EnforceLRULimitResult {
|
|
wd.Lock()
|
|
|
|
requestedGroups := wd.modelGroups[requestedModel]
|
|
if len(requestedGroups) == 0 {
|
|
wd.Unlock()
|
|
return EnforceLRULimitResult{}
|
|
}
|
|
|
|
forceEvictionWhenBusy := wd.forceEvictionWhenBusy
|
|
|
|
// Build the conflict candidate list: every loaded model whose groups
|
|
// overlap with requestedGroups. Order doesn't affect correctness, but
|
|
// sort by lastUsed (oldest first) so logs and behaviour are deterministic.
|
|
var conflicts []modelUsageInfo
|
|
for address, name := range wd.addressModelMap {
|
|
if name == requestedModel {
|
|
continue
|
|
}
|
|
if !groupsOverlap(requestedGroups, wd.modelGroups[name]) {
|
|
continue
|
|
}
|
|
conflicts = append(conflicts, modelUsageInfo{
|
|
address: address,
|
|
model: name,
|
|
lastUsed: wd.lastUsed[address],
|
|
})
|
|
}
|
|
if len(conflicts) == 0 {
|
|
wd.Unlock()
|
|
return EnforceLRULimitResult{}
|
|
}
|
|
slices.SortFunc(conflicts, func(a, b modelUsageInfo) int {
|
|
return a.lastUsed.Compare(b.lastUsed)
|
|
})
|
|
|
|
xlog.Debug("[WatchDog] Group exclusivity triggered", "requested", requestedModel, "groups", requestedGroups, "conflicts", len(conflicts))
|
|
|
|
modelsToShutdown, skippedBusyCount := wd.collectEvictionsLocked(conflicts, len(conflicts), forceEvictionWhenBusy)
|
|
// For groups any unresolved conflict matters — busy *or* pinned. The loader
|
|
// retries on NeedMore; pinned cases will eventually time out and the load
|
|
// proceeds with a visible warning, which is the right signal for what is a
|
|
// configuration mismatch.
|
|
needMore := len(modelsToShutdown) < len(conflicts)
|
|
wd.Unlock()
|
|
|
|
for _, m := range modelsToShutdown {
|
|
if err := wd.pm.ShutdownModel(m); err != nil {
|
|
xlog.Error("[WatchDog] error shutting down model during group eviction", "error", err, "model", m)
|
|
}
|
|
xlog.Debug("[WatchDog] Group eviction complete", "model", m)
|
|
}
|
|
|
|
if needMore {
|
|
xlog.Warn("[WatchDog] Group eviction incomplete", "requested", requestedModel, "evicted", len(modelsToShutdown), "needed", len(conflicts), "skippedBusy", skippedBusyCount, "reason", "some conflicts are busy or pinned")
|
|
}
|
|
|
|
return EnforceLRULimitResult{
|
|
EvictedCount: len(modelsToShutdown),
|
|
NeedMore: needMore,
|
|
}
|
|
}
|
|
|
|
// groupsOverlap reports whether the two group lists share any name.
|
|
func groupsOverlap(a, b []string) bool {
|
|
if len(a) == 0 || len(b) == 0 {
|
|
return false
|
|
}
|
|
for _, x := range a {
|
|
if slices.Contains(b, x) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (wd *WatchDog) Run() {
|
|
xlog.Info("[WatchDog] starting watchdog")
|
|
|
|
for {
|
|
select {
|
|
case <-wd.stop:
|
|
xlog.Info("[WatchDog] Stopping watchdog")
|
|
wd.done <- true
|
|
return
|
|
case <-time.After(wd.watchdogInterval):
|
|
// Check if any monitoring is enabled
|
|
wd.Lock()
|
|
busyCheck := wd.busyCheck
|
|
idleCheck := wd.idleCheck
|
|
memoryCheck := wd.memoryReclaimerEnabled
|
|
wd.Unlock()
|
|
|
|
if !busyCheck && !idleCheck && !memoryCheck {
|
|
xlog.Info("[WatchDog] No checks enabled, stopping watchdog")
|
|
wd.done <- true
|
|
return
|
|
}
|
|
if busyCheck {
|
|
wd.checkBusy()
|
|
}
|
|
if idleCheck {
|
|
wd.checkIdle()
|
|
}
|
|
if memoryCheck {
|
|
wd.checkMemory()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (wd *WatchDog) checkIdle() {
|
|
wd.Lock()
|
|
xlog.Debug("[WatchDog] Watchdog checks for idle connections")
|
|
|
|
// Collect models to shutdown while holding the lock
|
|
var modelsToShutdown []string
|
|
for address, t := range wd.idleTime {
|
|
xlog.Debug("[WatchDog] idle connection", "address", address)
|
|
if time.Since(t) > wd.idletimeout {
|
|
model, ok := wd.addressModelMap[address]
|
|
if ok {
|
|
if wd.pinnedModels[model] {
|
|
xlog.Debug("[WatchDog] Skipping idle eviction for pinned model", "model", model)
|
|
continue
|
|
}
|
|
xlog.Warn("[WatchDog] Address is idle for too long, killing it", "address", address)
|
|
modelsToShutdown = append(modelsToShutdown, model)
|
|
} else {
|
|
xlog.Warn("[WatchDog] Address unresolvable", "address", address)
|
|
}
|
|
wd.untrack(address)
|
|
}
|
|
}
|
|
wd.Unlock()
|
|
|
|
// Now shutdown models without holding the watchdog lock to prevent deadlock
|
|
for _, model := range modelsToShutdown {
|
|
if err := wd.pm.ShutdownModel(model); err != nil {
|
|
xlog.Error("[watchdog] error shutting down model", "error", err, "model", model)
|
|
}
|
|
xlog.Debug("[WatchDog] model shut down", "model", model)
|
|
}
|
|
}
|
|
|
|
func (wd *WatchDog) checkBusy() {
|
|
wd.Lock()
|
|
xlog.Debug("[WatchDog] Watchdog checks for busy connections")
|
|
|
|
// Collect models to shutdown while holding the lock
|
|
var modelsToShutdown []string
|
|
for address, t := range wd.busyTime {
|
|
xlog.Debug("[WatchDog] active connection", "address", address)
|
|
|
|
if time.Since(t) > wd.timeout {
|
|
model, ok := wd.addressModelMap[address]
|
|
if ok {
|
|
xlog.Warn("[WatchDog] Model is busy for too long, killing it", "model", model)
|
|
modelsToShutdown = append(modelsToShutdown, model)
|
|
} else {
|
|
xlog.Warn("[WatchDog] Address unresolvable", "address", address)
|
|
}
|
|
wd.untrack(address)
|
|
}
|
|
}
|
|
wd.Unlock()
|
|
|
|
// Now shutdown models without holding the watchdog lock to prevent deadlock
|
|
for _, model := range modelsToShutdown {
|
|
if err := wd.pm.ShutdownModel(model); err != nil {
|
|
xlog.Error("[watchdog] error shutting down model", "error", err, "model", model)
|
|
}
|
|
xlog.Debug("[WatchDog] model shut down", "model", model)
|
|
}
|
|
}
|
|
|
|
// checkMemory monitors memory usage (GPU VRAM if available, otherwise RAM) and evicts backends when usage exceeds threshold
|
|
func (wd *WatchDog) checkMemory() {
|
|
wd.Lock()
|
|
threshold := wd.memoryReclaimerThreshold
|
|
enabled := wd.memoryReclaimerEnabled
|
|
modelCount := len(wd.addressModelMap)
|
|
wd.Unlock()
|
|
|
|
if !enabled || threshold <= 0 || modelCount == 0 {
|
|
return
|
|
}
|
|
|
|
// Get current memory usage (GPU if available, otherwise RAM)
|
|
aggregate := xsysinfo.GetResourceAggregateInfo()
|
|
if aggregate.TotalMemory == 0 {
|
|
xlog.Debug("[WatchDog] No memory information available for memory reclaimer")
|
|
return
|
|
}
|
|
|
|
// Convert threshold from 0.0-1.0 to percentage
|
|
thresholdPercent := threshold * 100
|
|
|
|
memoryType := "GPU"
|
|
if aggregate.GPUCount == 0 {
|
|
memoryType = "RAM"
|
|
}
|
|
|
|
//xlog.Debug("[WatchDog] Memory check", "type", memoryType, "usage_percent", aggregate.UsagePercent, "threshold_percent", thresholdPercent, "loaded_models", modelCount)
|
|
|
|
// Check if usage exceeds threshold
|
|
if aggregate.UsagePercent > thresholdPercent {
|
|
xlog.Warn("[WatchDog] Memory usage exceeds threshold, evicting LRU backend", "type", memoryType, "usage_percent", aggregate.UsagePercent, "threshold_percent", thresholdPercent)
|
|
|
|
// Evict the least recently used model
|
|
wd.evictLRUModel()
|
|
}
|
|
}
|
|
|
|
// evictLRUModel evicts the least recently used model
|
|
func (wd *WatchDog) evictLRUModel() {
|
|
wd.Lock()
|
|
|
|
if len(wd.addressModelMap) == 0 {
|
|
wd.Unlock()
|
|
return
|
|
}
|
|
|
|
forceEvictionWhenBusy := wd.forceEvictionWhenBusy
|
|
|
|
// Build a list of models sorted by last used time (oldest first)
|
|
var models []modelUsageInfo
|
|
for address, model := range wd.addressModelMap {
|
|
lastUsed := wd.lastUsed[address]
|
|
if lastUsed.IsZero() {
|
|
lastUsed = time.Time{}
|
|
}
|
|
models = append(models, modelUsageInfo{
|
|
address: address,
|
|
model: model,
|
|
lastUsed: lastUsed,
|
|
})
|
|
}
|
|
|
|
if len(models) == 0 {
|
|
wd.Unlock()
|
|
return
|
|
}
|
|
|
|
// Sort by lastUsed time (oldest first)
|
|
slices.SortFunc(models, func(a, b modelUsageInfo) int {
|
|
return a.lastUsed.Compare(b.lastUsed)
|
|
})
|
|
|
|
// Find the first non-busy, non-pinned model (or first non-pinned model if forceEvictionWhenBusy is true)
|
|
var lruModel *modelUsageInfo
|
|
for i := range len(models) {
|
|
m := models[i]
|
|
if wd.pinnedModels[m.model] {
|
|
xlog.Debug("[WatchDog] Skipping memory reclaimer eviction for pinned model", "model", m.model)
|
|
continue
|
|
}
|
|
_, isBusy := wd.busyTime[m.address]
|
|
if isBusy && !forceEvictionWhenBusy {
|
|
// Skip busy models when forceEvictionWhenBusy is false
|
|
xlog.Warn("[WatchDog] Skipping memory reclaimer eviction for busy model", "model", m.model, "reason", "model has active API calls")
|
|
continue
|
|
}
|
|
lruModel = &m
|
|
break
|
|
}
|
|
|
|
if lruModel == nil {
|
|
// All models are busy and forceEvictionWhenBusy is false
|
|
wd.Unlock()
|
|
xlog.Warn("[WatchDog] Memory reclaimer cannot evict: all models are busy with active API calls")
|
|
return
|
|
}
|
|
|
|
xlog.Info("[WatchDog] Memory reclaimer evicting LRU model", "model", lruModel.model, "lastUsed", lruModel.lastUsed)
|
|
|
|
wd.Unlock()
|
|
|
|
// Shutdown the model
|
|
if err := wd.pm.ShutdownModel(lruModel.model); err != nil && err != modelNotFoundErr {
|
|
xlog.Error("[WatchDog] error shutting down model during memory reclamation", "error", err, "model", lruModel.model)
|
|
} else {
|
|
// Untrack the model
|
|
wd.Lock()
|
|
wd.untrack(lruModel.address)
|
|
wd.Unlock()
|
|
xlog.Info("[WatchDog] Memory reclaimer eviction complete", "model", lruModel.model)
|
|
}
|
|
}
|
|
|
|
func (wd *WatchDog) untrack(address string) {
|
|
delete(wd.busyTime, address)
|
|
delete(wd.idleTime, address)
|
|
delete(wd.lastUsed, address)
|
|
delete(wd.addressModelMap, address)
|
|
delete(wd.addressMap, address)
|
|
}
|