mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-18 13:38:49 -04:00
feat: disable force eviction (#7725)
* feat: allow to set forcing backends eviction while requests are in flight Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat: try to make the request sit and retry if eviction couldn't be done Otherwise calls that in order to pass would need to shutdown other backends would just fail. In this way instead we make the request sit and retry eviction until it succeeds. The thresholds can be configured by the user. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * add tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * expose settings to CLI Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Update docs Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
committed by
GitHub
parent
bb459e671f
commit
c844b7ac58
@@ -9,8 +9,8 @@ import (
|
||||
"time"
|
||||
|
||||
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||
"github.com/phayes/freeport"
|
||||
"github.com/mudler/xlog"
|
||||
"github.com/phayes/freeport"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -173,7 +173,7 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
|
||||
|
||||
model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backend, o))
|
||||
if err != nil {
|
||||
if stopErr := ml.StopGRPC(only(o.modelID));stopErr != nil {
|
||||
if stopErr := ml.StopGRPC(only(o.modelID)); stopErr != nil {
|
||||
xlog.Error("error stopping model", "error", stopErr, "model", o.modelID)
|
||||
}
|
||||
xlog.Error("Failed to load model", "modelID", o.modelID, "error", err, "backend", o.backendString)
|
||||
@@ -186,13 +186,47 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
|
||||
// enforceLRULimit enforces the LRU limit before loading a new model.
|
||||
// This is called before loading a model to ensure we don't exceed the limit.
|
||||
// It accounts for models that are currently being loaded by other goroutines.
|
||||
// If models are busy and can't be evicted, it will wait and retry until space is available.
|
||||
func (ml *ModelLoader) enforceLRULimit() {
|
||||
if ml.wd == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// Get the count of models currently being loaded to account for concurrent requests
|
||||
pendingLoads := ml.GetLoadingCount()
|
||||
ml.wd.EnforceLRULimit(pendingLoads)
|
||||
|
||||
// Get retry settings from ModelLoader
|
||||
ml.mu.Lock()
|
||||
maxRetries := ml.lruEvictionMaxRetries
|
||||
retryInterval := ml.lruEvictionRetryInterval
|
||||
ml.mu.Unlock()
|
||||
|
||||
for attempt := 0; attempt < maxRetries; attempt++ {
|
||||
result := ml.wd.EnforceLRULimit(pendingLoads)
|
||||
|
||||
if !result.NeedMore {
|
||||
// Successfully evicted enough models (or no eviction needed)
|
||||
if result.EvictedCount > 0 {
|
||||
xlog.Info("[ModelLoader] LRU enforcement complete", "evicted", result.EvictedCount)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Need more evictions but models are busy - wait and retry
|
||||
if attempt < maxRetries-1 {
|
||||
xlog.Info("[ModelLoader] Waiting for busy models to become idle before eviction",
|
||||
"evicted", result.EvictedCount,
|
||||
"attempt", attempt+1,
|
||||
"maxRetries", maxRetries,
|
||||
"retryIn", retryInterval)
|
||||
time.Sleep(retryInterval)
|
||||
} else {
|
||||
// Last attempt - log warning but proceed (might fail to load, but at least we tried)
|
||||
xlog.Warn("[ModelLoader] LRU enforcement incomplete after max retries",
|
||||
"evicted", result.EvictedCount,
|
||||
"reason", "models are still busy with active API calls")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// updateModelLastUsed updates the last used time for a model (for LRU tracking)
|
||||
|
||||
@@ -20,22 +20,26 @@ import (
|
||||
|
||||
// TODO: Split ModelLoader and TemplateLoader? Just to keep things more organized. Left together to share a mutex until I look into that. Would split if we separate directories for .bin/.yaml and .tmpl
|
||||
type ModelLoader struct {
|
||||
ModelPath string
|
||||
mu sync.Mutex
|
||||
models map[string]*Model
|
||||
loading map[string]chan struct{} // tracks models currently being loaded
|
||||
wd *WatchDog
|
||||
externalBackends map[string]string
|
||||
ModelPath string
|
||||
mu sync.Mutex
|
||||
models map[string]*Model
|
||||
loading map[string]chan struct{} // tracks models currently being loaded
|
||||
wd *WatchDog
|
||||
externalBackends map[string]string
|
||||
lruEvictionMaxRetries int // Maximum number of retries when waiting for busy models
|
||||
lruEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models
|
||||
}
|
||||
|
||||
// NewModelLoader creates a new ModelLoader instance.
|
||||
// LRU eviction is now managed through the WatchDog component.
|
||||
func NewModelLoader(system *system.SystemState) *ModelLoader {
|
||||
nml := &ModelLoader{
|
||||
ModelPath: system.Model.ModelsPath,
|
||||
models: make(map[string]*Model),
|
||||
loading: make(map[string]chan struct{}),
|
||||
externalBackends: make(map[string]string),
|
||||
ModelPath: system.Model.ModelsPath,
|
||||
models: make(map[string]*Model),
|
||||
loading: make(map[string]chan struct{}),
|
||||
externalBackends: make(map[string]string),
|
||||
lruEvictionMaxRetries: 30, // Default: 30 retries
|
||||
lruEvictionRetryInterval: 1 * time.Second, // Default: 1 second
|
||||
}
|
||||
|
||||
return nml
|
||||
@@ -56,6 +60,14 @@ func (ml *ModelLoader) GetWatchDog() *WatchDog {
|
||||
return ml.wd
|
||||
}
|
||||
|
||||
// SetLRUEvictionRetrySettings updates the LRU eviction retry settings
|
||||
func (ml *ModelLoader) SetLRUEvictionRetrySettings(maxRetries int, retryInterval time.Duration) {
|
||||
ml.mu.Lock()
|
||||
defer ml.mu.Unlock()
|
||||
ml.lruEvictionMaxRetries = maxRetries
|
||||
ml.lruEvictionRetryInterval = retryInterval
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) ExistsInModelPath(s string) bool {
|
||||
return utils.ExistsInPath(ml.ModelPath, s)
|
||||
}
|
||||
|
||||
@@ -262,4 +262,13 @@ var _ = Describe("ModelLoader", func() {
|
||||
Expect(modelLoader.GetLoadingCount()).To(Equal(0))
|
||||
})
|
||||
})
|
||||
|
||||
Context("LRU Eviction Retry Settings", func() {
|
||||
It("should allow updating retry settings", func() {
|
||||
modelLoader.SetLRUEvictionRetrySettings(50, 2*time.Second)
|
||||
// Settings are updated - we can verify through behavior if needed
|
||||
// For now, just verify the call doesn't panic
|
||||
Expect(modelLoader).ToNot(BeNil())
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
@@ -41,6 +41,9 @@ type WatchDog struct {
|
||||
memoryReclaimerEnabled bool // Enable memory threshold monitoring
|
||||
memoryReclaimerThreshold float64 // Threshold 0.0-1.0 (e.g., 0.95 = 95%)
|
||||
watchdogInterval time.Duration
|
||||
|
||||
// Eviction settings
|
||||
forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
|
||||
}
|
||||
|
||||
type ProcessManager interface {
|
||||
@@ -78,6 +81,7 @@ func NewWatchDog(opts ...WatchDogOption) *WatchDog {
|
||||
memoryReclaimerEnabled: o.memoryReclaimerEnabled,
|
||||
memoryReclaimerThreshold: o.memoryReclaimerThreshold,
|
||||
watchdogInterval: o.watchdogInterval,
|
||||
forceEvictionWhenBusy: o.forceEvictionWhenBusy,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -110,6 +114,13 @@ func (wd *WatchDog) GetMemoryReclaimerSettings() (enabled bool, threshold float6
|
||||
return wd.memoryReclaimerEnabled, wd.memoryReclaimerThreshold
|
||||
}
|
||||
|
||||
// SetForceEvictionWhenBusy updates the force eviction when busy setting dynamically
|
||||
func (wd *WatchDog) SetForceEvictionWhenBusy(force bool) {
|
||||
wd.Lock()
|
||||
defer wd.Unlock()
|
||||
wd.forceEvictionWhenBusy = force
|
||||
}
|
||||
|
||||
func (wd *WatchDog) Shutdown() {
|
||||
wd.Lock()
|
||||
defer wd.Unlock()
|
||||
@@ -169,13 +180,19 @@ type modelUsageInfo struct {
|
||||
lastUsed time.Time
|
||||
}
|
||||
|
||||
// EnforceLRULimitResult contains the result of LRU enforcement
|
||||
type EnforceLRULimitResult struct {
|
||||
EvictedCount int // Number of models successfully evicted
|
||||
NeedMore bool // True if more evictions are needed but couldn't be done (e.g., all models are busy)
|
||||
}
|
||||
|
||||
// EnforceLRULimit ensures we're under the LRU limit by evicting least recently used models.
|
||||
// This should be called before loading a new model.
|
||||
// pendingLoads is the number of models currently being loaded (to account for concurrent loads).
|
||||
// Returns the number of models evicted.
|
||||
func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
|
||||
// Returns the result containing evicted count and whether more evictions are needed.
|
||||
func (wd *WatchDog) EnforceLRULimit(pendingLoads int) EnforceLRULimitResult {
|
||||
if wd.lruLimit <= 0 {
|
||||
return 0 // LRU disabled
|
||||
return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false} // LRU disabled
|
||||
}
|
||||
|
||||
wd.Lock()
|
||||
@@ -186,9 +203,10 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
|
||||
// We need: currentCount + pendingLoads + 1 <= lruLimit
|
||||
// So evict: currentCount + pendingLoads + 1 - lruLimit = currentCount - lruLimit + pendingLoads + 1
|
||||
modelsToEvict := currentCount - wd.lruLimit + pendingLoads + 1
|
||||
forceEvictionWhenBusy := wd.forceEvictionWhenBusy
|
||||
if modelsToEvict <= 0 {
|
||||
wd.Unlock()
|
||||
return 0
|
||||
return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false}
|
||||
}
|
||||
|
||||
xlog.Debug("[WatchDog] LRU enforcement triggered", "current", currentCount, "pendingLoads", pendingLoads, "limit", wd.lruLimit, "toEvict", modelsToEvict)
|
||||
@@ -215,13 +233,25 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
|
||||
|
||||
// Collect models to evict (the oldest ones)
|
||||
var modelsToShutdown []string
|
||||
for i := 0; i < modelsToEvict && i < len(models); i++ {
|
||||
evictedCount := 0
|
||||
skippedBusyCount := 0
|
||||
for i := 0; evictedCount < modelsToEvict && i < len(models); i++ {
|
||||
m := models[i]
|
||||
xlog.Info("[WatchDog] LRU evicting model", "model", m.model, "lastUsed", m.lastUsed)
|
||||
// Check if model is busy
|
||||
_, isBusy := wd.busyTime[m.address]
|
||||
if isBusy && !forceEvictionWhenBusy {
|
||||
// Skip eviction for busy models when forceEvictionWhenBusy is false
|
||||
xlog.Warn("[WatchDog] Skipping LRU eviction for busy model", "model", m.model, "reason", "model has active API calls")
|
||||
skippedBusyCount++
|
||||
continue
|
||||
}
|
||||
xlog.Info("[WatchDog] LRU evicting model", "model", m.model, "lastUsed", m.lastUsed, "busy", isBusy)
|
||||
modelsToShutdown = append(modelsToShutdown, m.model)
|
||||
// Clean up the maps while we have the lock
|
||||
wd.untrack(m.address)
|
||||
evictedCount++
|
||||
}
|
||||
needMore := evictedCount < modelsToEvict && skippedBusyCount > 0
|
||||
wd.Unlock()
|
||||
|
||||
// Now shutdown models without holding the watchdog lock to prevent deadlock
|
||||
@@ -232,7 +262,14 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
|
||||
xlog.Debug("[WatchDog] LRU eviction complete", "model", model)
|
||||
}
|
||||
|
||||
return len(modelsToShutdown)
|
||||
if needMore {
|
||||
xlog.Warn("[WatchDog] LRU eviction incomplete", "evicted", evictedCount, "needed", modelsToEvict, "skippedBusy", skippedBusyCount, "reason", "some models are busy with active API calls")
|
||||
}
|
||||
|
||||
return EnforceLRULimitResult{
|
||||
EvictedCount: len(modelsToShutdown),
|
||||
NeedMore: needMore,
|
||||
}
|
||||
}
|
||||
|
||||
func (wd *WatchDog) Run() {
|
||||
@@ -376,6 +413,8 @@ func (wd *WatchDog) evictLRUModel() {
|
||||
return
|
||||
}
|
||||
|
||||
forceEvictionWhenBusy := wd.forceEvictionWhenBusy
|
||||
|
||||
// Build a list of models sorted by last used time (oldest first)
|
||||
var models []modelUsageInfo
|
||||
for address, model := range wd.addressModelMap {
|
||||
@@ -400,8 +439,27 @@ func (wd *WatchDog) evictLRUModel() {
|
||||
return models[i].lastUsed.Before(models[j].lastUsed)
|
||||
})
|
||||
|
||||
// Get the LRU model
|
||||
lruModel := models[0]
|
||||
// Find the first non-busy model (or first model if forceEvictionWhenBusy is true)
|
||||
var lruModel *modelUsageInfo
|
||||
for i := 0; i < len(models); i++ {
|
||||
m := models[i]
|
||||
_, isBusy := wd.busyTime[m.address]
|
||||
if isBusy && !forceEvictionWhenBusy {
|
||||
// Skip busy models when forceEvictionWhenBusy is false
|
||||
xlog.Warn("[WatchDog] Skipping memory reclaimer eviction for busy model", "model", m.model, "reason", "model has active API calls")
|
||||
continue
|
||||
}
|
||||
lruModel = &m
|
||||
break
|
||||
}
|
||||
|
||||
if lruModel == nil {
|
||||
// All models are busy and forceEvictionWhenBusy is false
|
||||
wd.Unlock()
|
||||
xlog.Warn("[WatchDog] Memory reclaimer cannot evict: all models are busy with active API calls")
|
||||
return
|
||||
}
|
||||
|
||||
xlog.Info("[WatchDog] Memory reclaimer evicting LRU model", "model", lruModel.model, "lastUsed", lruModel.lastUsed)
|
||||
|
||||
// Untrack the model
|
||||
|
||||
@@ -28,6 +28,9 @@ type WatchDogOptions struct {
|
||||
// Memory reclaimer settings (works with GPU if available, otherwise RAM)
|
||||
memoryReclaimerEnabled bool // Enable memory threshold monitoring
|
||||
memoryReclaimerThreshold float64 // Threshold 0.0-1.0 (e.g., 0.95 = 95%)
|
||||
|
||||
// Eviction settings
|
||||
forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
|
||||
}
|
||||
|
||||
// WatchDogOption is a function that configures WatchDogOptions
|
||||
@@ -105,6 +108,14 @@ func WithMemoryReclaimerThreshold(threshold float64) WatchDogOption {
|
||||
}
|
||||
}
|
||||
|
||||
// WithForceEvictionWhenBusy sets whether to force eviction even when models have active API calls
|
||||
// Default: false (skip eviction when busy for safety)
|
||||
func WithForceEvictionWhenBusy(force bool) WatchDogOption {
|
||||
return func(o *WatchDogOptions) {
|
||||
o.forceEvictionWhenBusy = force
|
||||
}
|
||||
}
|
||||
|
||||
// DefaultWatchDogOptions returns default options for the watchdog
|
||||
func DefaultWatchDogOptions() *WatchDogOptions {
|
||||
return &WatchDogOptions{
|
||||
@@ -116,6 +127,7 @@ func DefaultWatchDogOptions() *WatchDogOptions {
|
||||
lruLimit: 0,
|
||||
memoryReclaimerEnabled: false,
|
||||
memoryReclaimerThreshold: DefaultMemoryReclaimerThreshold,
|
||||
forceEvictionWhenBusy: false, // Default: skip eviction when busy for safety
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -170,15 +170,18 @@ var _ = Describe("WatchDog", func() {
|
||||
model.WithBusyTimeout(5*time.Minute),
|
||||
model.WithIdleTimeout(15*time.Minute),
|
||||
model.WithLRULimit(2),
|
||||
model.WithForceEvictionWhenBusy(true), // Enable force eviction for these tests to match old behavior
|
||||
)
|
||||
})
|
||||
|
||||
It("should not evict when under limit", func() {
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
wd.UnMark("addr1") // Unmark to make it idle (not busy)
|
||||
|
||||
evicted := wd.EnforceLRULimit(0)
|
||||
Expect(evicted).To(Equal(0))
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.EvictedCount).To(Equal(0))
|
||||
Expect(result.NeedMore).To(BeFalse())
|
||||
Expect(pm.getShutdownCalls()).To(BeEmpty())
|
||||
})
|
||||
|
||||
@@ -186,14 +189,17 @@ var _ = Describe("WatchDog", func() {
|
||||
// Add two models
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
wd.UnMark("addr1") // Unmark to make it idle
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2") // Unmark to make it idle
|
||||
|
||||
// Enforce LRU with limit of 2 (need to make room for 1 new model)
|
||||
evicted := wd.EnforceLRULimit(0)
|
||||
Expect(evicted).To(Equal(1))
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.EvictedCount).To(Equal(1))
|
||||
Expect(result.NeedMore).To(BeFalse())
|
||||
Expect(pm.getShutdownCalls()).To(ContainElement("model1")) // oldest should be evicted
|
||||
})
|
||||
|
||||
@@ -201,19 +207,23 @@ var _ = Describe("WatchDog", func() {
|
||||
// Add three models
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
wd.UnMark("addr1") // Unmark to make it idle
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2") // Unmark to make it idle
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr3", "model3")
|
||||
wd.Mark("addr3")
|
||||
wd.UnMark("addr3") // Unmark to make it idle
|
||||
|
||||
// Set limit to 1, should evict 2 oldest + 1 for new = 3 evictions
|
||||
wd.SetLRULimit(1)
|
||||
evicted := wd.EnforceLRULimit(0)
|
||||
Expect(evicted).To(Equal(3))
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.EvictedCount).To(Equal(3))
|
||||
Expect(result.NeedMore).To(BeFalse())
|
||||
shutdowns := pm.getShutdownCalls()
|
||||
Expect(shutdowns).To(ContainElement("model1"))
|
||||
Expect(shutdowns).To(ContainElement("model2"))
|
||||
@@ -224,15 +234,18 @@ var _ = Describe("WatchDog", func() {
|
||||
// Add two models (at limit)
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
wd.UnMark("addr1") // Unmark to make it idle
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2") // Unmark to make it idle
|
||||
|
||||
// With 1 pending load, we need to evict 2 (current=2, pending=1, new=1, limit=2)
|
||||
// total after = 2 + 1 + 1 = 4, need to evict 4 - 2 = 2
|
||||
evicted := wd.EnforceLRULimit(1)
|
||||
Expect(evicted).To(Equal(2))
|
||||
result := wd.EnforceLRULimit(1)
|
||||
Expect(result.EvictedCount).To(Equal(2))
|
||||
Expect(result.NeedMore).To(BeFalse())
|
||||
})
|
||||
|
||||
It("should not evict when LRU is disabled", func() {
|
||||
@@ -242,8 +255,9 @@ var _ = Describe("WatchDog", func() {
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.AddAddressModelMap("addr3", "model3")
|
||||
|
||||
evicted := wd.EnforceLRULimit(0)
|
||||
Expect(evicted).To(Equal(0))
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.EvictedCount).To(Equal(0))
|
||||
Expect(result.NeedMore).To(BeFalse())
|
||||
Expect(pm.getShutdownCalls()).To(BeEmpty())
|
||||
})
|
||||
|
||||
@@ -253,10 +267,12 @@ var _ = Describe("WatchDog", func() {
|
||||
// Add models with different lastUsed times
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
wd.UnMark("addr1") // Unmark to make it idle
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2") // Unmark to make it idle
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
|
||||
// Touch model1 again to make it more recent
|
||||
@@ -265,10 +281,12 @@ var _ = Describe("WatchDog", func() {
|
||||
|
||||
wd.AddAddressModelMap("addr3", "model3")
|
||||
wd.Mark("addr3")
|
||||
wd.UnMark("addr3") // Unmark to make it idle
|
||||
|
||||
// Now model2 is the oldest, should be evicted first
|
||||
evicted := wd.EnforceLRULimit(0)
|
||||
Expect(evicted).To(BeNumerically(">=", 1))
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.EvictedCount).To(BeNumerically(">=", 1))
|
||||
Expect(result.NeedMore).To(BeFalse())
|
||||
|
||||
shutdowns := pm.getShutdownCalls()
|
||||
// model2 should be evicted first (it's the oldest)
|
||||
@@ -285,16 +303,19 @@ var _ = Describe("WatchDog", func() {
|
||||
model.WithBusyTimeout(5*time.Minute),
|
||||
model.WithIdleTimeout(15*time.Minute),
|
||||
model.WithLRULimit(1),
|
||||
model.WithForceEvictionWhenBusy(true), // Enable force eviction for these tests
|
||||
)
|
||||
})
|
||||
|
||||
It("should evict existing model when loading new one", func() {
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
wd.UnMark("addr1") // Unmark to make it idle
|
||||
|
||||
// With limit=1, loading a new model should evict the existing one
|
||||
evicted := wd.EnforceLRULimit(0)
|
||||
Expect(evicted).To(Equal(1))
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.EvictedCount).To(Equal(1))
|
||||
Expect(result.NeedMore).To(BeFalse())
|
||||
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
|
||||
})
|
||||
|
||||
@@ -302,6 +323,7 @@ var _ = Describe("WatchDog", func() {
|
||||
for i := 0; i < 5; i++ {
|
||||
wd.AddAddressModelMap("addr", "model")
|
||||
wd.Mark("addr")
|
||||
wd.UnMark("addr") // Unmark to make it idle
|
||||
wd.EnforceLRULimit(0)
|
||||
}
|
||||
// All previous models should have been evicted
|
||||
@@ -309,6 +331,233 @@ var _ = Describe("WatchDog", func() {
|
||||
})
|
||||
})
|
||||
|
||||
Context("Force Eviction When Busy", func() {
|
||||
BeforeEach(func() {
|
||||
wd = model.NewWatchDog(
|
||||
model.WithProcessManager(pm),
|
||||
model.WithLRULimit(2),
|
||||
model.WithForceEvictionWhenBusy(false), // Default: skip eviction when busy
|
||||
)
|
||||
})
|
||||
|
||||
It("should skip eviction for busy models when forceEvictionWhenBusy is false", func() {
|
||||
// Add two models (at limit of 2, need to evict 1 for new model)
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2") // Make model2 idle
|
||||
|
||||
// Keep model1 as busy (simulating active API call)
|
||||
// model1 is already marked as busy from the first Mark call
|
||||
|
||||
// Try to enforce LRU - should skip busy model1, evict model2
|
||||
result := wd.EnforceLRULimit(0)
|
||||
// Should evict model2 (not busy) but skip model1 (busy)
|
||||
// Since we evicted 1 (which is what we needed), NeedMore should be false
|
||||
Expect(result.EvictedCount).To(Equal(1))
|
||||
Expect(result.NeedMore).To(BeFalse()) // We evicted enough, even though we skipped model1
|
||||
Expect(pm.getShutdownCalls()).To(ContainElement("model2"))
|
||||
Expect(pm.getShutdownCalls()).ToNot(ContainElement("model1"))
|
||||
})
|
||||
|
||||
It("should evict busy models when forceEvictionWhenBusy is true", func() {
|
||||
wd.SetForceEvictionWhenBusy(true)
|
||||
|
||||
// Add two models
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
|
||||
// Keep model1 as busy (already marked from first Mark call)
|
||||
|
||||
// Try to enforce LRU - should evict model1 even though busy
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.EvictedCount).To(Equal(1))
|
||||
Expect(result.NeedMore).To(BeFalse())
|
||||
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
|
||||
})
|
||||
|
||||
It("should set NeedMore when all models are busy and forceEvictionWhenBusy is false", func() {
|
||||
// Add two models
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
|
||||
// Mark both as busy
|
||||
wd.Mark("addr1")
|
||||
wd.Mark("addr2")
|
||||
|
||||
// Try to enforce LRU - should skip both busy models
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.EvictedCount).To(Equal(0))
|
||||
Expect(result.NeedMore).To(BeTrue())
|
||||
Expect(pm.getShutdownCalls()).To(BeEmpty())
|
||||
})
|
||||
|
||||
It("should allow updating forceEvictionWhenBusy dynamically", func() {
|
||||
// Start with false
|
||||
Expect(wd).ToNot(BeNil())
|
||||
|
||||
// Add models
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2") // Make model2 idle
|
||||
// Keep model1 busy (already marked)
|
||||
|
||||
// With forceEvictionWhenBusy=false, should skip busy model1, evict model2
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.NeedMore).To(BeFalse()) // We evicted enough (1 model)
|
||||
Expect(result.EvictedCount).To(Equal(1)) // Should evict model2 (not busy)
|
||||
|
||||
// Now enable force eviction
|
||||
wd.SetForceEvictionWhenBusy(true)
|
||||
|
||||
// Add models again
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
// Keep model1 busy (already marked)
|
||||
|
||||
// With forceEvictionWhenBusy=true, should evict busy model1
|
||||
result = wd.EnforceLRULimit(0)
|
||||
Expect(result.NeedMore).To(BeFalse())
|
||||
Expect(result.EvictedCount).To(Equal(1))
|
||||
})
|
||||
|
||||
It("should continue to next LRU model when busy model is skipped", func() {
|
||||
// Add three models
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2") // Make model2 idle
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr3", "model3")
|
||||
wd.Mark("addr3")
|
||||
wd.UnMark("addr3") // Make model3 idle
|
||||
|
||||
// Keep model1 as busy (oldest, already marked)
|
||||
|
||||
// Need to evict 2 models (limit=2, current=3, need room for 1 new)
|
||||
// Should skip model1 (busy), evict model2 and model3 (not busy)
|
||||
result := wd.EnforceLRULimit(0)
|
||||
// Should evict model2 and model3 (2 models, which is what we needed)
|
||||
Expect(result.EvictedCount).To(Equal(2))
|
||||
Expect(result.NeedMore).To(BeFalse()) // We evicted enough (2 models)
|
||||
Expect(pm.getShutdownCalls()).To(ContainElement("model2"))
|
||||
Expect(pm.getShutdownCalls()).To(ContainElement("model3"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("EnforceLRULimitResult", func() {
|
||||
BeforeEach(func() {
|
||||
wd = model.NewWatchDog(
|
||||
model.WithProcessManager(pm),
|
||||
model.WithLRULimit(2),
|
||||
model.WithForceEvictionWhenBusy(false),
|
||||
)
|
||||
})
|
||||
|
||||
It("should return NeedMore=false when eviction is successful", func() {
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
wd.UnMark("addr1") // Make idle
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2") // Make idle
|
||||
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.NeedMore).To(BeFalse())
|
||||
Expect(result.EvictedCount).To(Equal(1))
|
||||
})
|
||||
|
||||
It("should return NeedMore=true when not enough models can be evicted", func() {
|
||||
// Add two models (at limit of 2, need to evict 1 for new model)
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
|
||||
// Mark both as busy (keep them busy)
|
||||
// Both are already marked as busy from the Mark calls above
|
||||
|
||||
// Need to evict 1, but both are busy
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.NeedMore).To(BeTrue())
|
||||
Expect(result.EvictedCount).To(Equal(0))
|
||||
})
|
||||
|
||||
It("should return NeedMore=true when need to evict multiple but some are busy", func() {
|
||||
// Set limit to 1, add 3 models (need to evict 2 for new model)
|
||||
wd.SetLRULimit(1)
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2") // Make model2 idle
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr3", "model3")
|
||||
wd.Mark("addr3")
|
||||
// Keep model1 and model3 busy
|
||||
|
||||
// Need to evict 2 models, but model1 and model3 are busy, only model2 is idle
|
||||
// Should evict model2 (1 model), but NeedMore=true because we needed 2
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.EvictedCount).To(Equal(1))
|
||||
Expect(result.NeedMore).To(BeTrue())
|
||||
})
|
||||
|
||||
It("should return correct EvictedCount when some models are evicted", func() {
|
||||
// Add three models
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2") // Make model2 idle
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr3", "model3")
|
||||
wd.Mark("addr3")
|
||||
wd.UnMark("addr3") // Make model3 idle
|
||||
|
||||
// Keep model1 as busy (already marked)
|
||||
|
||||
// Need to evict 2 models, but model1 is busy
|
||||
// Should evict model2 and model3 (2 models, which is what we needed)
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.EvictedCount).To(Equal(2))
|
||||
Expect(result.NeedMore).To(BeFalse()) // We evicted enough (2 models)
|
||||
})
|
||||
})
|
||||
|
||||
Context("Functional Options", func() {
|
||||
It("should use default options when none provided", func() {
|
||||
wd = model.NewWatchDog(
|
||||
@@ -331,6 +580,7 @@ var _ = Describe("WatchDog", func() {
|
||||
model.WithLRULimit(5),
|
||||
model.WithMemoryReclaimerEnabled(true),
|
||||
model.WithMemoryReclaimerThreshold(0.80),
|
||||
model.WithForceEvictionWhenBusy(true),
|
||||
)
|
||||
|
||||
Expect(wd.GetLRULimit()).To(Equal(5))
|
||||
@@ -339,5 +589,48 @@ var _ = Describe("WatchDog", func() {
|
||||
Expect(enabled).To(BeTrue())
|
||||
Expect(threshold).To(Equal(0.80))
|
||||
})
|
||||
|
||||
It("should use default forceEvictionWhenBusy (false) when not specified", func() {
|
||||
wd = model.NewWatchDog(
|
||||
model.WithProcessManager(pm),
|
||||
)
|
||||
// Default should be false - we can test this by checking behavior
|
||||
// Add a busy model and verify it's skipped
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.Mark("addr1") // Keep model1 busy
|
||||
|
||||
wd.SetLRULimit(1)
|
||||
result := wd.EnforceLRULimit(0)
|
||||
// Should skip busy model1, evict model2, but NeedMore=true
|
||||
Expect(result.NeedMore).To(BeTrue())
|
||||
})
|
||||
|
||||
It("should allow setting forceEvictionWhenBusy via option", func() {
|
||||
wd = model.NewWatchDog(
|
||||
model.WithProcessManager(pm),
|
||||
model.WithLRULimit(2),
|
||||
model.WithForceEvictionWhenBusy(true),
|
||||
)
|
||||
|
||||
// Add models
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
// Keep model1 busy (already marked from first Mark call)
|
||||
|
||||
// Should evict busy model1
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.NeedMore).To(BeFalse())
|
||||
Expect(result.EvictedCount).To(Equal(1))
|
||||
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user