mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-22 07:39:02 -04:00
* feat(watchdog): add size-aware LRU eviction mode When the model count hits the LRU limit or the memory reclaimer fires, evict the largest model by on-disk file size first rather than the least-recently-used one. For GGUF models the file size is a reliable proxy for GPU/RAM footprint, so evicting the largest candidate maximises freed memory per eviction round while keeping small utility models (embeddings, classifiers, rerankers) resident. Changes: - `pkg/model/watchdog.go`: add `sizeAwareEviction` flag and `modelSizes map[string]int64` to `WatchDog`; sort candidates by `sizeBytes` desc (LRU time as tiebreaker) when the flag is set; add `RegisterModelSize`, `SetSizeAwareEviction`, `GetSizeAwareEviction` - `pkg/model/watchdog_options.go`: add `WithSizeAwareEviction` option - `pkg/model/initializers.go`: stat model file after load and call `RegisterModelSize` so size data is available before the first eviction - `core/config/application_config.go`, `runtime_settings.go`: add `SizeAwareEviction` field and `WithSizeAwareEviction` app option; expose via `ToRuntimeSettings` / `ApplyRuntimeSettings` for the `POST /api/settings` live-reload path - `core/cli/run.go`: add `--size-aware-eviction` flag / `LOCALAI_SIZE_AWARE_EVICTION` env var - `core/application/startup.go`, `watchdog.go`: wire the new option through to `NewWatchDog` - `pkg/model/watchdog_test.go`: 5 new specs — option enable, dynamic toggle, largest-first ordering, equal-size LRU tiebreaker, no-size fallback to LRU, and size-map cleanup on eviction Closes #9375 Signed-off-by: supermario_leo <leo.stack@outlook.com> * refactor(watchdog): use vram estimation scaffolding for model size Replace the brittle os.Stat(modelFile) approach with a proper call to pkg/vram, which handles multi-file models (DownloadFiles, MMProj) and all weight file types, not just single GGUF files. - Add estimateModelSizeBytes() in core/backend/options.go that collects all weight file URIs from the model config, resolves them to file:// URIs, and calls vram.Estimate() with the shared DefaultCachedSizeResolver (15-min TTL cache avoids redundant stat calls on repeated loads) - Thread the result through via a new WithModelSizeBytes() loader option - In initializers.go, consume the pre-computed size instead of calling os.Stat; if no size was supplied (e.g. for external/router-dispatched models) the registration is simply skipped Signed-off-by: supermario_leo <leo.stack@outlook.com> * refactor(watchdog): use EstimateModel with HF fallback for size estimation Switch estimateModelSizeBytes from calling vram.Estimate directly to the unified vram.EstimateModel entry point, which adds automatic fallbacks: file-based GGUF metadata → HF API → size string. Also extract the HuggingFace repo ID from model URIs (huggingface://, hf://, https://huggingface.co/ and org/model short-form) and pass it as ModelEstimateInput.HFRepo, so models not yet downloaded locally can still get a size estimate via the HF API. Addresses @mudler's review feedback: "better to rely on EstimateModel and pass by the HF URL of the model extracted from the URI". Signed-off-by: supermario_leo <leo.stack@outlook.com> * feat(webui): add Size-Aware Eviction toggle to settings page The size-aware eviction setting was wired through the CLI flag and the RuntimeSettings live-reload path (POST /api/settings) but had no handle on the React settings page, so it could not be toggled from the UI. Add a Size-Aware Eviction toggle to the Watchdog section, next to the existing Force Eviction When Busy / LRU eviction handles. The settings page loads and saves the whole RuntimeSettings object, so the new size_aware_eviction key is picked up with no extra plumbing. Addresses @mudler's review feedback: the application config setting should land on the same UI settings page as the other handles. Signed-off-by: supermario_leo <leo.stack@outlook.com> --------- Signed-off-by: supermario_leo <leo.stack@outlook.com>
1027 lines
31 KiB
Go
1027 lines
31 KiB
Go
package model_test
|
|
|
|
import (
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/mudler/LocalAI/pkg/model"
|
|
. "github.com/onsi/ginkgo/v2"
|
|
. "github.com/onsi/gomega"
|
|
)
|
|
|
|
// mockProcessManager implements ProcessManager for testing
|
|
type mockProcessManager struct {
|
|
mu sync.Mutex
|
|
shutdownCalls []string
|
|
shutdownErrors map[string]error
|
|
}
|
|
|
|
func newMockProcessManager() *mockProcessManager {
|
|
return &mockProcessManager{
|
|
shutdownCalls: []string{},
|
|
shutdownErrors: make(map[string]error),
|
|
}
|
|
}
|
|
|
|
func (m *mockProcessManager) ShutdownModel(modelName string) error {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
m.shutdownCalls = append(m.shutdownCalls, modelName)
|
|
if err, ok := m.shutdownErrors[modelName]; ok {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (m *mockProcessManager) getShutdownCalls() []string {
|
|
m.mu.Lock()
|
|
defer m.mu.Unlock()
|
|
result := make([]string, len(m.shutdownCalls))
|
|
copy(result, m.shutdownCalls)
|
|
return result
|
|
}
|
|
|
|
var _ = Describe("WatchDog", func() {
|
|
var (
|
|
wd *model.WatchDog
|
|
pm *mockProcessManager
|
|
)
|
|
|
|
BeforeEach(func() {
|
|
pm = newMockProcessManager()
|
|
})
|
|
|
|
Context("LRU Limit", func() {
|
|
It("should create watchdog with LRU limit", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithBusyTimeout(5*time.Minute),
|
|
model.WithIdleTimeout(15*time.Minute),
|
|
model.WithLRULimit(2),
|
|
)
|
|
Expect(wd.GetLRULimit()).To(Equal(2))
|
|
})
|
|
|
|
It("should allow updating LRU limit dynamically", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithLRULimit(2),
|
|
)
|
|
wd.SetLRULimit(5)
|
|
Expect(wd.GetLRULimit()).To(Equal(5))
|
|
})
|
|
|
|
It("should return 0 for disabled LRU", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithLRULimit(0),
|
|
)
|
|
Expect(wd.GetLRULimit()).To(Equal(0))
|
|
})
|
|
})
|
|
|
|
Context("Memory Reclaimer Options", func() {
|
|
It("should create watchdog with memory reclaimer settings", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithMemoryReclaimer(true, 0.85),
|
|
)
|
|
enabled, threshold := wd.GetMemoryReclaimerSettings()
|
|
Expect(enabled).To(BeTrue())
|
|
Expect(threshold).To(Equal(0.85))
|
|
})
|
|
|
|
It("should allow setting memory reclaimer via separate options", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithMemoryReclaimerEnabled(true),
|
|
model.WithMemoryReclaimerThreshold(0.90),
|
|
)
|
|
enabled, threshold := wd.GetMemoryReclaimerSettings()
|
|
Expect(enabled).To(BeTrue())
|
|
Expect(threshold).To(Equal(0.90))
|
|
})
|
|
|
|
It("should use default threshold when not specified", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
)
|
|
_, threshold := wd.GetMemoryReclaimerSettings()
|
|
Expect(threshold).To(Equal(model.DefaultMemoryReclaimerThreshold))
|
|
})
|
|
|
|
It("should allow updating memory reclaimer settings dynamically", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
)
|
|
wd.SetMemoryReclaimer(true, 0.80)
|
|
enabled, threshold := wd.GetMemoryReclaimerSettings()
|
|
Expect(enabled).To(BeTrue())
|
|
Expect(threshold).To(Equal(0.80))
|
|
})
|
|
})
|
|
|
|
Context("Model Tracking", func() {
|
|
BeforeEach(func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithBusyTimeout(5*time.Minute),
|
|
model.WithIdleTimeout(15*time.Minute),
|
|
model.WithLRULimit(3),
|
|
)
|
|
})
|
|
|
|
It("should track loaded models count", func() {
|
|
Expect(wd.GetLoadedModelCount()).To(Equal(0))
|
|
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
Expect(wd.GetLoadedModelCount()).To(Equal(1))
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
Expect(wd.GetLoadedModelCount()).To(Equal(2))
|
|
})
|
|
|
|
It("should update lastUsed time on Mark", func() {
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
// The model should now have a lastUsed time set
|
|
// We can verify this indirectly through LRU eviction behavior
|
|
})
|
|
|
|
It("should update lastUsed time on UnMark", func() {
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
wd.UnMark("addr1")
|
|
// The model should now have an updated lastUsed time
|
|
})
|
|
|
|
It("should update lastUsed time via UpdateLastUsed", func() {
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.UpdateLastUsed("addr1")
|
|
// Verify the time was updated
|
|
})
|
|
})
|
|
|
|
Context("EnforceLRULimit", func() {
|
|
BeforeEach(func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithBusyTimeout(5*time.Minute),
|
|
model.WithIdleTimeout(15*time.Minute),
|
|
model.WithLRULimit(2),
|
|
model.WithForceEvictionWhenBusy(true), // Enable force eviction for these tests to match old behavior
|
|
)
|
|
})
|
|
|
|
It("should not evict when under limit", func() {
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1") // Unmark to make it idle (not busy)
|
|
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(0))
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
Expect(pm.getShutdownCalls()).To(BeEmpty())
|
|
})
|
|
|
|
It("should evict oldest model when at limit", func() {
|
|
// Add two models
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1") // Unmark to make it idle
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2") // Unmark to make it idle
|
|
|
|
// Enforce LRU with limit of 2 (need to make room for 1 new model)
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
Expect(pm.getShutdownCalls()).To(ContainElement("model1")) // oldest should be evicted
|
|
})
|
|
|
|
It("should evict multiple models when needed", func() {
|
|
// Add three models
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1") // Unmark to make it idle
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2") // Unmark to make it idle
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr3", "model3")
|
|
wd.Mark("addr3")
|
|
wd.UnMark("addr3") // Unmark to make it idle
|
|
|
|
// Set limit to 1, should evict 2 oldest + 1 for new = 3 evictions
|
|
wd.SetLRULimit(1)
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(3))
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
shutdowns := pm.getShutdownCalls()
|
|
Expect(shutdowns).To(ContainElement("model1"))
|
|
Expect(shutdowns).To(ContainElement("model2"))
|
|
Expect(shutdowns).To(ContainElement("model3"))
|
|
})
|
|
|
|
It("should account for pending loads", func() {
|
|
// Add two models (at limit)
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1") // Unmark to make it idle
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2") // Unmark to make it idle
|
|
|
|
// With 1 pending load, we need to evict 2 (current=2, pending=1, new=1, limit=2)
|
|
// total after = 2 + 1 + 1 = 4, need to evict 4 - 2 = 2
|
|
result := wd.EnforceLRULimit(1)
|
|
Expect(result.EvictedCount).To(Equal(2))
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
})
|
|
|
|
It("should not evict when LRU is disabled", func() {
|
|
wd.SetLRULimit(0)
|
|
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.AddAddressModelMap("addr3", "model3")
|
|
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(0))
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
Expect(pm.getShutdownCalls()).To(BeEmpty())
|
|
})
|
|
|
|
It("should evict least recently used first", func() {
|
|
wd.SetLRULimit(2)
|
|
|
|
// Add models with different lastUsed times
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1") // Unmark to make it idle
|
|
time.Sleep(20 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2") // Unmark to make it idle
|
|
time.Sleep(20 * time.Millisecond)
|
|
|
|
// Touch model1 again to make it more recent
|
|
wd.UpdateLastUsed("addr1")
|
|
time.Sleep(20 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr3", "model3")
|
|
wd.Mark("addr3")
|
|
wd.UnMark("addr3") // Unmark to make it idle
|
|
|
|
// Now model2 is the oldest, should be evicted first
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(BeNumerically(">=", 1))
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
|
|
shutdowns := pm.getShutdownCalls()
|
|
// model2 should be evicted first (it's the oldest)
|
|
if len(shutdowns) >= 1 {
|
|
Expect(shutdowns[0]).To(Equal("model2"))
|
|
}
|
|
})
|
|
})
|
|
|
|
Context("Single Backend Mode (LRU=1)", func() {
|
|
BeforeEach(func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithBusyTimeout(5*time.Minute),
|
|
model.WithIdleTimeout(15*time.Minute),
|
|
model.WithLRULimit(1),
|
|
model.WithForceEvictionWhenBusy(true), // Enable force eviction for these tests
|
|
)
|
|
})
|
|
|
|
It("should evict existing model when loading new one", func() {
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1") // Unmark to make it idle
|
|
|
|
// With limit=1, loading a new model should evict the existing one
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
|
|
})
|
|
|
|
It("should handle rapid model switches", func() {
|
|
for range 5 {
|
|
wd.AddAddressModelMap("addr", "model")
|
|
wd.Mark("addr")
|
|
wd.UnMark("addr") // Unmark to make it idle
|
|
wd.EnforceLRULimit(0)
|
|
}
|
|
// All previous models should have been evicted
|
|
Expect(len(pm.getShutdownCalls())).To(Equal(5))
|
|
})
|
|
})
|
|
|
|
Context("Force Eviction When Busy", func() {
|
|
BeforeEach(func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithLRULimit(2),
|
|
model.WithForceEvictionWhenBusy(false), // Default: skip eviction when busy
|
|
)
|
|
})
|
|
|
|
It("should skip eviction for busy models when forceEvictionWhenBusy is false", func() {
|
|
// Add two models (at limit of 2, need to evict 1 for new model)
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2") // Make model2 idle
|
|
|
|
// Keep model1 as busy (simulating active API call)
|
|
// model1 is already marked as busy from the first Mark call
|
|
|
|
// Try to enforce LRU - should skip busy model1, evict model2
|
|
result := wd.EnforceLRULimit(0)
|
|
// Should evict model2 (not busy) but skip model1 (busy)
|
|
// Since we evicted 1 (which is what we needed), NeedMore should be false
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
Expect(result.NeedMore).To(BeFalse()) // We evicted enough, even though we skipped model1
|
|
Expect(pm.getShutdownCalls()).To(ContainElement("model2"))
|
|
Expect(pm.getShutdownCalls()).ToNot(ContainElement("model1"))
|
|
})
|
|
|
|
It("should evict busy models when forceEvictionWhenBusy is true", func() {
|
|
wd.SetForceEvictionWhenBusy(true)
|
|
|
|
// Add two models
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
|
|
// Keep model1 as busy (already marked from first Mark call)
|
|
|
|
// Try to enforce LRU - should evict model1 even though busy
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
|
|
})
|
|
|
|
It("should set NeedMore when all models are busy and forceEvictionWhenBusy is false", func() {
|
|
// Add two models
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
|
|
// Mark both as busy
|
|
wd.Mark("addr1")
|
|
wd.Mark("addr2")
|
|
|
|
// Try to enforce LRU - should skip both busy models
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(0))
|
|
Expect(result.NeedMore).To(BeTrue())
|
|
Expect(pm.getShutdownCalls()).To(BeEmpty())
|
|
})
|
|
|
|
It("should allow updating forceEvictionWhenBusy dynamically", func() {
|
|
// Start with false
|
|
Expect(wd).ToNot(BeNil())
|
|
|
|
// Add models
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2") // Make model2 idle
|
|
// Keep model1 busy (already marked)
|
|
|
|
// With forceEvictionWhenBusy=false, should skip busy model1, evict model2
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.NeedMore).To(BeFalse()) // We evicted enough (1 model)
|
|
Expect(result.EvictedCount).To(Equal(1)) // Should evict model2 (not busy)
|
|
|
|
// Now enable force eviction
|
|
wd.SetForceEvictionWhenBusy(true)
|
|
|
|
// Add models again
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
// Keep model1 busy (already marked)
|
|
|
|
// With forceEvictionWhenBusy=true, should evict busy model1
|
|
result = wd.EnforceLRULimit(0)
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
})
|
|
|
|
It("should continue to next LRU model when busy model is skipped", func() {
|
|
// Add three models
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2") // Make model2 idle
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr3", "model3")
|
|
wd.Mark("addr3")
|
|
wd.UnMark("addr3") // Make model3 idle
|
|
|
|
// Keep model1 as busy (oldest, already marked)
|
|
|
|
// Need to evict 2 models (limit=2, current=3, need room for 1 new)
|
|
// Should skip model1 (busy), evict model2 and model3 (not busy)
|
|
result := wd.EnforceLRULimit(0)
|
|
// Should evict model2 and model3 (2 models, which is what we needed)
|
|
Expect(result.EvictedCount).To(Equal(2))
|
|
Expect(result.NeedMore).To(BeFalse()) // We evicted enough (2 models)
|
|
Expect(pm.getShutdownCalls()).To(ContainElement("model2"))
|
|
Expect(pm.getShutdownCalls()).To(ContainElement("model3"))
|
|
})
|
|
})
|
|
|
|
Context("EnforceLRULimitResult", func() {
|
|
BeforeEach(func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithLRULimit(2),
|
|
model.WithForceEvictionWhenBusy(false),
|
|
)
|
|
})
|
|
|
|
It("should return NeedMore=false when eviction is successful", func() {
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1") // Make idle
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2") // Make idle
|
|
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
})
|
|
|
|
It("should return NeedMore=true when not enough models can be evicted", func() {
|
|
// Add two models (at limit of 2, need to evict 1 for new model)
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
|
|
// Mark both as busy (keep them busy)
|
|
// Both are already marked as busy from the Mark calls above
|
|
|
|
// Need to evict 1, but both are busy
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.NeedMore).To(BeTrue())
|
|
Expect(result.EvictedCount).To(Equal(0))
|
|
})
|
|
|
|
It("should return NeedMore=true when need to evict multiple but some are busy", func() {
|
|
// Set limit to 1, add 3 models (need to evict 2 for new model)
|
|
wd.SetLRULimit(1)
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2") // Make model2 idle
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr3", "model3")
|
|
wd.Mark("addr3")
|
|
// Keep model1 and model3 busy
|
|
|
|
// Need to evict 2 models, but model1 and model3 are busy, only model2 is idle
|
|
// Should evict model2 (1 model), but NeedMore=true because we needed 2
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
Expect(result.NeedMore).To(BeTrue())
|
|
})
|
|
|
|
It("should return correct EvictedCount when some models are evicted", func() {
|
|
// Add three models
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2") // Make model2 idle
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr3", "model3")
|
|
wd.Mark("addr3")
|
|
wd.UnMark("addr3") // Make model3 idle
|
|
|
|
// Keep model1 as busy (already marked)
|
|
|
|
// Need to evict 2 models, but model1 is busy
|
|
// Should evict model2 and model3 (2 models, which is what we needed)
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(2))
|
|
Expect(result.NeedMore).To(BeFalse()) // We evicted enough (2 models)
|
|
})
|
|
})
|
|
|
|
Context("Pinned Models", func() {
|
|
It("should set and get pinned models", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
)
|
|
Expect(wd.IsModelPinned("model1")).To(BeFalse())
|
|
|
|
wd.SetPinnedModels([]string{"model1", "model2"})
|
|
Expect(wd.IsModelPinned("model1")).To(BeTrue())
|
|
Expect(wd.IsModelPinned("model2")).To(BeTrue())
|
|
Expect(wd.IsModelPinned("model3")).To(BeFalse())
|
|
})
|
|
|
|
It("should replace pinned models on subsequent calls", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
)
|
|
wd.SetPinnedModels([]string{"model1"})
|
|
Expect(wd.IsModelPinned("model1")).To(BeTrue())
|
|
|
|
wd.SetPinnedModels([]string{"model2"})
|
|
Expect(wd.IsModelPinned("model1")).To(BeFalse())
|
|
Expect(wd.IsModelPinned("model2")).To(BeTrue())
|
|
})
|
|
|
|
It("should skip pinned models during LRU eviction", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithLRULimit(2),
|
|
model.WithForceEvictionWhenBusy(true),
|
|
)
|
|
|
|
// Add two models, pin the older one
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2")
|
|
|
|
wd.SetPinnedModels([]string{"model1"})
|
|
|
|
// Enforce LRU - model1 is oldest but pinned, model2 should be evicted
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
Expect(pm.getShutdownCalls()).To(ContainElement("model2"))
|
|
Expect(pm.getShutdownCalls()).ToNot(ContainElement("model1"))
|
|
})
|
|
|
|
It("should not evict any model when all are pinned and LRU limit reached", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithLRULimit(1),
|
|
model.WithForceEvictionWhenBusy(true),
|
|
)
|
|
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2")
|
|
|
|
wd.SetPinnedModels([]string{"model1", "model2"})
|
|
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(0))
|
|
Expect(pm.getShutdownCalls()).To(BeEmpty())
|
|
})
|
|
|
|
It("should skip pinned models during idle check", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithIdleTimeout(10*time.Millisecond),
|
|
model.WithIdleCheck(true),
|
|
model.WithWatchdogInterval(50*time.Millisecond),
|
|
)
|
|
|
|
// Add two models and make them idle
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1")
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2")
|
|
|
|
// Pin model1
|
|
wd.SetPinnedModels([]string{"model1"})
|
|
|
|
// Start watchdog and wait for idle check
|
|
go wd.Run()
|
|
defer wd.Shutdown()
|
|
|
|
// Wait for the idle timeout + watchdog interval to pass
|
|
time.Sleep(200 * time.Millisecond)
|
|
|
|
// Only model2 should be shut down
|
|
shutdowns := pm.getShutdownCalls()
|
|
Expect(shutdowns).To(ContainElement("model2"))
|
|
Expect(shutdowns).ToNot(ContainElement("model1"))
|
|
})
|
|
})
|
|
|
|
Context("Concurrency Groups", func() {
|
|
Describe("ReplaceModelGroups / GetModelGroups", func() {
|
|
It("returns nil for unknown models", func() {
|
|
wd = model.NewWatchDog(model.WithProcessManager(pm))
|
|
Expect(wd.GetModelGroups("nope")).To(BeNil())
|
|
})
|
|
|
|
It("stores and retrieves groups", func() {
|
|
wd = model.NewWatchDog(model.WithProcessManager(pm))
|
|
wd.ReplaceModelGroups(map[string][]string{
|
|
"a": {"heavy", "vision"},
|
|
"b": {"heavy"},
|
|
})
|
|
Expect(wd.GetModelGroups("a")).To(Equal([]string{"heavy", "vision"}))
|
|
Expect(wd.GetModelGroups("b")).To(Equal([]string{"heavy"}))
|
|
Expect(wd.GetModelGroups("c")).To(BeNil())
|
|
})
|
|
|
|
It("replaces previous state on subsequent calls", func() {
|
|
wd = model.NewWatchDog(model.WithProcessManager(pm))
|
|
wd.ReplaceModelGroups(map[string][]string{"a": {"heavy"}})
|
|
wd.ReplaceModelGroups(map[string][]string{"b": {"vision"}})
|
|
Expect(wd.GetModelGroups("a")).To(BeNil())
|
|
Expect(wd.GetModelGroups("b")).To(Equal([]string{"vision"}))
|
|
})
|
|
|
|
It("clears state when called with an empty map", func() {
|
|
wd = model.NewWatchDog(model.WithProcessManager(pm))
|
|
wd.ReplaceModelGroups(map[string][]string{"a": {"heavy"}})
|
|
wd.ReplaceModelGroups(nil)
|
|
Expect(wd.GetModelGroups("a")).To(BeNil())
|
|
})
|
|
|
|
It("returns a defensive copy", func() {
|
|
wd = model.NewWatchDog(model.WithProcessManager(pm))
|
|
wd.ReplaceModelGroups(map[string][]string{"a": {"heavy"}})
|
|
got := wd.GetModelGroups("a")
|
|
got[0] = "tampered"
|
|
Expect(wd.GetModelGroups("a")).To(Equal([]string{"heavy"}))
|
|
})
|
|
})
|
|
|
|
Describe("EnforceGroupExclusivity", func() {
|
|
It("is a no-op when the requested model has no groups", func() {
|
|
wd = model.NewWatchDog(model.WithProcessManager(pm))
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
|
|
result := wd.EnforceGroupExclusivity("requested")
|
|
Expect(result.EvictedCount).To(Equal(0))
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
Expect(pm.getShutdownCalls()).To(BeEmpty())
|
|
})
|
|
|
|
It("is a no-op when no loaded model shares a group", func() {
|
|
wd = model.NewWatchDog(model.WithProcessManager(pm))
|
|
wd.ReplaceModelGroups(map[string][]string{
|
|
"loaded": {"vision"},
|
|
"requested": {"heavy"},
|
|
})
|
|
wd.AddAddressModelMap("addr1", "loaded")
|
|
|
|
result := wd.EnforceGroupExclusivity("requested")
|
|
Expect(result.EvictedCount).To(Equal(0))
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
Expect(pm.getShutdownCalls()).To(BeEmpty())
|
|
})
|
|
|
|
It("evicts a loaded model that shares a single group", func() {
|
|
wd = model.NewWatchDog(model.WithProcessManager(pm))
|
|
wd.ReplaceModelGroups(map[string][]string{
|
|
"a": {"heavy"},
|
|
"b": {"heavy"},
|
|
})
|
|
wd.AddAddressModelMap("addrA", "a")
|
|
wd.Mark("addrA")
|
|
wd.UnMark("addrA")
|
|
|
|
result := wd.EnforceGroupExclusivity("b")
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
Expect(pm.getShutdownCalls()).To(ConsistOf("a"))
|
|
})
|
|
|
|
It("evicts when groups overlap on any single name", func() {
|
|
wd = model.NewWatchDog(model.WithProcessManager(pm))
|
|
wd.ReplaceModelGroups(map[string][]string{
|
|
"a": {"x", "y"},
|
|
"b": {"y", "z"},
|
|
})
|
|
wd.AddAddressModelMap("addrA", "a")
|
|
wd.Mark("addrA")
|
|
wd.UnMark("addrA")
|
|
|
|
result := wd.EnforceGroupExclusivity("b")
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
Expect(pm.getShutdownCalls()).To(ConsistOf("a"))
|
|
})
|
|
|
|
It("evicts every conflicting loaded model", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithForceEvictionWhenBusy(true),
|
|
)
|
|
wd.ReplaceModelGroups(map[string][]string{
|
|
"a": {"heavy"},
|
|
"b": {"heavy"},
|
|
"c": {"heavy"},
|
|
})
|
|
wd.AddAddressModelMap("addrA", "a")
|
|
wd.Mark("addrA")
|
|
wd.UnMark("addrA")
|
|
wd.AddAddressModelMap("addrB", "b")
|
|
wd.Mark("addrB")
|
|
wd.UnMark("addrB")
|
|
|
|
result := wd.EnforceGroupExclusivity("c")
|
|
Expect(result.EvictedCount).To(Equal(2))
|
|
Expect(pm.getShutdownCalls()).To(ConsistOf("a", "b"))
|
|
})
|
|
|
|
It("skips a pinned conflicting model and reports NeedMore", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithForceEvictionWhenBusy(true),
|
|
)
|
|
wd.ReplaceModelGroups(map[string][]string{
|
|
"a": {"heavy"},
|
|
"b": {"heavy"},
|
|
})
|
|
wd.SetPinnedModels([]string{"a"})
|
|
wd.AddAddressModelMap("addrA", "a")
|
|
wd.Mark("addrA")
|
|
wd.UnMark("addrA")
|
|
|
|
result := wd.EnforceGroupExclusivity("b")
|
|
Expect(result.EvictedCount).To(Equal(0))
|
|
Expect(result.NeedMore).To(BeTrue())
|
|
Expect(pm.getShutdownCalls()).To(BeEmpty())
|
|
})
|
|
|
|
It("skips a busy conflict when forceEvictionWhenBusy is false", func() {
|
|
wd = model.NewWatchDog(model.WithProcessManager(pm))
|
|
wd.ReplaceModelGroups(map[string][]string{
|
|
"a": {"heavy"},
|
|
"b": {"heavy"},
|
|
})
|
|
wd.AddAddressModelMap("addrA", "a")
|
|
wd.Mark("addrA") // leave busy
|
|
|
|
result := wd.EnforceGroupExclusivity("b")
|
|
Expect(result.EvictedCount).To(Equal(0))
|
|
Expect(result.NeedMore).To(BeTrue())
|
|
Expect(pm.getShutdownCalls()).To(BeEmpty())
|
|
})
|
|
|
|
It("evicts a busy conflict when forceEvictionWhenBusy is true", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithForceEvictionWhenBusy(true),
|
|
)
|
|
wd.ReplaceModelGroups(map[string][]string{
|
|
"a": {"heavy"},
|
|
"b": {"heavy"},
|
|
})
|
|
wd.AddAddressModelMap("addrA", "a")
|
|
wd.Mark("addrA") // leave busy
|
|
|
|
result := wd.EnforceGroupExclusivity("b")
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
Expect(pm.getShutdownCalls()).To(ConsistOf("a"))
|
|
})
|
|
})
|
|
})
|
|
|
|
Context("Functional Options", func() {
|
|
It("should use default options when none provided", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
)
|
|
Expect(wd.GetLRULimit()).To(Equal(0))
|
|
|
|
enabled, threshold := wd.GetMemoryReclaimerSettings()
|
|
Expect(enabled).To(BeFalse())
|
|
Expect(threshold).To(Equal(model.DefaultMemoryReclaimerThreshold))
|
|
})
|
|
|
|
It("should allow combining multiple options", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithBusyTimeout(10*time.Minute),
|
|
model.WithIdleTimeout(30*time.Minute),
|
|
model.WithBusyCheck(true),
|
|
model.WithIdleCheck(true),
|
|
model.WithLRULimit(5),
|
|
model.WithMemoryReclaimerEnabled(true),
|
|
model.WithMemoryReclaimerThreshold(0.80),
|
|
model.WithForceEvictionWhenBusy(true),
|
|
)
|
|
|
|
Expect(wd.GetLRULimit()).To(Equal(5))
|
|
|
|
enabled, threshold := wd.GetMemoryReclaimerSettings()
|
|
Expect(enabled).To(BeTrue())
|
|
Expect(threshold).To(Equal(0.80))
|
|
})
|
|
|
|
It("should use default forceEvictionWhenBusy (false) when not specified", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
)
|
|
// Default should be false - we can test this by checking behavior
|
|
// Add a busy model and verify it's skipped
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.Mark("addr1") // Keep model1 busy
|
|
|
|
wd.SetLRULimit(1)
|
|
result := wd.EnforceLRULimit(0)
|
|
// Should skip busy model1, evict model2, but NeedMore=true
|
|
Expect(result.NeedMore).To(BeTrue())
|
|
})
|
|
|
|
It("should allow setting forceEvictionWhenBusy via option", func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithLRULimit(2),
|
|
model.WithForceEvictionWhenBusy(true),
|
|
)
|
|
|
|
// Add models
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
// Keep model1 busy (already marked from first Mark call)
|
|
|
|
// Should evict busy model1
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
|
|
})
|
|
})
|
|
|
|
Context("Size-Aware Eviction", func() {
|
|
BeforeEach(func() {
|
|
wd = model.NewWatchDog(
|
|
model.WithProcessManager(pm),
|
|
model.WithLRULimit(2),
|
|
model.WithForceEvictionWhenBusy(true),
|
|
model.WithSizeAwareEviction(true),
|
|
)
|
|
})
|
|
|
|
It("should enable size-aware eviction via option", func() {
|
|
Expect(wd.GetSizeAwareEviction()).To(BeTrue())
|
|
})
|
|
|
|
It("should allow toggling size-aware eviction dynamically", func() {
|
|
wd.SetSizeAwareEviction(false)
|
|
Expect(wd.GetSizeAwareEviction()).To(BeFalse())
|
|
wd.SetSizeAwareEviction(true)
|
|
Expect(wd.GetSizeAwareEviction()).To(BeTrue())
|
|
})
|
|
|
|
It("should evict the largest model first when size-aware eviction is enabled", func() {
|
|
// Register sizes: model1=100MB, model2=400MB
|
|
wd.RegisterModelSize("model1", 100*1024*1024)
|
|
wd.RegisterModelSize("model2", 400*1024*1024)
|
|
|
|
// Add models — model1 older, model2 newer
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1")
|
|
time.Sleep(10 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2")
|
|
|
|
// With limit=2 and 2 loaded, adding a 3rd triggers eviction.
|
|
// LRU order: model1 (oldest) would be evicted first.
|
|
// Size order: model2 (400MB) should be evicted first.
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
Expect(result.NeedMore).To(BeFalse())
|
|
Expect(pm.getShutdownCalls()).To(ContainElement("model2")) // largest first
|
|
Expect(pm.getShutdownCalls()).ToNot(ContainElement("model1"))
|
|
})
|
|
|
|
It("should use LRU time as tiebreaker for equal-size models", func() {
|
|
// Register equal sizes for both models
|
|
wd.RegisterModelSize("model1", 200*1024*1024)
|
|
wd.RegisterModelSize("model2", 200*1024*1024)
|
|
|
|
// Add model1 first (older)
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1")
|
|
time.Sleep(20 * time.Millisecond)
|
|
|
|
// Add model2 (newer)
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2")
|
|
|
|
// Equal size → LRU tiebreaker: model1 (older) should be evicted
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
|
|
Expect(pm.getShutdownCalls()).ToNot(ContainElement("model2"))
|
|
})
|
|
|
|
It("should fall back to LRU when no size is registered", func() {
|
|
// No sizes registered — should behave like standard LRU
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1")
|
|
time.Sleep(20 * time.Millisecond)
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2")
|
|
|
|
// Both have size 0 → LRU tiebreaker: model1 (older) evicted
|
|
result := wd.EnforceLRULimit(0)
|
|
Expect(result.EvictedCount).To(Equal(1))
|
|
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
|
|
})
|
|
|
|
It("should clean up model size on eviction", func() {
|
|
wd.RegisterModelSize("model1", 200*1024*1024)
|
|
|
|
wd.AddAddressModelMap("addr1", "model1")
|
|
wd.Mark("addr1")
|
|
wd.UnMark("addr1")
|
|
|
|
wd.AddAddressModelMap("addr2", "model2")
|
|
wd.Mark("addr2")
|
|
wd.UnMark("addr2")
|
|
|
|
wd.EnforceLRULimit(0)
|
|
|
|
// model1 was evicted; registering a new model with the same name
|
|
// should start from a clean state (size not inherited)
|
|
wd.RegisterModelSize("model1", 50*1024*1024)
|
|
// Just verifying no panic and size can be re-registered
|
|
})
|
|
})
|
|
})
|