mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-21 23:29:04 -04:00
feat(watchdog): add size-aware LRU eviction mode (#9527)
* feat(watchdog): add size-aware LRU eviction mode When the model count hits the LRU limit or the memory reclaimer fires, evict the largest model by on-disk file size first rather than the least-recently-used one. For GGUF models the file size is a reliable proxy for GPU/RAM footprint, so evicting the largest candidate maximises freed memory per eviction round while keeping small utility models (embeddings, classifiers, rerankers) resident. Changes: - `pkg/model/watchdog.go`: add `sizeAwareEviction` flag and `modelSizes map[string]int64` to `WatchDog`; sort candidates by `sizeBytes` desc (LRU time as tiebreaker) when the flag is set; add `RegisterModelSize`, `SetSizeAwareEviction`, `GetSizeAwareEviction` - `pkg/model/watchdog_options.go`: add `WithSizeAwareEviction` option - `pkg/model/initializers.go`: stat model file after load and call `RegisterModelSize` so size data is available before the first eviction - `core/config/application_config.go`, `runtime_settings.go`: add `SizeAwareEviction` field and `WithSizeAwareEviction` app option; expose via `ToRuntimeSettings` / `ApplyRuntimeSettings` for the `POST /api/settings` live-reload path - `core/cli/run.go`: add `--size-aware-eviction` flag / `LOCALAI_SIZE_AWARE_EVICTION` env var - `core/application/startup.go`, `watchdog.go`: wire the new option through to `NewWatchDog` - `pkg/model/watchdog_test.go`: 5 new specs — option enable, dynamic toggle, largest-first ordering, equal-size LRU tiebreaker, no-size fallback to LRU, and size-map cleanup on eviction Closes #9375 Signed-off-by: supermario_leo <leo.stack@outlook.com> * refactor(watchdog): use vram estimation scaffolding for model size Replace the brittle os.Stat(modelFile) approach with a proper call to pkg/vram, which handles multi-file models (DownloadFiles, MMProj) and all weight file types, not just single GGUF files. - Add estimateModelSizeBytes() in core/backend/options.go that collects all weight file URIs from the model config, resolves them to file:// URIs, and calls vram.Estimate() with the shared DefaultCachedSizeResolver (15-min TTL cache avoids redundant stat calls on repeated loads) - Thread the result through via a new WithModelSizeBytes() loader option - In initializers.go, consume the pre-computed size instead of calling os.Stat; if no size was supplied (e.g. for external/router-dispatched models) the registration is simply skipped Signed-off-by: supermario_leo <leo.stack@outlook.com> * refactor(watchdog): use EstimateModel with HF fallback for size estimation Switch estimateModelSizeBytes from calling vram.Estimate directly to the unified vram.EstimateModel entry point, which adds automatic fallbacks: file-based GGUF metadata → HF API → size string. Also extract the HuggingFace repo ID from model URIs (huggingface://, hf://, https://huggingface.co/ and org/model short-form) and pass it as ModelEstimateInput.HFRepo, so models not yet downloaded locally can still get a size estimate via the HF API. Addresses @mudler's review feedback: "better to rely on EstimateModel and pass by the HF URL of the model extracted from the URI". Signed-off-by: supermario_leo <leo.stack@outlook.com> * feat(webui): add Size-Aware Eviction toggle to settings page The size-aware eviction setting was wired through the CLI flag and the RuntimeSettings live-reload path (POST /api/settings) but had no handle on the React settings page, so it could not be toggled from the UI. Add a Size-Aware Eviction toggle to the Watchdog section, next to the existing Force Eviction When Busy / LRU eviction handles. The settings page loads and saves the whole RuntimeSettings object, so the new size_aware_eviction key is picked up with no extra plumbing. Addresses @mudler's review feedback: the application config setting should land on the same UI settings page as the other handles. Signed-off-by: supermario_leo <leo.stack@outlook.com> --------- Signed-off-by: supermario_leo <leo.stack@outlook.com>
This commit is contained in:
@@ -644,6 +644,12 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
|
||||
options.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
|
||||
}
|
||||
}
|
||||
if settings.SizeAwareEviction != nil {
|
||||
// Only apply if current value is default (false), suggesting it wasn't set from env var
|
||||
if !options.SizeAwareEviction {
|
||||
options.SizeAwareEviction = *settings.SizeAwareEviction
|
||||
}
|
||||
}
|
||||
if settings.LRUEvictionMaxRetries != nil {
|
||||
// Only apply if current value is default (30), suggesting it wasn't set from env var
|
||||
if options.LRUEvictionMaxRetries == 0 {
|
||||
@@ -847,6 +853,7 @@ func initializeWatchdog(application *Application, options *config.ApplicationCon
|
||||
model.WithLRULimit(lruLimit),
|
||||
model.WithMemoryReclaimer(options.MemoryReclaimerEnabled, options.MemoryReclaimerThreshold),
|
||||
model.WithForceEvictionWhenBusy(options.ForceEvictionWhenBusy),
|
||||
model.WithSizeAwareEviction(options.SizeAwareEviction),
|
||||
)
|
||||
application.ModelLoader().SetWatchDog(wd)
|
||||
|
||||
|
||||
@@ -90,6 +90,7 @@ func (a *Application) startWatchdog() error {
|
||||
model.WithLRULimit(lruLimit),
|
||||
model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
|
||||
model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
|
||||
model.WithSizeAwareEviction(appConfig.SizeAwareEviction),
|
||||
)
|
||||
|
||||
// Create new stop channel BEFORE setting up any goroutines
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"math/rand/v2"
|
||||
@@ -12,7 +13,9 @@ import (
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/trace"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
"github.com/mudler/LocalAI/pkg/downloader"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/vram"
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
@@ -33,6 +36,67 @@ func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, back
|
||||
})
|
||||
}
|
||||
|
||||
// estimateModelSizeBytes uses the unified EstimateModel entry point to compute
|
||||
// the total weight-file size for a model config. It collects all weight files
|
||||
// from DownloadFiles, Model, and MMProj, and also extracts the HuggingFace
|
||||
// repo ID so EstimateModel can fall back to the HF API when local file
|
||||
// metadata is unavailable (e.g. not-yet-downloaded models).
|
||||
func estimateModelSizeBytes(c config.ModelConfig, modelsPath string) int64 {
|
||||
seen := make(map[string]bool)
|
||||
input := vram.ModelEstimateInput{}
|
||||
|
||||
addFile := func(uri string) {
|
||||
if !vram.IsWeightFile(uri) {
|
||||
return
|
||||
}
|
||||
resolved := uri
|
||||
if !strings.Contains(uri, "://") {
|
||||
resolved = "file://" + filepath.Join(modelsPath, uri)
|
||||
}
|
||||
if seen[resolved] {
|
||||
return
|
||||
}
|
||||
seen[resolved] = true
|
||||
input.Files = append(input.Files, vram.FileInput{URI: resolved})
|
||||
}
|
||||
|
||||
// tryHFRepo resolves any huggingface:// or hf:// URI to an HTTPS URL and
|
||||
// then extracts the org/model repo ID for use as the HF fallback path.
|
||||
tryHFRepo := func(uri string) {
|
||||
if input.HFRepo != "" {
|
||||
return
|
||||
}
|
||||
resolved := downloader.URI(uri).ResolveURL()
|
||||
if repoID, ok := vram.ExtractHFRepoID(resolved); ok {
|
||||
input.HFRepo = repoID
|
||||
}
|
||||
}
|
||||
|
||||
for _, f := range c.DownloadFiles {
|
||||
uriStr := string(f.URI)
|
||||
addFile(uriStr)
|
||||
tryHFRepo(uriStr)
|
||||
}
|
||||
addFile(c.Model)
|
||||
tryHFRepo(c.Model)
|
||||
if c.MMProj != "" {
|
||||
addFile(c.MMProj)
|
||||
}
|
||||
|
||||
if len(input.Files) == 0 && input.HFRepo == "" {
|
||||
return 0
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
|
||||
result, err := vram.EstimateModel(ctx, input)
|
||||
if err != nil || result.SizeBytes == 0 {
|
||||
return 0
|
||||
}
|
||||
return int64(result.SizeBytes)
|
||||
}
|
||||
|
||||
func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...model.Option) []model.Option {
|
||||
defOpts := []model.Option{
|
||||
model.WithBackendString(c.Backend),
|
||||
@@ -70,6 +134,10 @@ func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...mo
|
||||
defOpts = append(defOpts, model.WithExternalBackend(k, v))
|
||||
}
|
||||
|
||||
if sizeBytes := estimateModelSizeBytes(c, so.SystemState.Model.ModelsPath); sizeBytes > 0 {
|
||||
defOpts = append(defOpts, model.WithModelSizeBytes(sizeBytes))
|
||||
}
|
||||
|
||||
return append(defOpts, opts...)
|
||||
}
|
||||
|
||||
|
||||
@@ -93,6 +93,7 @@ type RunCMD struct {
|
||||
EnableMemoryReclaimer bool `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
|
||||
MemoryReclaimerThreshold float64 `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
|
||||
ForceEvictionWhenBusy bool `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
|
||||
SizeAwareEviction bool `env:"LOCALAI_SIZE_AWARE_EVICTION,SIZE_AWARE_EVICTION" default:"false" help:"Evict the largest loaded model first rather than the least-recently-used one, keeping small utility models resident and maximizing freed memory per eviction" group:"backends"`
|
||||
LRUEvictionMaxRetries int `env:"LOCALAI_LRU_EVICTION_MAX_RETRIES,LRU_EVICTION_MAX_RETRIES" default:"30" help:"Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)" group:"backends"`
|
||||
LRUEvictionRetryInterval string `env:"LOCALAI_LRU_EVICTION_RETRY_INTERVAL,LRU_EVICTION_RETRY_INTERVAL" default:"1s" help:"Interval between retries when waiting for busy models to become idle (e.g., 1s, 2s) (default: 1s)" group:"backends"`
|
||||
Federated bool `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
|
||||
@@ -564,6 +565,9 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
|
||||
if r.ForceEvictionWhenBusy {
|
||||
opts = append(opts, config.WithForceEvictionWhenBusy(true))
|
||||
}
|
||||
if r.SizeAwareEviction {
|
||||
opts = append(opts, config.WithSizeAwareEviction(true))
|
||||
}
|
||||
if r.LRUEvictionMaxRetries > 0 {
|
||||
opts = append(opts, config.WithLRUEvictionMaxRetries(r.LRUEvictionMaxRetries))
|
||||
}
|
||||
|
||||
@@ -119,6 +119,7 @@ type ApplicationConfig struct {
|
||||
|
||||
// Eviction settings
|
||||
ForceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
|
||||
SizeAwareEviction bool // Evict largest models first rather than least-recently-used (default: false)
|
||||
LRUEvictionMaxRetries int // Maximum number of retries when waiting for busy models to become idle (default: 30)
|
||||
LRUEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models (default: 1s)
|
||||
|
||||
@@ -488,6 +489,16 @@ func WithForceEvictionWhenBusy(enabled bool) AppOption {
|
||||
}
|
||||
}
|
||||
|
||||
// WithSizeAwareEviction enables size-aware eviction ordering.
|
||||
// When true, the watchdog evicts the largest loaded model first rather than the
|
||||
// least-recently-used one, keeping small utility models resident and maximizing
|
||||
// memory freed per eviction.
|
||||
func WithSizeAwareEviction(enabled bool) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
o.SizeAwareEviction = enabled
|
||||
}
|
||||
}
|
||||
|
||||
// WithLRUEvictionMaxRetries sets the maximum number of retries when waiting for busy models to become idle
|
||||
func WithLRUEvictionMaxRetries(maxRetries int) AppOption {
|
||||
return func(o *ApplicationConfig) {
|
||||
@@ -1028,6 +1039,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
|
||||
memoryReclaimerEnabled := o.MemoryReclaimerEnabled
|
||||
memoryReclaimerThreshold := o.MemoryReclaimerThreshold
|
||||
forceEvictionWhenBusy := o.ForceEvictionWhenBusy
|
||||
sizeAwareEviction := o.SizeAwareEviction
|
||||
lruEvictionMaxRetries := o.LRUEvictionMaxRetries
|
||||
threads := o.Threads
|
||||
contextSize := o.ContextSize
|
||||
@@ -1120,6 +1132,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
|
||||
MemoryReclaimerEnabled: &memoryReclaimerEnabled,
|
||||
MemoryReclaimerThreshold: &memoryReclaimerThreshold,
|
||||
ForceEvictionWhenBusy: &forceEvictionWhenBusy,
|
||||
SizeAwareEviction: &sizeAwareEviction,
|
||||
LRUEvictionMaxRetries: &lruEvictionMaxRetries,
|
||||
LRUEvictionRetryInterval: &lruEvictionRetryInterval,
|
||||
Threads: &threads,
|
||||
@@ -1244,6 +1257,10 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
|
||||
o.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
|
||||
// This setting doesn't require restart, can be updated dynamically
|
||||
}
|
||||
if settings.SizeAwareEviction != nil {
|
||||
o.SizeAwareEviction = *settings.SizeAwareEviction
|
||||
// This setting doesn't require restart, can be updated dynamically
|
||||
}
|
||||
if settings.LRUEvictionMaxRetries != nil {
|
||||
o.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
|
||||
// This setting doesn't require restart, can be updated dynamically
|
||||
|
||||
@@ -28,6 +28,7 @@ type RuntimeSettings struct {
|
||||
|
||||
// Eviction settings
|
||||
ForceEvictionWhenBusy *bool `json:"force_eviction_when_busy,omitempty"` // Force eviction even when models have active API calls (default: false for safety)
|
||||
SizeAwareEviction *bool `json:"size_aware_eviction,omitempty"` // Evict largest models first rather than least-recently-used (default: false)
|
||||
LRUEvictionMaxRetries *int `json:"lru_eviction_max_retries,omitempty"` // Maximum number of retries when waiting for busy models to become idle (default: 30)
|
||||
LRUEvictionRetryInterval *string `json:"lru_eviction_retry_interval,omitempty"` // Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)
|
||||
|
||||
|
||||
@@ -316,6 +316,9 @@ export default function Settings() {
|
||||
<SettingRow label="Force Eviction When Busy" description="Allow model eviction even during active API calls">
|
||||
<Toggle checked={settings.force_eviction_when_busy} onChange={(v) => update('force_eviction_when_busy', v)} />
|
||||
</SettingRow>
|
||||
<SettingRow label="Size-Aware Eviction" description="Evict the largest loaded model first instead of the least-recently-used one">
|
||||
<Toggle checked={settings.size_aware_eviction} onChange={(v) => update('size_aware_eviction', v)} />
|
||||
</SettingRow>
|
||||
<SettingRow label="LRU Eviction Max Retries" description="Maximum retries waiting for busy models before eviction">
|
||||
<input className="input" type="number" style={{ width: 120 }} value={settings.lru_eviction_max_retries ?? ''} onChange={(e) => update('lru_eviction_max_retries', parseInt(e.target.value) || 0)} placeholder="30" />
|
||||
</SettingRow>
|
||||
|
||||
@@ -159,6 +159,12 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
||||
return nil, fmt.Errorf("could not load model (no success): %s", res.Message)
|
||||
}
|
||||
|
||||
// Register size for size-aware eviction using the caller-supplied estimate
|
||||
// (computed via pkg/vram, which handles multi-file and non-GGUF models).
|
||||
if ml.wd != nil && o.modelSizeBytes > 0 {
|
||||
ml.wd.RegisterModelSize(modelID, o.modelSizeBytes)
|
||||
}
|
||||
|
||||
return client, nil
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,11 @@ type Options struct {
|
||||
grpcAttempts int
|
||||
grpcAttemptsDelay int
|
||||
parallelRequests bool
|
||||
|
||||
// modelSizeBytes is the estimated total weight size in bytes, pre-computed
|
||||
// by the caller using the vram estimation scaffolding. When non-zero it is
|
||||
// registered with the watchdog so size-aware eviction can rank models.
|
||||
modelSizeBytes int64
|
||||
}
|
||||
|
||||
type Option func(*Options)
|
||||
@@ -86,6 +91,12 @@ func WithModelID(id string) Option {
|
||||
}
|
||||
}
|
||||
|
||||
func WithModelSizeBytes(bytes int64) Option {
|
||||
return func(o *Options) {
|
||||
o.modelSizeBytes = bytes
|
||||
}
|
||||
}
|
||||
|
||||
func NewOptions(opts ...Option) *Options {
|
||||
o := &Options{
|
||||
gRPCOptions: &pb.ModelOptions{},
|
||||
|
||||
@@ -46,6 +46,11 @@ type WatchDog struct {
|
||||
// Eviction settings
|
||||
forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
|
||||
|
||||
// Size-aware eviction: sort candidates by model file size (largest first) to maximize freed memory.
|
||||
// When enabled, bigger models are evicted before smaller ones regardless of recency.
|
||||
sizeAwareEviction bool
|
||||
modelSizes map[string]int64 // modelID → file size in bytes
|
||||
|
||||
// Pinned models are excluded from idle, LRU, and memory-pressure eviction
|
||||
pinnedModels map[string]bool
|
||||
|
||||
@@ -94,6 +99,8 @@ func NewWatchDog(opts ...WatchDogOption) *WatchDog {
|
||||
memoryReclaimerThreshold: o.memoryReclaimerThreshold,
|
||||
watchdogInterval: o.watchdogInterval,
|
||||
forceEvictionWhenBusy: o.forceEvictionWhenBusy,
|
||||
sizeAwareEviction: o.sizeAwareEviction,
|
||||
modelSizes: make(map[string]int64),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -133,6 +140,31 @@ func (wd *WatchDog) SetForceEvictionWhenBusy(force bool) {
|
||||
wd.forceEvictionWhenBusy = force
|
||||
}
|
||||
|
||||
// RegisterModelSize records the on-disk file size for a model.
|
||||
// This is used by size-aware eviction to prefer evicting larger models first.
|
||||
// Call this after a model has been successfully loaded.
|
||||
func (wd *WatchDog) RegisterModelSize(modelID string, bytes int64) {
|
||||
wd.Lock()
|
||||
defer wd.Unlock()
|
||||
wd.modelSizes[modelID] = bytes
|
||||
}
|
||||
|
||||
// SetSizeAwareEviction enables or disables size-aware eviction ordering.
|
||||
// When enabled, eviction candidates are sorted by file size (largest first)
|
||||
// rather than by recency, maximizing freed memory per eviction.
|
||||
func (wd *WatchDog) SetSizeAwareEviction(enabled bool) {
|
||||
wd.Lock()
|
||||
defer wd.Unlock()
|
||||
wd.sizeAwareEviction = enabled
|
||||
}
|
||||
|
||||
// GetSizeAwareEviction returns whether size-aware eviction is enabled.
|
||||
func (wd *WatchDog) GetSizeAwareEviction() bool {
|
||||
wd.Lock()
|
||||
defer wd.Unlock()
|
||||
return wd.sizeAwareEviction
|
||||
}
|
||||
|
||||
// SetPinnedModels replaces the set of pinned model names.
|
||||
// Pinned models are excluded from idle, LRU, and memory-pressure eviction.
|
||||
func (wd *WatchDog) SetPinnedModels(models []string) {
|
||||
@@ -302,11 +334,12 @@ func (wd *WatchDog) RestoreState(state WatchDogState) {
|
||||
xlog.Info("[WatchDog] Restored model state", "modelCount", len(wd.addressModelMap))
|
||||
}
|
||||
|
||||
// modelUsageInfo holds information about a model's usage for LRU sorting
|
||||
// modelUsageInfo holds information about a model's usage for eviction sorting
|
||||
type modelUsageInfo struct {
|
||||
address string
|
||||
model string
|
||||
lastUsed time.Time
|
||||
address string
|
||||
model string
|
||||
lastUsed time.Time
|
||||
sizeBytes int64 // on-disk file size; 0 if unknown
|
||||
}
|
||||
|
||||
// EnforceLRULimitResult contains the result of LRU enforcement
|
||||
@@ -338,27 +371,39 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) EnforceLRULimitResult {
|
||||
return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false}
|
||||
}
|
||||
|
||||
xlog.Debug("[WatchDog] LRU enforcement triggered", "current", currentCount, "pendingLoads", pendingLoads, "limit", wd.lruLimit, "toEvict", modelsToEvict)
|
||||
sizeAwareEviction := wd.sizeAwareEviction
|
||||
xlog.Debug("[WatchDog] LRU enforcement triggered", "current", currentCount, "pendingLoads", pendingLoads, "limit", wd.lruLimit, "toEvict", modelsToEvict, "sizeAware", sizeAwareEviction)
|
||||
|
||||
// Build a list of models sorted by last used time (oldest first)
|
||||
// Build a list of models to sort for eviction candidates
|
||||
var models []modelUsageInfo
|
||||
for address, model := range wd.addressModelMap {
|
||||
lastUsed := wd.lastUsed[address]
|
||||
if lastUsed.IsZero() {
|
||||
// If no lastUsed recorded, use a very old time
|
||||
lastUsed = time.Time{}
|
||||
}
|
||||
models = append(models, modelUsageInfo{
|
||||
address: address,
|
||||
model: model,
|
||||
lastUsed: lastUsed,
|
||||
address: address,
|
||||
model: model,
|
||||
lastUsed: lastUsed,
|
||||
sizeBytes: wd.modelSizes[model],
|
||||
})
|
||||
}
|
||||
|
||||
// Sort by lastUsed time (oldest first)
|
||||
slices.SortFunc(models, func(a, b modelUsageInfo) int {
|
||||
return a.lastUsed.Compare(b.lastUsed)
|
||||
})
|
||||
// Sort eviction candidates: largest-first when size-aware, oldest-first otherwise.
|
||||
// Tiebreaker in size-aware mode: oldest last-used (LRU) to break ties between
|
||||
// models of the same size.
|
||||
if sizeAwareEviction {
|
||||
slices.SortFunc(models, func(a, b modelUsageInfo) int {
|
||||
if a.sizeBytes != b.sizeBytes {
|
||||
return int(b.sizeBytes - a.sizeBytes) // largest first
|
||||
}
|
||||
return a.lastUsed.Compare(b.lastUsed) // oldest first as tiebreaker
|
||||
})
|
||||
} else {
|
||||
slices.SortFunc(models, func(a, b modelUsageInfo) int {
|
||||
return a.lastUsed.Compare(b.lastUsed)
|
||||
})
|
||||
}
|
||||
|
||||
// Collect models to evict (the oldest ones)
|
||||
modelsToShutdown, skippedBusyCount := wd.collectEvictionsLocked(models, modelsToEvict, forceEvictionWhenBusy)
|
||||
@@ -635,8 +680,9 @@ func (wd *WatchDog) evictLRUModel() {
|
||||
}
|
||||
|
||||
forceEvictionWhenBusy := wd.forceEvictionWhenBusy
|
||||
sizeAwareEviction := wd.sizeAwareEviction
|
||||
|
||||
// Build a list of models sorted by last used time (oldest first)
|
||||
// Build a list of models to sort for eviction candidates
|
||||
var models []modelUsageInfo
|
||||
for address, model := range wd.addressModelMap {
|
||||
lastUsed := wd.lastUsed[address]
|
||||
@@ -644,9 +690,10 @@ func (wd *WatchDog) evictLRUModel() {
|
||||
lastUsed = time.Time{}
|
||||
}
|
||||
models = append(models, modelUsageInfo{
|
||||
address: address,
|
||||
model: model,
|
||||
lastUsed: lastUsed,
|
||||
address: address,
|
||||
model: model,
|
||||
lastUsed: lastUsed,
|
||||
sizeBytes: wd.modelSizes[model],
|
||||
})
|
||||
}
|
||||
|
||||
@@ -655,10 +702,19 @@ func (wd *WatchDog) evictLRUModel() {
|
||||
return
|
||||
}
|
||||
|
||||
// Sort by lastUsed time (oldest first)
|
||||
slices.SortFunc(models, func(a, b modelUsageInfo) int {
|
||||
return a.lastUsed.Compare(b.lastUsed)
|
||||
})
|
||||
// Sort eviction candidates: largest-first when size-aware, oldest-first otherwise.
|
||||
if sizeAwareEviction {
|
||||
slices.SortFunc(models, func(a, b modelUsageInfo) int {
|
||||
if a.sizeBytes != b.sizeBytes {
|
||||
return int(b.sizeBytes - a.sizeBytes) // largest first
|
||||
}
|
||||
return a.lastUsed.Compare(b.lastUsed)
|
||||
})
|
||||
} else {
|
||||
slices.SortFunc(models, func(a, b modelUsageInfo) int {
|
||||
return a.lastUsed.Compare(b.lastUsed)
|
||||
})
|
||||
}
|
||||
|
||||
// Find the first non-busy, non-pinned model (or first non-pinned model if forceEvictionWhenBusy is true)
|
||||
var lruModel *modelUsageInfo
|
||||
@@ -702,6 +758,9 @@ func (wd *WatchDog) evictLRUModel() {
|
||||
}
|
||||
|
||||
func (wd *WatchDog) untrack(address string) {
|
||||
if modelID, ok := wd.addressModelMap[address]; ok {
|
||||
delete(wd.modelSizes, modelID)
|
||||
}
|
||||
delete(wd.busyTime, address)
|
||||
delete(wd.idleTime, address)
|
||||
delete(wd.lastUsed, address)
|
||||
|
||||
@@ -31,6 +31,9 @@ type WatchDogOptions struct {
|
||||
|
||||
// Eviction settings
|
||||
forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
|
||||
|
||||
// Size-aware eviction: sort candidates by model file size (largest first)
|
||||
sizeAwareEviction bool
|
||||
}
|
||||
|
||||
// WatchDogOption is a function that configures WatchDogOptions
|
||||
@@ -116,6 +119,17 @@ func WithForceEvictionWhenBusy(force bool) WatchDogOption {
|
||||
}
|
||||
}
|
||||
|
||||
// WithSizeAwareEviction enables size-aware eviction ordering.
|
||||
// When true, eviction candidates are sorted by on-disk file size (largest first)
|
||||
// so that bigger models are freed before smaller ones, keeping small utility models
|
||||
// resident and maximizing the memory freed per eviction round.
|
||||
// Default: false (LRU time ordering).
|
||||
func WithSizeAwareEviction(enabled bool) WatchDogOption {
|
||||
return func(o *WatchDogOptions) {
|
||||
o.sizeAwareEviction = enabled
|
||||
}
|
||||
}
|
||||
|
||||
// DefaultWatchDogOptions returns default options for the watchdog
|
||||
func DefaultWatchDogOptions() *WatchDogOptions {
|
||||
return &WatchDogOptions{
|
||||
|
||||
@@ -917,4 +917,110 @@ var _ = Describe("WatchDog", func() {
|
||||
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
|
||||
})
|
||||
})
|
||||
|
||||
Context("Size-Aware Eviction", func() {
|
||||
BeforeEach(func() {
|
||||
wd = model.NewWatchDog(
|
||||
model.WithProcessManager(pm),
|
||||
model.WithLRULimit(2),
|
||||
model.WithForceEvictionWhenBusy(true),
|
||||
model.WithSizeAwareEviction(true),
|
||||
)
|
||||
})
|
||||
|
||||
It("should enable size-aware eviction via option", func() {
|
||||
Expect(wd.GetSizeAwareEviction()).To(BeTrue())
|
||||
})
|
||||
|
||||
It("should allow toggling size-aware eviction dynamically", func() {
|
||||
wd.SetSizeAwareEviction(false)
|
||||
Expect(wd.GetSizeAwareEviction()).To(BeFalse())
|
||||
wd.SetSizeAwareEviction(true)
|
||||
Expect(wd.GetSizeAwareEviction()).To(BeTrue())
|
||||
})
|
||||
|
||||
It("should evict the largest model first when size-aware eviction is enabled", func() {
|
||||
// Register sizes: model1=100MB, model2=400MB
|
||||
wd.RegisterModelSize("model1", 100*1024*1024)
|
||||
wd.RegisterModelSize("model2", 400*1024*1024)
|
||||
|
||||
// Add models — model1 older, model2 newer
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
wd.UnMark("addr1")
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2")
|
||||
|
||||
// With limit=2 and 2 loaded, adding a 3rd triggers eviction.
|
||||
// LRU order: model1 (oldest) would be evicted first.
|
||||
// Size order: model2 (400MB) should be evicted first.
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.EvictedCount).To(Equal(1))
|
||||
Expect(result.NeedMore).To(BeFalse())
|
||||
Expect(pm.getShutdownCalls()).To(ContainElement("model2")) // largest first
|
||||
Expect(pm.getShutdownCalls()).ToNot(ContainElement("model1"))
|
||||
})
|
||||
|
||||
It("should use LRU time as tiebreaker for equal-size models", func() {
|
||||
// Register equal sizes for both models
|
||||
wd.RegisterModelSize("model1", 200*1024*1024)
|
||||
wd.RegisterModelSize("model2", 200*1024*1024)
|
||||
|
||||
// Add model1 first (older)
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
wd.UnMark("addr1")
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
|
||||
// Add model2 (newer)
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2")
|
||||
|
||||
// Equal size → LRU tiebreaker: model1 (older) should be evicted
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.EvictedCount).To(Equal(1))
|
||||
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
|
||||
Expect(pm.getShutdownCalls()).ToNot(ContainElement("model2"))
|
||||
})
|
||||
|
||||
It("should fall back to LRU when no size is registered", func() {
|
||||
// No sizes registered — should behave like standard LRU
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
wd.UnMark("addr1")
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2")
|
||||
|
||||
// Both have size 0 → LRU tiebreaker: model1 (older) evicted
|
||||
result := wd.EnforceLRULimit(0)
|
||||
Expect(result.EvictedCount).To(Equal(1))
|
||||
Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
|
||||
})
|
||||
|
||||
It("should clean up model size on eviction", func() {
|
||||
wd.RegisterModelSize("model1", 200*1024*1024)
|
||||
|
||||
wd.AddAddressModelMap("addr1", "model1")
|
||||
wd.Mark("addr1")
|
||||
wd.UnMark("addr1")
|
||||
|
||||
wd.AddAddressModelMap("addr2", "model2")
|
||||
wd.Mark("addr2")
|
||||
wd.UnMark("addr2")
|
||||
|
||||
wd.EnforceLRULimit(0)
|
||||
|
||||
// model1 was evicted; registering a new model with the same name
|
||||
// should start from a clean state (size not inherited)
|
||||
wd.RegisterModelSize("model1", 50*1024*1024)
|
||||
// Just verifying no panic and size can be re-registered
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user