mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-22 07:39:02 -04:00
* feat(watchdog): add size-aware LRU eviction mode When the model count hits the LRU limit or the memory reclaimer fires, evict the largest model by on-disk file size first rather than the least-recently-used one. For GGUF models the file size is a reliable proxy for GPU/RAM footprint, so evicting the largest candidate maximises freed memory per eviction round while keeping small utility models (embeddings, classifiers, rerankers) resident. Changes: - `pkg/model/watchdog.go`: add `sizeAwareEviction` flag and `modelSizes map[string]int64` to `WatchDog`; sort candidates by `sizeBytes` desc (LRU time as tiebreaker) when the flag is set; add `RegisterModelSize`, `SetSizeAwareEviction`, `GetSizeAwareEviction` - `pkg/model/watchdog_options.go`: add `WithSizeAwareEviction` option - `pkg/model/initializers.go`: stat model file after load and call `RegisterModelSize` so size data is available before the first eviction - `core/config/application_config.go`, `runtime_settings.go`: add `SizeAwareEviction` field and `WithSizeAwareEviction` app option; expose via `ToRuntimeSettings` / `ApplyRuntimeSettings` for the `POST /api/settings` live-reload path - `core/cli/run.go`: add `--size-aware-eviction` flag / `LOCALAI_SIZE_AWARE_EVICTION` env var - `core/application/startup.go`, `watchdog.go`: wire the new option through to `NewWatchDog` - `pkg/model/watchdog_test.go`: 5 new specs — option enable, dynamic toggle, largest-first ordering, equal-size LRU tiebreaker, no-size fallback to LRU, and size-map cleanup on eviction Closes #9375 Signed-off-by: supermario_leo <leo.stack@outlook.com> * refactor(watchdog): use vram estimation scaffolding for model size Replace the brittle os.Stat(modelFile) approach with a proper call to pkg/vram, which handles multi-file models (DownloadFiles, MMProj) and all weight file types, not just single GGUF files. - Add estimateModelSizeBytes() in core/backend/options.go that collects all weight file URIs from the model config, resolves them to file:// URIs, and calls vram.Estimate() with the shared DefaultCachedSizeResolver (15-min TTL cache avoids redundant stat calls on repeated loads) - Thread the result through via a new WithModelSizeBytes() loader option - In initializers.go, consume the pre-computed size instead of calling os.Stat; if no size was supplied (e.g. for external/router-dispatched models) the registration is simply skipped Signed-off-by: supermario_leo <leo.stack@outlook.com> * refactor(watchdog): use EstimateModel with HF fallback for size estimation Switch estimateModelSizeBytes from calling vram.Estimate directly to the unified vram.EstimateModel entry point, which adds automatic fallbacks: file-based GGUF metadata → HF API → size string. Also extract the HuggingFace repo ID from model URIs (huggingface://, hf://, https://huggingface.co/ and org/model short-form) and pass it as ModelEstimateInput.HFRepo, so models not yet downloaded locally can still get a size estimate via the HF API. Addresses @mudler's review feedback: "better to rely on EstimateModel and pass by the HF URL of the model extracted from the URI". Signed-off-by: supermario_leo <leo.stack@outlook.com> * feat(webui): add Size-Aware Eviction toggle to settings page The size-aware eviction setting was wired through the CLI flag and the RuntimeSettings live-reload path (POST /api/settings) but had no handle on the React settings page, so it could not be toggled from the UI. Add a Size-Aware Eviction toggle to the Watchdog section, next to the existing Force Eviction When Busy / LRU eviction handles. The settings page loads and saves the whole RuntimeSettings object, so the new size_aware_eviction key is picked up with no extra plumbing. Addresses @mudler's review feedback: the application config setting should land on the same UI settings page as the other handles. Signed-off-by: supermario_leo <leo.stack@outlook.com> --------- Signed-off-by: supermario_leo <leo.stack@outlook.com>
194 lines
6.2 KiB
Go
194 lines
6.2 KiB
Go
package application
|
|
|
|
import (
|
|
"github.com/mudler/LocalAI/core/config"
|
|
"github.com/mudler/LocalAI/pkg/model"
|
|
"github.com/mudler/xlog"
|
|
)
|
|
|
|
// SyncPinnedModelsToWatchdog reads pinned status from all model configs and updates the watchdog
|
|
func (a *Application) SyncPinnedModelsToWatchdog() {
|
|
cl := a.ModelConfigLoader()
|
|
if cl == nil {
|
|
return
|
|
}
|
|
wd := a.modelLoader.GetWatchDog()
|
|
if wd == nil {
|
|
return
|
|
}
|
|
configs := cl.GetAllModelsConfigs()
|
|
var pinned []string
|
|
for _, cfg := range configs {
|
|
if cfg.IsPinned() {
|
|
pinned = append(pinned, cfg.Name)
|
|
}
|
|
}
|
|
wd.SetPinnedModels(pinned)
|
|
xlog.Debug("Synced pinned models to watchdog", "count", len(pinned))
|
|
}
|
|
|
|
// SyncModelGroupsToWatchdog reads concurrency_groups from all model configs and
|
|
// updates the watchdog so EnforceGroupExclusivity has the current view.
|
|
func (a *Application) SyncModelGroupsToWatchdog() {
|
|
cl := a.ModelConfigLoader()
|
|
if cl == nil {
|
|
return
|
|
}
|
|
wd := a.modelLoader.GetWatchDog()
|
|
if wd == nil {
|
|
return
|
|
}
|
|
groups := extractModelGroupsFromConfigs(cl.GetAllModelsConfigs())
|
|
wd.ReplaceModelGroups(groups)
|
|
xlog.Debug("Synced concurrency groups to watchdog", "count", len(groups))
|
|
}
|
|
|
|
// extractModelGroupsFromConfigs builds the model→groups map the watchdog
|
|
// expects. Disabled models are skipped — their declared groups should not
|
|
// block other models from loading.
|
|
func extractModelGroupsFromConfigs(configs []config.ModelConfig) map[string][]string {
|
|
out := make(map[string][]string)
|
|
for _, cfg := range configs {
|
|
if cfg.IsDisabled() {
|
|
continue
|
|
}
|
|
gs := cfg.GetConcurrencyGroups()
|
|
if len(gs) == 0 {
|
|
continue
|
|
}
|
|
out[cfg.Name] = gs
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (a *Application) StopWatchdog() error {
|
|
if a.watchdogStop != nil {
|
|
close(a.watchdogStop)
|
|
a.watchdogStop = nil
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// startWatchdog starts the watchdog with current ApplicationConfig settings
|
|
// This is an internal method that assumes the caller holds the watchdogMutex
|
|
func (a *Application) startWatchdog() error {
|
|
appConfig := a.ApplicationConfig()
|
|
|
|
// Get effective max active backends (considers both MaxActiveBackends and deprecated SingleBackend)
|
|
lruLimit := appConfig.GetEffectiveMaxActiveBackends()
|
|
|
|
// Create watchdog if enabled OR if LRU limit is set OR if memory reclaimer is enabled
|
|
// LRU eviction requires watchdog infrastructure even without busy/idle checks
|
|
if appConfig.WatchDog || lruLimit > 0 || appConfig.MemoryReclaimerEnabled {
|
|
wd := model.NewWatchDog(
|
|
model.WithProcessManager(a.modelLoader),
|
|
model.WithBusyTimeout(appConfig.WatchDogBusyTimeout),
|
|
model.WithIdleTimeout(appConfig.WatchDogIdleTimeout),
|
|
model.WithWatchdogInterval(appConfig.WatchDogInterval),
|
|
model.WithBusyCheck(appConfig.WatchDogBusy),
|
|
model.WithIdleCheck(appConfig.WatchDogIdle),
|
|
model.WithLRULimit(lruLimit),
|
|
model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
|
|
model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
|
|
model.WithSizeAwareEviction(appConfig.SizeAwareEviction),
|
|
)
|
|
|
|
// Create new stop channel BEFORE setting up any goroutines
|
|
// This prevents race conditions where the old shutdown handler might
|
|
// receive the closed channel and try to shut down the new watchdog
|
|
a.watchdogStop = make(chan bool, 1)
|
|
|
|
// Set the watchdog on the model loader
|
|
a.modelLoader.SetWatchDog(wd)
|
|
|
|
// Sync pinned models and concurrency groups from config to the watchdog
|
|
a.SyncPinnedModelsToWatchdog()
|
|
a.SyncModelGroupsToWatchdog()
|
|
|
|
// Start watchdog goroutine if any periodic checks are enabled
|
|
// LRU eviction doesn't need the Run() loop - it's triggered on model load
|
|
// But memory reclaimer needs the Run() loop for periodic checking
|
|
if appConfig.WatchDogBusy || appConfig.WatchDogIdle || appConfig.MemoryReclaimerEnabled {
|
|
go wd.Run()
|
|
}
|
|
|
|
// Setup shutdown handler - this goroutine will wait on a.watchdogStop
|
|
// which is now a fresh channel, so it won't receive any stale signals
|
|
// Note: We capture wd in a local variable to ensure this handler operates
|
|
// on the correct watchdog instance (not a later one that gets assigned to wd)
|
|
wdForShutdown := wd
|
|
go func() {
|
|
select {
|
|
case <-a.watchdogStop:
|
|
xlog.Debug("Watchdog stop signal received")
|
|
wdForShutdown.Shutdown()
|
|
case <-appConfig.Context.Done():
|
|
xlog.Debug("Context canceled, shutting down watchdog")
|
|
wdForShutdown.Shutdown()
|
|
}
|
|
}()
|
|
|
|
xlog.Info("Watchdog started with new settings", "lruLimit", lruLimit, "busyCheck", appConfig.WatchDogBusy, "idleCheck", appConfig.WatchDogIdle, "memoryReclaimer", appConfig.MemoryReclaimerEnabled, "memoryThreshold", appConfig.MemoryReclaimerThreshold, "interval", appConfig.WatchDogInterval)
|
|
} else {
|
|
xlog.Info("Watchdog disabled")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// StartWatchdog starts the watchdog with current ApplicationConfig settings
|
|
func (a *Application) StartWatchdog() error {
|
|
a.watchdogMutex.Lock()
|
|
defer a.watchdogMutex.Unlock()
|
|
|
|
return a.startWatchdog()
|
|
}
|
|
|
|
// RestartWatchdog restarts the watchdog with current ApplicationConfig settings
|
|
func (a *Application) RestartWatchdog() error {
|
|
a.watchdogMutex.Lock()
|
|
defer a.watchdogMutex.Unlock()
|
|
|
|
// Get the old watchdog before we shut it down
|
|
oldWD := a.modelLoader.GetWatchDog()
|
|
|
|
// Get the state from the old watchdog before shutting it down
|
|
// This preserves information about loaded models
|
|
var oldState model.WatchDogState
|
|
if oldWD != nil {
|
|
oldState = oldWD.GetState()
|
|
}
|
|
|
|
// Signal all handlers to stop by closing the stop channel
|
|
// This will cause any goroutine waiting on <-a.watchdogStop to unblock
|
|
if a.watchdogStop != nil {
|
|
close(a.watchdogStop)
|
|
a.watchdogStop = nil
|
|
}
|
|
|
|
// Shutdown existing watchdog - this triggers the stop signal
|
|
if oldWD != nil {
|
|
oldWD.Shutdown()
|
|
// Wait for the old watchdog's Run() goroutine to fully shut down
|
|
oldWD.WaitDone()
|
|
}
|
|
|
|
// Start watchdog with new settings
|
|
if err := a.startWatchdog(); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Restore the model state from the old watchdog to the new one
|
|
// This ensures the new watchdog knows about already-loaded models
|
|
newWD := a.modelLoader.GetWatchDog()
|
|
if newWD != nil && len(oldState.AddressModelMap) > 0 {
|
|
newWD.RestoreState(oldState)
|
|
}
|
|
|
|
// Re-sync pinned models and concurrency groups after restart
|
|
a.SyncPinnedModelsToWatchdog()
|
|
a.SyncModelGroupsToWatchdog()
|
|
|
|
return nil
|
|
}
|