fix: improve watchdown logics (#8591)

* fix: ensure proper watchdog shutdown and state passing between restarts

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix: add missing watchdog settings

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix: untrack model if we shut it down successfully

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-02-17 18:49:22 +01:00
committed by GitHub
parent 067a255435
commit ecba23d44e
3 changed files with 144 additions and 17 deletions

View File

@@ -319,6 +319,29 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
options.MemoryReclaimerThreshold = *settings.MemoryReclaimerThreshold
}
}
if settings.ForceEvictionWhenBusy != nil {
// Only apply if current value is default (false), suggesting it wasn't set from env var
if !options.ForceEvictionWhenBusy {
options.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
}
}
if settings.LRUEvictionMaxRetries != nil {
// Only apply if current value is default (30), suggesting it wasn't set from env var
if options.LRUEvictionMaxRetries == 0 {
options.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
}
}
if settings.LRUEvictionRetryInterval != nil {
// Only apply if current value is default (1s), suggesting it wasn't set from env var
if options.LRUEvictionRetryInterval == 0 {
dur, err := time.ParseDuration(*settings.LRUEvictionRetryInterval)
if err == nil {
options.LRUEvictionRetryInterval = dur
} else {
xlog.Warn("invalid LRU eviction retry interval in runtime_settings.json", "error", err, "interval", *settings.LRUEvictionRetryInterval)
}
}
}
if settings.AgentJobRetentionDays != nil {
// Only apply if current value is default (0), suggesting it wasn't set from env var
if options.AgentJobRetentionDays == 0 {

View File

@@ -1,8 +1,6 @@
package application
import (
"time"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/xlog"
)
@@ -37,11 +35,15 @@ func (a *Application) startWatchdog() error {
model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
)
a.modelLoader.SetWatchDog(wd)
// Create new stop channel
// Create new stop channel BEFORE setting up any goroutines
// This prevents race conditions where the old shutdown handler might
// receive the closed channel and try to shut down the new watchdog
a.watchdogStop = make(chan bool, 1)
// Set the watchdog on the model loader
a.modelLoader.SetWatchDog(wd)
// Start watchdog goroutine if any periodic checks are enabled
// LRU eviction doesn't need the Run() loop - it's triggered on model load
// But memory reclaimer needs the Run() loop for periodic checking
@@ -49,15 +51,19 @@ func (a *Application) startWatchdog() error {
go wd.Run()
}
// Setup shutdown handler
// Setup shutdown handler - this goroutine will wait on a.watchdogStop
// which is now a fresh channel, so it won't receive any stale signals
// Note: We capture wd in a local variable to ensure this handler operates
// on the correct watchdog instance (not a later one that gets assigned to wd)
wdForShutdown := wd
go func() {
select {
case <-a.watchdogStop:
xlog.Debug("Watchdog stop signal received")
wd.Shutdown()
wdForShutdown.Shutdown()
case <-appConfig.Context.Done():
xlog.Debug("Context canceled, shutting down watchdog")
wd.Shutdown()
wdForShutdown.Shutdown()
}
}()
@@ -82,20 +88,41 @@ func (a *Application) RestartWatchdog() error {
a.watchdogMutex.Lock()
defer a.watchdogMutex.Unlock()
// Shutdown existing watchdog if running
// Get the old watchdog before we shut it down
oldWD := a.modelLoader.GetWatchDog()
// Get the state from the old watchdog before shutting it down
// This preserves information about loaded models
var oldState model.WatchDogState
if oldWD != nil {
oldState = oldWD.GetState()
}
// Signal all handlers to stop by closing the stop channel
// This will cause any goroutine waiting on <-a.watchdogStop to unblock
if a.watchdogStop != nil {
close(a.watchdogStop)
a.watchdogStop = nil
}
// Shutdown existing watchdog if running
currentWD := a.modelLoader.GetWatchDog()
if currentWD != nil {
currentWD.Shutdown()
// Wait a bit for shutdown to complete
time.Sleep(100 * time.Millisecond)
// Shutdown existing watchdog - this triggers the stop signal
if oldWD != nil {
oldWD.Shutdown()
// Wait for the old watchdog's Run() goroutine to fully shut down
oldWD.WaitDone()
}
// Start watchdog with new settings
return a.startWatchdog()
if err := a.startWatchdog(); err != nil {
return err
}
// Restore the model state from the old watchdog to the new one
// This ensures the new watchdog knows about already-loaded models
newWD := a.modelLoader.GetWatchDog()
if newWD != nil && len(oldState.AddressModelMap) > 0 {
newWD.RestoreState(oldState)
}
return nil
}