fix: improve watchdown logics (#8591)

* fix: ensure proper watchdog shutdown and state passing between restarts Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix: add missing watchdog settings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix: untrack model if we shut it down successfully Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-17 04:56:52 -04:00 · 2026-02-17 18:49:22 +01:00
parent 067a255435
commit ecba23d44e
3 changed files with 144 additions and 17 deletions
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -319,6 +319,29 @@ func loadRuntimeSettingsFromFile(options *config.ApplicationConfig) {
 			options.MemoryReclaimerThreshold = *settings.MemoryReclaimerThreshold
 		}
 	}
+	if settings.ForceEvictionWhenBusy != nil {
+		// Only apply if current value is default (false), suggesting it wasn't set from env var
+		if !options.ForceEvictionWhenBusy {
+			options.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
+		}
+	}
+	if settings.LRUEvictionMaxRetries != nil {
+		// Only apply if current value is default (30), suggesting it wasn't set from env var
+		if options.LRUEvictionMaxRetries == 0 {
+			options.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
+		}
+	}
+	if settings.LRUEvictionRetryInterval != nil {
+		// Only apply if current value is default (1s), suggesting it wasn't set from env var
+		if options.LRUEvictionRetryInterval == 0 {
+			dur, err := time.ParseDuration(*settings.LRUEvictionRetryInterval)
+			if err == nil {
+				options.LRUEvictionRetryInterval = dur
+			} else {
+				xlog.Warn("invalid LRU eviction retry interval in runtime_settings.json", "error", err, "interval", *settings.LRUEvictionRetryInterval)
+			}
+		}
+	}
 	if settings.AgentJobRetentionDays != nil {
 		// Only apply if current value is default (0), suggesting it wasn't set from env var
 		if options.AgentJobRetentionDays == 0 {
--- a/core/application/watchdog.go
+++ b/core/application/watchdog.go
@@ -1,8 +1,6 @@
 package application

 import (
-	"time"
-
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/xlog"
 )
@@ -37,11 +35,15 @@ func (a *Application) startWatchdog() error {
 			model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
 			model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
 		)
-		a.modelLoader.SetWatchDog(wd)

-		// Create new stop channel
+		// Create new stop channel BEFORE setting up any goroutines
+		// This prevents race conditions where the old shutdown handler might
+		// receive the closed channel and try to shut down the new watchdog
 		a.watchdogStop = make(chan bool, 1)

+		// Set the watchdog on the model loader
+		a.modelLoader.SetWatchDog(wd)
+
 		// Start watchdog goroutine if any periodic checks are enabled
 		// LRU eviction doesn't need the Run() loop - it's triggered on model load
 		// But memory reclaimer needs the Run() loop for periodic checking
@@ -49,15 +51,19 @@ func (a *Application) startWatchdog() error {
 			go wd.Run()
 		}

-		// Setup shutdown handler
+		// Setup shutdown handler - this goroutine will wait on a.watchdogStop
+		// which is now a fresh channel, so it won't receive any stale signals
+		// Note: We capture wd in a local variable to ensure this handler operates
+		// on the correct watchdog instance (not a later one that gets assigned to wd)
+		wdForShutdown := wd
 		go func() {
 			select {
 			case <-a.watchdogStop:
 				xlog.Debug("Watchdog stop signal received")
-				wd.Shutdown()
+				wdForShutdown.Shutdown()
 			case <-appConfig.Context.Done():
 				xlog.Debug("Context canceled, shutting down watchdog")
-				wd.Shutdown()
+				wdForShutdown.Shutdown()
 			}
 		}()

@@ -82,20 +88,41 @@ func (a *Application) RestartWatchdog() error {
 	a.watchdogMutex.Lock()
 	defer a.watchdogMutex.Unlock()

-	// Shutdown existing watchdog if running
+	// Get the old watchdog before we shut it down
+	oldWD := a.modelLoader.GetWatchDog()
+
+	// Get the state from the old watchdog before shutting it down
+	// This preserves information about loaded models
+	var oldState model.WatchDogState
+	if oldWD != nil {
+		oldState = oldWD.GetState()
+	}
+
+	// Signal all handlers to stop by closing the stop channel
+	// This will cause any goroutine waiting on <-a.watchdogStop to unblock
 	if a.watchdogStop != nil {
 		close(a.watchdogStop)
 		a.watchdogStop = nil
 	}

-	// Shutdown existing watchdog if running
-	currentWD := a.modelLoader.GetWatchDog()
-	if currentWD != nil {
-		currentWD.Shutdown()
-		// Wait a bit for shutdown to complete
-		time.Sleep(100 * time.Millisecond)
+	// Shutdown existing watchdog - this triggers the stop signal
+	if oldWD != nil {
+		oldWD.Shutdown()
+		// Wait for the old watchdog's Run() goroutine to fully shut down
+		oldWD.WaitDone()
 	}

 	// Start watchdog with new settings
-	return a.startWatchdog()
+	if err := a.startWatchdog(); err != nil {
+		return err
+	}
+
+	// Restore the model state from the old watchdog to the new one
+	// This ensures the new watchdog knows about already-loaded models
+	newWD := a.modelLoader.GetWatchDog()
+	if newWD != nil && len(oldState.AddressModelMap) > 0 {
+		newWD.RestoreState(oldState)
+	}
+
+	return nil
 }