fix(cli): Fix watchdog running constantly and spamming logs (#8624)

* Fix watchdog running constantly and spamming logs Signed-off-by: Andres Smith <andressmithdev@pm.me> * Update docs Signed-off-by: Andres Smith <andressmithdev@pm.me> --------- Signed-off-by: Andres Smith <andressmithdev@pm.me>
2026-06-02 13:22:34 -04:00 · 2026-02-23 11:57:28 +01:00
parent f40c8dd0ce
commit e45d63c86e
3 changed files with 21 additions and 5 deletions
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -71,6 +71,7 @@ type RunCMD struct {
 	WatchdogIdleTimeout                string   `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"`
 	EnableWatchdogBusy                 bool     `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"`
 	WatchdogBusyTimeout                string   `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
+	WatchdogInterval                   string   `env:"LOCALAI_WATCHDOG_INTERVAL,WATCHDOG_INTERVAL" default:"500ms" help:"Interval between watchdog checks (e.g., 500ms, 5s, 1m) (default: 500ms)" group:"backends"`
 	EnableMemoryReclaimer              bool     `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
 	MemoryReclaimerThreshold           float64  `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
 	ForceEvictionWhenBusy              bool     `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
@@ -215,6 +216,13 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 			}
 			opts = append(opts, config.SetWatchDogBusyTimeout(dur))
 		}
+		if r.WatchdogInterval != "" {
+			dur, err := time.ParseDuration(r.WatchdogInterval)
+			if err != nil {
+				return err
+			}
+			opts = append(opts, config.SetWatchDogInterval(dur))
+		}
 	}

 	// Handle memory reclaimer (uses GPU VRAM if available, otherwise RAM)
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -98,10 +98,11 @@ func NewApplicationConfig(o ...AppOption) *ApplicationConfig {
 		Context:                  context.Background(),
 		UploadLimitMB:            15,
 		Debug:                    true,
-		AgentJobRetentionDays:    30,              // Default: 30 days
-		LRUEvictionMaxRetries:    30,              // Default: 30 retries
-		LRUEvictionRetryInterval: 1 * time.Second, // Default: 1 second
-		TracingMaxItems:       1024,
+		AgentJobRetentionDays:    30,                     // Default: 30 days
+		LRUEvictionMaxRetries:    30,                     // Default: 30 retries
+		LRUEvictionRetryInterval: 1 * time.Second,        // Default: 1 second
+		WatchDogInterval:         500 * time.Millisecond, // Default: 500ms
+		TracingMaxItems:          1024,
 		PathWithoutAuth: []string{
 			"/static/",
 			"/generated-audio/",
@@ -208,6 +209,12 @@ func SetWatchDogIdleTimeout(t time.Duration) AppOption {
 	}
 }

+func SetWatchDogInterval(t time.Duration) AppOption {
+	return func(o *ApplicationConfig) {
+		o.WatchDogInterval = t
+	}
+}
+
 // EnableMemoryReclaimer enables memory threshold monitoring.
 // When enabled, the watchdog will evict backends if memory usage exceeds the threshold.
 // Works with GPU VRAM if available, otherwise uses system RAM.
@@ -642,7 +649,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 		AutoloadBackendGalleries: &autoloadBackendGalleries,
 		ApiKeys:                  &apiKeys,
 		AgentJobRetentionDays:    &agentJobRetentionDays,
-		OpenResponsesStoreTTL:     &openResponsesStoreTTL,
+		OpenResponsesStoreTTL:    &openResponsesStoreTTL,
 	}
 }

--- a/docs/content/reference/cli-reference.md
+++ b/docs/content/reference/cli-reference.md
@@ -46,6 +46,7 @@ Complete reference for all LocalAI command-line interface (CLI) parameters and e
 | `--watchdog-idle-timeout` | `15m` | Threshold beyond which an idle backend should be stopped | `$LOCALAI_WATCHDOG_IDLE_TIMEOUT`, `$WATCHDOG_IDLE_TIMEOUT` |
 | `--enable-watchdog-busy` | `false` | Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout | `$LOCALAI_WATCHDOG_BUSY`, `$WATCHDOG_BUSY` |
 | `--watchdog-busy-timeout` | `5m` | Threshold beyond which a busy backend should be stopped | `$LOCALAI_WATCHDOG_BUSY_TIMEOUT`, `$WATCHDOG_BUSY_TIMEOUT` |
+| `--watchdog-interval` | `500ms` | Interval between watchdog checks (e.g., `500ms`, `5s`, `1m`) | `$LOCALAI_WATCHDOG_INTERVAL`, `$WATCHDOG_INTERVAL` |
 | `--force-eviction-when-busy` | `false` | Force eviction even when models have active API calls (default: false for safety). **Warning:** Enabling this can interrupt active requests | `$LOCALAI_FORCE_EVICTION_WHEN_BUSY`, `$FORCE_EVICTION_WHEN_BUSY` |
 | `--lru-eviction-max-retries` | `30` | Maximum number of retries when waiting for busy models to become idle before eviction | `$LOCALAI_LRU_EVICTION_MAX_RETRIES`, `$LRU_EVICTION_MAX_RETRIES` |
 | `--lru-eviction-retry-interval` | `1s` | Interval between retries when waiting for busy models to become idle (e.g., `1s`, `2s`) | `$LOCALAI_LRU_EVICTION_RETRY_INTERVAL`, `$LRU_EVICTION_RETRY_INTERVAL` |