From e45d63c86ec9f960dc5193399e6d78e9e8d954d6 Mon Sep 17 00:00:00 2001 From: Andres Date: Mon, 23 Feb 2026 11:57:28 +0100 Subject: [PATCH] fix(cli): Fix watchdog running constantly and spamming logs (#8624) * Fix watchdog running constantly and spamming logs Signed-off-by: Andres Smith * Update docs Signed-off-by: Andres Smith --------- Signed-off-by: Andres Smith --- core/cli/run.go | 8 ++++++++ core/config/application_config.go | 17 ++++++++++++----- docs/content/reference/cli-reference.md | 1 + 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/core/cli/run.go b/core/cli/run.go index 4b845f9a4..7410c8def 100644 --- a/core/cli/run.go +++ b/core/cli/run.go @@ -71,6 +71,7 @@ type RunCMD struct { WatchdogIdleTimeout string `env:"LOCALAI_WATCHDOG_IDLE_TIMEOUT,WATCHDOG_IDLE_TIMEOUT" default:"15m" help:"Threshold beyond which an idle backend should be stopped" group:"backends"` EnableWatchdogBusy bool `env:"LOCALAI_WATCHDOG_BUSY,WATCHDOG_BUSY" default:"false" help:"Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout" group:"backends"` WatchdogBusyTimeout string `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"` + WatchdogInterval string `env:"LOCALAI_WATCHDOG_INTERVAL,WATCHDOG_INTERVAL" default:"500ms" help:"Interval between watchdog checks (e.g., 500ms, 5s, 1m) (default: 500ms)" group:"backends"` EnableMemoryReclaimer bool `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"` MemoryReclaimerThreshold float64 `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"` ForceEvictionWhenBusy bool `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"` @@ -215,6 +216,13 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error { } opts = append(opts, config.SetWatchDogBusyTimeout(dur)) } + if r.WatchdogInterval != "" { + dur, err := time.ParseDuration(r.WatchdogInterval) + if err != nil { + return err + } + opts = append(opts, config.SetWatchDogInterval(dur)) + } } // Handle memory reclaimer (uses GPU VRAM if available, otherwise RAM) diff --git a/core/config/application_config.go b/core/config/application_config.go index e96e8ac58..79276e49f 100644 --- a/core/config/application_config.go +++ b/core/config/application_config.go @@ -98,10 +98,11 @@ func NewApplicationConfig(o ...AppOption) *ApplicationConfig { Context: context.Background(), UploadLimitMB: 15, Debug: true, - AgentJobRetentionDays: 30, // Default: 30 days - LRUEvictionMaxRetries: 30, // Default: 30 retries - LRUEvictionRetryInterval: 1 * time.Second, // Default: 1 second - TracingMaxItems: 1024, + AgentJobRetentionDays: 30, // Default: 30 days + LRUEvictionMaxRetries: 30, // Default: 30 retries + LRUEvictionRetryInterval: 1 * time.Second, // Default: 1 second + WatchDogInterval: 500 * time.Millisecond, // Default: 500ms + TracingMaxItems: 1024, PathWithoutAuth: []string{ "/static/", "/generated-audio/", @@ -208,6 +209,12 @@ func SetWatchDogIdleTimeout(t time.Duration) AppOption { } } +func SetWatchDogInterval(t time.Duration) AppOption { + return func(o *ApplicationConfig) { + o.WatchDogInterval = t + } +} + // EnableMemoryReclaimer enables memory threshold monitoring. // When enabled, the watchdog will evict backends if memory usage exceeds the threshold. // Works with GPU VRAM if available, otherwise uses system RAM. @@ -642,7 +649,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings { AutoloadBackendGalleries: &autoloadBackendGalleries, ApiKeys: &apiKeys, AgentJobRetentionDays: &agentJobRetentionDays, - OpenResponsesStoreTTL: &openResponsesStoreTTL, + OpenResponsesStoreTTL: &openResponsesStoreTTL, } } diff --git a/docs/content/reference/cli-reference.md b/docs/content/reference/cli-reference.md index f8349ef7a..2b24020a8 100644 --- a/docs/content/reference/cli-reference.md +++ b/docs/content/reference/cli-reference.md @@ -46,6 +46,7 @@ Complete reference for all LocalAI command-line interface (CLI) parameters and e | `--watchdog-idle-timeout` | `15m` | Threshold beyond which an idle backend should be stopped | `$LOCALAI_WATCHDOG_IDLE_TIMEOUT`, `$WATCHDOG_IDLE_TIMEOUT` | | `--enable-watchdog-busy` | `false` | Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout | `$LOCALAI_WATCHDOG_BUSY`, `$WATCHDOG_BUSY` | | `--watchdog-busy-timeout` | `5m` | Threshold beyond which a busy backend should be stopped | `$LOCALAI_WATCHDOG_BUSY_TIMEOUT`, `$WATCHDOG_BUSY_TIMEOUT` | +| `--watchdog-interval` | `500ms` | Interval between watchdog checks (e.g., `500ms`, `5s`, `1m`) | `$LOCALAI_WATCHDOG_INTERVAL`, `$WATCHDOG_INTERVAL` | | `--force-eviction-when-busy` | `false` | Force eviction even when models have active API calls (default: false for safety). **Warning:** Enabling this can interrupt active requests | `$LOCALAI_FORCE_EVICTION_WHEN_BUSY`, `$FORCE_EVICTION_WHEN_BUSY` | | `--lru-eviction-max-retries` | `30` | Maximum number of retries when waiting for busy models to become idle before eviction | `$LOCALAI_LRU_EVICTION_MAX_RETRIES`, `$LRU_EVICTION_MAX_RETRIES` | | `--lru-eviction-retry-interval` | `1s` | Interval between retries when waiting for busy models to become idle (e.g., `1s`, `2s`) | `$LOCALAI_LRU_EVICTION_RETRY_INTERVAL`, `$LRU_EVICTION_RETRY_INTERVAL` |