From 347cdcf54532f51b93b527c41d9014912b2cfd0a Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 30 Jun 2026 08:04:12 +0000 Subject: [PATCH] fix(watchdog): persist a UI-saved Check Interval across restarts (#10601) The watchdog Check Interval saved via /api/settings reverted to 500ms on every restart, while the idle/busy timeouts persisted correctly. Root cause: NewApplicationConfig baseline-defaulted WatchDogInterval to 500ms, whereas the idle/busy timeouts default to 0. The startup loader (loadRuntimeSettingsFromFile) applies a persisted runtime_settings.json value only when the field is still at its zero default - its heuristic for "this wasn't set by an env var". Because the interval was always 500ms at that point, the loader never read the persisted value back, so the saved interval was silently discarded on each boot. Fix: drop the non-zero baseline default so the interval behaves like the sibling timeouts (0 = unset). The effective 500ms default is now supplied at the watchdog layer: WithWatchdogInterval ignores a non-positive value so DefaultWatchDogOptions' 500ms is preserved (and a 0 interval can never turn the watchdog loop into a busy spin). Also mirror the interval in the live config file watcher alongside idle/busy, and report the real 500ms default (not the stale "2s") from ToRuntimeSettings. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] --- core/application/config_file_watcher.go | 9 +++++++ .../runtime_settings_branding_test.go | 25 +++++++++++++++++++ core/config/application_config.go | 22 ++++++++++------ pkg/model/watchdog_options.go | 11 ++++++-- 4 files changed, 58 insertions(+), 9 deletions(-) diff --git a/core/application/config_file_watcher.go b/core/application/config_file_watcher.go index a5f7d5f48..530acb4d4 100644 --- a/core/application/config_file_watcher.go +++ b/core/application/config_file_watcher.go @@ -197,6 +197,7 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand envWatchdogBusy := appConfig.WatchDogBusy == startupAppConfig.WatchDogBusy envWatchdogIdleTimeout := appConfig.WatchDogIdleTimeout == startupAppConfig.WatchDogIdleTimeout envWatchdogBusyTimeout := appConfig.WatchDogBusyTimeout == startupAppConfig.WatchDogBusyTimeout + envWatchdogInterval := appConfig.WatchDogInterval == startupAppConfig.WatchDogInterval envSingleBackend := appConfig.SingleBackend == startupAppConfig.SingleBackend envMaxActiveBackends := appConfig.MaxActiveBackends == startupAppConfig.MaxActiveBackends envMemoryReclaimerEnabled := appConfig.MemoryReclaimerEnabled == startupAppConfig.MemoryReclaimerEnabled @@ -257,6 +258,14 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand xlog.Warn("invalid watchdog busy timeout in runtime_settings.json", "error", err, "timeout", *settings.WatchdogBusyTimeout) } } + if settings.WatchdogInterval != nil && !envWatchdogInterval { + dur, err := time.ParseDuration(*settings.WatchdogInterval) + if err == nil { + appConfig.WatchDogInterval = dur + } else { + xlog.Warn("invalid watchdog interval in runtime_settings.json", "error", err, "interval", *settings.WatchdogInterval) + } + } // Handle MaxActiveBackends (new) and SingleBackend (deprecated) if settings.MaxActiveBackends != nil && !envMaxActiveBackends { appConfig.MaxActiveBackends = *settings.MaxActiveBackends diff --git a/core/application/runtime_settings_branding_test.go b/core/application/runtime_settings_branding_test.go index 763ede4b1..3e19901bd 100644 --- a/core/application/runtime_settings_branding_test.go +++ b/core/application/runtime_settings_branding_test.go @@ -87,6 +87,31 @@ var _ = Describe("loadRuntimeSettingsFromFile", func() { }) }) + // Watchdog check interval (issue #10601). Unlike the idle/busy timeouts + // (which default to 0), NewApplicationConfig baseline-defaults the + // interval to 500ms. The loader's "apply file value only if still at the + // zero default" env-detection therefore never fired for the interval, so + // a UI-saved Check Interval silently reverted to 500ms on every restart + // while the idle/busy timeouts persisted. These specs construct the + // config the same way boot does (NewApplicationConfig) so they observe + // the real default the loader sees. + Describe("watchdog interval", func() { + It("loads a UI-saved watchdog_interval on the next startup", func() { + cfg := config.NewApplicationConfig() + cfg.DynamicConfigsDir = seedSettings(`{"watchdog_interval": "2s"}`) + loadRuntimeSettingsFromFile(cfg) + Expect(cfg.WatchDogInterval).To(Equal(2 * time.Second)) + }) + + It("does not override an explicit env/CLI interval", func() { + cfg := config.NewApplicationConfig() + cfg.DynamicConfigsDir = seedSettings(`{"watchdog_interval": "2s"}`) + cfg.WatchDogInterval = 1 * time.Second // simulate SetWatchDogInterval from env + loadRuntimeSettingsFromFile(cfg) + Expect(cfg.WatchDogInterval).To(Equal(1*time.Second), "env/CLI interval must win over the persisted file value") + }) + }) + // MITM listener address. The file is the only source — no env var // exists — so a regression here means an admin who configured the // listener via /api/settings loses it after a reboot, even though diff --git a/core/config/application_config.go b/core/config/application_config.go index 1821a8441..8e4bb08f5 100644 --- a/core/config/application_config.go +++ b/core/config/application_config.go @@ -6,6 +6,7 @@ import ( "regexp" "time" + "github.com/mudler/LocalAI/pkg/model" "github.com/mudler/LocalAI/pkg/system" "github.com/mudler/LocalAI/pkg/xsysinfo" "github.com/mudler/xlog" @@ -241,12 +242,19 @@ func NewApplicationConfig(o ...AppOption) *ApplicationConfig { Context: context.Background(), UploadLimitMB: 15, Debug: true, - AgentJobRetentionDays: 30, // Default: 30 days - LRUEvictionMaxRetries: 30, // Default: 30 retries - LRUEvictionRetryInterval: 1 * time.Second, // Default: 1 second - WatchDogInterval: 500 * time.Millisecond, // Default: 500ms - TracingMaxItems: 1024, - TracingMaxBodyBytes: 64 * 1024, // 64 KiB - caps each request/response body in the trace buffer + AgentJobRetentionDays: 30, // Default: 30 days + LRUEvictionMaxRetries: 30, // Default: 30 retries + LRUEvictionRetryInterval: 1 * time.Second, // Default: 1 second + // WatchDogInterval is intentionally left at the zero value here. + // The startup loader applies a persisted runtime_settings.json value + // only when the interval is still 0 (its "not set by env var" + // heuristic, matching the idle/busy timeouts); a non-zero baseline + // default would defeat that and silently revert a UI-saved Check + // Interval to the default on every restart (#10601). The effective + // 500ms default is supplied at the watchdog layer (DefaultWatchdogInterval) + // when the value is still 0. + TracingMaxItems: 1024, + TracingMaxBodyBytes: 64 * 1024, // 64 KiB - caps each request/response body in the trace buffer AgentPool: AgentPoolConfig{ Enabled: true, Timeout: "5m", @@ -1097,7 +1105,7 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings { if o.WatchDogInterval > 0 { watchdogInterval = o.WatchDogInterval.String() } else { - watchdogInterval = "2s" // default + watchdogInterval = model.DefaultWatchdogInterval.String() // default: 500ms } var lruEvictionRetryInterval string if o.LRUEvictionRetryInterval > 0 { diff --git a/pkg/model/watchdog_options.go b/pkg/model/watchdog_options.go index d11eb2371..e7c1eddfe 100644 --- a/pkg/model/watchdog_options.go +++ b/pkg/model/watchdog_options.go @@ -60,10 +60,17 @@ func WithIdleTimeout(timeout time.Duration) WatchDogOption { } } -// WithWatchdogCheck sets the watchdog check duration +// WithWatchdogInterval sets the watchdog check interval. A non-positive +// interval is ignored so the DefaultWatchdogInterval set by +// DefaultWatchDogOptions is preserved: callers pass the raw +// ApplicationConfig value, which is 0 when neither an env var nor a +// persisted setting configured it (#10601), and a 0 interval would otherwise +// turn the watchdog loop into a busy spin. func WithWatchdogInterval(interval time.Duration) WatchDogOption { return func(o *WatchDogOptions) { - o.watchdogInterval = interval + if interval > 0 { + o.watchdogInterval = interval + } } }