From eb32cd90737f5730dcafd2cc18af0490da88c073 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Fri, 3 Jul 2026 17:00:37 +0100 Subject: [PATCH] feat(realtime): eager blocking pipeline warm-up + /backend/load API (#10662) Realtime sessions previously lazy-loaded each pipeline sub-model (VAD, transcription, LLM, TTS) on first use, so every cold session paid a per-request model-load stall and load errors only surfaced mid-stream. Warm the whole pipeline eagerly and blockingly at session start (including the voice-gate speaker-recognition model, which an enforced gate blocks each utterance on; compaction's summary_model stays lazy since it only runs off the response path): - Add backend.PreloadModel / PreloadModelByName as the single load path for every modality (no transcription special-case; backend-omitted configs are deprecated). - The realtime session blocks on Model.Warmup and returns a model_load_error to the client if any stage fails to load; updateSession warms in the background. Opt out per pipeline with pipeline.disable_warmup, exposed as a UI toggle via the config-metadata registry. Add a LocalAI-native POST /backend/load (and /v1/backend/load) that pre-loads a model -- expanding realtime pipelines into their sub-models -- as the inverse of /backend/shutdown. There is one preload engine (backend.PreloadStages): the realtime Warmup methods, /backend/load and the --load-to-memory startup flag all use it, so --load-to-memory now also expands pipeline models and records load-failure traces. Pipeline sub-model alias resolution is likewise shared (ModelConfigLoader.LoadResolvedModelConfig). Surface the endpoint everywhere an admin manages models: - MCP admin tool load_model (httpapi + inproc clients, safety/catalog prompts, catalog/dispatch tests). - "Load into memory" action in the React models UI. - Swagger regenerated; docs moved to the general backend-monitor page since it is not realtime-specific. Fix a Traces UI crash ("json: unsupported value: -Inf"): audio-snippet RMS/peak now floor at a finite dBFS, and backend-trace data is sanitized to drop non-finite floats before marshaling. The sanitizer is copy-on-write -- it runs on every RecordBackendTrace, so containers are only re-allocated on the paths that actually changed. Migrate core/http/openresponses_test.go onto the prebuilt mock-backend the rest of the http suite already uses -- it was the last spec still pointing at a real HuggingFace model, so it 404'd wherever no vision backend was built -- and fix its item_reference specs to send the spec's "id" field instead of "item_id", which the handler never accepted. Assisted-by: Claude:claude-opus-4-8 Claude Code Signed-off-by: Richard Palethorpe --- core/application/startup.go | 17 +- core/backend/options.go | 16 ++ core/backend/preload.go | 122 +++++++++++++++ core/backend/preload_internal_test.go | 146 ++++++++++++++++++ core/config/meta/registry.go | 7 + core/config/model_config.go | 12 ++ core/config/model_config_loader.go | 19 +++ .../model_config_loader_resolve_test.go} | 20 +-- core/http/endpoints/localai/backend_load.go | 54 +++++++ .../endpoints/localai/backend_load_test.go | 102 ++++++++++++ .../endpoints/mcp/localai_assistant_test.go | 3 + core/http/endpoints/openai/realtime.go | 77 +++++++-- .../endpoints/openai/realtime_doubles_test.go | 2 + core/http/endpoints/openai/realtime_model.go | 53 +++---- .../endpoints/openai/realtime_voicegate.go | 6 +- .../endpoints/openai/realtime_warmup_test.go | 64 ++++++++ core/http/openresponses_test.go | 61 ++++++-- core/http/react-ui/src/pages/Manage.jsx | 26 ++++ core/http/react-ui/src/utils/api.js | 3 + core/http/react-ui/src/utils/config.js | 1 + core/http/routes/localai.go | 6 + core/schema/localai.go | 18 +++ core/trace/audio_snippet.go | 14 +- core/trace/audio_snippet_test.go | 32 ++++ core/trace/backend_trace.go | 105 ++++++++++--- core/trace/backend_trace_sanitize_test.go | 80 ++++++++++ docs/content/advanced/vram-management.md | 2 + docs/content/features/authentication.md | 2 +- docs/content/features/backend-monitor.md | 40 ++++- docs/content/features/openai-realtime.md | 33 ++++ pkg/mcp/localaitools/client.go | 4 + pkg/mcp/localaitools/coverage_test.go | 1 + pkg/mcp/localaitools/fakes_test.go | 9 ++ pkg/mcp/localaitools/httpapi/client.go | 10 ++ pkg/mcp/localaitools/httpapi/routes.go | 1 + pkg/mcp/localaitools/inproc/client.go | 11 ++ .../localaitools/inproc/load_model_test.go | 71 +++++++++ pkg/mcp/localaitools/prompts/10_safety.md | 2 +- pkg/mcp/localaitools/prompts/20_tools.md | 1 + pkg/mcp/localaitools/server_test.go | 2 + pkg/mcp/localaitools/tools.go | 1 + pkg/mcp/localaitools/tools_models.go | 16 ++ swagger/docs.go | 70 +++++++++ swagger/swagger.json | 70 +++++++++ swagger/swagger.yaml | 51 ++++++ 45 files changed, 1364 insertions(+), 99 deletions(-) create mode 100644 core/backend/preload.go create mode 100644 core/backend/preload_internal_test.go rename core/{http/endpoints/openai/realtime_model_alias_test.go => config/model_config_loader_resolve_test.go} (63%) create mode 100644 core/http/endpoints/localai/backend_load.go create mode 100644 core/http/endpoints/localai/backend_load_test.go create mode 100644 core/http/endpoints/openai/realtime_warmup_test.go create mode 100644 core/trace/backend_trace_sanitize_test.go create mode 100644 pkg/mcp/localaitools/inproc/load_model_test.go diff --git a/core/application/startup.go b/core/application/startup.go index cf341dfa6..1cf323d34 100644 --- a/core/application/startup.go +++ b/core/application/startup.go @@ -473,20 +473,13 @@ func New(opts ...config.AppOption) (*Application, error) { if options.LoadToMemory != nil && !options.SingleBackend { for _, m := range options.LoadToMemory { - cfg, err := application.ModelConfigLoader().LoadModelConfigFileByNameDefaultOptions(m, options) - if err != nil { + xlog.Debug("Auto loading model into memory from file", "model", m) + // Same path as POST /backend/load: a realtime pipeline model expands + // to its sub-models, and load failures are recorded as model_load + // traces. + if _, err := backend.PreloadModelByName(options.Context, application.ModelConfigLoader(), application.ModelLoader(), options, m); err != nil { return nil, err } - - xlog.Debug("Auto loading model into memory from file", "model", m, "file", cfg.Model) - - o := backend.ModelOptions(*cfg, options) - - var backendErr error - _, backendErr = application.ModelLoader().Load(o...) - if backendErr != nil { - return nil, backendErr - } } } diff --git a/core/backend/options.go b/core/backend/options.go index 9ae22dd22..028ef3062 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -52,6 +52,22 @@ func ModelLoadTraceObserver(appConfig *config.ApplicationConfig) func(model.Back } } +// PreloadModel warms a model into memory without running any inference, so the +// first real request doesn't pay the backend's cold-start load cost. It uses +// the same ModelOptions + ml.Load path the modality functions use, so a +// subsequent inference call hits the loader cache instead of reloading. Load +// failures are recorded and returned; callers that warm models opportunistically +// (e.g. realtime session warm-up) typically log and continue, since the lazy +// path will retry on first use. +func PreloadModel(ctx context.Context, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) error { + opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx)) + if _, err := ml.Load(opts...); err != nil { + recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil) + return err + } + return nil +} + // recordModelLoadFailure records a backend trace when model loading fails. func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, backend string, err error, data map[string]any) { if !appConfig.EnableTracing { diff --git a/core/backend/preload.go b/core/backend/preload.go new file mode 100644 index 000000000..103d36efc --- /dev/null +++ b/core/backend/preload.go @@ -0,0 +1,122 @@ +package backend + +import ( + "context" + "errors" + "fmt" + "sync" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/xlog" +) + +// PreloadModelByName loads the named model into memory so the first request +// that uses it pays no cold-start load cost — the inverse of shutting a model +// down. If the model is a realtime pipeline (its config declares a `pipeline:` +// block), each configured sub-model (VAD, transcription, LLM, TTS, +// sound_detection, voice_recognition) is loaded concurrently instead of the +// pipeline stub, which has no backend of its own. It returns the model names +// actually loaded and a joined error naming each sub-model that failed (nil on +// full success); a partial pipeline load reports both the loaded names and the +// failures so the caller can surface exactly what is and isn't resident. +// Compaction's summary_model is deliberately left cold: it is only invoked off +// the response path, so it can stay lazy. +func PreloadModelByName(ctx context.Context, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, name string) ([]string, error) { + cfg, err := cl.LoadModelConfigFileByNameDefaultOptions(name, appConfig) + if err != nil { + return nil, err + } + + stages, err := pipelineStages(cl, &cfg.Pipeline, ml.ModelPath) + if err != nil { + return nil, err + } + if len(stages) == 0 { + // Not a pipeline: load the model's own backend directly. + if err := PreloadModel(ctx, ml, *cfg, appConfig); err != nil { + return nil, err + } + return []string{cfg.Name}, nil + } + return PreloadStages(ctx, ml, appConfig, stages) +} + +// PreloadStage names one pipeline sub-model to preload and the resolved config +// to load it from (nil = stage absent, skipped). Role labels the pipeline slot +// in errors and logs. +type PreloadStage struct { + Role string + Cfg *config.ModelConfig +} + +// loadStage is PreloadModel behind a seam so PreloadStages can be unit-tested +// without spawning real backends. +var loadStage = PreloadModel + +// pipelineStages resolves each populated pipeline stage to its concrete model +// config, following a single alias hop — the same resolution the realtime +// pipeline itself uses. A stage that fails to resolve is a misconfiguration, +// so it fails fast rather than being deferred to load. A pipeline with no +// stages set returns nil, which callers treat as "not a pipeline". +func pipelineStages(cl *config.ModelConfigLoader, p *config.Pipeline, modelPath string) ([]PreloadStage, error) { + voiceRec := "" + if p.VoiceRecognition != nil { + voiceRec = p.VoiceRecognition.Model + } + var stages []PreloadStage + for _, s := range []struct{ role, name string }{ + {"vad", p.VAD}, + {"transcription", p.Transcription}, + {"llm", p.LLM}, + {"tts", p.TTS}, + {"sound_detection", p.SoundDetection}, + {"voice_recognition", voiceRec}, + } { + if s.name == "" { + continue + } + cfg, err := cl.LoadResolvedModelConfig(s.name, modelPath) + if err != nil { + return nil, fmt.Errorf("%s (%s): %w", s.role, s.name, err) + } + stages = append(stages, PreloadStage{Role: s.role, Cfg: cfg}) + } + return stages, nil +} + +// PreloadStages loads every present stage at once and waits for all of them, so +// a pipeline warms in the time of its slowest stage rather than the sum. Absent +// (nil-config) stages are skipped. A failed stage does not cancel the others — +// they all run to completion so the joined error names every broken stage at +// once, alongside the names that did load. +func PreloadStages(ctx context.Context, ml *model.ModelLoader, appConfig *config.ApplicationConfig, stages []PreloadStage) ([]string, error) { + var ( + wg sync.WaitGroup + mu sync.Mutex + loaded []string + errs []error + ) + for _, s := range stages { + if s.Cfg == nil { + continue + } + wg.Add(1) + go func(s PreloadStage) { + defer wg.Done() + if err := loadStage(ctx, ml, *s.Cfg, appConfig); err != nil { + xlog.Warn("preload: failed to load pipeline sub-model", "stage", s.Role, "model", s.Cfg.Name, "error", err) + mu.Lock() + errs = append(errs, fmt.Errorf("%s (%s): %w", s.Role, s.Cfg.Name, err)) + mu.Unlock() + return + } + xlog.Debug("preload: loaded pipeline sub-model", "stage", s.Role, "model", s.Cfg.Name) + mu.Lock() + loaded = append(loaded, s.Cfg.Name) + mu.Unlock() + }(s) + } + wg.Wait() + return loaded, errors.Join(errs...) +} diff --git a/core/backend/preload_internal_test.go b/core/backend/preload_internal_test.go new file mode 100644 index 000000000..f92d2b015 --- /dev/null +++ b/core/backend/preload_internal_test.go @@ -0,0 +1,146 @@ +package backend + +import ( + "context" + "errors" + "os" + "path/filepath" + "sync" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/pkg/model" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("pipelineStages", func() { + seed := func(dir string, names ...string) *config.ModelConfigLoader { + for _, n := range names { + yaml := "name: " + n + "\nbackend: fake-backend\n" + Expect(os.WriteFile(filepath.Join(dir, n+".yaml"), []byte(yaml), 0o644)).To(Succeed()) + } + cl := config.NewModelConfigLoader(dir) + Expect(cl.LoadModelConfigsFromPath(dir)).To(Succeed()) + return cl + } + + It("resolves only the populated stages, in load order", func() { + dir := GinkgoT().TempDir() + cl := seed(dir, "vad-m", "stt-m", "llm-m", "tts-m") + + stages, err := pipelineStages(cl, &config.Pipeline{ + VAD: "vad-m", + Transcription: "stt-m", + LLM: "llm-m", + TTS: "tts-m", + }, dir) + Expect(err).ToNot(HaveOccurred()) + + roles := make([]string, len(stages)) + names := make([]string, len(stages)) + for i, s := range stages { + roles[i] = s.Role + names[i] = s.Cfg.Name + } + Expect(roles).To(Equal([]string{"vad", "transcription", "llm", "tts"})) + Expect(names).To(Equal([]string{"vad-m", "stt-m", "llm-m", "tts-m"})) + }) + + It("skips unset stages and includes sound_detection and voice_recognition when set", func() { + dir := GinkgoT().TempDir() + cl := seed(dir, "stt-m", "ced", "spk") + + stages, err := pipelineStages(cl, &config.Pipeline{ + Transcription: "stt-m", + SoundDetection: "ced", + VoiceRecognition: &config.PipelineVoiceRecognition{Model: "spk"}, + }, dir) + Expect(err).ToNot(HaveOccurred()) + + roles := make([]string, len(stages)) + for i, s := range stages { + roles[i] = s.Role + } + Expect(roles).To(ConsistOf("transcription", "sound_detection", "voice_recognition")) + }) + + It("returns nil for a pipeline with no stages (not a pipeline)", func() { + dir := GinkgoT().TempDir() + cl := seed(dir) + + stages, err := pipelineStages(cl, &config.Pipeline{}, dir) + Expect(err).ToNot(HaveOccurred()) + Expect(stages).To(BeNil()) + }) +}) + +var _ = Describe("PreloadStages", func() { + var ( + mu sync.Mutex + seen []string + ) + + // stubLoader swaps the loadStage seam for a recorder so no real backends + // are spawned; errFor injects per-model failures. + stubLoader := func(errFor map[string]error) { + loadStage = func(_ context.Context, _ *model.ModelLoader, cfg config.ModelConfig, _ *config.ApplicationConfig) error { + mu.Lock() + seen = append(seen, cfg.Name) + mu.Unlock() + return errFor[cfg.Name] + } + } + + BeforeEach(func() { + seen = nil + }) + AfterEach(func() { + loadStage = PreloadModel + }) + + mkStage := func(role, name string) PreloadStage { + return PreloadStage{Role: role, Cfg: &config.ModelConfig{Name: name}} + } + + It("loads every present stage, skips absent (nil-config) ones, and returns the loaded names", func() { + stubLoader(nil) + + loaded, err := PreloadStages(context.Background(), nil, nil, []PreloadStage{ + mkStage("vad", "vad-m"), + {Role: "transcription"}, // absent stage + mkStage("llm", "llm-m"), + }) + + Expect(err).ToNot(HaveOccurred()) + Expect(loaded).To(ConsistOf("vad-m", "llm-m")) + // Barrier: every stage has run by the time PreloadStages returns, so + // reading seen without the lock here is safe. + Expect(seen).To(ConsistOf("vad-m", "llm-m")) + }) + + It("reports a joined error naming each failed stage while still loading the rest", func() { + stubLoader(map[string]error{ + "vad-m": errors.New("vad boom"), + "tts-m": errors.New("tts boom"), + }) + + loaded, err := PreloadStages(context.Background(), nil, nil, []PreloadStage{ + mkStage("vad", "vad-m"), + mkStage("llm", "llm-m"), + mkStage("tts", "tts-m"), + }) + + // Every stage ran (a failure does not cancel the others)... + Expect(seen).To(ConsistOf("vad-m", "llm-m", "tts-m")) + // ...the stage that loaded fine is reported as loaded... + Expect(loaded).To(ConsistOf("llm-m")) + // ...and the joined error names every broken stage and its cause. + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("vad (vad-m)")) + Expect(err.Error()).To(ContainSubstring("vad boom")) + Expect(err.Error()).To(ContainSubstring("tts (tts-m)")) + Expect(err.Error()).To(ContainSubstring("tts boom")) + Expect(err.Error()).ToNot(ContainSubstring("llm")) + }) +}) diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go index b8200cd41..4fa555d65 100644 --- a/core/config/meta/registry.go +++ b/core/config/meta/registry.go @@ -599,6 +599,13 @@ func DefaultRegistry() map[string]FieldMetaOverride { Component: "toggle", Order: 89, }, + "pipeline.disable_warmup": { + Section: "pipeline", + Label: "Disable Warmup", + Description: "Turn off eager pre-loading of the pipeline's sub-models at realtime session start. By default LocalAI loads every configured sub-model backend (VAD, transcription, LLM, TTS, sound detection, voice recognition) before the session starts and blocks until they are ready, so the first turn pays no cold-start cost and a model that fails to load is reported at session start instead of mid-call. Enable this to restore the lazy 'load on first use' behavior — session start no longer waits on loading and load errors surface on the first turn instead. Useful to keep idle sessions from holding model memory they may never use.", + Component: "toggle", + Order: 90, + }, // --- Functions --- "function.grammar.parallel_calls": { diff --git a/core/config/model_config.go b/core/config/model_config.go index 69dda331b..0038f4f8d 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -656,6 +656,18 @@ type Pipeline struct { // to benefit. A client session.update still overrides type and eagerness // per session; retranscribe is server-side only. Unset keeps server_vad. TurnDetection PipelineTurnDetection `yaml:"turn_detection,omitempty" json:"turn_detection,omitempty"` + + // DisableWarmup turns off eager pre-loading of the pipeline's sub-models at + // realtime session start. By default (false) LocalAI loads every configured + // sub-model backend (VAD, transcription, LLM, TTS, sound detection, voice + // recognition) into memory (concurrently) before the + // session is announced and blocks until they are ready, so the first turn + // pays no cold-start cost and a model that fails to load surfaces as an error + // at session start rather than mid-call. Set true to restore the lazy "load + // on first use" behavior — session start no longer blocks on loading and + // load errors surface on first use instead (e.g. to keep idle sessions from + // holding model memory they may never use). + DisableWarmup bool `yaml:"disable_warmup,omitempty" json:"disable_warmup,omitempty"` } // PipelineCompaction configures summarize-then-drop for a realtime pipeline. diff --git a/core/config/model_config_loader.go b/core/config/model_config_loader.go index e2f43e83f..29319cbec 100644 --- a/core/config/model_config_loader.go +++ b/core/config/model_config_loader.go @@ -155,6 +155,25 @@ func (bcl *ModelConfigLoader) LoadModelConfigFileByNameDefaultOptions(modelName ModelPath(appConfig.SystemState.Model.ModelsPath)) } +// LoadResolvedModelConfig loads a model config by name and follows a single +// alias hop, so a caller that references an alias (e.g. a pipeline with +// `llm: default`) gets the alias target's full config (Backend, Model, ...) +// rather than the alias stub with an empty Backend. Without this the alias +// survives unresolved into model loading and fails downstream — notably in +// distributed mode with "backend name is empty". Mirrors the top-level alias +// resolution in core/http/middleware/request.go. +func (bcl *ModelConfigLoader) LoadResolvedModelConfig(modelName, modelPath string) (*ModelConfig, error) { + cfg, err := bcl.LoadModelConfigFileByName(modelName, modelPath) + if err != nil { + return nil, err + } + resolved, _, err := bcl.ResolveAlias(cfg) + if err != nil { + return nil, err + } + return resolved, nil +} + // This format is currently only used when reading a single file at startup, passed in via ApplicationConfig.ConfigFile func (bcl *ModelConfigLoader) LoadMultipleModelConfigsSingleFile(file string, opts ...ConfigLoaderOption) error { bcl.Lock() diff --git a/core/http/endpoints/openai/realtime_model_alias_test.go b/core/config/model_config_loader_resolve_test.go similarity index 63% rename from core/http/endpoints/openai/realtime_model_alias_test.go rename to core/config/model_config_loader_resolve_test.go index 77179d963..961693b01 100644 --- a/core/http/endpoints/openai/realtime_model_alias_test.go +++ b/core/config/model_config_loader_resolve_test.go @@ -1,4 +1,4 @@ -package openai +package config_test import ( "os" @@ -10,14 +10,14 @@ import ( "github.com/mudler/LocalAI/core/config" ) -// loadPipelineSubModel must resolve a pipeline sub-model that references an -// alias (e.g. `llm: default`) one hop to the alias target's full config — so -// the effective backend is the target's backend, not the empty backend of the -// alias stub. This mirrors the top-level alias resolution done in -// core/http/middleware/request.go, which the realtime pipeline previously +// LoadResolvedModelConfig must resolve a model that references an alias +// (e.g. a pipeline with `llm: default`) one hop to the alias target's full +// config — so the effective backend is the target's backend, not the empty +// backend of the alias stub. This mirrors the top-level alias resolution done +// in core/http/middleware/request.go, which the realtime pipeline previously // skipped (failing in distributed mode with "backend name is empty"). -var _ = Describe("loadPipelineSubModel", func() { - It("resolves a sub-model alias one hop to the target's config", func() { +var _ = Describe("LoadResolvedModelConfig", func() { + It("resolves an alias one hop to the target's config", func() { tmpDir := GinkgoT().TempDir() // A real model config with a concrete backend. @@ -38,13 +38,13 @@ alias: real-llm Expect(cl.LoadModelConfigsFromPath(tmpDir)).To(Succeed()) // Resolving the alias must follow the hop to the target's full config. - resolved, err := loadPipelineSubModel(cl, "default", tmpDir) + resolved, err := cl.LoadResolvedModelConfig("default", tmpDir) Expect(err).NotTo(HaveOccurred()) Expect(resolved.IsAlias()).To(BeFalse()) Expect(resolved.Backend).To(Equal("llama-cpp")) // A non-alias name must load unchanged. - direct, err := loadPipelineSubModel(cl, "real-llm", tmpDir) + direct, err := cl.LoadResolvedModelConfig("real-llm", tmpDir) Expect(err).NotTo(HaveOccurred()) Expect(direct.Backend).To(Equal("llama-cpp")) Expect(direct.Name).To(Equal("real-llm")) diff --git a/core/http/endpoints/localai/backend_load.go b/core/http/endpoints/localai/backend_load.go new file mode 100644 index 000000000..84d6396a3 --- /dev/null +++ b/core/http/endpoints/localai/backend_load.go @@ -0,0 +1,54 @@ +package localai + +import ( + "net/http" + + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/backend" + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/xlog" +) + +// LoadModelEndpoint pre-loads a model into memory by name — the inverse of +// /backend/shutdown. For a realtime pipeline model every configured sub-model +// (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded; for a regular +// model its own backend is loaded. The call blocks until loading finishes so +// clients can drive warm-up explicitly and learn up front whether a model +// fails to load. +// @Summary Pre-load a model into memory +// @Description Loads the named model (or, for a realtime pipeline, all of its sub-models) into memory so subsequent requests pay no cold-start cost. The inverse of /backend/shutdown. +// @Tags monitoring +// @Accept json +// @Produce json +// @Param request body schema.ModelLoadRequest true "Model to load" +// @Success 200 {object} schema.ModelLoadResponse "Model loaded" +// @Failure 400 {object} schema.ModelLoadResponse "Missing model name" +// @Failure 500 {object} schema.ModelLoadResponse "Load failed (Loaded lists any sub-models that did load)" +// @Router /backend/load [post] +func LoadModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { + return func(c echo.Context) error { + input := new(schema.ModelLoadRequest) + if err := c.Bind(input); err != nil { + return err + } + if input.Model == "" { + return c.JSON(http.StatusBadRequest, schema.ModelLoadResponse{Message: "model is required"}) + } + + loaded, err := backend.PreloadModelByName(c.Request().Context(), cl, ml, appConfig, input.Model) + if err != nil { + xlog.Error("failed to pre-load model", "model", input.Model, "loaded", loaded, "error", err) + return c.JSON(http.StatusInternalServerError, schema.ModelLoadResponse{ + Loaded: loaded, + Message: "failed to load model: " + err.Error(), + }) + } + + return c.JSON(http.StatusOK, schema.ModelLoadResponse{ + Loaded: loaded, + Message: "model loaded", + }) + } +} diff --git a/core/http/endpoints/localai/backend_load_test.go b/core/http/endpoints/localai/backend_load_test.go new file mode 100644 index 000000000..8022af73f --- /dev/null +++ b/core/http/endpoints/localai/backend_load_test.go @@ -0,0 +1,102 @@ +package localai_test + +import ( + "bytes" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + + "github.com/labstack/echo/v4" + "github.com/mudler/LocalAI/core/config" + . "github.com/mudler/LocalAI/core/http/endpoints/localai" + "github.com/mudler/LocalAI/core/schema" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/LocalAI/pkg/system" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var _ = Describe("LoadModelEndpoint (/backend/load)", func() { + var ( + app *echo.Echo + tempDir string + configLoader *config.ModelConfigLoader + modelLoader *model.ModelLoader + appConfig *config.ApplicationConfig + ) + + post := func(body string) *httptest.ResponseRecorder { + req := httptest.NewRequest(http.MethodPost, "/backend/load", bytes.NewBufferString(body)) + req.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON) + rec := httptest.NewRecorder() + app.ServeHTTP(rec, req) + return rec + } + + decode := func(rec *httptest.ResponseRecorder) schema.ModelLoadResponse { + var resp schema.ModelLoadResponse + Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed()) + return resp + } + + writeConfig := func(name, contents string) { + Expect(os.WriteFile(filepath.Join(tempDir, name+".yaml"), []byte(contents), 0o600)).To(Succeed()) + } + + BeforeEach(func() { + var err error + tempDir, err = os.MkdirTemp("", "backend-load-test-*") + Expect(err).NotTo(HaveOccurred()) + + systemState, err := system.GetSystemState(system.WithModelPath(tempDir)) + Expect(err).NotTo(HaveOccurred()) + + appConfig = config.NewApplicationConfig(config.WithSystemState(systemState)) + configLoader = config.NewModelConfigLoader(tempDir) + modelLoader = model.NewModelLoader(systemState) // no backends installed + + app = echo.New() + app.POST("/backend/load", LoadModelEndpoint(configLoader, modelLoader, appConfig)) + }) + + AfterEach(func() { + _ = os.RemoveAll(tempDir) + }) + + It("rejects a request with no model name", func() { + rec := post(`{}`) + Expect(rec.Code).To(Equal(http.StatusBadRequest)) + Expect(decode(rec).Message).To(ContainSubstring("model is required")) + }) + + It("reports a load failure for a regular model with nothing loaded", func() { + writeConfig("solo", "name: solo\n") + + rec := post(`{"model":"solo"}`) + Expect(rec.Code).To(Equal(http.StatusInternalServerError)) + + resp := decode(rec) + Expect(resp.Loaded).To(BeEmpty()) + Expect(resp.Message).To(ContainSubstring("failed to load model")) + }) + + It("expands a pipeline model and reports each sub-model that failed to load", func() { + writeConfig("voicebot", "name: voicebot\npipeline:\n vad: vad-m\n transcription: stt-m\n llm: llm-m\n tts: tts-m\n") + writeConfig("vad-m", "name: vad-m\n") + writeConfig("stt-m", "name: stt-m\n") + writeConfig("llm-m", "name: llm-m\n") + writeConfig("tts-m", "name: tts-m\n") + + rec := post(`{"model":"voicebot"}`) + Expect(rec.Code).To(Equal(http.StatusInternalServerError)) + + resp := decode(rec) + Expect(resp.Message).To(ContainSubstring("failed to load model")) + // The pipeline stub itself is never loaded; its sub-models are what the + // endpoint tries, so the error names them rather than "voicebot". + Expect(resp.Message).To(ContainSubstring("vad-m")) + Expect(resp.Message).ToNot(ContainSubstring("voicebot")) + }) +}) diff --git a/core/http/endpoints/mcp/localai_assistant_test.go b/core/http/endpoints/mcp/localai_assistant_test.go index 8de7355c6..817dea0c7 100644 --- a/core/http/endpoints/mcp/localai_assistant_test.go +++ b/core/http/endpoints/mcp/localai_assistant_test.go @@ -51,6 +51,9 @@ func (stubClient) EditModelConfig(_ context.Context, _ string, _ map[string]any) return nil } func (stubClient) ReloadModels(_ context.Context) error { return nil } +func (stubClient) LoadModel(_ context.Context, model string) ([]string, error) { + return []string{model}, nil +} func (stubClient) SetAlias(_ context.Context, _, _ string) error { return nil } diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index 94c8a1a65..cdff7aec5 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -7,6 +7,7 @@ import ( "encoding/binary" "encoding/hex" "encoding/json" + "errors" "fmt" "math" "os" @@ -266,6 +267,12 @@ type Model interface { // grpcerrors.IsLiveTranscriptionUnsupported. TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error) PredictConfig() *config.ModelConfig + // Warmup eagerly loads the pipeline's sub-model backends into memory so the + // first realtime turn doesn't pay each backend's cold-start load cost. Loads + // run concurrently; Warmup blocks until they all finish and returns a joined + // error naming every stage that failed to load (nil if all succeeded), so a + // caller can surface model-load failures at session start instead of mid-call. + Warmup(ctx context.Context) error } var upgrader = websocket.Upgrader{ @@ -583,18 +590,8 @@ func runRealtimeSession(application *application.Application, t Transport, model } session.ModelInterface = m - if session.SummaryModel != "" { - summaryModelName := session.SummaryModel - sid := sessionID - session.summarizerFactory = func() (Model, error) { - summaryCfg, lerr := application.ModelConfigLoader().LoadModelConfigFileByNameDefaultOptions(summaryModelName, application.ApplicationConfig()) - if lerr != nil { - return nil, fmt.Errorf("load summary model config %q: %w", summaryModelName, lerr) - } - return newModel(&summaryCfg.Pipeline, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), evaluator, buildRealtimeRoutingContext(application, sid)) - } - } - + // The voice gate is built before the warm-up below so its + // speaker-recognition model can warm alongside the pipeline stages. if cfg.Pipeline.VoiceGateEnabled() { gate, gerr := newVoiceGate( *cfg.Pipeline.VoiceRecognition, @@ -612,6 +609,47 @@ func runRealtimeSession(application *application.Application, t Transport, model xlog.Info("realtime voice recognition gate enabled", "mode", gate.cfg.Mode, "when", gate.cfg.When) } + // Warm the pipeline's sub-model backends before announcing the session. + // Loads run concurrently but we block here until they all finish, so a model + // that fails to load (missing weights, bad backend, OOM) surfaces as an error + // at session start rather than stalling — or failing — mid-call on the first + // turn (VAD on the first audio chunk, STT at end-of-speech, LLM on the first + // reply, TTS on the first spoken output). On success the backends are already + // resident, so the first turn pays no cold-start cost. Opt out per pipeline + // with `pipeline.disable_warmup: true` to restore lazy load-on-first-use + // (errors then surface on first use instead of at session start). + if !cfg.Pipeline.DisableWarmup { + warmErr := make(chan error, 1) + go func() { warmErr <- m.Warmup(context.Background()) }() + // The voice-gate model warms concurrently with the pipeline stages: an + // enforced gate blocks each utterance on speaker resolution, so its + // cold-start would otherwise land on the first turn too. (Compaction's + // summary_model stays lazy — it only runs off the response path.) + var gateErr error + if session.voiceGate != nil { + _, gateErr = backend.PreloadStages(context.Background(), application.ModelLoader(), application.ApplicationConfig(), []backend.PreloadStage{ + {Role: "voice_recognition", Cfg: session.voiceGate.recCfg}, + }) + } + if err := errors.Join(<-warmErr, gateErr); err != nil { + xlog.Error("realtime warmup failed", "model", model, "error", err) + sendError(t, "model_load_error", "Failed to load pipeline models: "+err.Error(), "", "") + return + } + } + + if session.SummaryModel != "" { + summaryModelName := session.SummaryModel + sid := sessionID + session.summarizerFactory = func() (Model, error) { + summaryCfg, lerr := application.ModelConfigLoader().LoadModelConfigFileByNameDefaultOptions(summaryModelName, application.ApplicationConfig()) + if lerr != nil { + return nil, fmt.Errorf("load summary model config %q: %w", summaryModelName, lerr) + } + return newModel(&summaryCfg.Pipeline, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), evaluator, buildRealtimeRoutingContext(application, sid)) + } + } + // Store the session and notify the transport (for WebRTC audio track handling) sessionLock.Lock() sessions[sessionID] = session @@ -1125,6 +1163,21 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode return err } session.ModelInterface = m + // A session.update that swaps the model/voice rebuilds the pipeline, so + // warm the new backends too (unless opted out) — otherwise the next turn + // pays the cold-start load the original session warm-up already avoided. + // Unlike session start this stays non-blocking: updateSession runs under + // the global sessionLock, so blocking on a multi-second load here would + // stall every other session. Load errors are logged (and still surface on + // first use); per-stage failures are already warned inside + // backend.PreloadStages. + if !session.ModelConfig.Pipeline.DisableWarmup { + go func() { + if err := m.Warmup(context.Background()); err != nil { + xlog.Error("realtime warmup failed after session.update", "error", err) + } + }() + } } if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.TurnDetectionSet { diff --git a/core/http/endpoints/openai/realtime_doubles_test.go b/core/http/endpoints/openai/realtime_doubles_test.go index 6dc1c6ca5..fe52e1c64 100644 --- a/core/http/endpoints/openai/realtime_doubles_test.go +++ b/core/http/endpoints/openai/realtime_doubles_test.go @@ -174,6 +174,8 @@ func (m *fakeModel) TranscribeLive(_ context.Context, _ string, onEvent func(bac func (m *fakeModel) PredictConfig() *config.ModelConfig { return m.cfg } +func (m *fakeModel) Warmup(ctx context.Context) error { return nil } + // fakeLiveSession records what semantic_vad fed and closed; closeEvents are // replayed through onEvent during Close, mimicking the backend's finalize // flush (trailing delta + Final) landing before Close returns. diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go index 71f553980..0449daee3 100644 --- a/core/http/endpoints/openai/realtime_model.go +++ b/core/http/endpoints/openai/realtime_model.go @@ -110,6 +110,15 @@ func (m *transcriptOnlyModel) PredictConfig() *config.ModelConfig { return nil } +func (m *transcriptOnlyModel) Warmup(ctx context.Context) error { + _, err := backend.PreloadStages(ctx, m.modelLoader, m.appConfig, []backend.PreloadStage{ + {Role: "vad", Cfg: m.VADConfig}, + {Role: "transcription", Cfg: m.TranscriptionConfig}, + {Role: "sound_detection", Cfg: m.SoundDetectionConfig}, + }) + return err +} + func (m *wrappedModel) VAD(ctx context.Context, request *schema.VADRequest) (*schema.VADResponse, error) { return backend.VAD(request, ctx, m.modelLoader, m.appConfig, *m.VADConfig) } @@ -360,6 +369,17 @@ func (m *wrappedModel) PredictConfig() *config.ModelConfig { return m.LLMConfig } +func (m *wrappedModel) Warmup(ctx context.Context) error { + _, err := backend.PreloadStages(ctx, m.modelLoader, m.appConfig, []backend.PreloadStage{ + {Role: "vad", Cfg: m.VADConfig}, + {Role: "transcription", Cfg: m.TranscriptionConfig}, + {Role: "llm", Cfg: m.LLMConfig}, + {Role: "tts", Cfg: m.TTSConfig}, + {Role: "sound_detection", Cfg: m.SoundDetectionConfig}, + }) + return err +} + // wavStreamHeaderBytes is the size of the WAV header that backend.ModelTTSStream // emits as its first audio callback; the sample rate lives at byte offset 24. const wavStreamHeaderBytes = 44 @@ -440,7 +460,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL if pipeline.SoundDetection == "" { return nil, nil } - cfg, err := loadPipelineSubModel(cl, pipeline.SoundDetection, ml.ModelPath) + cfg, err := cl.LoadResolvedModelConfig(pipeline.SoundDetection, ml.ModelPath) if err != nil { return nil, fmt.Errorf("failed to load sound detection config: %w", err) } @@ -451,7 +471,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL } func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) { - cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath) + cfgVAD, err := cl.LoadResolvedModelConfig(pipeline.VAD, ml.ModelPath) if err != nil { return nil, nil, fmt.Errorf("failed to load backend config: %w", err) @@ -461,7 +481,7 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig return nil, nil, fmt.Errorf("failed to validate config: %w", err) } - cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath) + cfgSST, err := cl.LoadResolvedModelConfig(pipeline.Transcription, ml.ModelPath) if err != nil { return nil, nil, fmt.Errorf("failed to load backend config: %w", err) @@ -550,30 +570,11 @@ func buildRealtimeRoutingContext(a *application.Application, sessionID string) * } } -// loadPipelineSubModel loads a pipeline sub-model config by name and follows a -// single alias hop, so a pipeline that references an alias (e.g. `llm: default`) -// gets the alias target's full config (Backend, Model, ...) rather than the -// alias stub with an empty Backend. Without this the alias survives unresolved -// into model loading and fails downstream — notably in distributed mode with -// "backend name is empty". Mirrors the top-level alias resolution in -// core/http/middleware/request.go. -func loadPipelineSubModel(cl *config.ModelConfigLoader, name, modelPath string) (*config.ModelConfig, error) { - cfg, err := cl.LoadModelConfigFileByName(name, modelPath) - if err != nil { - return nil, err - } - resolved, _, err := cl.ResolveAlias(cfg) - if err != nil { - return nil, err - } - return resolved, nil -} - // returns and loads either a wrapped model or a model that support audio-to-audio func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator, routing *RealtimeRoutingContext) (Model, error) { xlog.Debug("Creating new model pipeline model", "pipeline", pipeline) - cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath) + cfgVAD, err := cl.LoadResolvedModelConfig(pipeline.VAD, ml.ModelPath) if err != nil { return nil, fmt.Errorf("failed to load backend config: %w", err) @@ -584,7 +585,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model } // TODO: Do we always need a transcription model? It can be disabled. Note that any-to-any instruction following models don't transcribe as such, so if transcription is required it is a separate process - cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath) + cfgSST, err := cl.LoadResolvedModelConfig(pipeline.Transcription, ml.ModelPath) if err != nil { return nil, fmt.Errorf("failed to load backend config: %w", err) @@ -616,7 +617,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model xlog.Debug("Loading a wrapped model") // Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations - cfgLLM, err := loadPipelineSubModel(cl, pipeline.LLM, ml.ModelPath) + cfgLLM, err := cl.LoadResolvedModelConfig(pipeline.LLM, ml.ModelPath) if err != nil { return nil, fmt.Errorf("failed to load backend config: %w", err) @@ -631,7 +632,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model applyPipelineReasoning(cfgLLM, *pipeline) applyPipelineThinking(cfgLLM, *pipeline) - cfgTTS, err := loadPipelineSubModel(cl, pipeline.TTS, ml.ModelPath) + cfgTTS, err := cl.LoadResolvedModelConfig(pipeline.TTS, ml.ModelPath) if err != nil { return nil, fmt.Errorf("failed to load backend config: %w", err) diff --git a/core/http/endpoints/openai/realtime_voicegate.go b/core/http/endpoints/openai/realtime_voicegate.go index 9bd6f10f2..475b45e8f 100644 --- a/core/http/endpoints/openai/realtime_voicegate.go +++ b/core/http/endpoints/openai/realtime_voicegate.go @@ -21,6 +21,7 @@ type namedEmbedding struct { // drive the realtime pipeline. type voiceGate struct { cfg config.PipelineVoiceRecognition // normalized + recCfg *config.ModelConfig // resolved speaker-recognition model, for warm-up registry voicerecognition.Registry // identify mode (nil otherwise) refEmbeds []namedEmbedding // verify mode, pre-embedded refs refAudios []config.VoiceReference // verify + anti-spoofing: ref paths @@ -72,7 +73,9 @@ func newVoiceGate( return nil, err } - recCfg, err := cl.LoadModelConfigFileByName(cfg.Model, ml.ModelPath) + // Resolved like every other pipeline sub-model (one alias hop), so an + // aliased voice_recognition model gets its target's backend. + recCfg, err := cl.LoadResolvedModelConfig(cfg.Model, ml.ModelPath) if err != nil { return nil, fmt.Errorf("voice_recognition: failed to load model %q: %w", cfg.Model, err) } @@ -82,6 +85,7 @@ func newVoiceGate( g := &voiceGate{ cfg: cfg, + recCfg: recCfg, registry: registry, embedFn: func(ctx context.Context, wavPath string) ([]float32, error) { res, err := backend.VoiceEmbed(ctx, wavPath, ml, appConfig, *recCfg) diff --git a/core/http/endpoints/openai/realtime_warmup_test.go b/core/http/endpoints/openai/realtime_warmup_test.go new file mode 100644 index 000000000..ec511fc2a --- /dev/null +++ b/core/http/endpoints/openai/realtime_warmup_test.go @@ -0,0 +1,64 @@ +package openai + +import ( + "context" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/LocalAI/pkg/system" +) + +// Warmup delegates to backend.PreloadStages (its concurrency, nil-skipping and +// error-joining semantics are pinned in core/backend). These specs pin the +// wiring instead: each realtime model type must warm exactly its configured +// stages under the right pipeline-role labels. No backends are installed, so +// every attempted stage fails to load — the joined error is the proof of which +// stages were attempted and how they were labeled. +var _ = Describe("realtime model Warmup wiring", func() { + newLoader := func() (*model.ModelLoader, *config.ApplicationConfig) { + systemState, err := system.GetSystemState(system.WithModelPath(GinkgoT().TempDir())) + Expect(err).ToNot(HaveOccurred()) + appConfig := config.NewApplicationConfig(config.WithSystemState(systemState)) + return model.NewModelLoader(systemState), appConfig + } + + It("wrappedModel warms every configured stage under its pipeline role", func() { + ml, appConfig := newLoader() + m := &wrappedModel{ + VADConfig: &config.ModelConfig{Name: "vad-m"}, + TranscriptionConfig: &config.ModelConfig{Name: "stt-m"}, + LLMConfig: &config.ModelConfig{Name: "llm-m"}, + TTSConfig: &config.ModelConfig{Name: "tts-m"}, + SoundDetectionConfig: &config.ModelConfig{Name: "ced-m"}, + modelLoader: ml, + appConfig: appConfig, + } + + err := m.Warmup(context.Background()) + Expect(err).To(HaveOccurred()) + for _, stage := range []string{"vad (vad-m)", "transcription (stt-m)", "llm (llm-m)", "tts (tts-m)", "sound_detection (ced-m)"} { + Expect(err.Error()).To(ContainSubstring(stage)) + } + }) + + It("transcriptOnlyModel warms its stages and skips absent ones", func() { + ml, appConfig := newLoader() + m := &transcriptOnlyModel{ + VADConfig: &config.ModelConfig{Name: "vad-m"}, + TranscriptionConfig: &config.ModelConfig{Name: "stt-m"}, + // SoundDetectionConfig nil: an absent stage must be skipped, not + // fail the warm-up. + modelLoader: ml, + appConfig: appConfig, + } + + err := m.Warmup(context.Background()) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("vad (vad-m)")) + Expect(err.Error()).To(ContainSubstring("transcription (stt-m)")) + Expect(err.Error()).ToNot(ContainSubstring("sound_detection")) + }) +}) diff --git a/core/http/openresponses_test.go b/core/http/openresponses_test.go index 046d624b0..4e6eca7b7 100644 --- a/core/http/openresponses_test.go +++ b/core/http/openresponses_test.go @@ -7,6 +7,7 @@ import ( "io" "net/http" "os" + "path/filepath" "strings" "time" @@ -29,6 +30,8 @@ const testModel = "Qwen3-VL-2B-Instruct-Q4_K_M" var _ = Describe("Open Responses API", func() { var app *echo.Echo + var localApp *application.Application + var localModelDir string var c context.Context var cancel context.CancelFunc @@ -38,28 +41,47 @@ var _ = Describe("Open Responses API", func() { Context("API with ephemeral models", func() { BeforeEach(func(sc SpecContext) { - var err error + // This suite exercises the /v1/responses HTTP/protocol contract + // (Content-Type, SSE framing, response envelope, error shapes), + // not real inference — so it runs against the same prebuilt + // mock-backend the rest of the http suite uses instead of + // downloading a real model. Skip cleanly when it isn't built. + if mockBackendPath == "" { + Skip("mock-backend binary not built; run 'make build-mock-backend'") + } - backendPath := os.Getenv("BACKENDS_PATH") + var err error c, cancel = context.WithCancel(context.Background()) + // Isolated model dir carrying a single config named after testModel + // but served by the mock backend, so the responses endpoint can + // resolve and load the model without any real backend build. + localModelDir, err = os.MkdirTemp("", "openresponses-models-") + Expect(err).ToNot(HaveOccurred()) + + mockModelYAML := "name: " + testModel + "\n" + + "backend: mock-backend\n" + + "parameters:\n" + + " model: mock-model.bin\n" + Expect(os.WriteFile(filepath.Join(localModelDir, testModel+".yaml"), []byte(mockModelYAML), 0644)).To(Succeed()) + systemState, err := system.GetSystemState( - system.WithBackendPath(backendPath), - system.WithModelPath(modelDir), + system.WithBackendPath(backendDir), + system.WithModelPath(localModelDir), ) Expect(err).ToNot(HaveOccurred()) - application, err := application.New( + localApp, err = application.New( append(commonOpts, config.WithContext(c), config.WithSystemState(systemState), config.WithApiKeys([]string{apiKey}), - config.WithModelsURL("https://huggingface.co/unsloth/Qwen3-VL-2B-Instruct-GGUF"), )...) Expect(err).ToNot(HaveOccurred()) + localApp.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath) - app, err = API(application) + app, err = API(localApp) Expect(err).ToNot(HaveOccurred()) go func() { @@ -80,14 +102,24 @@ var _ = Describe("Open Responses API", func() { }) AfterEach(func(sc SpecContext) { + // Synchronous app shutdown first — context-cancel cleanup is async + // and races test-binary exit, orphaning mock-backend children. + if localApp != nil { + _ = localApp.Shutdown() + localApp = nil + } cancel() if app != nil { ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() err := app.Shutdown(ctx) Expect(err).ToNot(HaveOccurred()) + app = nil + } + if localModelDir != "" { + _ = os.RemoveAll(localModelDir) + localModelDir = "" } - }) Context("HTTP Protocol Compliance", func() { @@ -969,13 +1001,16 @@ var _ = Describe("Open Responses API", func() { Expect(ok).To(BeTrue()) Expect(itemID).ToNot(BeEmpty()) - // Now create a new response with item_reference + // Now create a new response with item_reference. Per the OpenAI + // Responses spec (and this server's parser in + // endpoints/openresponses/responses.go) an item_reference carries + // the referenced item in the "id" field, not "item_id". reqBody2 := map[string]any{ "model": testModel, "input": []any{ map[string]any{ - "type": "item_reference", - "item_id": itemID, + "type": "item_reference", + "id": itemID, }, map[string]any{ "type": "message", @@ -1005,8 +1040,8 @@ var _ = Describe("Open Responses API", func() { "model": testModel, "input": []any{ map[string]any{ - "type": "item_reference", - "item_id": "nonexistent_item_id", + "type": "item_reference", + "id": "nonexistent_item_id", }, }, } diff --git a/core/http/react-ui/src/pages/Manage.jsx b/core/http/react-ui/src/pages/Manage.jsx index 16d04f709..a5e19dbe8 100644 --- a/core/http/react-ui/src/pages/Manage.jsx +++ b/core/http/react-ui/src/pages/Manage.jsx @@ -146,6 +146,7 @@ export default function Manage() { const [distributedMode, setDistributedMode] = useState(false) const [togglingModels, setTogglingModels] = useState(new Set()) const [pinningModels, setPinningModels] = useState(new Set()) + const [loadingModels, setLoadingModels] = useState(new Set()) // Expanded row state — keyed by `${tab}:${id}` so switching tabs doesn't // collide and a single row is open at a time per tab. const [expandedKey, setExpandedKey] = useState(null) @@ -313,6 +314,26 @@ export default function Manage() { }) } + // Pre-load a model (or all of a realtime pipeline's sub-models) into memory. + // The /backend/load call blocks until loading finishes, so the menu item shows + // a loading state while in flight and reports the outcome on completion. + const handleLoadModel = async (modelName) => { + setLoadingModels(prev => new Set(prev).add(modelName)) + try { + await backendControlApi.load({ model: modelName }) + addToast(`Loaded ${modelName}`, 'success') + setTimeout(fetchLoadedModels, 500) + } catch (err) { + addToast(`Failed to load: ${err.message}`, 'error') + } finally { + setLoadingModels(prev => { + const next = new Set(prev) + next.delete(modelName) + return next + }) + } + } + const handleDeleteModel = (modelName) => { setConfirmDialog({ title: 'Delete Model', @@ -687,6 +708,11 @@ export default function Manage() { label: model.disabled ? 'Enable model' : 'Disable model', onClick: () => handleToggleModel(model.id, model.disabled), disabled: togglingModels.has(model.id) }, + { key: 'load', icon: 'fa-bolt', + label: loadingModels.has(model.id) ? 'Loading…' : 'Load into memory', + onClick: () => handleLoadModel(model.id), + hidden: isRunning || !!model.disabled, + disabled: loadingModels.has(model.id) }, { key: 'stop', icon: 'fa-stop', label: 'Stop model', onClick: () => handleStopModel(model.id), hidden: !isRunning }, { key: 'pin', icon: 'fa-thumbtack', diff --git a/core/http/react-ui/src/utils/api.js b/core/http/react-ui/src/utils/api.js index 8da0bffbd..b31048296 100644 --- a/core/http/react-ui/src/utils/api.js +++ b/core/http/react-ui/src/utils/api.js @@ -352,6 +352,9 @@ export const realtimeApi = { // Backend control export const backendControlApi = { shutdown: (body) => postJSON(API_CONFIG.endpoints.backendShutdown, body), + // Pre-load a model (or all of a realtime pipeline's sub-models) into memory. + // body: { model: "" }. Inverse of shutdown. + load: (body) => postJSON(API_CONFIG.endpoints.backendLoad, body), } // System info diff --git a/core/http/react-ui/src/utils/config.js b/core/http/react-ui/src/utils/config.js index d3db6ce2a..0fa0703b3 100644 --- a/core/http/react-ui/src/utils/config.js +++ b/core/http/react-ui/src/utils/config.js @@ -106,6 +106,7 @@ export const API_CONFIG = { video: '/video', backendMonitor: '/backend/monitor', backendShutdown: '/backend/shutdown', + backendLoad: '/backend/load', modelsApply: '/models/apply', modelsDelete: (name) => `/models/delete/${name}`, modelsAvailable: '/models/available', diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go index 763623a7f..5ef94539c 100644 --- a/core/http/routes/localai.go +++ b/core/http/routes/localai.go @@ -207,9 +207,14 @@ func RegisterLocalAIRoutes(router *echo.Echo, backendMonitorService := monitoring.NewBackendMonitorService(ml, cl, appConfig) // Split out for now router.GET("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService), adminMiddleware) router.POST("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService), adminMiddleware) + // /backend/load is the inverse of /backend/shutdown: pre-load a model (or all + // of a realtime pipeline's sub-models) into memory so clients can drive + // warm-up explicitly instead of paying the cold-start cost on first use. + router.POST("/backend/load", localai.LoadModelEndpoint(cl, ml, appConfig), adminMiddleware) // The v1/* urls are exactly the same as above - makes local e2e testing easier if they are registered. router.GET("/v1/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService), adminMiddleware) router.POST("/v1/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService), adminMiddleware) + router.POST("/v1/backend/load", localai.LoadModelEndpoint(cl, ml, appConfig), adminMiddleware) // Traces and backend logs (monitoring) router.GET("/api/traces", localai.GetAPITracesEndpoint(), adminMiddleware) @@ -245,6 +250,7 @@ func RegisterLocalAIRoutes(router *echo.Echo, "metrics": "/metrics", "backend_monitor": "/backend/monitor", "backend_shutdown": "/backend/shutdown", + "backend_load": "/backend/load", "system": "/system", "version": "/version", "traces": "/api/traces", diff --git a/core/schema/localai.go b/core/schema/localai.go index 41b513ce9..9fb42bf6d 100644 --- a/core/schema/localai.go +++ b/core/schema/localai.go @@ -11,6 +11,24 @@ type BackendMonitorRequest struct { BasicModelRequest } +// ModelLoadRequest asks LocalAI to pre-load a model into memory by name, so the +// first request that uses it pays no cold-start load cost. For a realtime +// pipeline model, every configured sub-model (VAD, transcription, LLM, TTS, +// sound_detection, voice_recognition) is loaded instead of the pipeline stub. +// It is the inverse of the /backend/shutdown request. +type ModelLoadRequest struct { + BasicModelRequest +} + +// ModelLoadResponse reports the outcome of a /backend/load call. +type ModelLoadResponse struct { + // Loaded lists the model names actually resident in memory after the call. + // For a pipeline model these are its sub-models, not the pipeline name. + Loaded []string `json:"loaded"` + // Message is a short human-readable status ("model loaded", or an error). + Message string `json:"message"` +} + type TokenMetricsRequest struct { BasicModelRequest } diff --git a/core/trace/audio_snippet.go b/core/trace/audio_snippet.go index 628a6acb9..1c2dc98b8 100644 --- a/core/trace/audio_snippet.go +++ b/core/trace/audio_snippet.go @@ -14,6 +14,16 @@ import ( // MaxSnippetSeconds is the maximum number of seconds of audio captured per trace. const MaxSnippetSeconds = 30 +// silenceFloorDBFS is the dBFS value reported for digital silence (RMS or peak +// of zero). The true level is -∞ dBFS; reporting a finite floor keeps the +// metric present and meaningful in the Traces UI (a scrubbed nil would read as +// "missing" rather than "silent"). -120 dBFS sits well below 16-bit PCM's +// ~-90 dBFS least-significant-bit floor, so it reads unambiguously as +// "effectively silent". JSON-marshal safety for any non-finite float that does +// reach a trace is owned centrally by RecordBackendTrace's sanitizer — this +// floor is about presentation, not transport. +const silenceFloorDBFS = -120.0 + // AudioSnippet captures the first MaxSnippetSeconds of a WAV file and computes // quality metrics. The result is a map suitable for merging into a BackendTrace // Data field. maxBytes caps the embedded base64 waveform so a single TTS or @@ -63,7 +73,7 @@ func AudioSnippetFromPCM(pcm []byte, sampleRate, totalPCMBytes, maxBytes int) ma snippetDuration := float64(len(samples)) / float64(sampleRate) rms := sound.CalculateRMS16(samples) - rmsDBFS := -math.Inf(1) + rmsDBFS := silenceFloorDBFS if rms > 0 { rmsDBFS = 20 * math.Log10(rms/32768.0) } @@ -78,7 +88,7 @@ func AudioSnippetFromPCM(pcm []byte, sampleRate, totalPCMBytes, maxBytes int) ma } dcSum += int64(s) } - peakDBFS := -math.Inf(1) + peakDBFS := silenceFloorDBFS if peak > 0 { peakDBFS = 20 * math.Log10(float64(peak)/32768.0) } diff --git a/core/trace/audio_snippet_test.go b/core/trace/audio_snippet_test.go index e330403cc..2fec4a91d 100644 --- a/core/trace/audio_snippet_test.go +++ b/core/trace/audio_snippet_test.go @@ -1,6 +1,9 @@ package trace_test import ( + "encoding/json" + "math" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -47,3 +50,32 @@ var _ = Describe("AudioSnippetFromPCM byte cap", func() { Expect(out).To(HaveKey("audio_wav_base64")) }) }) + +// Silent audio (RMS/peak of zero) has a true level of -∞ dBFS, but emitting +// -Inf made the whole /api/backend-traces response fail to JSON-marshal and +// blanked the Traces UI. The metrics must instead be finite and serializable. +var _ = Describe("AudioSnippetFromPCM silent audio dBFS", func() { + pcm := makePCM(snippetSeconds, snippetSampleRate) // all zeros == digital silence + totalPCM := len(pcm) + + It("reports finite dBFS for silence instead of -Inf", func() { + out := trace.AudioSnippetFromPCM(pcm, snippetSampleRate, totalPCM, 0) + + rms, ok := out["audio_rms_dbfs"].(float64) + Expect(ok).To(BeTrue()) + Expect(math.IsInf(rms, 0)).To(BeFalse(), "silent RMS must not be ±Inf") + Expect(math.IsNaN(rms)).To(BeFalse()) + + peak, ok := out["audio_peak_dbfs"].(float64) + Expect(ok).To(BeTrue()) + Expect(math.IsInf(peak, 0)).To(BeFalse(), "silent peak must not be ±Inf") + Expect(math.IsNaN(peak)).To(BeFalse()) + }) + + It("produces a snippet that round-trips through encoding/json", func() { + out := trace.AudioSnippetFromPCM(pcm, snippetSampleRate, totalPCM, 0) + + _, err := json.Marshal(out) + Expect(err).ToNot(HaveOccurred(), "silent-audio metrics must be JSON-marshalable") + }) +}) diff --git a/core/trace/backend_trace.go b/core/trace/backend_trace.go index a3d04d466..8d8ef6d10 100644 --- a/core/trace/backend_trace.go +++ b/core/trace/backend_trace.go @@ -3,6 +3,8 @@ package trace import ( "encoding/json" "fmt" + "maps" + "math" "slices" "sync" "time" @@ -116,8 +118,13 @@ func RecordBackendTrace(t BackendTrace) { backendMu.Lock() maxBody := backendMaxBodyBytes backendMu.Unlock() - if t.Data != nil && maxBody > 0 { - t.Data = capDataStrings(t.Data, maxBody) + // Always walk Data, even with no body cap configured: besides capping + // oversized strings (maxBody > 0), the walk replaces non-finite floats + // (Inf/NaN) that encoding/json cannot marshal. A single such value — e.g. a + // -Inf dBFS audio metric from a silent clip — would otherwise fail the whole + // /api/backend-traces response and blank the Traces UI. + if t.Data != nil { + t.Data = sanitizeData(t.Data, maxBody) } select { case backendLogChan <- &t: @@ -126,32 +133,90 @@ func RecordBackendTrace(t BackendTrace) { } } -// capDataStrings walks a trace Data map and replaces any string value (at any -// depth) that exceeds maxBytes with a fixed-size marker that names the -// original byte count. The replacement is intentionally short and not valid -// base64/JSON: the goal is to flag "this was dropped" cheaply, not to keep a -// partial value that the UI might try to render. Non-string scalars and -// non-map containers pass through untouched so structural fields like -// total_deltas or audio_sample_rate remain useful. -func capDataStrings(data map[string]any, maxBytes int) map[string]any { - out := make(map[string]any, len(data)) - for k, v := range data { - out[k] = capValue(v, maxBytes) - } +// sanitizeData walks a trace Data map (recursing into nested maps and slices) +// and makes every value safe for the /api/backend-traces JSON response: +// +// - When maxBytes > 0, any string longer than maxBytes is replaced with a +// fixed-size marker that names the original byte count. The replacement is +// intentionally short and not valid base64/JSON: it flags "this was dropped" +// cheaply rather than keeping a partial value the UI might try to render. +// - Non-finite floats (Inf/NaN) are replaced with nil regardless of maxBytes, +// because encoding/json refuses to marshal them and one bad value would fail +// the entire response. +// +// Other scalars (ints, bools, finite floats) pass through untouched so +// structural fields like total_deltas or audio_sample_rate remain useful. +// +// The walk is copy-on-write: it runs on every RecordBackendTrace call, and in +// the common case nothing needs rewriting, so containers are only re-allocated +// on the paths that actually changed and untouched values keep their original +// interface boxes instead of paying a per-value re-boxing allocation. +func sanitizeData(data map[string]any, maxBytes int) map[string]any { + out, _ := sanitizeMap(data, maxBytes) return out } -func capValue(v any, maxBytes int) any { +func sanitizeMap(m map[string]any, maxBytes int) (map[string]any, bool) { + var out map[string]any + for k, v := range m { + nv, changed := sanitizeValue(v, maxBytes) + if changed && out == nil { + // First change: fork the map. Entries already visited were + // unchanged, so a full copy then overwriting as we go is exact. + out = make(map[string]any, len(m)) + maps.Copy(out, m) + } + if out != nil { + out[k] = nv + } + } + if out == nil { + return m, false + } + return out, true +} + +func sanitizeSlice(s []any, maxBytes int) ([]any, bool) { + var out []any + for i, v := range s { + nv, changed := sanitizeValue(v, maxBytes) + if changed && out == nil { + out = make([]any, len(s)) + copy(out, s) + } + if out != nil { + out[i] = nv + } + } + if out == nil { + return s, false + } + return out, true +} + +func sanitizeValue(v any, maxBytes int) (any, bool) { switch val := v.(type) { case string: - if len(val) > maxBytes { - return fmt.Sprintf("", len(val)) + if maxBytes > 0 && len(val) > maxBytes { + return fmt.Sprintf("", len(val)), true } - return val + return v, false + case float64: + if math.IsInf(val, 0) || math.IsNaN(val) { + return nil, true + } + return v, false + case float32: + if f := float64(val); math.IsInf(f, 0) || math.IsNaN(f) { + return nil, true + } + return v, false case map[string]any: - return capDataStrings(val, maxBytes) + return sanitizeMap(val, maxBytes) + case []any: + return sanitizeSlice(val, maxBytes) default: - return v + return v, false } } diff --git a/core/trace/backend_trace_sanitize_test.go b/core/trace/backend_trace_sanitize_test.go new file mode 100644 index 000000000..e6f4eec4f --- /dev/null +++ b/core/trace/backend_trace_sanitize_test.go @@ -0,0 +1,80 @@ +package trace_test + +import ( + "encoding/json" + "math" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/mudler/LocalAI/core/trace" +) + +// encoding/json cannot marshal ±Inf or NaN. The /api/backend-traces endpoint +// serializes the whole buffer with one json call, so a single non-finite float +// in any trace's Data map (e.g. a -Inf dBFS audio metric from a silent clip) +// would fail the entire response and blank the Traces UI. RecordBackendTrace +// must scrub those values regardless of whether a body cap is configured. +var _ = Describe("RecordBackendTrace non-finite float sanitization", func() { + BeforeEach(func() { + // maxBodyBytes 0 == no body cap: float sanitization must still run. + trace.InitBackendTracingIfEnabled(64, 0) + trace.ClearBackendTraces() + }) + + It("replaces ±Inf and NaN with nil so the response stays JSON-marshalable", func() { + trace.RecordBackendTrace(trace.BackendTrace{ + Timestamp: time.Now(), + Type: trace.BackendTraceTranscription, + ModelName: "m", + Data: map[string]any{ + "audio_rms_dbfs": math.Inf(-1), + "audio_peak_dbfs": math.Inf(1), + "weird": math.NaN(), + "audio_duration_s": 1.5, // finite siblings must survive + }, + }) + + Eventually(trace.GetBackendTraces).Should(HaveLen(1)) + got := trace.GetBackendTraces()[0] + + Expect(got.Data["audio_rms_dbfs"]).To(BeNil()) + Expect(got.Data["audio_peak_dbfs"]).To(BeNil()) + Expect(got.Data["weird"]).To(BeNil()) + Expect(got.Data["audio_duration_s"]).To(Equal(1.5), "finite floats must pass through untouched") + + _, err := json.Marshal(trace.GetBackendTraces()) + Expect(err).ToNot(HaveOccurred(), "the whole trace buffer must marshal even with non-finite inputs") + }) + + It("scrubs non-finite floats nested in maps and slices", func() { + trace.RecordBackendTrace(trace.BackendTrace{ + Timestamp: time.Now(), + Type: trace.BackendTraceLLM, + ModelName: "m", + Data: map[string]any{ + "nested": map[string]any{ + "logprob": math.Inf(-1), + "ok": 0.25, + }, + "scores": []any{1.0, math.Inf(1), math.NaN()}, + }, + }) + + Eventually(trace.GetBackendTraces).Should(HaveLen(1)) + got := trace.GetBackendTraces()[0] + + nested := got.Data["nested"].(map[string]any) + Expect(nested["logprob"]).To(BeNil()) + Expect(nested["ok"]).To(Equal(0.25)) + + scores := got.Data["scores"].([]any) + Expect(scores[0]).To(Equal(1.0)) + Expect(scores[1]).To(BeNil()) + Expect(scores[2]).To(BeNil()) + + _, err := json.Marshal(trace.GetBackendTraces()) + Expect(err).ToNot(HaveOccurred()) + }) +}) diff --git a/docs/content/advanced/vram-management.md b/docs/content/advanced/vram-management.md index ee7c346be..ffa08b894 100644 --- a/docs/content/advanced/vram-management.md +++ b/docs/content/advanced/vram-management.md @@ -381,6 +381,8 @@ curl -X POST http://localhost:8080/backend/shutdown \ To stop all models, you'll need to call the endpoint for each loaded model individually, or use the web UI to stop all models at once. +Conversely, you can pre-load a model into memory ahead of its first request with `POST /backend/load` (the inverse of shutdown) — see [Backend Monitor]({{%relref "features/backend-monitor" %}}). + ### Best Practices 1. **Monitor VRAM usage**: Use `nvidia-smi` (for NVIDIA GPUs) or similar tools to monitor actual VRAM usage diff --git a/docs/content/features/authentication.md b/docs/content/features/authentication.md index ffaa43b34..8ca2aa4cd 100644 --- a/docs/content/features/authentication.md +++ b/docs/content/features/authentication.md @@ -166,7 +166,7 @@ When authentication is enabled, the following endpoints require admin role: - `GET /api/backend-traces`, `POST /api/backend-traces/clear` - `GET /api/backend-logs/*`, `POST /api/backend-logs/*/clear` - `GET /api/resources`, `GET /api/settings`, `POST /api/settings` -- `GET /system`, `GET /backend/monitor`, `POST /backend/shutdown` +- `GET /system`, `GET /backend/monitor`, `POST /backend/shutdown`, `POST /backend/load` **P2P:** - `GET /api/p2p/*` diff --git a/docs/content/features/backend-monitor.md b/docs/content/features/backend-monitor.md index 0d23c05a5..9af35d80a 100644 --- a/docs/content/features/backend-monitor.md +++ b/docs/content/features/backend-monitor.md @@ -5,7 +5,9 @@ weight = 20 url = "/features/backend-monitor/" +++ -LocalAI provides endpoints to monitor and manage running backends. The `/backend/monitor` endpoint reports the status and resource usage of loaded models, and `/backend/shutdown` allows stopping a model's backend process. +LocalAI provides endpoints to monitor and manage running backends. The `/backend/monitor` endpoint reports the status and resource usage of loaded models, `/backend/load` pre-loads a model into memory, and `/backend/shutdown` allows stopping a model's backend process. + +All three are admin-only. ## Monitor API @@ -62,6 +64,42 @@ curl "http://localhost:8080/backend/monitor?model=my-model" } ``` +## Load API + +Pre-loads a model into memory ahead of its first request, so that request pays no cold-start load cost. It is the inverse of the Shutdown API and works for any model, not just realtime pipelines. + +- **Method:** `POST` +- **Endpoints:** `/backend/load`, `/v1/backend/load` + +### Request + +| Parameter | Type | Required | Description | +|-----------|----------|----------|------------------------------| +| `model` | `string` | Yes | Name of the model to load | + +### Behavior + +- For a regular model, its own backend is loaded. +- For a **realtime pipeline** model (a config with a `pipeline:` block), every configured sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded concurrently instead of the pipeline stub, which has no backend of its own. + +The call blocks until loading finishes and reports which model names became resident, so partial failures are visible. + +### Usage + +```bash +curl -X POST http://localhost:8080/backend/load \ + -H "Content-Type: application/json" \ + -d '{"model": "my-model"}' +``` + +### Example response + +```json +{ "loaded": ["my-model"], "message": "model loaded" } +``` + +On failure the call returns `500` with `loaded` listing whichever sub-models did load and `message` naming the failures. + ## Shutdown API - **Method:** `POST` diff --git a/docs/content/features/openai-realtime.md b/docs/content/features/openai-realtime.md index f339b21d7..7db47c105 100644 --- a/docs/content/features/openai-realtime.md +++ b/docs/content/features/openai-realtime.md @@ -56,6 +56,39 @@ pipeline: All streaming flags are off by default, so existing pipelines are unaffected. +### Model warm-up (cold start) + +Without warm-up the pipeline's models are loaded into memory only on first use *within* a session: the VAD on the first audio chunk, transcription at the first end-of-speech, the LLM on the first reply, and TTS on the first spoken output. On a cold session this staggers a load delay across those first few interactions — and a model that fails to load (missing weights, wrong backend, out of memory) only fails part-way through the first turn. + +To avoid that, LocalAI **warms the pipeline by default**: it loads the VAD, transcription, LLM and TTS backends into memory *before* the session is announced, and the session start **blocks until they are all ready**. The loads run concurrently, so the wait is the slowest single model, not the sum. This means: + +- The first turn pays no cold-start cost — every backend is already resident. +- **Model-load errors surface at session start.** If any stage fails to load, the session is not started and the client receives a `model_load_error` instead of `session.created`, so a broken pipeline fails fast and visibly rather than mid-call. + +Set `disable_warmup: true` to restore the lazy "load on first use" behavior — session start no longer waits on loading and load errors surface on the first turn instead. Useful if you want idle sessions to avoid holding model memory they may never use: + +```yaml +name: gpt-realtime +pipeline: + vad: silero-vad-ggml + transcription: whisper-large-turbo + llm: qwen3-4b + tts: tts-1 + disable_warmup: true # lazily load each model on first use instead of at session start +``` + +#### Pre-loading a pipeline on demand + +Warm-up only fires when a realtime session opens. To load a pipeline into memory ahead of time — e.g. to warm it right after boot, or when running with `disable_warmup: true` — POST the model name to the admin-only `/backend/load` endpoint. For a pipeline model it loads every configured sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) concurrently: + +```bash +curl -X POST http://localhost:8080/backend/load \ + -H "Content-Type: application/json" \ + -d '{"model": "gpt-realtime"}' +``` + +The endpoint is not realtime-specific — it pre-loads any model. See [Backend Monitor]({{%relref "features/backend-monitor" %}}) for the full request/response reference (it is the inverse of `/backend/shutdown`). + ### Turn detection Turn detection decides when the user has finished speaking and the pipeline should respond. Two modes are supported, matching the OpenAI session schema: diff --git a/pkg/mcp/localaitools/client.go b/pkg/mcp/localaitools/client.go index f6f6114be..6a747a5f0 100644 --- a/pkg/mcp/localaitools/client.go +++ b/pkg/mcp/localaitools/client.go @@ -36,6 +36,10 @@ type LocalAIClient interface { DeleteModel(ctx context.Context, name string) error EditModelConfig(ctx context.Context, name string, patch map[string]any) error ReloadModels(ctx context.Context) error + // LoadModel pre-loads a model into memory by name (the inverse of shutting + // it down). For a realtime pipeline model every configured sub-model is + // loaded; it returns the model names that became resident. + LoadModel(ctx context.Context, model string) ([]string, error) ImportModelURI(ctx context.Context, req ImportModelURIRequest) (*ImportModelURIResponse, error) // ---- Model aliases ---- diff --git a/pkg/mcp/localaitools/coverage_test.go b/pkg/mcp/localaitools/coverage_test.go index 39a2ab544..51d0a85d4 100644 --- a/pkg/mcp/localaitools/coverage_test.go +++ b/pkg/mcp/localaitools/coverage_test.go @@ -49,6 +49,7 @@ var toolToHTTPRoute = map[string]string{ ToolDeleteModel: "POST /models/delete/:name", ToolEditModelConfig: "PATCH /api/models/config-json/:name", ToolReloadModels: "POST /models/reload", + ToolLoadModel: "POST /backend/load", ToolInstallBackend: "POST /backends/apply", ToolUpgradeBackend: "POST /backends/upgrade/:name", ToolToggleModelState: "PUT /models/toggle-state/:name/:action", diff --git a/pkg/mcp/localaitools/fakes_test.go b/pkg/mcp/localaitools/fakes_test.go index 388245ad2..e5f88fa36 100644 --- a/pkg/mcp/localaitools/fakes_test.go +++ b/pkg/mcp/localaitools/fakes_test.go @@ -35,6 +35,7 @@ type fakeClient struct { setAlias func(string, string) error listAliases func() ([]AliasInfo, error) reloadModels func() error + loadModel func(string) ([]string, error) listBackends func() ([]Backend, error) listKnownBackends func() ([]schema.KnownBackend, error) installBackend func(InstallBackendRequest) (string, error) @@ -169,6 +170,14 @@ func (f *fakeClient) ReloadModels(_ context.Context) error { return nil } +func (f *fakeClient) LoadModel(_ context.Context, model string) ([]string, error) { + f.record("LoadModel", model) + if f.loadModel != nil { + return f.loadModel(model) + } + return []string{model}, nil +} + func (f *fakeClient) ListBackends(_ context.Context) ([]Backend, error) { f.record("ListBackends", nil) if f.listBackends != nil { diff --git a/pkg/mcp/localaitools/httpapi/client.go b/pkg/mcp/localaitools/httpapi/client.go index 90ec332e2..fa248ed53 100644 --- a/pkg/mcp/localaitools/httpapi/client.go +++ b/pkg/mcp/localaitools/httpapi/client.go @@ -338,6 +338,16 @@ func (c *Client) ReloadModels(ctx context.Context) error { return c.do(ctx, http.MethodPost, routeModelsReload, nil, nil) } +func (c *Client) LoadModel(ctx context.Context, model string) ([]string, error) { + // On a load failure the endpoint returns a non-2xx whose body (carrying the + // per-sub-model failure detail) is folded into the HTTPError by c.do. + var resp schema.ModelLoadResponse + if err := c.do(ctx, http.MethodPost, routeBackendLoad, map[string]string{"model": model}, &resp); err != nil { + return nil, err + } + return resp.Loaded, nil +} + // ---- Model aliases ---- // SetAlias is swap-first: it PATCHes the alias config (a deep-merge that diff --git a/pkg/mcp/localaitools/httpapi/routes.go b/pkg/mcp/localaitools/httpapi/routes.go index cc552b728..85b1a1da5 100644 --- a/pkg/mcp/localaitools/httpapi/routes.go +++ b/pkg/mcp/localaitools/httpapi/routes.go @@ -19,6 +19,7 @@ const ( routeModelImport = "/models/import" routeAliases = "/api/aliases" routeModelsReload = "/models/reload" + routeBackendLoad = "/backend/load" routeBackends = "/backends" routeBackendsKnown = "/backends/known" routeBackendsApply = "/backends/apply" diff --git a/pkg/mcp/localaitools/inproc/client.go b/pkg/mcp/localaitools/inproc/client.go index e62934ccc..25602006d 100644 --- a/pkg/mcp/localaitools/inproc/client.go +++ b/pkg/mcp/localaitools/inproc/client.go @@ -13,6 +13,7 @@ import ( "path/filepath" "github.com/google/uuid" + "github.com/mudler/LocalAI/core/backend" "github.com/mudler/LocalAI/core/config" "github.com/mudler/LocalAI/core/gallery" "github.com/mudler/LocalAI/core/gallery/importers" @@ -302,6 +303,16 @@ func (c *Client) ReloadModels(_ context.Context) error { return c.ConfigLoader.LoadModelConfigsFromPath(c.SystemState.Model.ModelsPath) } +func (c *Client) LoadModel(ctx context.Context, model string) ([]string, error) { + if c.ConfigLoader == nil || c.ModelLoader == nil { + return nil, errors.New("model loader not available") + } + // Reuse the same preload path the REST /backend/load endpoint uses, so a + // pipeline model loads all its sub-models and the behaviour stays identical + // across the in-process and HTTP clients. + return backend.PreloadModelByName(ctx, c.ConfigLoader, c.ModelLoader, c.AppConfig, model) +} + // ---- Model aliases ---- // SetAlias is swap-first to match the httpapi client: PatchConfig swaps an diff --git a/pkg/mcp/localaitools/inproc/load_model_test.go b/pkg/mcp/localaitools/inproc/load_model_test.go new file mode 100644 index 000000000..e7def6c11 --- /dev/null +++ b/pkg/mcp/localaitools/inproc/load_model_test.go @@ -0,0 +1,71 @@ +package inproc + +import ( + "context" + "os" + "path/filepath" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/pkg/model" + "github.com/mudler/LocalAI/pkg/system" +) + +var _ = Describe("inproc.Client LoadModel", func() { + var ( + ctx context.Context + tempDir string + cl *config.ModelConfigLoader + ml *model.ModelLoader + c *Client + seedModel func(name, body string) + ) + + BeforeEach(func() { + ctx = context.Background() + tempDir = GinkgoT().TempDir() + systemState, err := system.GetSystemState(system.WithModelPath(tempDir)) + Expect(err).ToNot(HaveOccurred()) + appConfig := config.NewApplicationConfig(config.WithSystemState(systemState)) + cl = config.NewModelConfigLoader(tempDir) + ml = model.NewModelLoader(systemState) // no backends installed + c = New(appConfig, systemState, cl, ml, nil) + + seedModel = func(name, body string) { + Expect(os.WriteFile(filepath.Join(tempDir, name+".yaml"), []byte(body), 0o644)).To(Succeed()) + Expect(cl.LoadModelConfigsFromPath(tempDir)).To(Succeed()) + } + }) + + It("errors when the model loader is unavailable", func() { + noLoader := New(c.AppConfig, c.SystemState, cl, nil, nil) + _, err := noLoader.LoadModel(ctx, "anything") + Expect(err).To(MatchError(ContainSubstring("model loader not available"))) + }) + + It("loads a regular model through the model loader", func() { + seedModel("solo", "name: solo\n") + // No backend is installed in the test env, so the load itself fails — but + // the call must exercise the single-model path and surface that error + // rather than panicking or silently succeeding. + loaded, err := c.LoadModel(ctx, "solo") + Expect(err).To(HaveOccurred()) + Expect(loaded).To(BeEmpty()) + }) + + It("expands a pipeline model into its sub-models", func() { + seedModel("voicebot", "name: voicebot\npipeline:\n vad: vad-m\n llm: llm-m\n") + seedModel("vad-m", "name: vad-m\n") + seedModel("llm-m", "name: llm-m\n") + + loaded, err := c.LoadModel(ctx, "voicebot") + // Sub-models can't load without backends, so the joined error names them + // — proving the pipeline stub was expanded rather than loaded directly. + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("vad-m")) + Expect(err.Error()).ToNot(ContainSubstring("voicebot")) + Expect(loaded).To(BeEmpty()) + }) +}) diff --git a/pkg/mcp/localaitools/prompts/10_safety.md b/pkg/mcp/localaitools/prompts/10_safety.md index a4c8d8f57..badecb55b 100644 --- a/pkg/mcp/localaitools/prompts/10_safety.md +++ b/pkg/mcp/localaitools/prompts/10_safety.md @@ -2,7 +2,7 @@ These rules are non-negotiable. The user trusts you to operate their server without unintended changes. -1. **Confirm before mutating.** Before calling any of these tools — `install_model`, `import_model_uri`, `delete_model`, `install_backend`, `upgrade_backend`, `edit_model_config`, `reload_models`, `toggle_model_state`, `toggle_model_pinned` — first state in plain language what you are about to do (which tool, which target, which arguments) and wait for the user's explicit confirmation in the next turn. "Yes", "do it", "go ahead", "proceed" all count as confirmation. Anything else does not. +1. **Confirm before mutating.** Before calling any of these tools — `install_model`, `import_model_uri`, `delete_model`, `install_backend`, `upgrade_backend`, `edit_model_config`, `reload_models`, `load_model`, `toggle_model_state`, `toggle_model_pinned` — first state in plain language what you are about to do (which tool, which target, which arguments) and wait for the user's explicit confirmation in the next turn. "Yes", "do it", "go ahead", "proceed" all count as confirmation. Anything else does not. 2. **Disambiguate before mutating.** If the user's request is ambiguous (several gallery candidates match, the model name has multiple installed versions, the backend has variants), present the candidates as a numbered list and ask the user to pick before calling any mutating tool. diff --git a/pkg/mcp/localaitools/prompts/20_tools.md b/pkg/mcp/localaitools/prompts/20_tools.md index b26e2c333..6d076444f 100644 --- a/pkg/mcp/localaitools/prompts/20_tools.md +++ b/pkg/mcp/localaitools/prompts/20_tools.md @@ -24,5 +24,6 @@ The MCP `tools/list` endpoint also exposes the full input schema for each of the - `upgrade_backend` — Upgrade an installed backend by name. - `edit_model_config` — Patch (deep-merge) JSON into an installed model's config. - `reload_models` — Reload all model configs from disk. +- `load_model` — Pre-load a model into memory so the first request pays no cold-start cost. For a realtime pipeline model, every sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded. Inverse of stopping a model. - `toggle_model_state` — Enable or disable a model (`action`: `enable` or `disable`). - `toggle_model_pinned` — Pin or unpin a model (`action`: `pin` or `unpin`). diff --git a/pkg/mcp/localaitools/server_test.go b/pkg/mcp/localaitools/server_test.go index 052ca1e8b..387d837ca 100644 --- a/pkg/mcp/localaitools/server_test.go +++ b/pkg/mcp/localaitools/server_test.go @@ -92,6 +92,7 @@ var expectedFullCatalog = sortedStrings( ToolListInstalledModels, ToolListKnownBackends, ToolListNodes, + ToolLoadModel, ToolReloadModels, ToolSetAlias, ToolSetBranding, @@ -166,6 +167,7 @@ var _ = Describe("Tool dispatch", func() { {ToolUpgradeBackend, map[string]any{"name": "llama-cpp"}, "UpgradeBackend"}, {ToolEditModelConfig, map[string]any{"name": "foo", "patch": map[string]any{"context_size": 4096}}, "EditModelConfig"}, {ToolReloadModels, struct{}{}, "ReloadModels"}, + {ToolLoadModel, map[string]any{"model": "test-model"}, "LoadModel"}, {ToolToggleModelState, map[string]any{"name": "foo", "action": "enable"}, "ToggleModelState"}, {ToolToggleModelPinned, map[string]any{"name": "foo", "action": "pin"}, "ToggleModelPinned"}, {ToolSetAlias, map[string]any{"name": "gpt-4", "target": "real"}, "SetAlias"}, diff --git a/pkg/mcp/localaitools/tools.go b/pkg/mcp/localaitools/tools.go index 263bd791e..525c97a12 100644 --- a/pkg/mcp/localaitools/tools.go +++ b/pkg/mcp/localaitools/tools.go @@ -31,6 +31,7 @@ const ( ToolDeleteModel = "delete_model" ToolEditModelConfig = "edit_model_config" ToolReloadModels = "reload_models" + ToolLoadModel = "load_model" ToolInstallBackend = "install_backend" ToolUpgradeBackend = "upgrade_backend" ToolToggleModelState = "toggle_model_state" diff --git a/pkg/mcp/localaitools/tools_models.go b/pkg/mcp/localaitools/tools_models.go index 85f937a14..d652198a4 100644 --- a/pkg/mcp/localaitools/tools_models.go +++ b/pkg/mcp/localaitools/tools_models.go @@ -65,6 +65,22 @@ func registerModelTools(s *mcp.Server, client LocalAIClient, opts Options) { return } + mcp.AddTool(s, &mcp.Tool{ + Name: ToolLoadModel, + Description: "Pre-load a model into memory by name so the first request pays no cold-start cost (the inverse of shutting a model down). For a realtime pipeline model every configured sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded. Returns the model names that became resident. Requires user confirmation per safety rule 1.", + }, func(ctx context.Context, _ *mcp.CallToolRequest, args struct { + Model string `json:"model" jsonschema:"The installed model name to load into memory."` + }) (*mcp.CallToolResult, any, error) { + if args.Model == "" { + return errorResultf("model is required"), nil, nil + } + loaded, err := client.LoadModel(ctx, args.Model) + if err != nil { + return errorResult(err), nil, nil + } + return jsonResult(map[string]any{"loaded": loaded}), nil, nil + }) + mcp.AddTool(s, &mcp.Tool{ Name: ToolInstallModel, Description: "Install a model from a gallery. Requires explicit user confirmation per safety rule 1. Returns a job id; poll with get_job_status.", diff --git a/swagger/docs.go b/swagger/docs.go index 3bcbf569f..ec23de1aa 100644 --- a/swagger/docs.go +++ b/swagger/docs.go @@ -1443,6 +1443,52 @@ const docTemplate = `{ "responses": {} } }, + "/backend/load": { + "post": { + "description": "Loads the named model (or, for a realtime pipeline, all of its sub-models) into memory so subsequent requests pay no cold-start cost. The inverse of /backend/shutdown.", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "monitoring" + ], + "summary": "Pre-load a model into memory", + "parameters": [ + { + "description": "Model to load", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/schema.ModelLoadRequest" + } + } + ], + "responses": { + "200": { + "description": "Model loaded", + "schema": { + "$ref": "#/definitions/schema.ModelLoadResponse" + } + }, + "400": { + "description": "Missing model name", + "schema": { + "$ref": "#/definitions/schema.ModelLoadResponse" + } + }, + "500": { + "description": "Load failed (Loaded lists any sub-models that did load)", + "schema": { + "$ref": "#/definitions/schema.ModelLoadResponse" + } + } + } + } + }, "/backend/monitor": { "get": { "tags": [ @@ -5136,6 +5182,30 @@ const docTemplate = `{ } } }, + "schema.ModelLoadRequest": { + "type": "object", + "properties": { + "model": { + "type": "string" + } + } + }, + "schema.ModelLoadResponse": { + "type": "object", + "properties": { + "loaded": { + "description": "Loaded lists the model names actually resident in memory after the call.\nFor a pipeline model these are its sub-models, not the pipeline name.", + "type": "array", + "items": { + "type": "string" + } + }, + "message": { + "description": "Message is a short human-readable status (\"model loaded\", or an error).", + "type": "string" + } + } + }, "schema.ModelsDataResponse": { "type": "object", "properties": { diff --git a/swagger/swagger.json b/swagger/swagger.json index 212b62c2f..32baa866d 100644 --- a/swagger/swagger.json +++ b/swagger/swagger.json @@ -1440,6 +1440,52 @@ "responses": {} } }, + "/backend/load": { + "post": { + "description": "Loads the named model (or, for a realtime pipeline, all of its sub-models) into memory so subsequent requests pay no cold-start cost. The inverse of /backend/shutdown.", + "consumes": [ + "application/json" + ], + "produces": [ + "application/json" + ], + "tags": [ + "monitoring" + ], + "summary": "Pre-load a model into memory", + "parameters": [ + { + "description": "Model to load", + "name": "request", + "in": "body", + "required": true, + "schema": { + "$ref": "#/definitions/schema.ModelLoadRequest" + } + } + ], + "responses": { + "200": { + "description": "Model loaded", + "schema": { + "$ref": "#/definitions/schema.ModelLoadResponse" + } + }, + "400": { + "description": "Missing model name", + "schema": { + "$ref": "#/definitions/schema.ModelLoadResponse" + } + }, + "500": { + "description": "Load failed (Loaded lists any sub-models that did load)", + "schema": { + "$ref": "#/definitions/schema.ModelLoadResponse" + } + } + } + } + }, "/backend/monitor": { "get": { "tags": [ @@ -5133,6 +5179,30 @@ } } }, + "schema.ModelLoadRequest": { + "type": "object", + "properties": { + "model": { + "type": "string" + } + } + }, + "schema.ModelLoadResponse": { + "type": "object", + "properties": { + "loaded": { + "description": "Loaded lists the model names actually resident in memory after the call.\nFor a pipeline model these are its sub-models, not the pipeline name.", + "type": "array", + "items": { + "type": "string" + } + }, + "message": { + "description": "Message is a short human-readable status (\"model loaded\", or an error).", + "type": "string" + } + } + }, "schema.ModelsDataResponse": { "type": "object", "properties": { diff --git a/swagger/swagger.yaml b/swagger/swagger.yaml index e005f09f3..ae158410a 100644 --- a/swagger/swagger.yaml +++ b/swagger/swagger.yaml @@ -1362,6 +1362,25 @@ definitions: $ref: '#/definitions/schema.ToolCall' type: array type: object + schema.ModelLoadRequest: + properties: + model: + type: string + type: object + schema.ModelLoadResponse: + properties: + loaded: + description: |- + Loaded lists the model names actually resident in memory after the call. + For a pipeline model these are its sub-models, not the pipeline name. + items: + type: string + type: array + message: + description: Message is a short human-readable status ("model loaded", or + an error). + type: string + type: object schema.ModelsDataResponse: properties: data: @@ -3510,6 +3529,38 @@ paths: summary: Bidirectional realtime audio transform over WebSocket. tags: - audio + /backend/load: + post: + consumes: + - application/json + description: Loads the named model (or, for a realtime pipeline, all of its + sub-models) into memory so subsequent requests pay no cold-start cost. The + inverse of /backend/shutdown. + parameters: + - description: Model to load + in: body + name: request + required: true + schema: + $ref: '#/definitions/schema.ModelLoadRequest' + produces: + - application/json + responses: + "200": + description: Model loaded + schema: + $ref: '#/definitions/schema.ModelLoadResponse' + "400": + description: Missing model name + schema: + $ref: '#/definitions/schema.ModelLoadResponse' + "500": + description: Load failed (Loaded lists any sub-models that did load) + schema: + $ref: '#/definitions/schema.ModelLoadResponse' + summary: Pre-load a model into memory + tags: + - monitoring /backend/monitor: get: parameters: