feat(realtime): eager blocking pipeline warm-up + /backend/load API (#10662)

Realtime sessions previously lazy-loaded each pipeline sub-model (VAD, transcription, LLM, TTS) on first use, so every cold session paid a per-request model-load stall and load errors only surfaced mid-stream. Warm the whole pipeline eagerly and blockingly at session start (including the voice-gate speaker-recognition model, which an enforced gate blocks each utterance on; compaction's summary_model stays lazy since it only runs off the response path): - Add backend.PreloadModel / PreloadModelByName as the single load path for every modality (no transcription special-case; backend-omitted configs are deprecated). - The realtime session blocks on Model.Warmup and returns a model_load_error to the client if any stage fails to load; updateSession warms in the background. Opt out per pipeline with pipeline.disable_warmup, exposed as a UI toggle via the config-metadata registry. Add a LocalAI-native POST /backend/load (and /v1/backend/load) that pre-loads a model -- expanding realtime pipelines into their sub-models -- as the inverse of /backend/shutdown. There is one preload engine (backend.PreloadStages): the realtime Warmup methods, /backend/load and the --load-to-memory startup flag all use it, so --load-to-memory now also expands pipeline models and records load-failure traces. Pipeline sub-model alias resolution is likewise shared (ModelConfigLoader.LoadResolvedModelConfig). Surface the endpoint everywhere an admin manages models: - MCP admin tool load_model (httpapi + inproc clients, safety/catalog prompts, catalog/dispatch tests). - "Load into memory" action in the React models UI. - Swagger regenerated; docs moved to the general backend-monitor page since it is not realtime-specific. Fix a Traces UI crash ("json: unsupported value: -Inf"): audio-snippet RMS/peak now floor at a finite dBFS, and backend-trace data is sanitized to drop non-finite floats before marshaling. The sanitizer is copy-on-write -- it runs on every RecordBackendTrace, so containers are only re-allocated on the paths that actually changed. Migrate core/http/openresponses_test.go onto the prebuilt mock-backend the rest of the http suite already uses -- it was the last spec still pointing at a real HuggingFace model, so it 404'd wherever no vision backend was built -- and fix its item_reference specs to send the spec's "id" field instead of "item_id", which the handler never accepted. Assisted-by: Claude:claude-opus-4-8 Claude Code Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-07-03 12:57:02 -04:00 · 2026-07-03 17:00:37 +01:00
parent 80ec22945a
commit eb32cd9073
45 changed files with 1364 additions and 99 deletions
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -473,20 +473,13 @@ func New(opts ...config.AppOption) (*Application, error) {

 	if options.LoadToMemory != nil && !options.SingleBackend {
 		for _, m := range options.LoadToMemory {
-			cfg, err := application.ModelConfigLoader().LoadModelConfigFileByNameDefaultOptions(m, options)
-			if err != nil {
+			xlog.Debug("Auto loading model into memory from file", "model", m)
+			// Same path as POST /backend/load: a realtime pipeline model expands
+			// to its sub-models, and load failures are recorded as model_load
+			// traces.
+			if _, err := backend.PreloadModelByName(options.Context, application.ModelConfigLoader(), application.ModelLoader(), options, m); err != nil {
 				return nil, err
 			}
-
-			xlog.Debug("Auto loading model into memory from file", "model", m, "file", cfg.Model)
-
-			o := backend.ModelOptions(*cfg, options)
-
-			var backendErr error
-			_, backendErr = application.ModelLoader().Load(o...)
-			if backendErr != nil {
-				return nil, backendErr
-			}
 		}
 	}

--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -52,6 +52,22 @@ func ModelLoadTraceObserver(appConfig *config.ApplicationConfig) func(model.Back
 	}
 }

+// PreloadModel warms a model into memory without running any inference, so the
+// first real request doesn't pay the backend's cold-start load cost. It uses
+// the same ModelOptions + ml.Load path the modality functions use, so a
+// subsequent inference call hits the loader cache instead of reloading. Load
+// failures are recorded and returned; callers that warm models opportunistically
+// (e.g. realtime session warm-up) typically log and continue, since the lazy
+// path will retry on first use.
+func PreloadModel(ctx context.Context, ml *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig) error {
+	opts := ModelOptions(modelConfig, appConfig, model.WithContext(ctx))
+	if _, err := ml.Load(opts...); err != nil {
+		recordModelLoadFailure(appConfig, modelConfig.Name, modelConfig.Backend, err, nil)
+		return err
+	}
+	return nil
+}
+
 // recordModelLoadFailure records a backend trace when model loading fails.
 func recordModelLoadFailure(appConfig *config.ApplicationConfig, modelName, backend string, err error, data map[string]any) {
 	if !appConfig.EnableTracing {
--- a/core/backend/preload.go
+++ b/core/backend/preload.go
@@ -0,0 +1,122 @@
+package backend
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/xlog"
+)
+
+// PreloadModelByName loads the named model into memory so the first request
+// that uses it pays no cold-start load cost — the inverse of shutting a model
+// down. If the model is a realtime pipeline (its config declares a `pipeline:`
+// block), each configured sub-model (VAD, transcription, LLM, TTS,
+// sound_detection, voice_recognition) is loaded concurrently instead of the
+// pipeline stub, which has no backend of its own. It returns the model names
+// actually loaded and a joined error naming each sub-model that failed (nil on
+// full success); a partial pipeline load reports both the loaded names and the
+// failures so the caller can surface exactly what is and isn't resident.
+// Compaction's summary_model is deliberately left cold: it is only invoked off
+// the response path, so it can stay lazy.
+func PreloadModelByName(ctx context.Context, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, name string) ([]string, error) {
+	cfg, err := cl.LoadModelConfigFileByNameDefaultOptions(name, appConfig)
+	if err != nil {
+		return nil, err
+	}
+
+	stages, err := pipelineStages(cl, &cfg.Pipeline, ml.ModelPath)
+	if err != nil {
+		return nil, err
+	}
+	if len(stages) == 0 {
+		// Not a pipeline: load the model's own backend directly.
+		if err := PreloadModel(ctx, ml, *cfg, appConfig); err != nil {
+			return nil, err
+		}
+		return []string{cfg.Name}, nil
+	}
+	return PreloadStages(ctx, ml, appConfig, stages)
+}
+
+// PreloadStage names one pipeline sub-model to preload and the resolved config
+// to load it from (nil = stage absent, skipped). Role labels the pipeline slot
+// in errors and logs.
+type PreloadStage struct {
+	Role string
+	Cfg  *config.ModelConfig
+}
+
+// loadStage is PreloadModel behind a seam so PreloadStages can be unit-tested
+// without spawning real backends.
+var loadStage = PreloadModel
+
+// pipelineStages resolves each populated pipeline stage to its concrete model
+// config, following a single alias hop — the same resolution the realtime
+// pipeline itself uses. A stage that fails to resolve is a misconfiguration,
+// so it fails fast rather than being deferred to load. A pipeline with no
+// stages set returns nil, which callers treat as "not a pipeline".
+func pipelineStages(cl *config.ModelConfigLoader, p *config.Pipeline, modelPath string) ([]PreloadStage, error) {
+	voiceRec := ""
+	if p.VoiceRecognition != nil {
+		voiceRec = p.VoiceRecognition.Model
+	}
+	var stages []PreloadStage
+	for _, s := range []struct{ role, name string }{
+		{"vad", p.VAD},
+		{"transcription", p.Transcription},
+		{"llm", p.LLM},
+		{"tts", p.TTS},
+		{"sound_detection", p.SoundDetection},
+		{"voice_recognition", voiceRec},
+	} {
+		if s.name == "" {
+			continue
+		}
+		cfg, err := cl.LoadResolvedModelConfig(s.name, modelPath)
+		if err != nil {
+			return nil, fmt.Errorf("%s (%s): %w", s.role, s.name, err)
+		}
+		stages = append(stages, PreloadStage{Role: s.role, Cfg: cfg})
+	}
+	return stages, nil
+}
+
+// PreloadStages loads every present stage at once and waits for all of them, so
+// a pipeline warms in the time of its slowest stage rather than the sum. Absent
+// (nil-config) stages are skipped. A failed stage does not cancel the others —
+// they all run to completion so the joined error names every broken stage at
+// once, alongside the names that did load.
+func PreloadStages(ctx context.Context, ml *model.ModelLoader, appConfig *config.ApplicationConfig, stages []PreloadStage) ([]string, error) {
+	var (
+		wg     sync.WaitGroup
+		mu     sync.Mutex
+		loaded []string
+		errs   []error
+	)
+	for _, s := range stages {
+		if s.Cfg == nil {
+			continue
+		}
+		wg.Add(1)
+		go func(s PreloadStage) {
+			defer wg.Done()
+			if err := loadStage(ctx, ml, *s.Cfg, appConfig); err != nil {
+				xlog.Warn("preload: failed to load pipeline sub-model", "stage", s.Role, "model", s.Cfg.Name, "error", err)
+				mu.Lock()
+				errs = append(errs, fmt.Errorf("%s (%s): %w", s.Role, s.Cfg.Name, err))
+				mu.Unlock()
+				return
+			}
+			xlog.Debug("preload: loaded pipeline sub-model", "stage", s.Role, "model", s.Cfg.Name)
+			mu.Lock()
+			loaded = append(loaded, s.Cfg.Name)
+			mu.Unlock()
+		}(s)
+	}
+	wg.Wait()
+	return loaded, errors.Join(errs...)
+}
--- a/core/backend/preload_internal_test.go
+++ b/core/backend/preload_internal_test.go
@@ -0,0 +1,146 @@
+package backend
+
+import (
+	"context"
+	"errors"
+	"os"
+	"path/filepath"
+	"sync"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/model"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("pipelineStages", func() {
+	seed := func(dir string, names ...string) *config.ModelConfigLoader {
+		for _, n := range names {
+			yaml := "name: " + n + "\nbackend: fake-backend\n"
+			Expect(os.WriteFile(filepath.Join(dir, n+".yaml"), []byte(yaml), 0o644)).To(Succeed())
+		}
+		cl := config.NewModelConfigLoader(dir)
+		Expect(cl.LoadModelConfigsFromPath(dir)).To(Succeed())
+		return cl
+	}
+
+	It("resolves only the populated stages, in load order", func() {
+		dir := GinkgoT().TempDir()
+		cl := seed(dir, "vad-m", "stt-m", "llm-m", "tts-m")
+
+		stages, err := pipelineStages(cl, &config.Pipeline{
+			VAD:           "vad-m",
+			Transcription: "stt-m",
+			LLM:           "llm-m",
+			TTS:           "tts-m",
+		}, dir)
+		Expect(err).ToNot(HaveOccurred())
+
+		roles := make([]string, len(stages))
+		names := make([]string, len(stages))
+		for i, s := range stages {
+			roles[i] = s.Role
+			names[i] = s.Cfg.Name
+		}
+		Expect(roles).To(Equal([]string{"vad", "transcription", "llm", "tts"}))
+		Expect(names).To(Equal([]string{"vad-m", "stt-m", "llm-m", "tts-m"}))
+	})
+
+	It("skips unset stages and includes sound_detection and voice_recognition when set", func() {
+		dir := GinkgoT().TempDir()
+		cl := seed(dir, "stt-m", "ced", "spk")
+
+		stages, err := pipelineStages(cl, &config.Pipeline{
+			Transcription:    "stt-m",
+			SoundDetection:   "ced",
+			VoiceRecognition: &config.PipelineVoiceRecognition{Model: "spk"},
+		}, dir)
+		Expect(err).ToNot(HaveOccurred())
+
+		roles := make([]string, len(stages))
+		for i, s := range stages {
+			roles[i] = s.Role
+		}
+		Expect(roles).To(ConsistOf("transcription", "sound_detection", "voice_recognition"))
+	})
+
+	It("returns nil for a pipeline with no stages (not a pipeline)", func() {
+		dir := GinkgoT().TempDir()
+		cl := seed(dir)
+
+		stages, err := pipelineStages(cl, &config.Pipeline{}, dir)
+		Expect(err).ToNot(HaveOccurred())
+		Expect(stages).To(BeNil())
+	})
+})
+
+var _ = Describe("PreloadStages", func() {
+	var (
+		mu   sync.Mutex
+		seen []string
+	)
+
+	// stubLoader swaps the loadStage seam for a recorder so no real backends
+	// are spawned; errFor injects per-model failures.
+	stubLoader := func(errFor map[string]error) {
+		loadStage = func(_ context.Context, _ *model.ModelLoader, cfg config.ModelConfig, _ *config.ApplicationConfig) error {
+			mu.Lock()
+			seen = append(seen, cfg.Name)
+			mu.Unlock()
+			return errFor[cfg.Name]
+		}
+	}
+
+	BeforeEach(func() {
+		seen = nil
+	})
+	AfterEach(func() {
+		loadStage = PreloadModel
+	})
+
+	mkStage := func(role, name string) PreloadStage {
+		return PreloadStage{Role: role, Cfg: &config.ModelConfig{Name: name}}
+	}
+
+	It("loads every present stage, skips absent (nil-config) ones, and returns the loaded names", func() {
+		stubLoader(nil)
+
+		loaded, err := PreloadStages(context.Background(), nil, nil, []PreloadStage{
+			mkStage("vad", "vad-m"),
+			{Role: "transcription"}, // absent stage
+			mkStage("llm", "llm-m"),
+		})
+
+		Expect(err).ToNot(HaveOccurred())
+		Expect(loaded).To(ConsistOf("vad-m", "llm-m"))
+		// Barrier: every stage has run by the time PreloadStages returns, so
+		// reading seen without the lock here is safe.
+		Expect(seen).To(ConsistOf("vad-m", "llm-m"))
+	})
+
+	It("reports a joined error naming each failed stage while still loading the rest", func() {
+		stubLoader(map[string]error{
+			"vad-m": errors.New("vad boom"),
+			"tts-m": errors.New("tts boom"),
+		})
+
+		loaded, err := PreloadStages(context.Background(), nil, nil, []PreloadStage{
+			mkStage("vad", "vad-m"),
+			mkStage("llm", "llm-m"),
+			mkStage("tts", "tts-m"),
+		})
+
+		// Every stage ran (a failure does not cancel the others)...
+		Expect(seen).To(ConsistOf("vad-m", "llm-m", "tts-m"))
+		// ...the stage that loaded fine is reported as loaded...
+		Expect(loaded).To(ConsistOf("llm-m"))
+		// ...and the joined error names every broken stage and its cause.
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("vad (vad-m)"))
+		Expect(err.Error()).To(ContainSubstring("vad boom"))
+		Expect(err.Error()).To(ContainSubstring("tts (tts-m)"))
+		Expect(err.Error()).To(ContainSubstring("tts boom"))
+		Expect(err.Error()).ToNot(ContainSubstring("llm"))
+	})
+})
--- a/core/config/meta/registry.go
+++ b/core/config/meta/registry.go
@@ -599,6 +599,13 @@ func DefaultRegistry() map[string]FieldMetaOverride {
 			Component:   "toggle",
 			Order:       89,
 		},
+		"pipeline.disable_warmup": {
+			Section:     "pipeline",
+			Label:       "Disable Warmup",
+			Description: "Turn off eager pre-loading of the pipeline's sub-models at realtime session start. By default LocalAI loads every configured sub-model backend (VAD, transcription, LLM, TTS, sound detection, voice recognition) before the session starts and blocks until they are ready, so the first turn pays no cold-start cost and a model that fails to load is reported at session start instead of mid-call. Enable this to restore the lazy 'load on first use' behavior — session start no longer waits on loading and load errors surface on the first turn instead. Useful to keep idle sessions from holding model memory they may never use.",
+			Component:   "toggle",
+			Order:       90,
+		},

 		// --- Functions ---
 		"function.grammar.parallel_calls": {
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -656,6 +656,18 @@ type Pipeline struct {
 	// to benefit. A client session.update still overrides type and eagerness
 	// per session; retranscribe is server-side only. Unset keeps server_vad.
 	TurnDetection PipelineTurnDetection `yaml:"turn_detection,omitempty" json:"turn_detection,omitempty"`
+
+	// DisableWarmup turns off eager pre-loading of the pipeline's sub-models at
+	// realtime session start. By default (false) LocalAI loads every configured
+	// sub-model backend (VAD, transcription, LLM, TTS, sound detection, voice
+	// recognition) into memory (concurrently) before the
+	// session is announced and blocks until they are ready, so the first turn
+	// pays no cold-start cost and a model that fails to load surfaces as an error
+	// at session start rather than mid-call. Set true to restore the lazy "load
+	// on first use" behavior — session start no longer blocks on loading and
+	// load errors surface on first use instead (e.g. to keep idle sessions from
+	// holding model memory they may never use).
+	DisableWarmup bool `yaml:"disable_warmup,omitempty" json:"disable_warmup,omitempty"`
 }

 // PipelineCompaction configures summarize-then-drop for a realtime pipeline.
--- a/core/config/model_config_loader.go
+++ b/core/config/model_config_loader.go
@@ -155,6 +155,25 @@ func (bcl *ModelConfigLoader) LoadModelConfigFileByNameDefaultOptions(modelName
 		ModelPath(appConfig.SystemState.Model.ModelsPath))
 }

+// LoadResolvedModelConfig loads a model config by name and follows a single
+// alias hop, so a caller that references an alias (e.g. a pipeline with
+// `llm: default`) gets the alias target's full config (Backend, Model, ...)
+// rather than the alias stub with an empty Backend. Without this the alias
+// survives unresolved into model loading and fails downstream — notably in
+// distributed mode with "backend name is empty". Mirrors the top-level alias
+// resolution in core/http/middleware/request.go.
+func (bcl *ModelConfigLoader) LoadResolvedModelConfig(modelName, modelPath string) (*ModelConfig, error) {
+	cfg, err := bcl.LoadModelConfigFileByName(modelName, modelPath)
+	if err != nil {
+		return nil, err
+	}
+	resolved, _, err := bcl.ResolveAlias(cfg)
+	if err != nil {
+		return nil, err
+	}
+	return resolved, nil
+}
+
 // This format is currently only used when reading a single file at startup, passed in via ApplicationConfig.ConfigFile
 func (bcl *ModelConfigLoader) LoadMultipleModelConfigsSingleFile(file string, opts ...ConfigLoaderOption) error {
 	bcl.Lock()
--- a/core/http/endpoints/openai/realtime_model_alias_test.go
+++ b/core/http/endpoints/openai/realtime_model_alias_test.go
@@ -1,4 +1,4 @@
-package openai
+package config_test

 import (
 	"os"
@@ -10,14 +10,14 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 )

-// loadPipelineSubModel must resolve a pipeline sub-model that references an
-// alias (e.g. `llm: default`) one hop to the alias target's full config — so
-// the effective backend is the target's backend, not the empty backend of the
-// alias stub. This mirrors the top-level alias resolution done in
-// core/http/middleware/request.go, which the realtime pipeline previously
+// LoadResolvedModelConfig must resolve a model that references an alias
+// (e.g. a pipeline with `llm: default`) one hop to the alias target's full
+// config — so the effective backend is the target's backend, not the empty
+// backend of the alias stub. This mirrors the top-level alias resolution done
+// in core/http/middleware/request.go, which the realtime pipeline previously
 // skipped (failing in distributed mode with "backend name is empty").
-var _ = Describe("loadPipelineSubModel", func() {
-	It("resolves a sub-model alias one hop to the target's config", func() {
+var _ = Describe("LoadResolvedModelConfig", func() {
+	It("resolves an alias one hop to the target's config", func() {
 		tmpDir := GinkgoT().TempDir()

 		// A real model config with a concrete backend.
@@ -38,13 +38,13 @@ alias: real-llm
 		Expect(cl.LoadModelConfigsFromPath(tmpDir)).To(Succeed())

 		// Resolving the alias must follow the hop to the target's full config.
-		resolved, err := loadPipelineSubModel(cl, "default", tmpDir)
+		resolved, err := cl.LoadResolvedModelConfig("default", tmpDir)
 		Expect(err).NotTo(HaveOccurred())
 		Expect(resolved.IsAlias()).To(BeFalse())
 		Expect(resolved.Backend).To(Equal("llama-cpp"))

 		// A non-alias name must load unchanged.
-		direct, err := loadPipelineSubModel(cl, "real-llm", tmpDir)
+		direct, err := cl.LoadResolvedModelConfig("real-llm", tmpDir)
 		Expect(err).NotTo(HaveOccurred())
 		Expect(direct.Backend).To(Equal("llama-cpp"))
 		Expect(direct.Name).To(Equal("real-llm"))
--- a/core/http/endpoints/localai/backend_load.go
+++ b/core/http/endpoints/localai/backend_load.go
@@ -0,0 +1,54 @@
+package localai
+
+import (
+	"net/http"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/backend"
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/xlog"
+)
+
+// LoadModelEndpoint pre-loads a model into memory by name — the inverse of
+// /backend/shutdown. For a realtime pipeline model every configured sub-model
+// (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded; for a regular
+// model its own backend is loaded. The call blocks until loading finishes so
+// clients can drive warm-up explicitly and learn up front whether a model
+// fails to load.
+// @Summary Pre-load a model into memory
+// @Description Loads the named model (or, for a realtime pipeline, all of its sub-models) into memory so subsequent requests pay no cold-start cost. The inverse of /backend/shutdown.
+// @Tags monitoring
+// @Accept json
+// @Produce json
+// @Param request body schema.ModelLoadRequest true "Model to load"
+// @Success 200 {object} schema.ModelLoadResponse "Model loaded"
+// @Failure 400 {object} schema.ModelLoadResponse "Missing model name"
+// @Failure 500 {object} schema.ModelLoadResponse "Load failed (Loaded lists any sub-models that did load)"
+// @Router /backend/load [post]
+func LoadModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc {
+	return func(c echo.Context) error {
+		input := new(schema.ModelLoadRequest)
+		if err := c.Bind(input); err != nil {
+			return err
+		}
+		if input.Model == "" {
+			return c.JSON(http.StatusBadRequest, schema.ModelLoadResponse{Message: "model is required"})
+		}
+
+		loaded, err := backend.PreloadModelByName(c.Request().Context(), cl, ml, appConfig, input.Model)
+		if err != nil {
+			xlog.Error("failed to pre-load model", "model", input.Model, "loaded", loaded, "error", err)
+			return c.JSON(http.StatusInternalServerError, schema.ModelLoadResponse{
+				Loaded:  loaded,
+				Message: "failed to load model: " + err.Error(),
+			})
+		}
+
+		return c.JSON(http.StatusOK, schema.ModelLoadResponse{
+			Loaded:  loaded,
+			Message: "model loaded",
+		})
+	}
+}
--- a/core/http/endpoints/localai/backend_load_test.go
+++ b/core/http/endpoints/localai/backend_load_test.go
@@ -0,0 +1,102 @@
+package localai_test
+
+import (
+	"bytes"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"os"
+	"path/filepath"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/config"
+	. "github.com/mudler/LocalAI/core/http/endpoints/localai"
+	"github.com/mudler/LocalAI/core/schema"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/system"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("LoadModelEndpoint (/backend/load)", func() {
+	var (
+		app          *echo.Echo
+		tempDir      string
+		configLoader *config.ModelConfigLoader
+		modelLoader  *model.ModelLoader
+		appConfig    *config.ApplicationConfig
+	)
+
+	post := func(body string) *httptest.ResponseRecorder {
+		req := httptest.NewRequest(http.MethodPost, "/backend/load", bytes.NewBufferString(body))
+		req.Header.Set(echo.HeaderContentType, echo.MIMEApplicationJSON)
+		rec := httptest.NewRecorder()
+		app.ServeHTTP(rec, req)
+		return rec
+	}
+
+	decode := func(rec *httptest.ResponseRecorder) schema.ModelLoadResponse {
+		var resp schema.ModelLoadResponse
+		Expect(json.Unmarshal(rec.Body.Bytes(), &resp)).To(Succeed())
+		return resp
+	}
+
+	writeConfig := func(name, contents string) {
+		Expect(os.WriteFile(filepath.Join(tempDir, name+".yaml"), []byte(contents), 0o600)).To(Succeed())
+	}
+
+	BeforeEach(func() {
+		var err error
+		tempDir, err = os.MkdirTemp("", "backend-load-test-*")
+		Expect(err).NotTo(HaveOccurred())
+
+		systemState, err := system.GetSystemState(system.WithModelPath(tempDir))
+		Expect(err).NotTo(HaveOccurred())
+
+		appConfig = config.NewApplicationConfig(config.WithSystemState(systemState))
+		configLoader = config.NewModelConfigLoader(tempDir)
+		modelLoader = model.NewModelLoader(systemState) // no backends installed
+
+		app = echo.New()
+		app.POST("/backend/load", LoadModelEndpoint(configLoader, modelLoader, appConfig))
+	})
+
+	AfterEach(func() {
+		_ = os.RemoveAll(tempDir)
+	})
+
+	It("rejects a request with no model name", func() {
+		rec := post(`{}`)
+		Expect(rec.Code).To(Equal(http.StatusBadRequest))
+		Expect(decode(rec).Message).To(ContainSubstring("model is required"))
+	})
+
+	It("reports a load failure for a regular model with nothing loaded", func() {
+		writeConfig("solo", "name: solo\n")
+
+		rec := post(`{"model":"solo"}`)
+		Expect(rec.Code).To(Equal(http.StatusInternalServerError))
+
+		resp := decode(rec)
+		Expect(resp.Loaded).To(BeEmpty())
+		Expect(resp.Message).To(ContainSubstring("failed to load model"))
+	})
+
+	It("expands a pipeline model and reports each sub-model that failed to load", func() {
+		writeConfig("voicebot", "name: voicebot\npipeline:\n  vad: vad-m\n  transcription: stt-m\n  llm: llm-m\n  tts: tts-m\n")
+		writeConfig("vad-m", "name: vad-m\n")
+		writeConfig("stt-m", "name: stt-m\n")
+		writeConfig("llm-m", "name: llm-m\n")
+		writeConfig("tts-m", "name: tts-m\n")
+
+		rec := post(`{"model":"voicebot"}`)
+		Expect(rec.Code).To(Equal(http.StatusInternalServerError))
+
+		resp := decode(rec)
+		Expect(resp.Message).To(ContainSubstring("failed to load model"))
+		// The pipeline stub itself is never loaded; its sub-models are what the
+		// endpoint tries, so the error names them rather than "voicebot".
+		Expect(resp.Message).To(ContainSubstring("vad-m"))
+		Expect(resp.Message).ToNot(ContainSubstring("voicebot"))
+	})
+})
--- a/core/http/endpoints/mcp/localai_assistant_test.go
+++ b/core/http/endpoints/mcp/localai_assistant_test.go
@@ -51,6 +51,9 @@ func (stubClient) EditModelConfig(_ context.Context, _ string, _ map[string]any)
 	return nil
 }
 func (stubClient) ReloadModels(_ context.Context) error { return nil }
+func (stubClient) LoadModel(_ context.Context, model string) ([]string, error) {
+	return []string{model}, nil
+}
 func (stubClient) SetAlias(_ context.Context, _, _ string) error {
 	return nil
 }
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -7,6 +7,7 @@ import (
 	"encoding/binary"
 	"encoding/hex"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"math"
 	"os"
@@ -266,6 +267,12 @@ type Model interface {
 	// grpcerrors.IsLiveTranscriptionUnsupported.
 	TranscribeLive(ctx context.Context, language string, onEvent func(backend.LiveTranscriptionEvent)) (backend.LiveTranscriptionSession, error)
 	PredictConfig() *config.ModelConfig
+	// Warmup eagerly loads the pipeline's sub-model backends into memory so the
+	// first realtime turn doesn't pay each backend's cold-start load cost. Loads
+	// run concurrently; Warmup blocks until they all finish and returns a joined
+	// error naming every stage that failed to load (nil if all succeeded), so a
+	// caller can surface model-load failures at session start instead of mid-call.
+	Warmup(ctx context.Context) error
 }

 var upgrader = websocket.Upgrader{
@@ -583,18 +590,8 @@ func runRealtimeSession(application *application.Application, t Transport, model
 	}
 	session.ModelInterface = m

-	if session.SummaryModel != "" {
-		summaryModelName := session.SummaryModel
-		sid := sessionID
-		session.summarizerFactory = func() (Model, error) {
-			summaryCfg, lerr := application.ModelConfigLoader().LoadModelConfigFileByNameDefaultOptions(summaryModelName, application.ApplicationConfig())
-			if lerr != nil {
-				return nil, fmt.Errorf("load summary model config %q: %w", summaryModelName, lerr)
-			}
-			return newModel(&summaryCfg.Pipeline, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), evaluator, buildRealtimeRoutingContext(application, sid))
-		}
-	}
-
+	// The voice gate is built before the warm-up below so its
+	// speaker-recognition model can warm alongside the pipeline stages.
 	if cfg.Pipeline.VoiceGateEnabled() {
 		gate, gerr := newVoiceGate(
 			*cfg.Pipeline.VoiceRecognition,
@@ -612,6 +609,47 @@ func runRealtimeSession(application *application.Application, t Transport, model
 		xlog.Info("realtime voice recognition gate enabled", "mode", gate.cfg.Mode, "when", gate.cfg.When)
 	}

+	// Warm the pipeline's sub-model backends before announcing the session.
+	// Loads run concurrently but we block here until they all finish, so a model
+	// that fails to load (missing weights, bad backend, OOM) surfaces as an error
+	// at session start rather than stalling — or failing — mid-call on the first
+	// turn (VAD on the first audio chunk, STT at end-of-speech, LLM on the first
+	// reply, TTS on the first spoken output). On success the backends are already
+	// resident, so the first turn pays no cold-start cost. Opt out per pipeline
+	// with `pipeline.disable_warmup: true` to restore lazy load-on-first-use
+	// (errors then surface on first use instead of at session start).
+	if !cfg.Pipeline.DisableWarmup {
+		warmErr := make(chan error, 1)
+		go func() { warmErr <- m.Warmup(context.Background()) }()
+		// The voice-gate model warms concurrently with the pipeline stages: an
+		// enforced gate blocks each utterance on speaker resolution, so its
+		// cold-start would otherwise land on the first turn too. (Compaction's
+		// summary_model stays lazy — it only runs off the response path.)
+		var gateErr error
+		if session.voiceGate != nil {
+			_, gateErr = backend.PreloadStages(context.Background(), application.ModelLoader(), application.ApplicationConfig(), []backend.PreloadStage{
+				{Role: "voice_recognition", Cfg: session.voiceGate.recCfg},
+			})
+		}
+		if err := errors.Join(<-warmErr, gateErr); err != nil {
+			xlog.Error("realtime warmup failed", "model", model, "error", err)
+			sendError(t, "model_load_error", "Failed to load pipeline models: "+err.Error(), "", "")
+			return
+		}
+	}
+
+	if session.SummaryModel != "" {
+		summaryModelName := session.SummaryModel
+		sid := sessionID
+		session.summarizerFactory = func() (Model, error) {
+			summaryCfg, lerr := application.ModelConfigLoader().LoadModelConfigFileByNameDefaultOptions(summaryModelName, application.ApplicationConfig())
+			if lerr != nil {
+				return nil, fmt.Errorf("load summary model config %q: %w", summaryModelName, lerr)
+			}
+			return newModel(&summaryCfg.Pipeline, application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig(), evaluator, buildRealtimeRoutingContext(application, sid))
+		}
+	}
+
 	// Store the session and notify the transport (for WebRTC audio track handling)
 	sessionLock.Lock()
 	sessions[sessionID] = session
@@ -1125,6 +1163,21 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
 			return err
 		}
 		session.ModelInterface = m
+		// A session.update that swaps the model/voice rebuilds the pipeline, so
+		// warm the new backends too (unless opted out) — otherwise the next turn
+		// pays the cold-start load the original session warm-up already avoided.
+		// Unlike session start this stays non-blocking: updateSession runs under
+		// the global sessionLock, so blocking on a multi-second load here would
+		// stall every other session. Load errors are logged (and still surface on
+		// first use); per-stage failures are already warned inside
+		// backend.PreloadStages.
+		if !session.ModelConfig.Pipeline.DisableWarmup {
+			go func() {
+				if err := m.Warmup(context.Background()); err != nil {
+					xlog.Error("realtime warmup failed after session.update", "error", err)
+				}
+			}()
+		}
 	}

 	if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.TurnDetectionSet {
--- a/core/http/endpoints/openai/realtime_doubles_test.go
+++ b/core/http/endpoints/openai/realtime_doubles_test.go
@@ -174,6 +174,8 @@ func (m *fakeModel) TranscribeLive(_ context.Context, _ string, onEvent func(bac

 func (m *fakeModel) PredictConfig() *config.ModelConfig { return m.cfg }

+func (m *fakeModel) Warmup(ctx context.Context) error { return nil }
+
 // fakeLiveSession records what semantic_vad fed and closed; closeEvents are
 // replayed through onEvent during Close, mimicking the backend's finalize
 // flush (trailing delta + Final) landing before Close returns.
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -110,6 +110,15 @@ func (m *transcriptOnlyModel) PredictConfig() *config.ModelConfig {
 	return nil
 }

+func (m *transcriptOnlyModel) Warmup(ctx context.Context) error {
+	_, err := backend.PreloadStages(ctx, m.modelLoader, m.appConfig, []backend.PreloadStage{
+		{Role: "vad", Cfg: m.VADConfig},
+		{Role: "transcription", Cfg: m.TranscriptionConfig},
+		{Role: "sound_detection", Cfg: m.SoundDetectionConfig},
+	})
+	return err
+}
+
 func (m *wrappedModel) VAD(ctx context.Context, request *schema.VADRequest) (*schema.VADResponse, error) {
 	return backend.VAD(request, ctx, m.modelLoader, m.appConfig, *m.VADConfig)
 }
@@ -360,6 +369,17 @@ func (m *wrappedModel) PredictConfig() *config.ModelConfig {
 	return m.LLMConfig
 }

+func (m *wrappedModel) Warmup(ctx context.Context) error {
+	_, err := backend.PreloadStages(ctx, m.modelLoader, m.appConfig, []backend.PreloadStage{
+		{Role: "vad", Cfg: m.VADConfig},
+		{Role: "transcription", Cfg: m.TranscriptionConfig},
+		{Role: "llm", Cfg: m.LLMConfig},
+		{Role: "tts", Cfg: m.TTSConfig},
+		{Role: "sound_detection", Cfg: m.SoundDetectionConfig},
+	})
+	return err
+}
+
 // wavStreamHeaderBytes is the size of the WAV header that backend.ModelTTSStream
 // emits as its first audio callback; the sample rate lives at byte offset 24.
 const wavStreamHeaderBytes = 44
@@ -440,7 +460,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
 	if pipeline.SoundDetection == "" {
 		return nil, nil
 	}
-	cfg, err := loadPipelineSubModel(cl, pipeline.SoundDetection, ml.ModelPath)
+	cfg, err := cl.LoadResolvedModelConfig(pipeline.SoundDetection, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to load sound detection config: %w", err)
 	}
@@ -451,7 +471,7 @@ func loadSoundDetectionConfig(pipeline *config.Pipeline, cl *config.ModelConfigL
 }

 func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) (Model, *config.ModelConfig, error) {
-	cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
+	cfgVAD, err := cl.LoadResolvedModelConfig(pipeline.VAD, ml.ModelPath)
 	if err != nil {

 		return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -461,7 +481,7 @@ func newTranscriptionOnlyModel(pipeline *config.Pipeline, cl *config.ModelConfig
 		return nil, nil, fmt.Errorf("failed to validate config: %w", err)
 	}

-	cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
+	cfgSST, err := cl.LoadResolvedModelConfig(pipeline.Transcription, ml.ModelPath)
 	if err != nil {

 		return nil, nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -550,30 +570,11 @@ func buildRealtimeRoutingContext(a *application.Application, sessionID string) *
 	}
 }

-// loadPipelineSubModel loads a pipeline sub-model config by name and follows a
-// single alias hop, so a pipeline that references an alias (e.g. `llm: default`)
-// gets the alias target's full config (Backend, Model, ...) rather than the
-// alias stub with an empty Backend. Without this the alias survives unresolved
-// into model loading and fails downstream — notably in distributed mode with
-// "backend name is empty". Mirrors the top-level alias resolution in
-// core/http/middleware/request.go.
-func loadPipelineSubModel(cl *config.ModelConfigLoader, name, modelPath string) (*config.ModelConfig, error) {
-	cfg, err := cl.LoadModelConfigFileByName(name, modelPath)
-	if err != nil {
-		return nil, err
-	}
-	resolved, _, err := cl.ResolveAlias(cfg)
-	if err != nil {
-		return nil, err
-	}
-	return resolved, nil
-}
-
 // returns and loads either a wrapped model or a model that support audio-to-audio
 func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, evaluator *templates.Evaluator, routing *RealtimeRoutingContext) (Model, error) {
 	xlog.Debug("Creating new model pipeline model", "pipeline", pipeline)

-	cfgVAD, err := loadPipelineSubModel(cl, pipeline.VAD, ml.ModelPath)
+	cfgVAD, err := cl.LoadResolvedModelConfig(pipeline.VAD, ml.ModelPath)
 	if err != nil {

 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -584,7 +585,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	}

 	// TODO: Do we always need a transcription model? It can be disabled. Note that any-to-any instruction following models don't transcribe as such, so if transcription is required it is a separate process
-	cfgSST, err := loadPipelineSubModel(cl, pipeline.Transcription, ml.ModelPath)
+	cfgSST, err := cl.LoadResolvedModelConfig(pipeline.Transcription, ml.ModelPath)
 	if err != nil {

 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -616,7 +617,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	xlog.Debug("Loading a wrapped model")

 	// Otherwise we want to return a wrapped model, which is a "virtual" model that re-uses other models to perform operations
-	cfgLLM, err := loadPipelineSubModel(cl, pipeline.LLM, ml.ModelPath)
+	cfgLLM, err := cl.LoadResolvedModelConfig(pipeline.LLM, ml.ModelPath)
 	if err != nil {

 		return nil, fmt.Errorf("failed to load backend config: %w", err)
@@ -631,7 +632,7 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 	applyPipelineReasoning(cfgLLM, *pipeline)
 	applyPipelineThinking(cfgLLM, *pipeline)

-	cfgTTS, err := loadPipelineSubModel(cl, pipeline.TTS, ml.ModelPath)
+	cfgTTS, err := cl.LoadResolvedModelConfig(pipeline.TTS, ml.ModelPath)
 	if err != nil {

 		return nil, fmt.Errorf("failed to load backend config: %w", err)
--- a/core/http/endpoints/openai/realtime_voicegate.go
+++ b/core/http/endpoints/openai/realtime_voicegate.go
@@ -21,6 +21,7 @@ type namedEmbedding struct {
 // drive the realtime pipeline.
 type voiceGate struct {
 	cfg       config.PipelineVoiceRecognition // normalized
+	recCfg    *config.ModelConfig             // resolved speaker-recognition model, for warm-up
 	registry  voicerecognition.Registry       // identify mode (nil otherwise)
 	refEmbeds []namedEmbedding                // verify mode, pre-embedded refs
 	refAudios []config.VoiceReference         // verify + anti-spoofing: ref paths
@@ -72,7 +73,9 @@ func newVoiceGate(
 		return nil, err
 	}

-	recCfg, err := cl.LoadModelConfigFileByName(cfg.Model, ml.ModelPath)
+	// Resolved like every other pipeline sub-model (one alias hop), so an
+	// aliased voice_recognition model gets its target's backend.
+	recCfg, err := cl.LoadResolvedModelConfig(cfg.Model, ml.ModelPath)
 	if err != nil {
 		return nil, fmt.Errorf("voice_recognition: failed to load model %q: %w", cfg.Model, err)
 	}
@@ -82,6 +85,7 @@ func newVoiceGate(

 	g := &voiceGate{
 		cfg:      cfg,
+		recCfg:   recCfg,
 		registry: registry,
 		embedFn: func(ctx context.Context, wavPath string) ([]float32, error) {
 			res, err := backend.VoiceEmbed(ctx, wavPath, ml, appConfig, *recCfg)
--- a/core/http/endpoints/openai/realtime_warmup_test.go
+++ b/core/http/endpoints/openai/realtime_warmup_test.go
@@ -0,0 +1,64 @@
+package openai
+
+import (
+	"context"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/system"
+)
+
+// Warmup delegates to backend.PreloadStages (its concurrency, nil-skipping and
+// error-joining semantics are pinned in core/backend). These specs pin the
+// wiring instead: each realtime model type must warm exactly its configured
+// stages under the right pipeline-role labels. No backends are installed, so
+// every attempted stage fails to load — the joined error is the proof of which
+// stages were attempted and how they were labeled.
+var _ = Describe("realtime model Warmup wiring", func() {
+	newLoader := func() (*model.ModelLoader, *config.ApplicationConfig) {
+		systemState, err := system.GetSystemState(system.WithModelPath(GinkgoT().TempDir()))
+		Expect(err).ToNot(HaveOccurred())
+		appConfig := config.NewApplicationConfig(config.WithSystemState(systemState))
+		return model.NewModelLoader(systemState), appConfig
+	}
+
+	It("wrappedModel warms every configured stage under its pipeline role", func() {
+		ml, appConfig := newLoader()
+		m := &wrappedModel{
+			VADConfig:            &config.ModelConfig{Name: "vad-m"},
+			TranscriptionConfig:  &config.ModelConfig{Name: "stt-m"},
+			LLMConfig:            &config.ModelConfig{Name: "llm-m"},
+			TTSConfig:            &config.ModelConfig{Name: "tts-m"},
+			SoundDetectionConfig: &config.ModelConfig{Name: "ced-m"},
+			modelLoader:          ml,
+			appConfig:            appConfig,
+		}
+
+		err := m.Warmup(context.Background())
+		Expect(err).To(HaveOccurred())
+		for _, stage := range []string{"vad (vad-m)", "transcription (stt-m)", "llm (llm-m)", "tts (tts-m)", "sound_detection (ced-m)"} {
+			Expect(err.Error()).To(ContainSubstring(stage))
+		}
+	})
+
+	It("transcriptOnlyModel warms its stages and skips absent ones", func() {
+		ml, appConfig := newLoader()
+		m := &transcriptOnlyModel{
+			VADConfig:           &config.ModelConfig{Name: "vad-m"},
+			TranscriptionConfig: &config.ModelConfig{Name: "stt-m"},
+			// SoundDetectionConfig nil: an absent stage must be skipped, not
+			// fail the warm-up.
+			modelLoader: ml,
+			appConfig:   appConfig,
+		}
+
+		err := m.Warmup(context.Background())
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("vad (vad-m)"))
+		Expect(err.Error()).To(ContainSubstring("transcription (stt-m)"))
+		Expect(err.Error()).ToNot(ContainSubstring("sound_detection"))
+	})
+})
--- a/core/http/openresponses_test.go
+++ b/core/http/openresponses_test.go
@@ -7,6 +7,7 @@ import (
 	"io"
 	"net/http"
 	"os"
+	"path/filepath"
 	"strings"
 	"time"

@@ -29,6 +30,8 @@ const testModel = "Qwen3-VL-2B-Instruct-Q4_K_M"

 var _ = Describe("Open Responses API", func() {
 	var app *echo.Echo
+	var localApp *application.Application
+	var localModelDir string
 	var c context.Context
 	var cancel context.CancelFunc

@@ -38,28 +41,47 @@ var _ = Describe("Open Responses API", func() {

 	Context("API with ephemeral models", func() {
 		BeforeEach(func(sc SpecContext) {
-			var err error
+			// This suite exercises the /v1/responses HTTP/protocol contract
+			// (Content-Type, SSE framing, response envelope, error shapes),
+			// not real inference — so it runs against the same prebuilt
+			// mock-backend the rest of the http suite uses instead of
+			// downloading a real model. Skip cleanly when it isn't built.
+			if mockBackendPath == "" {
+				Skip("mock-backend binary not built; run 'make build-mock-backend'")
+			}

-			backendPath := os.Getenv("BACKENDS_PATH")
+			var err error

 			c, cancel = context.WithCancel(context.Background())

+			// Isolated model dir carrying a single config named after testModel
+			// but served by the mock backend, so the responses endpoint can
+			// resolve and load the model without any real backend build.
+			localModelDir, err = os.MkdirTemp("", "openresponses-models-")
+			Expect(err).ToNot(HaveOccurred())
+
+			mockModelYAML := "name: " + testModel + "\n" +
+				"backend: mock-backend\n" +
+				"parameters:\n" +
+				"  model: mock-model.bin\n"
+			Expect(os.WriteFile(filepath.Join(localModelDir, testModel+".yaml"), []byte(mockModelYAML), 0644)).To(Succeed())
+
 			systemState, err := system.GetSystemState(
-				system.WithBackendPath(backendPath),
-				system.WithModelPath(modelDir),
+				system.WithBackendPath(backendDir),
+				system.WithModelPath(localModelDir),
 			)
 			Expect(err).ToNot(HaveOccurred())

-			application, err := application.New(
+			localApp, err = application.New(
 				append(commonOpts,
 					config.WithContext(c),
 					config.WithSystemState(systemState),
 					config.WithApiKeys([]string{apiKey}),
-					config.WithModelsURL("https://huggingface.co/unsloth/Qwen3-VL-2B-Instruct-GGUF"),
 				)...)
 			Expect(err).ToNot(HaveOccurred())
+			localApp.ModelLoader().SetExternalBackend("mock-backend", mockBackendPath)

-			app, err = API(application)
+			app, err = API(localApp)
 			Expect(err).ToNot(HaveOccurred())

 			go func() {
@@ -80,14 +102,24 @@ var _ = Describe("Open Responses API", func() {
 		})

 		AfterEach(func(sc SpecContext) {
+			// Synchronous app shutdown first — context-cancel cleanup is async
+			// and races test-binary exit, orphaning mock-backend children.
+			if localApp != nil {
+				_ = localApp.Shutdown()
+				localApp = nil
+			}
 			cancel()
 			if app != nil {
 				ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 				defer cancel()
 				err := app.Shutdown(ctx)
 				Expect(err).ToNot(HaveOccurred())
+				app = nil
+			}
+			if localModelDir != "" {
+				_ = os.RemoveAll(localModelDir)
+				localModelDir = ""
 			}
-
 		})

 		Context("HTTP Protocol Compliance", func() {
@@ -969,13 +1001,16 @@ var _ = Describe("Open Responses API", func() {
 				Expect(ok).To(BeTrue())
 				Expect(itemID).ToNot(BeEmpty())

-				// Now create a new response with item_reference
+				// Now create a new response with item_reference. Per the OpenAI
+				// Responses spec (and this server's parser in
+				// endpoints/openresponses/responses.go) an item_reference carries
+				// the referenced item in the "id" field, not "item_id".
 				reqBody2 := map[string]any{
 					"model": testModel,
 					"input": []any{
 						map[string]any{
-							"type":    "item_reference",
-							"item_id": itemID,
+							"type": "item_reference",
+							"id":   itemID,
 						},
 						map[string]any{
 							"type":    "message",
@@ -1005,8 +1040,8 @@ var _ = Describe("Open Responses API", func() {
 					"model": testModel,
 					"input": []any{
 						map[string]any{
-							"type":    "item_reference",
-							"item_id": "nonexistent_item_id",
+							"type": "item_reference",
+							"id":   "nonexistent_item_id",
 						},
 					},
 				}
--- a/core/http/react-ui/src/pages/Manage.jsx
+++ b/core/http/react-ui/src/pages/Manage.jsx
@@ -146,6 +146,7 @@ export default function Manage() {
  const [distributedMode, setDistributedMode] = useState(false)
  const [togglingModels, setTogglingModels] = useState(new Set())
  const [pinningModels, setPinningModels] = useState(new Set())
+  const [loadingModels, setLoadingModels] = useState(new Set())
  // Expanded row state — keyed by `${tab}:${id}` so switching tabs doesn't
  // collide and a single row is open at a time per tab.
  const [expandedKey, setExpandedKey] = useState(null)
@@ -313,6 +314,26 @@ export default function Manage() {
    })
  }

+  // Pre-load a model (or all of a realtime pipeline's sub-models) into memory.
+  // The /backend/load call blocks until loading finishes, so the menu item shows
+  // a loading state while in flight and reports the outcome on completion.
+  const handleLoadModel = async (modelName) => {
+    setLoadingModels(prev => new Set(prev).add(modelName))
+    try {
+      await backendControlApi.load({ model: modelName })
+      addToast(`Loaded ${modelName}`, 'success')
+      setTimeout(fetchLoadedModels, 500)
+    } catch (err) {
+      addToast(`Failed to load: ${err.message}`, 'error')
+    } finally {
+      setLoadingModels(prev => {
+        const next = new Set(prev)
+        next.delete(modelName)
+        return next
+      })
+    }
+  }
+
  const handleDeleteModel = (modelName) => {
    setConfirmDialog({
      title: 'Delete Model',
@@ -687,6 +708,11 @@ export default function Manage() {
                              label: model.disabled ? 'Enable model' : 'Disable model',
                              onClick: () => handleToggleModel(model.id, model.disabled),
                              disabled: togglingModels.has(model.id) },
+                            { key: 'load', icon: 'fa-bolt',
+                              label: loadingModels.has(model.id) ? 'Loading…' : 'Load into memory',
+                              onClick: () => handleLoadModel(model.id),
+                              hidden: isRunning || !!model.disabled,
+                              disabled: loadingModels.has(model.id) },
                            { key: 'stop', icon: 'fa-stop', label: 'Stop model',
                              onClick: () => handleStopModel(model.id), hidden: !isRunning },
                            { key: 'pin', icon: 'fa-thumbtack',
--- a/core/http/react-ui/src/utils/api.js
+++ b/core/http/react-ui/src/utils/api.js
@@ -352,6 +352,9 @@ export const realtimeApi = {
 // Backend control
 export const backendControlApi = {
  shutdown: (body) => postJSON(API_CONFIG.endpoints.backendShutdown, body),
+  // Pre-load a model (or all of a realtime pipeline's sub-models) into memory.
+  // body: { model: "<name>" }. Inverse of shutdown.
+  load: (body) => postJSON(API_CONFIG.endpoints.backendLoad, body),
 }

 // System info
--- a/core/http/react-ui/src/utils/config.js
+++ b/core/http/react-ui/src/utils/config.js
@@ -106,6 +106,7 @@ export const API_CONFIG = {
    video: '/video',
    backendMonitor: '/backend/monitor',
    backendShutdown: '/backend/shutdown',
+    backendLoad: '/backend/load',
    modelsApply: '/models/apply',
    modelsDelete: (name) => `/models/delete/${name}`,
    modelsAvailable: '/models/available',
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -207,9 +207,14 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 	backendMonitorService := monitoring.NewBackendMonitorService(ml, cl, appConfig) // Split out for now
 	router.GET("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService), adminMiddleware)
 	router.POST("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService), adminMiddleware)
+	// /backend/load is the inverse of /backend/shutdown: pre-load a model (or all
+	// of a realtime pipeline's sub-models) into memory so clients can drive
+	// warm-up explicitly instead of paying the cold-start cost on first use.
+	router.POST("/backend/load", localai.LoadModelEndpoint(cl, ml, appConfig), adminMiddleware)
 	// The v1/* urls are exactly the same as above - makes local e2e testing easier if they are registered.
 	router.GET("/v1/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService), adminMiddleware)
 	router.POST("/v1/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService), adminMiddleware)
+	router.POST("/v1/backend/load", localai.LoadModelEndpoint(cl, ml, appConfig), adminMiddleware)

 	// Traces and backend logs (monitoring)
 	router.GET("/api/traces", localai.GetAPITracesEndpoint(), adminMiddleware)
@@ -245,6 +250,7 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 			"metrics":              "/metrics",
 			"backend_monitor":      "/backend/monitor",
 			"backend_shutdown":     "/backend/shutdown",
+			"backend_load":         "/backend/load",
 			"system":               "/system",
 			"version":              "/version",
 			"traces":               "/api/traces",
--- a/core/schema/localai.go
+++ b/core/schema/localai.go
@@ -11,6 +11,24 @@ type BackendMonitorRequest struct {
 	BasicModelRequest
 }

+// ModelLoadRequest asks LocalAI to pre-load a model into memory by name, so the
+// first request that uses it pays no cold-start load cost. For a realtime
+// pipeline model, every configured sub-model (VAD, transcription, LLM, TTS,
+// sound_detection, voice_recognition) is loaded instead of the pipeline stub.
+// It is the inverse of the /backend/shutdown request.
+type ModelLoadRequest struct {
+	BasicModelRequest
+}
+
+// ModelLoadResponse reports the outcome of a /backend/load call.
+type ModelLoadResponse struct {
+	// Loaded lists the model names actually resident in memory after the call.
+	// For a pipeline model these are its sub-models, not the pipeline name.
+	Loaded []string `json:"loaded"`
+	// Message is a short human-readable status ("model loaded", or an error).
+	Message string `json:"message"`
+}
+
 type TokenMetricsRequest struct {
 	BasicModelRequest
 }
--- a/core/trace/audio_snippet.go
+++ b/core/trace/audio_snippet.go
@@ -14,6 +14,16 @@ import (
 // MaxSnippetSeconds is the maximum number of seconds of audio captured per trace.
 const MaxSnippetSeconds = 30

+// silenceFloorDBFS is the dBFS value reported for digital silence (RMS or peak
+// of zero). The true level is -∞ dBFS; reporting a finite floor keeps the
+// metric present and meaningful in the Traces UI (a scrubbed nil would read as
+// "missing" rather than "silent"). -120 dBFS sits well below 16-bit PCM's
+// ~-90 dBFS least-significant-bit floor, so it reads unambiguously as
+// "effectively silent". JSON-marshal safety for any non-finite float that does
+// reach a trace is owned centrally by RecordBackendTrace's sanitizer — this
+// floor is about presentation, not transport.
+const silenceFloorDBFS = -120.0
+
 // AudioSnippet captures the first MaxSnippetSeconds of a WAV file and computes
 // quality metrics. The result is a map suitable for merging into a BackendTrace
 // Data field. maxBytes caps the embedded base64 waveform so a single TTS or
@@ -63,7 +73,7 @@ func AudioSnippetFromPCM(pcm []byte, sampleRate, totalPCMBytes, maxBytes int) ma
 	snippetDuration := float64(len(samples)) / float64(sampleRate)

 	rms := sound.CalculateRMS16(samples)
-	rmsDBFS := -math.Inf(1)
+	rmsDBFS := silenceFloorDBFS
 	if rms > 0 {
 		rmsDBFS = 20 * math.Log10(rms/32768.0)
 	}
@@ -78,7 +88,7 @@ func AudioSnippetFromPCM(pcm []byte, sampleRate, totalPCMBytes, maxBytes int) ma
 		}
 		dcSum += int64(s)
 	}
-	peakDBFS := -math.Inf(1)
+	peakDBFS := silenceFloorDBFS
 	if peak > 0 {
 		peakDBFS = 20 * math.Log10(float64(peak)/32768.0)
 	}
--- a/core/trace/audio_snippet_test.go
+++ b/core/trace/audio_snippet_test.go
@@ -1,6 +1,9 @@
 package trace_test

 import (
+	"encoding/json"
+	"math"
+
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"

@@ -47,3 +50,32 @@ var _ = Describe("AudioSnippetFromPCM byte cap", func() {
 		Expect(out).To(HaveKey("audio_wav_base64"))
 	})
 })
+
+// Silent audio (RMS/peak of zero) has a true level of -∞ dBFS, but emitting
+// -Inf made the whole /api/backend-traces response fail to JSON-marshal and
+// blanked the Traces UI. The metrics must instead be finite and serializable.
+var _ = Describe("AudioSnippetFromPCM silent audio dBFS", func() {
+	pcm := makePCM(snippetSeconds, snippetSampleRate) // all zeros == digital silence
+	totalPCM := len(pcm)
+
+	It("reports finite dBFS for silence instead of -Inf", func() {
+		out := trace.AudioSnippetFromPCM(pcm, snippetSampleRate, totalPCM, 0)
+
+		rms, ok := out["audio_rms_dbfs"].(float64)
+		Expect(ok).To(BeTrue())
+		Expect(math.IsInf(rms, 0)).To(BeFalse(), "silent RMS must not be ±Inf")
+		Expect(math.IsNaN(rms)).To(BeFalse())
+
+		peak, ok := out["audio_peak_dbfs"].(float64)
+		Expect(ok).To(BeTrue())
+		Expect(math.IsInf(peak, 0)).To(BeFalse(), "silent peak must not be ±Inf")
+		Expect(math.IsNaN(peak)).To(BeFalse())
+	})
+
+	It("produces a snippet that round-trips through encoding/json", func() {
+		out := trace.AudioSnippetFromPCM(pcm, snippetSampleRate, totalPCM, 0)
+
+		_, err := json.Marshal(out)
+		Expect(err).ToNot(HaveOccurred(), "silent-audio metrics must be JSON-marshalable")
+	})
+})
--- a/core/trace/backend_trace.go
+++ b/core/trace/backend_trace.go
@@ -3,6 +3,8 @@ package trace
 import (
 	"encoding/json"
 	"fmt"
+	"maps"
+	"math"
 	"slices"
 	"sync"
 	"time"
@@ -116,8 +118,13 @@ func RecordBackendTrace(t BackendTrace) {
 	backendMu.Lock()
 	maxBody := backendMaxBodyBytes
 	backendMu.Unlock()
-	if t.Data != nil && maxBody > 0 {
-		t.Data = capDataStrings(t.Data, maxBody)
+	// Always walk Data, even with no body cap configured: besides capping
+	// oversized strings (maxBody > 0), the walk replaces non-finite floats
+	// (Inf/NaN) that encoding/json cannot marshal. A single such value — e.g. a
+	// -Inf dBFS audio metric from a silent clip — would otherwise fail the whole
+	// /api/backend-traces response and blank the Traces UI.
+	if t.Data != nil {
+		t.Data = sanitizeData(t.Data, maxBody)
 	}
 	select {
 	case backendLogChan <- &t:
@@ -126,32 +133,90 @@ func RecordBackendTrace(t BackendTrace) {
 	}
 }

-// capDataStrings walks a trace Data map and replaces any string value (at any
-// depth) that exceeds maxBytes with a fixed-size marker that names the
-// original byte count. The replacement is intentionally short and not valid
-// base64/JSON: the goal is to flag "this was dropped" cheaply, not to keep a
-// partial value that the UI might try to render. Non-string scalars and
-// non-map containers pass through untouched so structural fields like
-// total_deltas or audio_sample_rate remain useful.
-func capDataStrings(data map[string]any, maxBytes int) map[string]any {
-	out := make(map[string]any, len(data))
-	for k, v := range data {
-		out[k] = capValue(v, maxBytes)
-	}
+// sanitizeData walks a trace Data map (recursing into nested maps and slices)
+// and makes every value safe for the /api/backend-traces JSON response:
+//
+//   - When maxBytes > 0, any string longer than maxBytes is replaced with a
+//     fixed-size marker that names the original byte count. The replacement is
+//     intentionally short and not valid base64/JSON: it flags "this was dropped"
+//     cheaply rather than keeping a partial value the UI might try to render.
+//   - Non-finite floats (Inf/NaN) are replaced with nil regardless of maxBytes,
+//     because encoding/json refuses to marshal them and one bad value would fail
+//     the entire response.
+//
+// Other scalars (ints, bools, finite floats) pass through untouched so
+// structural fields like total_deltas or audio_sample_rate remain useful.
+//
+// The walk is copy-on-write: it runs on every RecordBackendTrace call, and in
+// the common case nothing needs rewriting, so containers are only re-allocated
+// on the paths that actually changed and untouched values keep their original
+// interface boxes instead of paying a per-value re-boxing allocation.
+func sanitizeData(data map[string]any, maxBytes int) map[string]any {
+	out, _ := sanitizeMap(data, maxBytes)
 	return out
 }

-func capValue(v any, maxBytes int) any {
+func sanitizeMap(m map[string]any, maxBytes int) (map[string]any, bool) {
+	var out map[string]any
+	for k, v := range m {
+		nv, changed := sanitizeValue(v, maxBytes)
+		if changed && out == nil {
+			// First change: fork the map. Entries already visited were
+			// unchanged, so a full copy then overwriting as we go is exact.
+			out = make(map[string]any, len(m))
+			maps.Copy(out, m)
+		}
+		if out != nil {
+			out[k] = nv
+		}
+	}
+	if out == nil {
+		return m, false
+	}
+	return out, true
+}
+
+func sanitizeSlice(s []any, maxBytes int) ([]any, bool) {
+	var out []any
+	for i, v := range s {
+		nv, changed := sanitizeValue(v, maxBytes)
+		if changed && out == nil {
+			out = make([]any, len(s))
+			copy(out, s)
+		}
+		if out != nil {
+			out[i] = nv
+		}
+	}
+	if out == nil {
+		return s, false
+	}
+	return out, true
+}
+
+func sanitizeValue(v any, maxBytes int) (any, bool) {
 	switch val := v.(type) {
 	case string:
-		if len(val) > maxBytes {
-			return fmt.Sprintf("<truncated: %d bytes>", len(val))
+		if maxBytes > 0 && len(val) > maxBytes {
+			return fmt.Sprintf("<truncated: %d bytes>", len(val)), true
 		}
-		return val
+		return v, false
+	case float64:
+		if math.IsInf(val, 0) || math.IsNaN(val) {
+			return nil, true
+		}
+		return v, false
+	case float32:
+		if f := float64(val); math.IsInf(f, 0) || math.IsNaN(f) {
+			return nil, true
+		}
+		return v, false
 	case map[string]any:
-		return capDataStrings(val, maxBytes)
+		return sanitizeMap(val, maxBytes)
+	case []any:
+		return sanitizeSlice(val, maxBytes)
 	default:
-		return v
+		return v, false
 	}
 }

--- a/core/trace/backend_trace_sanitize_test.go
+++ b/core/trace/backend_trace_sanitize_test.go
@@ -0,0 +1,80 @@
+package trace_test
+
+import (
+	"encoding/json"
+	"math"
+	"time"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/trace"
+)
+
+// encoding/json cannot marshal ±Inf or NaN. The /api/backend-traces endpoint
+// serializes the whole buffer with one json call, so a single non-finite float
+// in any trace's Data map (e.g. a -Inf dBFS audio metric from a silent clip)
+// would fail the entire response and blank the Traces UI. RecordBackendTrace
+// must scrub those values regardless of whether a body cap is configured.
+var _ = Describe("RecordBackendTrace non-finite float sanitization", func() {
+	BeforeEach(func() {
+		// maxBodyBytes 0 == no body cap: float sanitization must still run.
+		trace.InitBackendTracingIfEnabled(64, 0)
+		trace.ClearBackendTraces()
+	})
+
+	It("replaces ±Inf and NaN with nil so the response stays JSON-marshalable", func() {
+		trace.RecordBackendTrace(trace.BackendTrace{
+			Timestamp: time.Now(),
+			Type:      trace.BackendTraceTranscription,
+			ModelName: "m",
+			Data: map[string]any{
+				"audio_rms_dbfs":   math.Inf(-1),
+				"audio_peak_dbfs":  math.Inf(1),
+				"weird":            math.NaN(),
+				"audio_duration_s": 1.5, // finite siblings must survive
+			},
+		})
+
+		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
+		got := trace.GetBackendTraces()[0]
+
+		Expect(got.Data["audio_rms_dbfs"]).To(BeNil())
+		Expect(got.Data["audio_peak_dbfs"]).To(BeNil())
+		Expect(got.Data["weird"]).To(BeNil())
+		Expect(got.Data["audio_duration_s"]).To(Equal(1.5), "finite floats must pass through untouched")
+
+		_, err := json.Marshal(trace.GetBackendTraces())
+		Expect(err).ToNot(HaveOccurred(), "the whole trace buffer must marshal even with non-finite inputs")
+	})
+
+	It("scrubs non-finite floats nested in maps and slices", func() {
+		trace.RecordBackendTrace(trace.BackendTrace{
+			Timestamp: time.Now(),
+			Type:      trace.BackendTraceLLM,
+			ModelName: "m",
+			Data: map[string]any{
+				"nested": map[string]any{
+					"logprob": math.Inf(-1),
+					"ok":      0.25,
+				},
+				"scores": []any{1.0, math.Inf(1), math.NaN()},
+			},
+		})
+
+		Eventually(trace.GetBackendTraces).Should(HaveLen(1))
+		got := trace.GetBackendTraces()[0]
+
+		nested := got.Data["nested"].(map[string]any)
+		Expect(nested["logprob"]).To(BeNil())
+		Expect(nested["ok"]).To(Equal(0.25))
+
+		scores := got.Data["scores"].([]any)
+		Expect(scores[0]).To(Equal(1.0))
+		Expect(scores[1]).To(BeNil())
+		Expect(scores[2]).To(BeNil())
+
+		_, err := json.Marshal(trace.GetBackendTraces())
+		Expect(err).ToNot(HaveOccurred())
+	})
+})
--- a/docs/content/advanced/vram-management.md
+++ b/docs/content/advanced/vram-management.md
@@ -381,6 +381,8 @@ curl -X POST http://localhost:8080/backend/shutdown \

 To stop all models, you'll need to call the endpoint for each loaded model individually, or use the web UI to stop all models at once.

+Conversely, you can pre-load a model into memory ahead of its first request with `POST /backend/load` (the inverse of shutdown) — see [Backend Monitor]({{%relref "features/backend-monitor" %}}).
+
 ### Best Practices

 1. **Monitor VRAM usage**: Use `nvidia-smi` (for NVIDIA GPUs) or similar tools to monitor actual VRAM usage
--- a/docs/content/features/authentication.md
+++ b/docs/content/features/authentication.md
@@ -166,7 +166,7 @@ When authentication is enabled, the following endpoints require admin role:
 - `GET /api/backend-traces`, `POST /api/backend-traces/clear`
 - `GET /api/backend-logs/*`, `POST /api/backend-logs/*/clear`
 - `GET /api/resources`, `GET /api/settings`, `POST /api/settings`
- `GET /system`, `GET /backend/monitor`, `POST /backend/shutdown`
+- `GET /system`, `GET /backend/monitor`, `POST /backend/shutdown`, `POST /backend/load`

 **P2P:**
 - `GET /api/p2p/*`
--- a/docs/content/features/backend-monitor.md
+++ b/docs/content/features/backend-monitor.md
@@ -5,7 +5,9 @@ weight = 20
 url = "/features/backend-monitor/"
 +++

-LocalAI provides endpoints to monitor and manage running backends. The `/backend/monitor` endpoint reports the status and resource usage of loaded models, and `/backend/shutdown` allows stopping a model's backend process.
+LocalAI provides endpoints to monitor and manage running backends. The `/backend/monitor` endpoint reports the status and resource usage of loaded models, `/backend/load` pre-loads a model into memory, and `/backend/shutdown` allows stopping a model's backend process.
+
+All three are admin-only.

 ## Monitor API

@@ -62,6 +64,42 @@ curl "http://localhost:8080/backend/monitor?model=my-model"
 }
 ```

+## Load API
+
+Pre-loads a model into memory ahead of its first request, so that request pays no cold-start load cost. It is the inverse of the Shutdown API and works for any model, not just realtime pipelines.
+
+- **Method:** `POST`
+- **Endpoints:** `/backend/load`, `/v1/backend/load`
+
+### Request
+
+| Parameter | Type     | Required | Description                  |
+|-----------|----------|----------|------------------------------|
+| `model`   | `string` | Yes      | Name of the model to load    |
+
+### Behavior
+
+- For a regular model, its own backend is loaded.
+- For a **realtime pipeline** model (a config with a `pipeline:` block), every configured sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded concurrently instead of the pipeline stub, which has no backend of its own.
+
+The call blocks until loading finishes and reports which model names became resident, so partial failures are visible.
+
+### Usage
+
+```bash
+curl -X POST http://localhost:8080/backend/load \
+  -H "Content-Type: application/json" \
+  -d '{"model": "my-model"}'
+```
+
+### Example response
+
+```json
+{ "loaded": ["my-model"], "message": "model loaded" }
+```
+
+On failure the call returns `500` with `loaded` listing whichever sub-models did load and `message` naming the failures.
+
 ## Shutdown API

 - **Method:** `POST`
--- a/docs/content/features/openai-realtime.md
+++ b/docs/content/features/openai-realtime.md
@@ -56,6 +56,39 @@ pipeline:

 All streaming flags are off by default, so existing pipelines are unaffected.

+### Model warm-up (cold start)
+
+Without warm-up the pipeline's models are loaded into memory only on first use *within* a session: the VAD on the first audio chunk, transcription at the first end-of-speech, the LLM on the first reply, and TTS on the first spoken output. On a cold session this staggers a load delay across those first few interactions — and a model that fails to load (missing weights, wrong backend, out of memory) only fails part-way through the first turn.
+
+To avoid that, LocalAI **warms the pipeline by default**: it loads the VAD, transcription, LLM and TTS backends into memory *before* the session is announced, and the session start **blocks until they are all ready**. The loads run concurrently, so the wait is the slowest single model, not the sum. This means:
+
+- The first turn pays no cold-start cost — every backend is already resident.
+- **Model-load errors surface at session start.** If any stage fails to load, the session is not started and the client receives a `model_load_error` instead of `session.created`, so a broken pipeline fails fast and visibly rather than mid-call.
+
+Set `disable_warmup: true` to restore the lazy "load on first use" behavior — session start no longer waits on loading and load errors surface on the first turn instead. Useful if you want idle sessions to avoid holding model memory they may never use:
+
+```yaml
+name: gpt-realtime
+pipeline:
+  vad: silero-vad-ggml
+  transcription: whisper-large-turbo
+  llm: qwen3-4b
+  tts: tts-1
+  disable_warmup: true   # lazily load each model on first use instead of at session start
+```
+
+#### Pre-loading a pipeline on demand
+
+Warm-up only fires when a realtime session opens. To load a pipeline into memory ahead of time — e.g. to warm it right after boot, or when running with `disable_warmup: true` — POST the model name to the admin-only `/backend/load` endpoint. For a pipeline model it loads every configured sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) concurrently:
+
+```bash
+curl -X POST http://localhost:8080/backend/load \
+  -H "Content-Type: application/json" \
+  -d '{"model": "gpt-realtime"}'
+```
+
+The endpoint is not realtime-specific — it pre-loads any model. See [Backend Monitor]({{%relref "features/backend-monitor" %}}) for the full request/response reference (it is the inverse of `/backend/shutdown`).
+
 ### Turn detection

 Turn detection decides when the user has finished speaking and the pipeline should respond. Two modes are supported, matching the OpenAI session schema:
--- a/pkg/mcp/localaitools/client.go
+++ b/pkg/mcp/localaitools/client.go
@@ -36,6 +36,10 @@ type LocalAIClient interface {
 	DeleteModel(ctx context.Context, name string) error
 	EditModelConfig(ctx context.Context, name string, patch map[string]any) error
 	ReloadModels(ctx context.Context) error
+	// LoadModel pre-loads a model into memory by name (the inverse of shutting
+	// it down). For a realtime pipeline model every configured sub-model is
+	// loaded; it returns the model names that became resident.
+	LoadModel(ctx context.Context, model string) ([]string, error)
 	ImportModelURI(ctx context.Context, req ImportModelURIRequest) (*ImportModelURIResponse, error)

 	// ---- Model aliases ----
--- a/pkg/mcp/localaitools/coverage_test.go
+++ b/pkg/mcp/localaitools/coverage_test.go
@@ -49,6 +49,7 @@ var toolToHTTPRoute = map[string]string{
 	ToolDeleteModel:       "POST /models/delete/:name",
 	ToolEditModelConfig:   "PATCH /api/models/config-json/:name",
 	ToolReloadModels:      "POST /models/reload",
+	ToolLoadModel:         "POST /backend/load",
 	ToolInstallBackend:    "POST /backends/apply",
 	ToolUpgradeBackend:    "POST /backends/upgrade/:name",
 	ToolToggleModelState:  "PUT /models/toggle-state/:name/:action",
--- a/pkg/mcp/localaitools/fakes_test.go
+++ b/pkg/mcp/localaitools/fakes_test.go
@@ -35,6 +35,7 @@ type fakeClient struct {
 	setAlias            func(string, string) error
 	listAliases         func() ([]AliasInfo, error)
 	reloadModels        func() error
+	loadModel           func(string) ([]string, error)
 	listBackends        func() ([]Backend, error)
 	listKnownBackends   func() ([]schema.KnownBackend, error)
 	installBackend      func(InstallBackendRequest) (string, error)
@@ -169,6 +170,14 @@ func (f *fakeClient) ReloadModels(_ context.Context) error {
 	return nil
 }

+func (f *fakeClient) LoadModel(_ context.Context, model string) ([]string, error) {
+	f.record("LoadModel", model)
+	if f.loadModel != nil {
+		return f.loadModel(model)
+	}
+	return []string{model}, nil
+}
+
 func (f *fakeClient) ListBackends(_ context.Context) ([]Backend, error) {
 	f.record("ListBackends", nil)
 	if f.listBackends != nil {
--- a/pkg/mcp/localaitools/httpapi/client.go
+++ b/pkg/mcp/localaitools/httpapi/client.go
@@ -338,6 +338,16 @@ func (c *Client) ReloadModels(ctx context.Context) error {
 	return c.do(ctx, http.MethodPost, routeModelsReload, nil, nil)
 }

+func (c *Client) LoadModel(ctx context.Context, model string) ([]string, error) {
+	// On a load failure the endpoint returns a non-2xx whose body (carrying the
+	// per-sub-model failure detail) is folded into the HTTPError by c.do.
+	var resp schema.ModelLoadResponse
+	if err := c.do(ctx, http.MethodPost, routeBackendLoad, map[string]string{"model": model}, &resp); err != nil {
+		return nil, err
+	}
+	return resp.Loaded, nil
+}
+
 // ---- Model aliases ----

 // SetAlias is swap-first: it PATCHes the alias config (a deep-merge that
--- a/pkg/mcp/localaitools/httpapi/routes.go
+++ b/pkg/mcp/localaitools/httpapi/routes.go
@@ -19,6 +19,7 @@ const (
 	routeModelImport     = "/models/import"
 	routeAliases         = "/api/aliases"
 	routeModelsReload    = "/models/reload"
+	routeBackendLoad     = "/backend/load"
 	routeBackends        = "/backends"
 	routeBackendsKnown   = "/backends/known"
 	routeBackendsApply   = "/backends/apply"
--- a/pkg/mcp/localaitools/inproc/client.go
+++ b/pkg/mcp/localaitools/inproc/client.go
@@ -13,6 +13,7 @@ import (
 	"path/filepath"

 	"github.com/google/uuid"
+	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/gallery/importers"
@@ -302,6 +303,16 @@ func (c *Client) ReloadModels(_ context.Context) error {
 	return c.ConfigLoader.LoadModelConfigsFromPath(c.SystemState.Model.ModelsPath)
 }

+func (c *Client) LoadModel(ctx context.Context, model string) ([]string, error) {
+	if c.ConfigLoader == nil || c.ModelLoader == nil {
+		return nil, errors.New("model loader not available")
+	}
+	// Reuse the same preload path the REST /backend/load endpoint uses, so a
+	// pipeline model loads all its sub-models and the behaviour stays identical
+	// across the in-process and HTTP clients.
+	return backend.PreloadModelByName(ctx, c.ConfigLoader, c.ModelLoader, c.AppConfig, model)
+}
+
 // ---- Model aliases ----

 // SetAlias is swap-first to match the httpapi client: PatchConfig swaps an
--- a/pkg/mcp/localaitools/inproc/load_model_test.go
+++ b/pkg/mcp/localaitools/inproc/load_model_test.go
@@ -0,0 +1,71 @@
+package inproc
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/system"
+)
+
+var _ = Describe("inproc.Client LoadModel", func() {
+	var (
+		ctx       context.Context
+		tempDir   string
+		cl        *config.ModelConfigLoader
+		ml        *model.ModelLoader
+		c         *Client
+		seedModel func(name, body string)
+	)
+
+	BeforeEach(func() {
+		ctx = context.Background()
+		tempDir = GinkgoT().TempDir()
+		systemState, err := system.GetSystemState(system.WithModelPath(tempDir))
+		Expect(err).ToNot(HaveOccurred())
+		appConfig := config.NewApplicationConfig(config.WithSystemState(systemState))
+		cl = config.NewModelConfigLoader(tempDir)
+		ml = model.NewModelLoader(systemState) // no backends installed
+		c = New(appConfig, systemState, cl, ml, nil)
+
+		seedModel = func(name, body string) {
+			Expect(os.WriteFile(filepath.Join(tempDir, name+".yaml"), []byte(body), 0o644)).To(Succeed())
+			Expect(cl.LoadModelConfigsFromPath(tempDir)).To(Succeed())
+		}
+	})
+
+	It("errors when the model loader is unavailable", func() {
+		noLoader := New(c.AppConfig, c.SystemState, cl, nil, nil)
+		_, err := noLoader.LoadModel(ctx, "anything")
+		Expect(err).To(MatchError(ContainSubstring("model loader not available")))
+	})
+
+	It("loads a regular model through the model loader", func() {
+		seedModel("solo", "name: solo\n")
+		// No backend is installed in the test env, so the load itself fails — but
+		// the call must exercise the single-model path and surface that error
+		// rather than panicking or silently succeeding.
+		loaded, err := c.LoadModel(ctx, "solo")
+		Expect(err).To(HaveOccurred())
+		Expect(loaded).To(BeEmpty())
+	})
+
+	It("expands a pipeline model into its sub-models", func() {
+		seedModel("voicebot", "name: voicebot\npipeline:\n  vad: vad-m\n  llm: llm-m\n")
+		seedModel("vad-m", "name: vad-m\n")
+		seedModel("llm-m", "name: llm-m\n")
+
+		loaded, err := c.LoadModel(ctx, "voicebot")
+		// Sub-models can't load without backends, so the joined error names them
+		// — proving the pipeline stub was expanded rather than loaded directly.
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("vad-m"))
+		Expect(err.Error()).ToNot(ContainSubstring("voicebot"))
+		Expect(loaded).To(BeEmpty())
+	})
+})
--- a/pkg/mcp/localaitools/prompts/10_safety.md
+++ b/pkg/mcp/localaitools/prompts/10_safety.md
@@ -2,7 +2,7 @@

 These rules are non-negotiable. The user trusts you to operate their server without unintended changes.

-1. **Confirm before mutating.** Before calling any of these tools — `install_model`, `import_model_uri`, `delete_model`, `install_backend`, `upgrade_backend`, `edit_model_config`, `reload_models`, `toggle_model_state`, `toggle_model_pinned` — first state in plain language what you are about to do (which tool, which target, which arguments) and wait for the user's explicit confirmation in the next turn. "Yes", "do it", "go ahead", "proceed" all count as confirmation. Anything else does not.
+1. **Confirm before mutating.** Before calling any of these tools — `install_model`, `import_model_uri`, `delete_model`, `install_backend`, `upgrade_backend`, `edit_model_config`, `reload_models`, `load_model`, `toggle_model_state`, `toggle_model_pinned` — first state in plain language what you are about to do (which tool, which target, which arguments) and wait for the user's explicit confirmation in the next turn. "Yes", "do it", "go ahead", "proceed" all count as confirmation. Anything else does not.

 2. **Disambiguate before mutating.** If the user's request is ambiguous (several gallery candidates match, the model name has multiple installed versions, the backend has variants), present the candidates as a numbered list and ask the user to pick before calling any mutating tool.

--- a/pkg/mcp/localaitools/prompts/20_tools.md
+++ b/pkg/mcp/localaitools/prompts/20_tools.md
@@ -24,5 +24,6 @@ The MCP `tools/list` endpoint also exposes the full input schema for each of the
 - `upgrade_backend` — Upgrade an installed backend by name.
 - `edit_model_config` — Patch (deep-merge) JSON into an installed model's config.
 - `reload_models` — Reload all model configs from disk.
+- `load_model` — Pre-load a model into memory so the first request pays no cold-start cost. For a realtime pipeline model, every sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded. Inverse of stopping a model.
 - `toggle_model_state` — Enable or disable a model (`action`: `enable` or `disable`).
 - `toggle_model_pinned` — Pin or unpin a model (`action`: `pin` or `unpin`).
--- a/pkg/mcp/localaitools/server_test.go
+++ b/pkg/mcp/localaitools/server_test.go
@@ -92,6 +92,7 @@ var expectedFullCatalog = sortedStrings(
 	ToolListInstalledModels,
 	ToolListKnownBackends,
 	ToolListNodes,
+	ToolLoadModel,
 	ToolReloadModels,
 	ToolSetAlias,
 	ToolSetBranding,
@@ -166,6 +167,7 @@ var _ = Describe("Tool dispatch", func() {
 		{ToolUpgradeBackend, map[string]any{"name": "llama-cpp"}, "UpgradeBackend"},
 		{ToolEditModelConfig, map[string]any{"name": "foo", "patch": map[string]any{"context_size": 4096}}, "EditModelConfig"},
 		{ToolReloadModels, struct{}{}, "ReloadModels"},
+		{ToolLoadModel, map[string]any{"model": "test-model"}, "LoadModel"},
 		{ToolToggleModelState, map[string]any{"name": "foo", "action": "enable"}, "ToggleModelState"},
 		{ToolToggleModelPinned, map[string]any{"name": "foo", "action": "pin"}, "ToggleModelPinned"},
 		{ToolSetAlias, map[string]any{"name": "gpt-4", "target": "real"}, "SetAlias"},
--- a/pkg/mcp/localaitools/tools.go
+++ b/pkg/mcp/localaitools/tools.go
@@ -31,6 +31,7 @@ const (
 	ToolDeleteModel       = "delete_model"
 	ToolEditModelConfig   = "edit_model_config"
 	ToolReloadModels      = "reload_models"
+	ToolLoadModel         = "load_model"
 	ToolInstallBackend    = "install_backend"
 	ToolUpgradeBackend    = "upgrade_backend"
 	ToolToggleModelState  = "toggle_model_state"
--- a/pkg/mcp/localaitools/tools_models.go
+++ b/pkg/mcp/localaitools/tools_models.go
@@ -65,6 +65,22 @@ func registerModelTools(s *mcp.Server, client LocalAIClient, opts Options) {
 		return
 	}

+	mcp.AddTool(s, &mcp.Tool{
+		Name:        ToolLoadModel,
+		Description: "Pre-load a model into memory by name so the first request pays no cold-start cost (the inverse of shutting a model down). For a realtime pipeline model every configured sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded. Returns the model names that became resident. Requires user confirmation per safety rule 1.",
+	}, func(ctx context.Context, _ *mcp.CallToolRequest, args struct {
+		Model string `json:"model" jsonschema:"The installed model name to load into memory."`
+	}) (*mcp.CallToolResult, any, error) {
+		if args.Model == "" {
+			return errorResultf("model is required"), nil, nil
+		}
+		loaded, err := client.LoadModel(ctx, args.Model)
+		if err != nil {
+			return errorResult(err), nil, nil
+		}
+		return jsonResult(map[string]any{"loaded": loaded}), nil, nil
+	})
+
 	mcp.AddTool(s, &mcp.Tool{
 		Name:        ToolInstallModel,
 		Description: "Install a model from a gallery. Requires explicit user confirmation per safety rule 1. Returns a job id; poll with get_job_status.",
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -1443,6 +1443,52 @@ const docTemplate = `{
                "responses": {}
            }
        },
+        "/backend/load": {
+            "post": {
+                "description": "Loads the named model (or, for a realtime pipeline, all of its sub-models) into memory so subsequent requests pay no cold-start cost. The inverse of /backend/shutdown.",
+                "consumes": [
+                    "application/json"
+                ],
+                "produces": [
+                    "application/json"
+                ],
+                "tags": [
+                    "monitoring"
+                ],
+                "summary": "Pre-load a model into memory",
+                "parameters": [
+                    {
+                        "description": "Model to load",
+                        "name": "request",
+                        "in": "body",
+                        "required": true,
+                        "schema": {
+                            "$ref": "#/definitions/schema.ModelLoadRequest"
+                        }
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "Model loaded",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ModelLoadResponse"
+                        }
+                    },
+                    "400": {
+                        "description": "Missing model name",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ModelLoadResponse"
+                        }
+                    },
+                    "500": {
+                        "description": "Load failed (Loaded lists any sub-models that did load)",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ModelLoadResponse"
+                        }
+                    }
+                }
+            }
+        },
        "/backend/monitor": {
            "get": {
                "tags": [
@@ -5136,6 +5182,30 @@ const docTemplate = `{
                }
            }
        },
+        "schema.ModelLoadRequest": {
+            "type": "object",
+            "properties": {
+                "model": {
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ModelLoadResponse": {
+            "type": "object",
+            "properties": {
+                "loaded": {
+                    "description": "Loaded lists the model names actually resident in memory after the call.\nFor a pipeline model these are its sub-models, not the pipeline name.",
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "message": {
+                    "description": "Message is a short human-readable status (\"model loaded\", or an error).",
+                    "type": "string"
+                }
+            }
+        },
        "schema.ModelsDataResponse": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -1440,6 +1440,52 @@
                "responses": {}
            }
        },
+        "/backend/load": {
+            "post": {
+                "description": "Loads the named model (or, for a realtime pipeline, all of its sub-models) into memory so subsequent requests pay no cold-start cost. The inverse of /backend/shutdown.",
+                "consumes": [
+                    "application/json"
+                ],
+                "produces": [
+                    "application/json"
+                ],
+                "tags": [
+                    "monitoring"
+                ],
+                "summary": "Pre-load a model into memory",
+                "parameters": [
+                    {
+                        "description": "Model to load",
+                        "name": "request",
+                        "in": "body",
+                        "required": true,
+                        "schema": {
+                            "$ref": "#/definitions/schema.ModelLoadRequest"
+                        }
+                    }
+                ],
+                "responses": {
+                    "200": {
+                        "description": "Model loaded",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ModelLoadResponse"
+                        }
+                    },
+                    "400": {
+                        "description": "Missing model name",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ModelLoadResponse"
+                        }
+                    },
+                    "500": {
+                        "description": "Load failed (Loaded lists any sub-models that did load)",
+                        "schema": {
+                            "$ref": "#/definitions/schema.ModelLoadResponse"
+                        }
+                    }
+                }
+            }
+        },
        "/backend/monitor": {
            "get": {
                "tags": [
@@ -5133,6 +5179,30 @@
                }
            }
        },
+        "schema.ModelLoadRequest": {
+            "type": "object",
+            "properties": {
+                "model": {
+                    "type": "string"
+                }
+            }
+        },
+        "schema.ModelLoadResponse": {
+            "type": "object",
+            "properties": {
+                "loaded": {
+                    "description": "Loaded lists the model names actually resident in memory after the call.\nFor a pipeline model these are its sub-models, not the pipeline name.",
+                    "type": "array",
+                    "items": {
+                        "type": "string"
+                    }
+                },
+                "message": {
+                    "description": "Message is a short human-readable status (\"model loaded\", or an error).",
+                    "type": "string"
+                }
+            }
+        },
        "schema.ModelsDataResponse": {
            "type": "object",
            "properties": {
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -1362,6 +1362,25 @@ definitions:
          $ref: '#/definitions/schema.ToolCall'
        type: array
    type: object
+  schema.ModelLoadRequest:
+    properties:
+      model:
+        type: string
+    type: object
+  schema.ModelLoadResponse:
+    properties:
+      loaded:
+        description: |-
+          Loaded lists the model names actually resident in memory after the call.
+          For a pipeline model these are its sub-models, not the pipeline name.
+        items:
+          type: string
+        type: array
+      message:
+        description: Message is a short human-readable status ("model loaded", or
+          an error).
+        type: string
+    type: object
  schema.ModelsDataResponse:
    properties:
      data:
@@ -3510,6 +3529,38 @@ paths:
      summary: Bidirectional realtime audio transform over WebSocket.
      tags:
      - audio
+  /backend/load:
+    post:
+      consumes:
+      - application/json
+      description: Loads the named model (or, for a realtime pipeline, all of its
+        sub-models) into memory so subsequent requests pay no cold-start cost. The
+        inverse of /backend/shutdown.
+      parameters:
+      - description: Model to load
+        in: body
+        name: request
+        required: true
+        schema:
+          $ref: '#/definitions/schema.ModelLoadRequest'
+      produces:
+      - application/json
+      responses:
+        "200":
+          description: Model loaded
+          schema:
+            $ref: '#/definitions/schema.ModelLoadResponse'
+        "400":
+          description: Missing model name
+          schema:
+            $ref: '#/definitions/schema.ModelLoadResponse'
+        "500":
+          description: Load failed (Loaded lists any sub-models that did load)
+          schema:
+            $ref: '#/definitions/schema.ModelLoadResponse'
+      summary: Pre-load a model into memory
+      tags:
+      - monitoring
  /backend/monitor:
    get:
      parameters: