feat(realtime): eager blocking pipeline warm-up + /backend/load API (#10662)

Realtime sessions previously lazy-loaded each pipeline sub-model (VAD, transcription, LLM, TTS) on first use, so every cold session paid a per-request model-load stall and load errors only surfaced mid-stream. Warm the whole pipeline eagerly and blockingly at session start (including the voice-gate speaker-recognition model, which an enforced gate blocks each utterance on; compaction's summary_model stays lazy since it only runs off the response path): - Add backend.PreloadModel / PreloadModelByName as the single load path for every modality (no transcription special-case; backend-omitted configs are deprecated). - The realtime session blocks on Model.Warmup and returns a model_load_error to the client if any stage fails to load; updateSession warms in the background. Opt out per pipeline with pipeline.disable_warmup, exposed as a UI toggle via the config-metadata registry. Add a LocalAI-native POST /backend/load (and /v1/backend/load) that pre-loads a model -- expanding realtime pipelines into their sub-models -- as the inverse of /backend/shutdown. There is one preload engine (backend.PreloadStages): the realtime Warmup methods, /backend/load and the --load-to-memory startup flag all use it, so --load-to-memory now also expands pipeline models and records load-failure traces. Pipeline sub-model alias resolution is likewise shared (ModelConfigLoader.LoadResolvedModelConfig). Surface the endpoint everywhere an admin manages models: - MCP admin tool load_model (httpapi + inproc clients, safety/catalog prompts, catalog/dispatch tests). - "Load into memory" action in the React models UI. - Swagger regenerated; docs moved to the general backend-monitor page since it is not realtime-specific. Fix a Traces UI crash ("json: unsupported value: -Inf"): audio-snippet RMS/peak now floor at a finite dBFS, and backend-trace data is sanitized to drop non-finite floats before marshaling. The sanitizer is copy-on-write -- it runs on every RecordBackendTrace, so containers are only re-allocated on the paths that actually changed. Migrate core/http/openresponses_test.go onto the prebuilt mock-backend the rest of the http suite already uses -- it was the last spec still pointing at a real HuggingFace model, so it 404'd wherever no vision backend was built -- and fix its item_reference specs to send the spec's "id" field instead of "item_id", which the handler never accepted. Assisted-by: Claude:claude-opus-4-8 Claude Code Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-07-03 21:07:33 -04:00 · 2026-07-03 17:00:37 +01:00
parent 80ec22945a
commit eb32cd9073
45 changed files with 1364 additions and 99 deletions
--- a/pkg/mcp/localaitools/client.go
+++ b/pkg/mcp/localaitools/client.go
@@ -36,6 +36,10 @@ type LocalAIClient interface {
 	DeleteModel(ctx context.Context, name string) error
 	EditModelConfig(ctx context.Context, name string, patch map[string]any) error
 	ReloadModels(ctx context.Context) error
+	// LoadModel pre-loads a model into memory by name (the inverse of shutting
+	// it down). For a realtime pipeline model every configured sub-model is
+	// loaded; it returns the model names that became resident.
+	LoadModel(ctx context.Context, model string) ([]string, error)
 	ImportModelURI(ctx context.Context, req ImportModelURIRequest) (*ImportModelURIResponse, error)

 	// ---- Model aliases ----
--- a/pkg/mcp/localaitools/coverage_test.go
+++ b/pkg/mcp/localaitools/coverage_test.go
@@ -49,6 +49,7 @@ var toolToHTTPRoute = map[string]string{
 	ToolDeleteModel:       "POST /models/delete/:name",
 	ToolEditModelConfig:   "PATCH /api/models/config-json/:name",
 	ToolReloadModels:      "POST /models/reload",
+	ToolLoadModel:         "POST /backend/load",
 	ToolInstallBackend:    "POST /backends/apply",
 	ToolUpgradeBackend:    "POST /backends/upgrade/:name",
 	ToolToggleModelState:  "PUT /models/toggle-state/:name/:action",
--- a/pkg/mcp/localaitools/fakes_test.go
+++ b/pkg/mcp/localaitools/fakes_test.go
@@ -35,6 +35,7 @@ type fakeClient struct {
 	setAlias            func(string, string) error
 	listAliases         func() ([]AliasInfo, error)
 	reloadModels        func() error
+	loadModel           func(string) ([]string, error)
 	listBackends        func() ([]Backend, error)
 	listKnownBackends   func() ([]schema.KnownBackend, error)
 	installBackend      func(InstallBackendRequest) (string, error)
@@ -169,6 +170,14 @@ func (f *fakeClient) ReloadModels(_ context.Context) error {
 	return nil
 }

+func (f *fakeClient) LoadModel(_ context.Context, model string) ([]string, error) {
+	f.record("LoadModel", model)
+	if f.loadModel != nil {
+		return f.loadModel(model)
+	}
+	return []string{model}, nil
+}
+
 func (f *fakeClient) ListBackends(_ context.Context) ([]Backend, error) {
 	f.record("ListBackends", nil)
 	if f.listBackends != nil {
--- a/pkg/mcp/localaitools/httpapi/client.go
+++ b/pkg/mcp/localaitools/httpapi/client.go
@@ -338,6 +338,16 @@ func (c *Client) ReloadModels(ctx context.Context) error {
 	return c.do(ctx, http.MethodPost, routeModelsReload, nil, nil)
 }

+func (c *Client) LoadModel(ctx context.Context, model string) ([]string, error) {
+	// On a load failure the endpoint returns a non-2xx whose body (carrying the
+	// per-sub-model failure detail) is folded into the HTTPError by c.do.
+	var resp schema.ModelLoadResponse
+	if err := c.do(ctx, http.MethodPost, routeBackendLoad, map[string]string{"model": model}, &resp); err != nil {
+		return nil, err
+	}
+	return resp.Loaded, nil
+}
+
 // ---- Model aliases ----

 // SetAlias is swap-first: it PATCHes the alias config (a deep-merge that
--- a/pkg/mcp/localaitools/httpapi/routes.go
+++ b/pkg/mcp/localaitools/httpapi/routes.go
@@ -19,6 +19,7 @@ const (
 	routeModelImport     = "/models/import"
 	routeAliases         = "/api/aliases"
 	routeModelsReload    = "/models/reload"
+	routeBackendLoad     = "/backend/load"
 	routeBackends        = "/backends"
 	routeBackendsKnown   = "/backends/known"
 	routeBackendsApply   = "/backends/apply"
--- a/pkg/mcp/localaitools/inproc/client.go
+++ b/pkg/mcp/localaitools/inproc/client.go
@@ -13,6 +13,7 @@ import (
 	"path/filepath"

 	"github.com/google/uuid"
+	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/gallery/importers"
@@ -302,6 +303,16 @@ func (c *Client) ReloadModels(_ context.Context) error {
 	return c.ConfigLoader.LoadModelConfigsFromPath(c.SystemState.Model.ModelsPath)
 }

+func (c *Client) LoadModel(ctx context.Context, model string) ([]string, error) {
+	if c.ConfigLoader == nil || c.ModelLoader == nil {
+		return nil, errors.New("model loader not available")
+	}
+	// Reuse the same preload path the REST /backend/load endpoint uses, so a
+	// pipeline model loads all its sub-models and the behaviour stays identical
+	// across the in-process and HTTP clients.
+	return backend.PreloadModelByName(ctx, c.ConfigLoader, c.ModelLoader, c.AppConfig, model)
+}
+
 // ---- Model aliases ----

 // SetAlias is swap-first to match the httpapi client: PatchConfig swaps an
--- a/pkg/mcp/localaitools/inproc/load_model_test.go
+++ b/pkg/mcp/localaitools/inproc/load_model_test.go
@@ -0,0 +1,71 @@
+package inproc
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/model"
+	"github.com/mudler/LocalAI/pkg/system"
+)
+
+var _ = Describe("inproc.Client LoadModel", func() {
+	var (
+		ctx       context.Context
+		tempDir   string
+		cl        *config.ModelConfigLoader
+		ml        *model.ModelLoader
+		c         *Client
+		seedModel func(name, body string)
+	)
+
+	BeforeEach(func() {
+		ctx = context.Background()
+		tempDir = GinkgoT().TempDir()
+		systemState, err := system.GetSystemState(system.WithModelPath(tempDir))
+		Expect(err).ToNot(HaveOccurred())
+		appConfig := config.NewApplicationConfig(config.WithSystemState(systemState))
+		cl = config.NewModelConfigLoader(tempDir)
+		ml = model.NewModelLoader(systemState) // no backends installed
+		c = New(appConfig, systemState, cl, ml, nil)
+
+		seedModel = func(name, body string) {
+			Expect(os.WriteFile(filepath.Join(tempDir, name+".yaml"), []byte(body), 0o644)).To(Succeed())
+			Expect(cl.LoadModelConfigsFromPath(tempDir)).To(Succeed())
+		}
+	})
+
+	It("errors when the model loader is unavailable", func() {
+		noLoader := New(c.AppConfig, c.SystemState, cl, nil, nil)
+		_, err := noLoader.LoadModel(ctx, "anything")
+		Expect(err).To(MatchError(ContainSubstring("model loader not available")))
+	})
+
+	It("loads a regular model through the model loader", func() {
+		seedModel("solo", "name: solo\n")
+		// No backend is installed in the test env, so the load itself fails — but
+		// the call must exercise the single-model path and surface that error
+		// rather than panicking or silently succeeding.
+		loaded, err := c.LoadModel(ctx, "solo")
+		Expect(err).To(HaveOccurred())
+		Expect(loaded).To(BeEmpty())
+	})
+
+	It("expands a pipeline model into its sub-models", func() {
+		seedModel("voicebot", "name: voicebot\npipeline:\n  vad: vad-m\n  llm: llm-m\n")
+		seedModel("vad-m", "name: vad-m\n")
+		seedModel("llm-m", "name: llm-m\n")
+
+		loaded, err := c.LoadModel(ctx, "voicebot")
+		// Sub-models can't load without backends, so the joined error names them
+		// — proving the pipeline stub was expanded rather than loaded directly.
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("vad-m"))
+		Expect(err.Error()).ToNot(ContainSubstring("voicebot"))
+		Expect(loaded).To(BeEmpty())
+	})
+})
--- a/pkg/mcp/localaitools/prompts/10_safety.md
+++ b/pkg/mcp/localaitools/prompts/10_safety.md
@@ -2,7 +2,7 @@

 These rules are non-negotiable. The user trusts you to operate their server without unintended changes.

-1. **Confirm before mutating.** Before calling any of these tools — `install_model`, `import_model_uri`, `delete_model`, `install_backend`, `upgrade_backend`, `edit_model_config`, `reload_models`, `toggle_model_state`, `toggle_model_pinned` — first state in plain language what you are about to do (which tool, which target, which arguments) and wait for the user's explicit confirmation in the next turn. "Yes", "do it", "go ahead", "proceed" all count as confirmation. Anything else does not.
+1. **Confirm before mutating.** Before calling any of these tools — `install_model`, `import_model_uri`, `delete_model`, `install_backend`, `upgrade_backend`, `edit_model_config`, `reload_models`, `load_model`, `toggle_model_state`, `toggle_model_pinned` — first state in plain language what you are about to do (which tool, which target, which arguments) and wait for the user's explicit confirmation in the next turn. "Yes", "do it", "go ahead", "proceed" all count as confirmation. Anything else does not.

 2. **Disambiguate before mutating.** If the user's request is ambiguous (several gallery candidates match, the model name has multiple installed versions, the backend has variants), present the candidates as a numbered list and ask the user to pick before calling any mutating tool.

--- a/pkg/mcp/localaitools/prompts/20_tools.md
+++ b/pkg/mcp/localaitools/prompts/20_tools.md
@@ -24,5 +24,6 @@ The MCP `tools/list` endpoint also exposes the full input schema for each of the
 - `upgrade_backend` — Upgrade an installed backend by name.
 - `edit_model_config` — Patch (deep-merge) JSON into an installed model's config.
 - `reload_models` — Reload all model configs from disk.
+- `load_model` — Pre-load a model into memory so the first request pays no cold-start cost. For a realtime pipeline model, every sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded. Inverse of stopping a model.
 - `toggle_model_state` — Enable or disable a model (`action`: `enable` or `disable`).
 - `toggle_model_pinned` — Pin or unpin a model (`action`: `pin` or `unpin`).
--- a/pkg/mcp/localaitools/server_test.go
+++ b/pkg/mcp/localaitools/server_test.go
@@ -92,6 +92,7 @@ var expectedFullCatalog = sortedStrings(
 	ToolListInstalledModels,
 	ToolListKnownBackends,
 	ToolListNodes,
+	ToolLoadModel,
 	ToolReloadModels,
 	ToolSetAlias,
 	ToolSetBranding,
@@ -166,6 +167,7 @@ var _ = Describe("Tool dispatch", func() {
 		{ToolUpgradeBackend, map[string]any{"name": "llama-cpp"}, "UpgradeBackend"},
 		{ToolEditModelConfig, map[string]any{"name": "foo", "patch": map[string]any{"context_size": 4096}}, "EditModelConfig"},
 		{ToolReloadModels, struct{}{}, "ReloadModels"},
+		{ToolLoadModel, map[string]any{"model": "test-model"}, "LoadModel"},
 		{ToolToggleModelState, map[string]any{"name": "foo", "action": "enable"}, "ToggleModelState"},
 		{ToolToggleModelPinned, map[string]any{"name": "foo", "action": "pin"}, "ToggleModelPinned"},
 		{ToolSetAlias, map[string]any{"name": "gpt-4", "target": "real"}, "SetAlias"},
--- a/pkg/mcp/localaitools/tools.go
+++ b/pkg/mcp/localaitools/tools.go
@@ -31,6 +31,7 @@ const (
 	ToolDeleteModel       = "delete_model"
 	ToolEditModelConfig   = "edit_model_config"
 	ToolReloadModels      = "reload_models"
+	ToolLoadModel         = "load_model"
 	ToolInstallBackend    = "install_backend"
 	ToolUpgradeBackend    = "upgrade_backend"
 	ToolToggleModelState  = "toggle_model_state"
--- a/pkg/mcp/localaitools/tools_models.go
+++ b/pkg/mcp/localaitools/tools_models.go
@@ -65,6 +65,22 @@ func registerModelTools(s *mcp.Server, client LocalAIClient, opts Options) {
 		return
 	}

+	mcp.AddTool(s, &mcp.Tool{
+		Name:        ToolLoadModel,
+		Description: "Pre-load a model into memory by name so the first request pays no cold-start cost (the inverse of shutting a model down). For a realtime pipeline model every configured sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded. Returns the model names that became resident. Requires user confirmation per safety rule 1.",
+	}, func(ctx context.Context, _ *mcp.CallToolRequest, args struct {
+		Model string `json:"model" jsonschema:"The installed model name to load into memory."`
+	}) (*mcp.CallToolResult, any, error) {
+		if args.Model == "" {
+			return errorResultf("model is required"), nil, nil
+		}
+		loaded, err := client.LoadModel(ctx, args.Model)
+		if err != nil {
+			return errorResult(err), nil, nil
+		}
+		return jsonResult(map[string]any{"loaded": loaded}), nil, nil
+	})
+
 	mcp.AddTool(s, &mcp.Tool{
 		Name:        ToolInstallModel,
 		Description: "Install a model from a gallery. Requires explicit user confirmation per safety rule 1. Returns a job id; poll with get_job_status.",