feat(realtime): eager blocking pipeline warm-up + /backend/load API (#10662)

Realtime sessions previously lazy-loaded each pipeline sub-model (VAD,
transcription, LLM, TTS) on first use, so every cold session paid a
per-request model-load stall and load errors only surfaced mid-stream.

Warm the whole pipeline eagerly and blockingly at session start
(including the voice-gate speaker-recognition model, which an enforced
gate blocks each utterance on; compaction's summary_model stays lazy
since it only runs off the response path):
- Add backend.PreloadModel / PreloadModelByName as the single load path
  for every modality (no transcription special-case; backend-omitted
  configs are deprecated).
- The realtime session blocks on Model.Warmup and returns a
  model_load_error to the client if any stage fails to load;
  updateSession warms in the background. Opt out per pipeline with
  pipeline.disable_warmup, exposed as a UI toggle via the
  config-metadata registry.

Add a LocalAI-native POST /backend/load (and /v1/backend/load) that
pre-loads a model -- expanding realtime pipelines into their sub-models
-- as the inverse of /backend/shutdown. There is one preload engine
(backend.PreloadStages): the realtime Warmup methods, /backend/load and
the --load-to-memory startup flag all use it, so --load-to-memory now
also expands pipeline models and records load-failure traces. Pipeline
sub-model alias resolution is likewise shared
(ModelConfigLoader.LoadResolvedModelConfig). Surface the endpoint
everywhere an admin manages models:
- MCP admin tool load_model (httpapi + inproc clients, safety/catalog
  prompts, catalog/dispatch tests).
- "Load into memory" action in the React models UI.
- Swagger regenerated; docs moved to the general backend-monitor page
  since it is not realtime-specific.

Fix a Traces UI crash ("json: unsupported value: -Inf"): audio-snippet
RMS/peak now floor at a finite dBFS, and backend-trace data is sanitized
to drop non-finite floats before marshaling. The sanitizer is
copy-on-write -- it runs on every RecordBackendTrace, so containers are
only re-allocated on the paths that actually changed.

Migrate core/http/openresponses_test.go onto the prebuilt mock-backend
the rest of the http suite already uses -- it was the last spec still
pointing at a real HuggingFace model, so it 404'd wherever no vision
backend was built -- and fix its item_reference specs to send the
spec's "id" field instead of "item_id", which the handler never
accepted.

Assisted-by: Claude:claude-opus-4-8 Claude Code

Signed-off-by: Richard Palethorpe <io@richiejp.com>
This commit is contained in:
Richard Palethorpe
2026-07-03 17:00:37 +01:00
committed by GitHub
parent 80ec22945a
commit eb32cd9073
45 changed files with 1364 additions and 99 deletions

View File

@@ -36,6 +36,10 @@ type LocalAIClient interface {
DeleteModel(ctx context.Context, name string) error
EditModelConfig(ctx context.Context, name string, patch map[string]any) error
ReloadModels(ctx context.Context) error
// LoadModel pre-loads a model into memory by name (the inverse of shutting
// it down). For a realtime pipeline model every configured sub-model is
// loaded; it returns the model names that became resident.
LoadModel(ctx context.Context, model string) ([]string, error)
ImportModelURI(ctx context.Context, req ImportModelURIRequest) (*ImportModelURIResponse, error)
// ---- Model aliases ----

View File

@@ -49,6 +49,7 @@ var toolToHTTPRoute = map[string]string{
ToolDeleteModel: "POST /models/delete/:name",
ToolEditModelConfig: "PATCH /api/models/config-json/:name",
ToolReloadModels: "POST /models/reload",
ToolLoadModel: "POST /backend/load",
ToolInstallBackend: "POST /backends/apply",
ToolUpgradeBackend: "POST /backends/upgrade/:name",
ToolToggleModelState: "PUT /models/toggle-state/:name/:action",

View File

@@ -35,6 +35,7 @@ type fakeClient struct {
setAlias func(string, string) error
listAliases func() ([]AliasInfo, error)
reloadModels func() error
loadModel func(string) ([]string, error)
listBackends func() ([]Backend, error)
listKnownBackends func() ([]schema.KnownBackend, error)
installBackend func(InstallBackendRequest) (string, error)
@@ -169,6 +170,14 @@ func (f *fakeClient) ReloadModels(_ context.Context) error {
return nil
}
func (f *fakeClient) LoadModel(_ context.Context, model string) ([]string, error) {
f.record("LoadModel", model)
if f.loadModel != nil {
return f.loadModel(model)
}
return []string{model}, nil
}
func (f *fakeClient) ListBackends(_ context.Context) ([]Backend, error) {
f.record("ListBackends", nil)
if f.listBackends != nil {

View File

@@ -338,6 +338,16 @@ func (c *Client) ReloadModels(ctx context.Context) error {
return c.do(ctx, http.MethodPost, routeModelsReload, nil, nil)
}
func (c *Client) LoadModel(ctx context.Context, model string) ([]string, error) {
// On a load failure the endpoint returns a non-2xx whose body (carrying the
// per-sub-model failure detail) is folded into the HTTPError by c.do.
var resp schema.ModelLoadResponse
if err := c.do(ctx, http.MethodPost, routeBackendLoad, map[string]string{"model": model}, &resp); err != nil {
return nil, err
}
return resp.Loaded, nil
}
// ---- Model aliases ----
// SetAlias is swap-first: it PATCHes the alias config (a deep-merge that

View File

@@ -19,6 +19,7 @@ const (
routeModelImport = "/models/import"
routeAliases = "/api/aliases"
routeModelsReload = "/models/reload"
routeBackendLoad = "/backend/load"
routeBackends = "/backends"
routeBackendsKnown = "/backends/known"
routeBackendsApply = "/backends/apply"

View File

@@ -13,6 +13,7 @@ import (
"path/filepath"
"github.com/google/uuid"
"github.com/mudler/LocalAI/core/backend"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/gallery"
"github.com/mudler/LocalAI/core/gallery/importers"
@@ -302,6 +303,16 @@ func (c *Client) ReloadModels(_ context.Context) error {
return c.ConfigLoader.LoadModelConfigsFromPath(c.SystemState.Model.ModelsPath)
}
func (c *Client) LoadModel(ctx context.Context, model string) ([]string, error) {
if c.ConfigLoader == nil || c.ModelLoader == nil {
return nil, errors.New("model loader not available")
}
// Reuse the same preload path the REST /backend/load endpoint uses, so a
// pipeline model loads all its sub-models and the behaviour stays identical
// across the in-process and HTTP clients.
return backend.PreloadModelByName(ctx, c.ConfigLoader, c.ModelLoader, c.AppConfig, model)
}
// ---- Model aliases ----
// SetAlias is swap-first to match the httpapi client: PatchConfig swaps an

View File

@@ -0,0 +1,71 @@
package inproc
import (
"context"
"os"
"path/filepath"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/LocalAI/pkg/system"
)
var _ = Describe("inproc.Client LoadModel", func() {
var (
ctx context.Context
tempDir string
cl *config.ModelConfigLoader
ml *model.ModelLoader
c *Client
seedModel func(name, body string)
)
BeforeEach(func() {
ctx = context.Background()
tempDir = GinkgoT().TempDir()
systemState, err := system.GetSystemState(system.WithModelPath(tempDir))
Expect(err).ToNot(HaveOccurred())
appConfig := config.NewApplicationConfig(config.WithSystemState(systemState))
cl = config.NewModelConfigLoader(tempDir)
ml = model.NewModelLoader(systemState) // no backends installed
c = New(appConfig, systemState, cl, ml, nil)
seedModel = func(name, body string) {
Expect(os.WriteFile(filepath.Join(tempDir, name+".yaml"), []byte(body), 0o644)).To(Succeed())
Expect(cl.LoadModelConfigsFromPath(tempDir)).To(Succeed())
}
})
It("errors when the model loader is unavailable", func() {
noLoader := New(c.AppConfig, c.SystemState, cl, nil, nil)
_, err := noLoader.LoadModel(ctx, "anything")
Expect(err).To(MatchError(ContainSubstring("model loader not available")))
})
It("loads a regular model through the model loader", func() {
seedModel("solo", "name: solo\n")
// No backend is installed in the test env, so the load itself fails — but
// the call must exercise the single-model path and surface that error
// rather than panicking or silently succeeding.
loaded, err := c.LoadModel(ctx, "solo")
Expect(err).To(HaveOccurred())
Expect(loaded).To(BeEmpty())
})
It("expands a pipeline model into its sub-models", func() {
seedModel("voicebot", "name: voicebot\npipeline:\n vad: vad-m\n llm: llm-m\n")
seedModel("vad-m", "name: vad-m\n")
seedModel("llm-m", "name: llm-m\n")
loaded, err := c.LoadModel(ctx, "voicebot")
// Sub-models can't load without backends, so the joined error names them
// — proving the pipeline stub was expanded rather than loaded directly.
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("vad-m"))
Expect(err.Error()).ToNot(ContainSubstring("voicebot"))
Expect(loaded).To(BeEmpty())
})
})

View File

@@ -2,7 +2,7 @@
These rules are non-negotiable. The user trusts you to operate their server without unintended changes.
1. **Confirm before mutating.** Before calling any of these tools — `install_model`, `import_model_uri`, `delete_model`, `install_backend`, `upgrade_backend`, `edit_model_config`, `reload_models`, `toggle_model_state`, `toggle_model_pinned` — first state in plain language what you are about to do (which tool, which target, which arguments) and wait for the user's explicit confirmation in the next turn. "Yes", "do it", "go ahead", "proceed" all count as confirmation. Anything else does not.
1. **Confirm before mutating.** Before calling any of these tools — `install_model`, `import_model_uri`, `delete_model`, `install_backend`, `upgrade_backend`, `edit_model_config`, `reload_models`, `load_model`, `toggle_model_state`, `toggle_model_pinned` — first state in plain language what you are about to do (which tool, which target, which arguments) and wait for the user's explicit confirmation in the next turn. "Yes", "do it", "go ahead", "proceed" all count as confirmation. Anything else does not.
2. **Disambiguate before mutating.** If the user's request is ambiguous (several gallery candidates match, the model name has multiple installed versions, the backend has variants), present the candidates as a numbered list and ask the user to pick before calling any mutating tool.

View File

@@ -24,5 +24,6 @@ The MCP `tools/list` endpoint also exposes the full input schema for each of the
- `upgrade_backend` — Upgrade an installed backend by name.
- `edit_model_config` — Patch (deep-merge) JSON into an installed model's config.
- `reload_models` — Reload all model configs from disk.
- `load_model` — Pre-load a model into memory so the first request pays no cold-start cost. For a realtime pipeline model, every sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded. Inverse of stopping a model.
- `toggle_model_state` — Enable or disable a model (`action`: `enable` or `disable`).
- `toggle_model_pinned` — Pin or unpin a model (`action`: `pin` or `unpin`).

View File

@@ -92,6 +92,7 @@ var expectedFullCatalog = sortedStrings(
ToolListInstalledModels,
ToolListKnownBackends,
ToolListNodes,
ToolLoadModel,
ToolReloadModels,
ToolSetAlias,
ToolSetBranding,
@@ -166,6 +167,7 @@ var _ = Describe("Tool dispatch", func() {
{ToolUpgradeBackend, map[string]any{"name": "llama-cpp"}, "UpgradeBackend"},
{ToolEditModelConfig, map[string]any{"name": "foo", "patch": map[string]any{"context_size": 4096}}, "EditModelConfig"},
{ToolReloadModels, struct{}{}, "ReloadModels"},
{ToolLoadModel, map[string]any{"model": "test-model"}, "LoadModel"},
{ToolToggleModelState, map[string]any{"name": "foo", "action": "enable"}, "ToggleModelState"},
{ToolToggleModelPinned, map[string]any{"name": "foo", "action": "pin"}, "ToggleModelPinned"},
{ToolSetAlias, map[string]any{"name": "gpt-4", "target": "real"}, "SetAlias"},

View File

@@ -31,6 +31,7 @@ const (
ToolDeleteModel = "delete_model"
ToolEditModelConfig = "edit_model_config"
ToolReloadModels = "reload_models"
ToolLoadModel = "load_model"
ToolInstallBackend = "install_backend"
ToolUpgradeBackend = "upgrade_backend"
ToolToggleModelState = "toggle_model_state"

View File

@@ -65,6 +65,22 @@ func registerModelTools(s *mcp.Server, client LocalAIClient, opts Options) {
return
}
mcp.AddTool(s, &mcp.Tool{
Name: ToolLoadModel,
Description: "Pre-load a model into memory by name so the first request pays no cold-start cost (the inverse of shutting a model down). For a realtime pipeline model every configured sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded. Returns the model names that became resident. Requires user confirmation per safety rule 1.",
}, func(ctx context.Context, _ *mcp.CallToolRequest, args struct {
Model string `json:"model" jsonschema:"The installed model name to load into memory."`
}) (*mcp.CallToolResult, any, error) {
if args.Model == "" {
return errorResultf("model is required"), nil, nil
}
loaded, err := client.LoadModel(ctx, args.Model)
if err != nil {
return errorResult(err), nil, nil
}
return jsonResult(map[string]any{"loaded": loaded}), nil, nil
})
mcp.AddTool(s, &mcp.Tool{
Name: ToolInstallModel,
Description: "Install a model from a gallery. Requires explicit user confirmation per safety rule 1. Returns a job id; poll with get_job_status.",