mirror of
https://github.com/mudler/LocalAI.git
synced 2026-07-03 21:07:33 -04:00
feat(realtime): eager blocking pipeline warm-up + /backend/load API (#10662)
Realtime sessions previously lazy-loaded each pipeline sub-model (VAD,
transcription, LLM, TTS) on first use, so every cold session paid a
per-request model-load stall and load errors only surfaced mid-stream.
Warm the whole pipeline eagerly and blockingly at session start
(including the voice-gate speaker-recognition model, which an enforced
gate blocks each utterance on; compaction's summary_model stays lazy
since it only runs off the response path):
- Add backend.PreloadModel / PreloadModelByName as the single load path
for every modality (no transcription special-case; backend-omitted
configs are deprecated).
- The realtime session blocks on Model.Warmup and returns a
model_load_error to the client if any stage fails to load;
updateSession warms in the background. Opt out per pipeline with
pipeline.disable_warmup, exposed as a UI toggle via the
config-metadata registry.
Add a LocalAI-native POST /backend/load (and /v1/backend/load) that
pre-loads a model -- expanding realtime pipelines into their sub-models
-- as the inverse of /backend/shutdown. There is one preload engine
(backend.PreloadStages): the realtime Warmup methods, /backend/load and
the --load-to-memory startup flag all use it, so --load-to-memory now
also expands pipeline models and records load-failure traces. Pipeline
sub-model alias resolution is likewise shared
(ModelConfigLoader.LoadResolvedModelConfig). Surface the endpoint
everywhere an admin manages models:
- MCP admin tool load_model (httpapi + inproc clients, safety/catalog
prompts, catalog/dispatch tests).
- "Load into memory" action in the React models UI.
- Swagger regenerated; docs moved to the general backend-monitor page
since it is not realtime-specific.
Fix a Traces UI crash ("json: unsupported value: -Inf"): audio-snippet
RMS/peak now floor at a finite dBFS, and backend-trace data is sanitized
to drop non-finite floats before marshaling. The sanitizer is
copy-on-write -- it runs on every RecordBackendTrace, so containers are
only re-allocated on the paths that actually changed.
Migrate core/http/openresponses_test.go onto the prebuilt mock-backend
the rest of the http suite already uses -- it was the last spec still
pointing at a real HuggingFace model, so it 404'd wherever no vision
backend was built -- and fix its item_reference specs to send the
spec's "id" field instead of "item_id", which the handler never
accepted.
Assisted-by: Claude:claude-opus-4-8 Claude Code
Signed-off-by: Richard Palethorpe <io@richiejp.com>
This commit is contained in:
committed by
GitHub
parent
80ec22945a
commit
eb32cd9073
@@ -36,6 +36,10 @@ type LocalAIClient interface {
|
||||
DeleteModel(ctx context.Context, name string) error
|
||||
EditModelConfig(ctx context.Context, name string, patch map[string]any) error
|
||||
ReloadModels(ctx context.Context) error
|
||||
// LoadModel pre-loads a model into memory by name (the inverse of shutting
|
||||
// it down). For a realtime pipeline model every configured sub-model is
|
||||
// loaded; it returns the model names that became resident.
|
||||
LoadModel(ctx context.Context, model string) ([]string, error)
|
||||
ImportModelURI(ctx context.Context, req ImportModelURIRequest) (*ImportModelURIResponse, error)
|
||||
|
||||
// ---- Model aliases ----
|
||||
|
||||
@@ -49,6 +49,7 @@ var toolToHTTPRoute = map[string]string{
|
||||
ToolDeleteModel: "POST /models/delete/:name",
|
||||
ToolEditModelConfig: "PATCH /api/models/config-json/:name",
|
||||
ToolReloadModels: "POST /models/reload",
|
||||
ToolLoadModel: "POST /backend/load",
|
||||
ToolInstallBackend: "POST /backends/apply",
|
||||
ToolUpgradeBackend: "POST /backends/upgrade/:name",
|
||||
ToolToggleModelState: "PUT /models/toggle-state/:name/:action",
|
||||
|
||||
@@ -35,6 +35,7 @@ type fakeClient struct {
|
||||
setAlias func(string, string) error
|
||||
listAliases func() ([]AliasInfo, error)
|
||||
reloadModels func() error
|
||||
loadModel func(string) ([]string, error)
|
||||
listBackends func() ([]Backend, error)
|
||||
listKnownBackends func() ([]schema.KnownBackend, error)
|
||||
installBackend func(InstallBackendRequest) (string, error)
|
||||
@@ -169,6 +170,14 @@ func (f *fakeClient) ReloadModels(_ context.Context) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *fakeClient) LoadModel(_ context.Context, model string) ([]string, error) {
|
||||
f.record("LoadModel", model)
|
||||
if f.loadModel != nil {
|
||||
return f.loadModel(model)
|
||||
}
|
||||
return []string{model}, nil
|
||||
}
|
||||
|
||||
func (f *fakeClient) ListBackends(_ context.Context) ([]Backend, error) {
|
||||
f.record("ListBackends", nil)
|
||||
if f.listBackends != nil {
|
||||
|
||||
@@ -338,6 +338,16 @@ func (c *Client) ReloadModels(ctx context.Context) error {
|
||||
return c.do(ctx, http.MethodPost, routeModelsReload, nil, nil)
|
||||
}
|
||||
|
||||
func (c *Client) LoadModel(ctx context.Context, model string) ([]string, error) {
|
||||
// On a load failure the endpoint returns a non-2xx whose body (carrying the
|
||||
// per-sub-model failure detail) is folded into the HTTPError by c.do.
|
||||
var resp schema.ModelLoadResponse
|
||||
if err := c.do(ctx, http.MethodPost, routeBackendLoad, map[string]string{"model": model}, &resp); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return resp.Loaded, nil
|
||||
}
|
||||
|
||||
// ---- Model aliases ----
|
||||
|
||||
// SetAlias is swap-first: it PATCHes the alias config (a deep-merge that
|
||||
|
||||
@@ -19,6 +19,7 @@ const (
|
||||
routeModelImport = "/models/import"
|
||||
routeAliases = "/api/aliases"
|
||||
routeModelsReload = "/models/reload"
|
||||
routeBackendLoad = "/backend/load"
|
||||
routeBackends = "/backends"
|
||||
routeBackendsKnown = "/backends/known"
|
||||
routeBackendsApply = "/backends/apply"
|
||||
|
||||
@@ -13,6 +13,7 @@ import (
|
||||
"path/filepath"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/gallery"
|
||||
"github.com/mudler/LocalAI/core/gallery/importers"
|
||||
@@ -302,6 +303,16 @@ func (c *Client) ReloadModels(_ context.Context) error {
|
||||
return c.ConfigLoader.LoadModelConfigsFromPath(c.SystemState.Model.ModelsPath)
|
||||
}
|
||||
|
||||
func (c *Client) LoadModel(ctx context.Context, model string) ([]string, error) {
|
||||
if c.ConfigLoader == nil || c.ModelLoader == nil {
|
||||
return nil, errors.New("model loader not available")
|
||||
}
|
||||
// Reuse the same preload path the REST /backend/load endpoint uses, so a
|
||||
// pipeline model loads all its sub-models and the behaviour stays identical
|
||||
// across the in-process and HTTP clients.
|
||||
return backend.PreloadModelByName(ctx, c.ConfigLoader, c.ModelLoader, c.AppConfig, model)
|
||||
}
|
||||
|
||||
// ---- Model aliases ----
|
||||
|
||||
// SetAlias is swap-first to match the httpapi client: PatchConfig swaps an
|
||||
|
||||
71
pkg/mcp/localaitools/inproc/load_model_test.go
Normal file
71
pkg/mcp/localaitools/inproc/load_model_test.go
Normal file
@@ -0,0 +1,71 @@
|
||||
package inproc
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/system"
|
||||
)
|
||||
|
||||
var _ = Describe("inproc.Client LoadModel", func() {
|
||||
var (
|
||||
ctx context.Context
|
||||
tempDir string
|
||||
cl *config.ModelConfigLoader
|
||||
ml *model.ModelLoader
|
||||
c *Client
|
||||
seedModel func(name, body string)
|
||||
)
|
||||
|
||||
BeforeEach(func() {
|
||||
ctx = context.Background()
|
||||
tempDir = GinkgoT().TempDir()
|
||||
systemState, err := system.GetSystemState(system.WithModelPath(tempDir))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
appConfig := config.NewApplicationConfig(config.WithSystemState(systemState))
|
||||
cl = config.NewModelConfigLoader(tempDir)
|
||||
ml = model.NewModelLoader(systemState) // no backends installed
|
||||
c = New(appConfig, systemState, cl, ml, nil)
|
||||
|
||||
seedModel = func(name, body string) {
|
||||
Expect(os.WriteFile(filepath.Join(tempDir, name+".yaml"), []byte(body), 0o644)).To(Succeed())
|
||||
Expect(cl.LoadModelConfigsFromPath(tempDir)).To(Succeed())
|
||||
}
|
||||
})
|
||||
|
||||
It("errors when the model loader is unavailable", func() {
|
||||
noLoader := New(c.AppConfig, c.SystemState, cl, nil, nil)
|
||||
_, err := noLoader.LoadModel(ctx, "anything")
|
||||
Expect(err).To(MatchError(ContainSubstring("model loader not available")))
|
||||
})
|
||||
|
||||
It("loads a regular model through the model loader", func() {
|
||||
seedModel("solo", "name: solo\n")
|
||||
// No backend is installed in the test env, so the load itself fails — but
|
||||
// the call must exercise the single-model path and surface that error
|
||||
// rather than panicking or silently succeeding.
|
||||
loaded, err := c.LoadModel(ctx, "solo")
|
||||
Expect(err).To(HaveOccurred())
|
||||
Expect(loaded).To(BeEmpty())
|
||||
})
|
||||
|
||||
It("expands a pipeline model into its sub-models", func() {
|
||||
seedModel("voicebot", "name: voicebot\npipeline:\n vad: vad-m\n llm: llm-m\n")
|
||||
seedModel("vad-m", "name: vad-m\n")
|
||||
seedModel("llm-m", "name: llm-m\n")
|
||||
|
||||
loaded, err := c.LoadModel(ctx, "voicebot")
|
||||
// Sub-models can't load without backends, so the joined error names them
|
||||
// — proving the pipeline stub was expanded rather than loaded directly.
|
||||
Expect(err).To(HaveOccurred())
|
||||
Expect(err.Error()).To(ContainSubstring("vad-m"))
|
||||
Expect(err.Error()).ToNot(ContainSubstring("voicebot"))
|
||||
Expect(loaded).To(BeEmpty())
|
||||
})
|
||||
})
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
These rules are non-negotiable. The user trusts you to operate their server without unintended changes.
|
||||
|
||||
1. **Confirm before mutating.** Before calling any of these tools — `install_model`, `import_model_uri`, `delete_model`, `install_backend`, `upgrade_backend`, `edit_model_config`, `reload_models`, `toggle_model_state`, `toggle_model_pinned` — first state in plain language what you are about to do (which tool, which target, which arguments) and wait for the user's explicit confirmation in the next turn. "Yes", "do it", "go ahead", "proceed" all count as confirmation. Anything else does not.
|
||||
1. **Confirm before mutating.** Before calling any of these tools — `install_model`, `import_model_uri`, `delete_model`, `install_backend`, `upgrade_backend`, `edit_model_config`, `reload_models`, `load_model`, `toggle_model_state`, `toggle_model_pinned` — first state in plain language what you are about to do (which tool, which target, which arguments) and wait for the user's explicit confirmation in the next turn. "Yes", "do it", "go ahead", "proceed" all count as confirmation. Anything else does not.
|
||||
|
||||
2. **Disambiguate before mutating.** If the user's request is ambiguous (several gallery candidates match, the model name has multiple installed versions, the backend has variants), present the candidates as a numbered list and ask the user to pick before calling any mutating tool.
|
||||
|
||||
|
||||
@@ -24,5 +24,6 @@ The MCP `tools/list` endpoint also exposes the full input schema for each of the
|
||||
- `upgrade_backend` — Upgrade an installed backend by name.
|
||||
- `edit_model_config` — Patch (deep-merge) JSON into an installed model's config.
|
||||
- `reload_models` — Reload all model configs from disk.
|
||||
- `load_model` — Pre-load a model into memory so the first request pays no cold-start cost. For a realtime pipeline model, every sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded. Inverse of stopping a model.
|
||||
- `toggle_model_state` — Enable or disable a model (`action`: `enable` or `disable`).
|
||||
- `toggle_model_pinned` — Pin or unpin a model (`action`: `pin` or `unpin`).
|
||||
|
||||
@@ -92,6 +92,7 @@ var expectedFullCatalog = sortedStrings(
|
||||
ToolListInstalledModels,
|
||||
ToolListKnownBackends,
|
||||
ToolListNodes,
|
||||
ToolLoadModel,
|
||||
ToolReloadModels,
|
||||
ToolSetAlias,
|
||||
ToolSetBranding,
|
||||
@@ -166,6 +167,7 @@ var _ = Describe("Tool dispatch", func() {
|
||||
{ToolUpgradeBackend, map[string]any{"name": "llama-cpp"}, "UpgradeBackend"},
|
||||
{ToolEditModelConfig, map[string]any{"name": "foo", "patch": map[string]any{"context_size": 4096}}, "EditModelConfig"},
|
||||
{ToolReloadModels, struct{}{}, "ReloadModels"},
|
||||
{ToolLoadModel, map[string]any{"model": "test-model"}, "LoadModel"},
|
||||
{ToolToggleModelState, map[string]any{"name": "foo", "action": "enable"}, "ToggleModelState"},
|
||||
{ToolToggleModelPinned, map[string]any{"name": "foo", "action": "pin"}, "ToggleModelPinned"},
|
||||
{ToolSetAlias, map[string]any{"name": "gpt-4", "target": "real"}, "SetAlias"},
|
||||
|
||||
@@ -31,6 +31,7 @@ const (
|
||||
ToolDeleteModel = "delete_model"
|
||||
ToolEditModelConfig = "edit_model_config"
|
||||
ToolReloadModels = "reload_models"
|
||||
ToolLoadModel = "load_model"
|
||||
ToolInstallBackend = "install_backend"
|
||||
ToolUpgradeBackend = "upgrade_backend"
|
||||
ToolToggleModelState = "toggle_model_state"
|
||||
|
||||
@@ -65,6 +65,22 @@ func registerModelTools(s *mcp.Server, client LocalAIClient, opts Options) {
|
||||
return
|
||||
}
|
||||
|
||||
mcp.AddTool(s, &mcp.Tool{
|
||||
Name: ToolLoadModel,
|
||||
Description: "Pre-load a model into memory by name so the first request pays no cold-start cost (the inverse of shutting a model down). For a realtime pipeline model every configured sub-model (VAD, transcription, LLM, TTS, sound_detection, voice_recognition) is loaded. Returns the model names that became resident. Requires user confirmation per safety rule 1.",
|
||||
}, func(ctx context.Context, _ *mcp.CallToolRequest, args struct {
|
||||
Model string `json:"model" jsonschema:"The installed model name to load into memory."`
|
||||
}) (*mcp.CallToolResult, any, error) {
|
||||
if args.Model == "" {
|
||||
return errorResultf("model is required"), nil, nil
|
||||
}
|
||||
loaded, err := client.LoadModel(ctx, args.Model)
|
||||
if err != nil {
|
||||
return errorResult(err), nil, nil
|
||||
}
|
||||
return jsonResult(map[string]any{"loaded": loaded}), nil, nil
|
||||
})
|
||||
|
||||
mcp.AddTool(s, &mcp.Tool{
|
||||
Name: ToolInstallModel,
|
||||
Description: "Install a model from a gallery. Requires explicit user confirmation per safety rule 1. Returns a job id; poll with get_job_status.",
|
||||
|
||||
Reference in New Issue
Block a user