Files
LocalAI/core/backend/preload.go
Richard Palethorpe eb32cd9073 feat(realtime): eager blocking pipeline warm-up + /backend/load API (#10662)
Realtime sessions previously lazy-loaded each pipeline sub-model (VAD,
transcription, LLM, TTS) on first use, so every cold session paid a
per-request model-load stall and load errors only surfaced mid-stream.

Warm the whole pipeline eagerly and blockingly at session start
(including the voice-gate speaker-recognition model, which an enforced
gate blocks each utterance on; compaction's summary_model stays lazy
since it only runs off the response path):
- Add backend.PreloadModel / PreloadModelByName as the single load path
  for every modality (no transcription special-case; backend-omitted
  configs are deprecated).
- The realtime session blocks on Model.Warmup and returns a
  model_load_error to the client if any stage fails to load;
  updateSession warms in the background. Opt out per pipeline with
  pipeline.disable_warmup, exposed as a UI toggle via the
  config-metadata registry.

Add a LocalAI-native POST /backend/load (and /v1/backend/load) that
pre-loads a model -- expanding realtime pipelines into their sub-models
-- as the inverse of /backend/shutdown. There is one preload engine
(backend.PreloadStages): the realtime Warmup methods, /backend/load and
the --load-to-memory startup flag all use it, so --load-to-memory now
also expands pipeline models and records load-failure traces. Pipeline
sub-model alias resolution is likewise shared
(ModelConfigLoader.LoadResolvedModelConfig). Surface the endpoint
everywhere an admin manages models:
- MCP admin tool load_model (httpapi + inproc clients, safety/catalog
  prompts, catalog/dispatch tests).
- "Load into memory" action in the React models UI.
- Swagger regenerated; docs moved to the general backend-monitor page
  since it is not realtime-specific.

Fix a Traces UI crash ("json: unsupported value: -Inf"): audio-snippet
RMS/peak now floor at a finite dBFS, and backend-trace data is sanitized
to drop non-finite floats before marshaling. The sanitizer is
copy-on-write -- it runs on every RecordBackendTrace, so containers are
only re-allocated on the paths that actually changed.

Migrate core/http/openresponses_test.go onto the prebuilt mock-backend
the rest of the http suite already uses -- it was the last spec still
pointing at a real HuggingFace model, so it 404'd wherever no vision
backend was built -- and fix its item_reference specs to send the
spec's "id" field instead of "item_id", which the handler never
accepted.

Assisted-by: Claude:claude-opus-4-8 Claude Code

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-07-03 18:00:37 +02:00

123 lines
4.3 KiB
Go

package backend
import (
"context"
"errors"
"fmt"
"sync"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/xlog"
)
// PreloadModelByName loads the named model into memory so the first request
// that uses it pays no cold-start load cost — the inverse of shutting a model
// down. If the model is a realtime pipeline (its config declares a `pipeline:`
// block), each configured sub-model (VAD, transcription, LLM, TTS,
// sound_detection, voice_recognition) is loaded concurrently instead of the
// pipeline stub, which has no backend of its own. It returns the model names
// actually loaded and a joined error naming each sub-model that failed (nil on
// full success); a partial pipeline load reports both the loaded names and the
// failures so the caller can surface exactly what is and isn't resident.
// Compaction's summary_model is deliberately left cold: it is only invoked off
// the response path, so it can stay lazy.
func PreloadModelByName(ctx context.Context, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, name string) ([]string, error) {
cfg, err := cl.LoadModelConfigFileByNameDefaultOptions(name, appConfig)
if err != nil {
return nil, err
}
stages, err := pipelineStages(cl, &cfg.Pipeline, ml.ModelPath)
if err != nil {
return nil, err
}
if len(stages) == 0 {
// Not a pipeline: load the model's own backend directly.
if err := PreloadModel(ctx, ml, *cfg, appConfig); err != nil {
return nil, err
}
return []string{cfg.Name}, nil
}
return PreloadStages(ctx, ml, appConfig, stages)
}
// PreloadStage names one pipeline sub-model to preload and the resolved config
// to load it from (nil = stage absent, skipped). Role labels the pipeline slot
// in errors and logs.
type PreloadStage struct {
Role string
Cfg *config.ModelConfig
}
// loadStage is PreloadModel behind a seam so PreloadStages can be unit-tested
// without spawning real backends.
var loadStage = PreloadModel
// pipelineStages resolves each populated pipeline stage to its concrete model
// config, following a single alias hop — the same resolution the realtime
// pipeline itself uses. A stage that fails to resolve is a misconfiguration,
// so it fails fast rather than being deferred to load. A pipeline with no
// stages set returns nil, which callers treat as "not a pipeline".
func pipelineStages(cl *config.ModelConfigLoader, p *config.Pipeline, modelPath string) ([]PreloadStage, error) {
voiceRec := ""
if p.VoiceRecognition != nil {
voiceRec = p.VoiceRecognition.Model
}
var stages []PreloadStage
for _, s := range []struct{ role, name string }{
{"vad", p.VAD},
{"transcription", p.Transcription},
{"llm", p.LLM},
{"tts", p.TTS},
{"sound_detection", p.SoundDetection},
{"voice_recognition", voiceRec},
} {
if s.name == "" {
continue
}
cfg, err := cl.LoadResolvedModelConfig(s.name, modelPath)
if err != nil {
return nil, fmt.Errorf("%s (%s): %w", s.role, s.name, err)
}
stages = append(stages, PreloadStage{Role: s.role, Cfg: cfg})
}
return stages, nil
}
// PreloadStages loads every present stage at once and waits for all of them, so
// a pipeline warms in the time of its slowest stage rather than the sum. Absent
// (nil-config) stages are skipped. A failed stage does not cancel the others —
// they all run to completion so the joined error names every broken stage at
// once, alongside the names that did load.
func PreloadStages(ctx context.Context, ml *model.ModelLoader, appConfig *config.ApplicationConfig, stages []PreloadStage) ([]string, error) {
var (
wg sync.WaitGroup
mu sync.Mutex
loaded []string
errs []error
)
for _, s := range stages {
if s.Cfg == nil {
continue
}
wg.Add(1)
go func(s PreloadStage) {
defer wg.Done()
if err := loadStage(ctx, ml, *s.Cfg, appConfig); err != nil {
xlog.Warn("preload: failed to load pipeline sub-model", "stage", s.Role, "model", s.Cfg.Name, "error", err)
mu.Lock()
errs = append(errs, fmt.Errorf("%s (%s): %w", s.Role, s.Cfg.Name, err))
mu.Unlock()
return
}
xlog.Debug("preload: loaded pipeline sub-model", "stage", s.Role, "model", s.Cfg.Name)
mu.Lock()
loaded = append(loaded, s.Cfg.Name)
mu.Unlock()
}(s)
}
wg.Wait()
return loaded, errors.Join(errs...)
}