From 7270a98ce5377f061acf73c8d9b16da89287cf49 Mon Sep 17 00:00:00 2001 From: Richard Palethorpe Date: Wed, 11 Feb 2026 13:18:05 +0000 Subject: [PATCH] fix(realtime): Use user provided voice and allow pipeline models to have no backend (#8415) * fix(realtime): Use the voice provided by the user or none at all Signed-off-by: Richard Palethorpe * fix(ui,config): Allow pipeline models to have no backend and use same validation in frontend Signed-off-by: Richard Palethorpe --------- Signed-off-by: Richard Palethorpe Co-authored-by: Ettore Di Giacinto --- core/config/model_config_loader.go | 72 ++++++++++++++------------ core/config/model_config_test.go | 15 ++++-- core/config/model_test.go | 2 +- core/http/app_test.go | 4 +- core/http/endpoints/openai/realtime.go | 22 ++++++-- core/http/views/model-editor.html | 7 +-- 6 files changed, 75 insertions(+), 47 deletions(-) diff --git a/core/config/model_config_loader.go b/core/config/model_config_loader.go index 02724a5d6..664899230 100644 --- a/core/config/model_config_loader.go +++ b/core/config/model_config_loader.go @@ -76,42 +76,35 @@ func (lo *LoadOptions) Apply(options ...ConfigLoaderOption) { } } -// TODO: either in the next PR or the next commit, I want to merge these down into a single function that looks at the first few characters of the file to determine if we need to deserialize to []BackendConfig or BackendConfig -func readMultipleModelConfigsFromFile(file string, opts ...ConfigLoaderOption) ([]*ModelConfig, error) { - c := &[]*ModelConfig{} +// readModelConfigsFromFile reads a config file that may contain either a single +// ModelConfig or an array of ModelConfigs. It tries to unmarshal as an array first, +// then falls back to a single config if that fails. +func readModelConfigsFromFile(file string, opts ...ConfigLoaderOption) ([]*ModelConfig, error) { f, err := os.ReadFile(file) if err != nil { - return nil, fmt.Errorf("readMultipleModelConfigsFromFile cannot read config file %q: %w", file, err) - } - if err := yaml.Unmarshal(f, c); err != nil { - return nil, fmt.Errorf("readMultipleModelConfigsFromFile cannot unmarshal config file %q: %w", file, err) + return nil, fmt.Errorf("readModelConfigsFromFile cannot read config file %q: %w", file, err) } - for _, cc := range *c { - cc.modelConfigFile = file - cc.SetDefaults(opts...) + // Try to unmarshal as array first + var configs []*ModelConfig + if err := yaml.Unmarshal(f, &configs); err == nil && len(configs) > 0 { + for _, cc := range configs { + cc.modelConfigFile = file + cc.SetDefaults(opts...) + } + return configs, nil } - return *c, nil -} - -func readModelConfigFromFile(file string, opts ...ConfigLoaderOption) (*ModelConfig, error) { - lo := &LoadOptions{} - lo.Apply(opts...) - + // Fall back to single config c := &ModelConfig{} - f, err := os.ReadFile(file) - if err != nil { - return nil, fmt.Errorf("readModelConfigFromFile cannot read config file %q: %w", file, err) - } if err := yaml.Unmarshal(f, c); err != nil { - return nil, fmt.Errorf("readModelConfigFromFile cannot unmarshal config file %q: %w", file, err) + return nil, fmt.Errorf("readModelConfigsFromFile cannot unmarshal config file %q: %w", file, err) } - c.SetDefaults(opts...) - c.modelConfigFile = file - return c, nil + c.SetDefaults(opts...) + + return []*ModelConfig{c}, nil } // Load a config file for a model @@ -163,7 +156,7 @@ func (bcl *ModelConfigLoader) LoadModelConfigFileByNameDefaultOptions(modelName func (bcl *ModelConfigLoader) LoadMultipleModelConfigsSingleFile(file string, opts ...ConfigLoaderOption) error { bcl.Lock() defer bcl.Unlock() - c, err := readMultipleModelConfigsFromFile(file, opts...) + c, err := readModelConfigsFromFile(file, opts...) if err != nil { return fmt.Errorf("cannot load config file: %w", err) } @@ -181,11 +174,18 @@ func (bcl *ModelConfigLoader) LoadMultipleModelConfigsSingleFile(file string, op func (bcl *ModelConfigLoader) ReadModelConfig(file string, opts ...ConfigLoaderOption) error { bcl.Lock() defer bcl.Unlock() - c, err := readModelConfigFromFile(file, opts...) + configs, err := readModelConfigsFromFile(file, opts...) if err != nil { return fmt.Errorf("ReadModelConfig cannot read config file %q: %w", file, err) } + if len(configs) == 0 { + return fmt.Errorf("ReadModelConfig: no configs found in file %q", file) + } + if len(configs) > 1 { + xlog.Warn("ReadModelConig: read more than one config from file, only using first", "file", file, "configs", len(configs)) + } + c := configs[0] if valid, err := c.Validate(); valid { bcl.configs[c.Name] = *c } else { @@ -375,15 +375,23 @@ func (bcl *ModelConfigLoader) LoadModelConfigsFromPath(path string, opts ...Conf strings.HasPrefix(file.Name(), ".") { continue } - c, err := readModelConfigFromFile(filepath.Join(path, file.Name()), opts...) + + filePath := filepath.Join(path, file.Name()) + + // Read config(s) - handles both single and array formats + configs, err := readModelConfigsFromFile(filePath, opts...) if err != nil { xlog.Error("LoadModelConfigsFromPath cannot read config file", "error", err, "File Name", file.Name()) continue } - if valid, validationErr := c.Validate(); valid { - bcl.configs[c.Name] = *c - } else { - xlog.Error("config is not valid", "error", validationErr, "Name", c.Name) + + // Validate and store each config + for _, c := range configs { + if valid, validationErr := c.Validate(); valid { + bcl.configs[c.Name] = *c + } else { + xlog.Error("config is not valid", "error", validationErr, "Name", c.Name) + } } } diff --git a/core/config/model_config_test.go b/core/config/model_config_test.go index a086d95f6..9926774b1 100644 --- a/core/config/model_config_test.go +++ b/core/config/model_config_test.go @@ -25,7 +25,8 @@ known_usecases: - COMPLETION `) Expect(err).ToNot(HaveOccurred()) - config, err := readModelConfigFromFile(tmp.Name()) + configs, err := readModelConfigsFromFile(tmp.Name()) + config := configs[0] Expect(err).To(BeNil()) Expect(config).ToNot(BeNil()) valid, err := config.Validate() @@ -43,7 +44,8 @@ backend: "foo-bar" parameters: model: "foo-bar"`) Expect(err).ToNot(HaveOccurred()) - config, err := readModelConfigFromFile(tmp.Name()) + configs, err := readModelConfigsFromFile(tmp.Name()) + config := configs[0] Expect(err).To(BeNil()) Expect(config).ToNot(BeNil()) // two configs in config.yaml @@ -62,7 +64,8 @@ parameters: defer os.Remove(tmp.Name()) _, err = io.Copy(tmp, resp.Body) Expect(err).To(BeNil()) - config, err = readModelConfigFromFile(tmp.Name()) + configs, err = readModelConfigsFromFile(tmp.Name()) + config = configs[0] Expect(err).To(BeNil()) Expect(config).ToNot(BeNil()) // two configs in config.yaml @@ -188,7 +191,8 @@ mcp: } }`) Expect(err).ToNot(HaveOccurred()) - config, err := readModelConfigFromFile(tmp.Name()) + configs, err := readModelConfigsFromFile(tmp.Name()) + config := configs[0] Expect(err).To(BeNil()) Expect(config).ToNot(BeNil()) valid, err := config.Validate() @@ -218,7 +222,8 @@ mcp: } }`) Expect(err).ToNot(HaveOccurred()) - config, err := readModelConfigFromFile(tmp.Name()) + configs, err := readModelConfigsFromFile(tmp.Name()) + config := configs[0] Expect(err).To(BeNil()) Expect(config).ToNot(BeNil()) valid, err := config.Validate() diff --git a/core/config/model_test.go b/core/config/model_test.go index f127f8f56..4e4036185 100644 --- a/core/config/model_test.go +++ b/core/config/model_test.go @@ -16,7 +16,7 @@ var _ = Describe("Test cases for config related functions", func() { Context("Test Read configuration functions", func() { configFile = os.Getenv("CONFIG_FILE") It("Test readConfigFile", func() { - config, err := readMultipleModelConfigsFromFile(configFile) + config, err := readModelConfigsFromFile(configFile) Expect(err).To(BeNil()) Expect(config).ToNot(BeNil()) // two configs in config.yaml diff --git a/core/http/app_test.go b/core/http/app_test.go index 749f1f02a..eee66ab1e 100644 --- a/core/http/app_test.go +++ b/core/http/app_test.go @@ -336,6 +336,7 @@ var _ = Describe("API test", func() { Name: "bert", URL: bertEmbeddingsURL, }, + Overrides: map[string]interface{}{"backend": "llama-cpp"}, }, { Metadata: gallery.Metadata{ @@ -953,7 +954,8 @@ parameters: It("returns the models list", func() { models, err := client.ListModels(context.TODO()) Expect(err).ToNot(HaveOccurred()) - Expect(len(models.Models)).To(Equal(7)) // If "config.yaml" should be included, this should be 8? + // A model called "bert" can be present in the model directory depending on the order of the tests + Expect(len(models.Models)).To(BeNumerically(">=", 8)) }) It("can generate completions via ggml", func() { if runtime.GOOS != "linux" { diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index 6339c7cd3..383212cff 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -183,14 +183,13 @@ func registerRealtime(application *application.Application, model string) func(c } sttModel := cfg.Pipeline.Transcription - ttsModel := cfg.Pipeline.TTS sessionID := generateSessionID() session := &Session{ ID: sessionID, TranscriptionOnly: false, Model: model, - Voice: ttsModel, + Voice: cfg.TTSConfig.Voice, ModelConfig: cfg, TurnDetection: &types.TurnDetectionUnion{ ServerVad: &types.ServerVad{ @@ -557,13 +556,13 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode session.InputAudioTranscription = &types.AudioTranscription{} } session.InputAudioTranscription.Model = cfg.Pipeline.Transcription - session.Voice = cfg.Pipeline.TTS + session.Voice = cfg.TTSConfig.Voice session.Model = rt.Model session.ModelConfig = cfg } if rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "" { - xlog.Warn("Ignoring voice setting; not implemented", "voice", rt.Audio.Output.Voice) + session.Voice = string(rt.Audio.Output.Voice) } if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil { @@ -746,6 +745,10 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co tr, err := session.ModelInterface.Transcribe(ctx, f.Name(), session.InputAudioTranscription.Language, false, false, session.InputAudioTranscription.Prompt) if err != nil { sendError(c, "transcription_failed", err.Error(), "", "event_TODO") + return + } else if tr == nil { + sendError(c, "transcription_failed", "trancribe result is nil", "", "event_TODO") + return } transcript = tr.Text @@ -1006,7 +1009,16 @@ func generateResponse(session *Session, utt []byte, transcript string, conv *Con sendError(c, "tts_error", fmt.Sprintf("Failed to read TTS audio: %v", err), "", item.Assistant.ID) return } - audioString := base64.StdEncoding.EncodeToString(audioBytes) + + // Strip WAV header (44 bytes) to get raw PCM data + // The OpenAI Realtime API expects raw PCM, not WAV files + const wavHeaderSize = 44 + pcmData := audioBytes + if len(audioBytes) > wavHeaderSize { + pcmData = audioBytes[wavHeaderSize:] + } + + audioString := base64.StdEncoding.EncodeToString(pcmData) sendEvent(c, types.ResponseOutputAudioTranscriptDeltaEvent{ ServerEventBase: types.ServerEventBase{}, diff --git a/core/http/views/model-editor.html b/core/http/views/model-editor.html index 11b1bab93..d01a91ae7 100644 --- a/core/http/views/model-editor.html +++ b/core/http/views/model-editor.html @@ -1026,7 +1026,8 @@ parameters: if (!config.name) { throw new Error('Model name is required'); } - if (!config.backend) { + const isPipeline = config.pipeline && (config.pipeline.vad || config.pipeline.transcription || config.pipeline.tts || config.pipeline.llm); + if (!isPipeline && !config.backend) { throw new Error('Backend is required'); } if (!config.parameters || !config.parameters.model) { @@ -1041,7 +1042,6 @@ parameters: async saveConfig() { try { - // Validate before saving const yamlContent = this.yamlEditor.getValue(); const config = jsyaml.load(yamlContent); @@ -1052,7 +1052,8 @@ parameters: if (!config.name) { throw new Error('Model name is required'); } - if (!config.backend) { + const isPipeline = config.pipeline && (config.pipeline.vad || config.pipeline.transcription || config.pipeline.tts || config.pipeline.llm); + if (!isPipeline && !config.backend) { throw new Error('Backend is required'); } if (!config.parameters || !config.parameters.model) {