From 7270a98ce5377f061acf73c8d9b16da89287cf49 Mon Sep 17 00:00:00 2001
From: Richard Palethorpe <io@richiejp.com>
Date: Wed, 11 Feb 2026 13:18:05 +0000
Subject: [PATCH] fix(realtime): Use user provided voice and allow pipeline
 models to have no backend (#8415)

* fix(realtime): Use the voice provided by the user or none at all

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(ui,config): Allow pipeline models to have no backend and use same validation in frontend

Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>
---
 core/config/model_config_loader.go     | 72 ++++++++++++++------------
 core/config/model_config_test.go       | 15 ++++--
 core/config/model_test.go              |  2 +-
 core/http/app_test.go                  |  4 +-
 core/http/endpoints/openai/realtime.go | 22 ++++++--
 core/http/views/model-editor.html      |  7 +--
 6 files changed, 75 insertions(+), 47 deletions(-)

diff --git a/core/config/model_config_loader.go b/core/config/model_config_loader.go
index 02724a5d6..664899230 100644
--- a/core/config/model_config_loader.go
+++ b/core/config/model_config_loader.go
@@ -76,42 +76,35 @@ func (lo *LoadOptions) Apply(options ...ConfigLoaderOption) {
 	}
 }
 
-// TODO: either in the next PR or the next commit, I want to merge these down into a single function that looks at the first few characters of the file to determine if we need to deserialize to []BackendConfig or BackendConfig
-func readMultipleModelConfigsFromFile(file string, opts ...ConfigLoaderOption) ([]*ModelConfig, error) {
-	c := &[]*ModelConfig{}
+// readModelConfigsFromFile reads a config file that may contain either a single
+// ModelConfig or an array of ModelConfigs. It tries to unmarshal as an array first,
+// then falls back to a single config if that fails.
+func readModelConfigsFromFile(file string, opts ...ConfigLoaderOption) ([]*ModelConfig, error) {
 	f, err := os.ReadFile(file)
 	if err != nil {
-		return nil, fmt.Errorf("readMultipleModelConfigsFromFile cannot read config file %q: %w", file, err)
-	}
-	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("readMultipleModelConfigsFromFile cannot unmarshal config file %q: %w", file, err)
+		return nil, fmt.Errorf("readModelConfigsFromFile cannot read config file %q: %w", file, err)
 	}
 
-	for _, cc := range *c {
-		cc.modelConfigFile = file
-		cc.SetDefaults(opts...)
+	// Try to unmarshal as array first
+	var configs []*ModelConfig
+	if err := yaml.Unmarshal(f, &configs); err == nil && len(configs) > 0 {
+		for _, cc := range configs {
+			cc.modelConfigFile = file
+			cc.SetDefaults(opts...)
+		}
+		return configs, nil
 	}
 
-	return *c, nil
-}
-
-func readModelConfigFromFile(file string, opts ...ConfigLoaderOption) (*ModelConfig, error) {
-	lo := &LoadOptions{}
-	lo.Apply(opts...)
-
+	// Fall back to single config
 	c := &ModelConfig{}
-	f, err := os.ReadFile(file)
-	if err != nil {
-		return nil, fmt.Errorf("readModelConfigFromFile cannot read config file %q: %w", file, err)
-	}
 	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("readModelConfigFromFile cannot unmarshal config file %q: %w", file, err)
+		return nil, fmt.Errorf("readModelConfigsFromFile cannot unmarshal config file %q: %w", file, err)
 	}
 
-	c.SetDefaults(opts...)
-
 	c.modelConfigFile = file
-	return c, nil
+	c.SetDefaults(opts...)
+
+	return []*ModelConfig{c}, nil
 }
 
 // Load a config file for a model
@@ -163,7 +156,7 @@ func (bcl *ModelConfigLoader) LoadModelConfigFileByNameDefaultOptions(modelName
 func (bcl *ModelConfigLoader) LoadMultipleModelConfigsSingleFile(file string, opts ...ConfigLoaderOption) error {
 	bcl.Lock()
 	defer bcl.Unlock()
-	c, err := readMultipleModelConfigsFromFile(file, opts...)
+	c, err := readModelConfigsFromFile(file, opts...)
 	if err != nil {
 		return fmt.Errorf("cannot load config file: %w", err)
 	}
@@ -181,11 +174,18 @@ func (bcl *ModelConfigLoader) LoadMultipleModelConfigsSingleFile(file string, op
 func (bcl *ModelConfigLoader) ReadModelConfig(file string, opts ...ConfigLoaderOption) error {
 	bcl.Lock()
 	defer bcl.Unlock()
-	c, err := readModelConfigFromFile(file, opts...)
+	configs, err := readModelConfigsFromFile(file, opts...)
 	if err != nil {
 		return fmt.Errorf("ReadModelConfig cannot read config file %q: %w", file, err)
 	}
+	if len(configs) == 0 {
+		return fmt.Errorf("ReadModelConfig: no configs found in file %q", file)
+	}
+	if len(configs) > 1 {
+		xlog.Warn("ReadModelConig: read more than one config from file, only using first", "file", file, "configs", len(configs))
+	}
 
+	c := configs[0]
 	if valid, err := c.Validate(); valid {
 		bcl.configs[c.Name] = *c
 	} else {
@@ -375,15 +375,23 @@ func (bcl *ModelConfigLoader) LoadModelConfigsFromPath(path string, opts ...Conf
 			strings.HasPrefix(file.Name(), ".") {
 			continue
 		}
-		c, err := readModelConfigFromFile(filepath.Join(path, file.Name()), opts...)
+
+		filePath := filepath.Join(path, file.Name())
+
+		// Read config(s) - handles both single and array formats
+		configs, err := readModelConfigsFromFile(filePath, opts...)
 		if err != nil {
 			xlog.Error("LoadModelConfigsFromPath cannot read config file", "error", err, "File Name", file.Name())
 			continue
 		}
-		if valid, validationErr := c.Validate(); valid {
-			bcl.configs[c.Name] = *c
-		} else {
-			xlog.Error("config is not valid", "error", validationErr, "Name", c.Name)
+
+		// Validate and store each config
+		for _, c := range configs {
+			if valid, validationErr := c.Validate(); valid {
+				bcl.configs[c.Name] = *c
+			} else {
+				xlog.Error("config is not valid", "error", validationErr, "Name", c.Name)
+			}
 		}
 	}
 
diff --git a/core/config/model_config_test.go b/core/config/model_config_test.go
index a086d95f6..9926774b1 100644
--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -25,7 +25,8 @@ known_usecases:
 - COMPLETION
 `)
 			Expect(err).ToNot(HaveOccurred())
-			config, err := readModelConfigFromFile(tmp.Name())
+			configs, err := readModelConfigsFromFile(tmp.Name())
+			config := configs[0]
 			Expect(err).To(BeNil())
 			Expect(config).ToNot(BeNil())
 			valid, err := config.Validate()
@@ -43,7 +44,8 @@ backend: "foo-bar"
 parameters:
   model: "foo-bar"`)
 			Expect(err).ToNot(HaveOccurred())
-			config, err := readModelConfigFromFile(tmp.Name())
+			configs, err := readModelConfigsFromFile(tmp.Name())
+			config := configs[0]
 			Expect(err).To(BeNil())
 			Expect(config).ToNot(BeNil())
 			// two configs in config.yaml
@@ -62,7 +64,8 @@ parameters:
 			defer os.Remove(tmp.Name())
 			_, err = io.Copy(tmp, resp.Body)
 			Expect(err).To(BeNil())
-			config, err = readModelConfigFromFile(tmp.Name())
+			configs, err = readModelConfigsFromFile(tmp.Name())
+			config = configs[0]
 			Expect(err).To(BeNil())
 			Expect(config).ToNot(BeNil())
 			// two configs in config.yaml
@@ -188,7 +191,8 @@ mcp:
       }
     }`)
 		Expect(err).ToNot(HaveOccurred())
-		config, err := readModelConfigFromFile(tmp.Name())
+		configs, err := readModelConfigsFromFile(tmp.Name())
+		config := configs[0]
 		Expect(err).To(BeNil())
 		Expect(config).ToNot(BeNil())
 		valid, err := config.Validate()
@@ -218,7 +222,8 @@ mcp:
       }
     }`)
 		Expect(err).ToNot(HaveOccurred())
-		config, err := readModelConfigFromFile(tmp.Name())
+		configs, err := readModelConfigsFromFile(tmp.Name())
+		config := configs[0]
 		Expect(err).To(BeNil())
 		Expect(config).ToNot(BeNil())
 		valid, err := config.Validate()
diff --git a/core/config/model_test.go b/core/config/model_test.go
index f127f8f56..4e4036185 100644
--- a/core/config/model_test.go
+++ b/core/config/model_test.go
@@ -16,7 +16,7 @@ var _ = Describe("Test cases for config related functions", func() {
 	Context("Test Read configuration functions", func() {
 		configFile = os.Getenv("CONFIG_FILE")
 		It("Test readConfigFile", func() {
-			config, err := readMultipleModelConfigsFromFile(configFile)
+			config, err := readModelConfigsFromFile(configFile)
 			Expect(err).To(BeNil())
 			Expect(config).ToNot(BeNil())
 			// two configs in config.yaml
diff --git a/core/http/app_test.go b/core/http/app_test.go
index 749f1f02a..eee66ab1e 100644
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -336,6 +336,7 @@ var _ = Describe("API test", func() {
 						Name: "bert",
 						URL:  bertEmbeddingsURL,
 					},
+					Overrides: map[string]interface{}{"backend": "llama-cpp"},
 				},
 				{
 					Metadata: gallery.Metadata{
@@ -953,7 +954,8 @@ parameters:
 		It("returns the models list", func() {
 			models, err := client.ListModels(context.TODO())
 			Expect(err).ToNot(HaveOccurred())
-			Expect(len(models.Models)).To(Equal(7)) // If "config.yaml" should be included, this should be 8?
+			// A model called "bert" can be present in the model directory depending on the order of the tests
+			Expect(len(models.Models)).To(BeNumerically(">=", 8))
 		})
 		It("can generate completions via ggml", func() {
 			if runtime.GOOS != "linux" {
diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index 6339c7cd3..383212cff 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -183,14 +183,13 @@ func registerRealtime(application *application.Application, model string) func(c
 		}
 
 		sttModel := cfg.Pipeline.Transcription
-		ttsModel := cfg.Pipeline.TTS
 
 		sessionID := generateSessionID()
 		session := &Session{
 			ID:                sessionID,
 			TranscriptionOnly: false,
 			Model:             model,
-			Voice:             ttsModel,
+			Voice:             cfg.TTSConfig.Voice,
 			ModelConfig:       cfg,
 			TurnDetection: &types.TurnDetectionUnion{
 				ServerVad: &types.ServerVad{
@@ -557,13 +556,13 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode
 			session.InputAudioTranscription = &types.AudioTranscription{}
 		}
 		session.InputAudioTranscription.Model = cfg.Pipeline.Transcription
-		session.Voice = cfg.Pipeline.TTS
+		session.Voice = cfg.TTSConfig.Voice
 		session.Model = rt.Model
 		session.ModelConfig = cfg
 	}
 
 	if rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "" {
-		xlog.Warn("Ignoring voice setting; not implemented", "voice", rt.Audio.Output.Voice)
+		session.Voice = string(rt.Audio.Output.Voice)
 	}
 
 	if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil {
@@ -746,6 +745,10 @@ func commitUtterance(ctx context.Context, utt []byte, session *Session, conv *Co
 		tr, err := session.ModelInterface.Transcribe(ctx, f.Name(), session.InputAudioTranscription.Language, false, false, session.InputAudioTranscription.Prompt)
 		if err != nil {
 			sendError(c, "transcription_failed", err.Error(), "", "event_TODO")
+			return
+		} else if tr == nil {
+			sendError(c, "transcription_failed", "trancribe result is nil", "", "event_TODO")
+			return
 		}
 
 		transcript = tr.Text
@@ -1006,7 +1009,16 @@ func generateResponse(session *Session, utt []byte, transcript string, conv *Con
 			sendError(c, "tts_error", fmt.Sprintf("Failed to read TTS audio: %v", err), "", item.Assistant.ID)
 			return
 		}
-		audioString := base64.StdEncoding.EncodeToString(audioBytes)
+
+		// Strip WAV header (44 bytes) to get raw PCM data
+		// The OpenAI Realtime API expects raw PCM, not WAV files
+		const wavHeaderSize = 44
+		pcmData := audioBytes
+		if len(audioBytes) > wavHeaderSize {
+			pcmData = audioBytes[wavHeaderSize:]
+		}
+
+		audioString := base64.StdEncoding.EncodeToString(pcmData)
 
 		sendEvent(c, types.ResponseOutputAudioTranscriptDeltaEvent{
 			ServerEventBase: types.ServerEventBase{},
diff --git a/core/http/views/model-editor.html b/core/http/views/model-editor.html
index 11b1bab93..d01a91ae7 100644
--- a/core/http/views/model-editor.html
+++ b/core/http/views/model-editor.html
@@ -1026,7 +1026,8 @@ parameters:
                 if (!config.name) {
                     throw new Error('Model name is required');
                 }
-                if (!config.backend) {
+                const isPipeline = config.pipeline && (config.pipeline.vad || config.pipeline.transcription || config.pipeline.tts || config.pipeline.llm);
+                if (!isPipeline && !config.backend) {
                     throw new Error('Backend is required');
                 }
                 if (!config.parameters || !config.parameters.model) {
@@ -1041,7 +1042,6 @@ parameters:
         
         async saveConfig() {
             try {
-                // Validate before saving
                 const yamlContent = this.yamlEditor.getValue();
                 const config = jsyaml.load(yamlContent);
                 
@@ -1052,7 +1052,8 @@ parameters:
                 if (!config.name) {
                     throw new Error('Model name is required');
                 }
-                if (!config.backend) {
+                const isPipeline = config.pipeline && (config.pipeline.vad || config.pipeline.transcription || config.pipeline.tts || config.pipeline.llm);
+                if (!isPipeline && !config.backend) {
                     throw new Error('Backend is required');
                 }
                 if (!config.parameters || !config.parameters.model) {