From edeacf22c4eac3054ead9a74c7ae9079abc1d9bf Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 13 Jun 2026 01:01:36 +0200 Subject: [PATCH] fix(realtime): keep transcription model on a language-only session.update (#10295) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A transcription session.update that carries only a language (no model) — e.g. a client forcing the STT input language — has an empty Transcription.Model. updateSession unconditionally copied that into session.ModelConfig.Pipeline.Transcription, blanking the pipeline's configured transcription backend. The next utterance then transcribed against an empty model and the backend RPC failed with "unimplemented" (surfaced to the client as transcription_failed), so transcription silently stopped whenever a language was selected. Only adopt the incoming transcription model when it is non-empty, and preserve the existing model otherwise (mirroring updateTransSession). Signed-off-by: mudler Co-authored-by: Ettore Di Giacinto Co-authored-by: Claude Opus 4.8 --- core/http/endpoints/openai/realtime.go | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index d2e2413c6..63b1a1589 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -990,8 +990,18 @@ func updateSession(session *Session, update *types.SessionUnion, cl *config.Mode } if rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil { - session.InputAudioTranscription = rt.Audio.Input.Transcription - session.ModelConfig.Pipeline.Transcription = rt.Audio.Input.Transcription.Model + trUpd := rt.Audio.Input.Transcription + // A language-only update (e.g. a client forcing the STT language) carries + // an empty Model. Preserve the pipeline's configured transcription backend + // instead of blanking it — otherwise the next utterance transcribes against + // an empty model and the backend RPC fails with "unimplemented". + if trUpd.Model == "" && session.InputAudioTranscription != nil { + trUpd.Model = session.InputAudioTranscription.Model + } + session.InputAudioTranscription = trUpd + if trUpd.Model != "" { + session.ModelConfig.Pipeline.Transcription = trUpd.Model + } } if rt.Model != "" || (rt.Audio != nil && rt.Audio.Output != nil && rt.Audio.Output.Voice != "") || (rt.Audio != nil && rt.Audio.Input != nil && rt.Audio.Input.Transcription != nil) {