fix(realtime): always strip reasoning from spoken output

disable_thinking maps to ReasoningConfig.DisableReasoning=true on the LLM config, which the backend reads as enable_thinking=false. But the realtime handler reads that SAME config to drive reasoning extraction, and there DisableReasoning=true means "skip stripping". PredictConfig() returns this LLM config, so both the streamed (speechStreamer) and buffered realtime paths stopped stripping <think>…</think> exactly when disable_thinking was on — leaking raw reasoning to the client whenever the model ignored the enable_thinking hint (e.g. lfm2.5). Add spokenReasoningConfig() which clears DisableReasoning for extraction (keeping custom tokens/tag pairs) and route both realtime paths through it. Spoken output now always strips reasoning, independent of the backend suppression hint. Assisted-by: Claude:claude-opus-4-8 go test, golangci-lint Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-06 07:46:15 -04:00 · 2026-06-04 23:18:37 +00:00
parent f48344f2ff
commit cb3609530a
6 changed files with 85 additions and 3 deletions
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -1590,7 +1590,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		// ExtractReasoningWithConfig is a no-op when no tag pair matches,
 		// so it's safe to apply unconditionally in the no-reasoning branch.
 		if deltaReasoning == "" && deltaContent != "" {
-			deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
+			deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
 		}
 		reasoningText = deltaReasoning
 		responseWithoutReasoning = deltaContent
@@ -1598,7 +1598,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		cleanedResponse = deltaContent
 		toolCalls = deltaToolCalls
 	} else {
-		reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
+		reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
 		textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
 		cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
 		toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
--- a/core/http/endpoints/openai/realtime_doubles_test.go
+++ b/core/http/endpoints/openai/realtime_doubles_test.go
@@ -2,6 +2,7 @@ package openai

 import (
 	"context"
+	"strings"

 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
@@ -48,6 +49,18 @@ func (f *fakeTransport) countEvents(et types.ServerEventType) int {
 	return n
 }

+// transcriptDeltaText concatenates the Delta of every recorded transcript
+// delta event — i.e. the text streamed to the client as it is generated.
+func (f *fakeTransport) transcriptDeltaText() string {
+	var b strings.Builder
+	for _, e := range f.events {
+		if d, ok := e.(types.ResponseOutputAudioTranscriptDeltaEvent); ok {
+			b.WriteString(d.Delta)
+		}
+	}
+	return b.String()
+}
+
 // fakeModel is a configurable Model double. TTSStream replays ttsStreamChunks
 // and TranscribeStream replays transcribeDeltas, so the handler's streaming
 // paths can be driven deterministically.
--- a/core/http/endpoints/openai/realtime_stream.go
+++ b/core/http/endpoints/openai/realtime_stream.go
@@ -35,6 +35,9 @@ type speechStreamer struct {
 }

 func newSpeechStreamer(ctx context.Context, t Transport, session *Session, responseID, itemID, thinkingStartToken string, reasoningCfg reasoning.Config) *speechStreamer {
+	// Spoken output must never contain reasoning, even when disable_thinking set
+	// DisableReasoning (which would otherwise turn the extractor's stripping off).
+	reasoningCfg = spokenReasoningConfig(reasoningCfg)
 	return &speechStreamer{
 		ctx:        ctx,
 		t:          t,
--- a/core/http/endpoints/openai/realtime_stream_test.go
+++ b/core/http/endpoints/openai/realtime_stream_test.go
@@ -43,6 +43,32 @@ var _ = Describe("speechStreamer", func() {
 		Expect(audio).To(Equal([]byte{7, 7}))
 	})

+	It("strips leaked reasoning even when reasoning is disabled (disable_thinking safety net)", func() {
+		// disable_thinking maps to ReasoningConfig.DisableReasoning=true (it tells
+		// the backend enable_thinking=false). When the model ignores that and emits
+		// thinking anyway, the spoken stream must still not leak it: the streamer is
+		// the last line of defence and always strips reasoning from spoken content.
+		disable := true
+		session := &Session{
+			OutputSampleRate: 24000,
+			ModelInterface:   &fakeModel{},
+			ModelConfig:      &config.ModelConfig{}, // streaming.tts off
+		}
+		t := &fakeTransport{}
+		s := newSpeechStreamer(context.Background(), t, session, "resp1", "item1", "",
+			reasoning.Config{DisableReasoning: &disable})
+
+		s.onToken("<think>secret plan</think>")
+		s.onToken("The answer is 42.")
+		content, _, err := s.finish()
+
+		Expect(err).ToNot(HaveOccurred())
+		Expect(content).To(Equal("The answer is 42."))
+		Expect(content).ToNot(ContainSubstring("secret plan"))
+		// The text streamed to the client must not carry the reasoning either.
+		Expect(t.transcriptDeltaText()).ToNot(ContainSubstring("secret plan"))
+	})
+
 	It("does not synthesize audio when TTS streaming is disabled", func() {
 		m := &fakeModel{ttsStreamChunks: [][]byte{{7}}, ttsStreamRate: 24000}
 		session := &Session{
--- a/core/http/endpoints/openai/realtime_thinking.go
+++ b/core/http/endpoints/openai/realtime_thinking.go
@@ -1,6 +1,9 @@
 package openai

-import "github.com/mudler/LocalAI/core/config"
+import (
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/reasoning"
+)

 // applyPipelineThinking forces the LLM's reasoning/thinking off when the realtime
 // pipeline sets disable_thinking, mapping to the enable_thinking=false backend
@@ -15,3 +18,16 @@ func applyPipelineThinking(llm *config.ModelConfig, pipeline config.Pipeline) {
 	disable := true
 	llm.ReasoningConfig.DisableReasoning = &disable
 }
+
+// spokenReasoningConfig adapts a model's reasoning config for stripping reasoning
+// OUT of realtime spoken output. ReasoningConfig.DisableReasoning is overloaded:
+// the backend reads it as the "enable_thinking=false" hint (which pipeline
+// disable_thinking sets via applyPipelineThinking), but the reasoning extractor
+// reads it as "skip stripping, assume there is no reasoning". Honouring the latter
+// when extracting for speech would leak raw <think>…</think> whenever the model
+// ignores the suppression hint. Spoken output must never contain reasoning, so we
+// always strip: clear DisableReasoning while keeping custom tokens/tag pairs.
+func spokenReasoningConfig(cfg reasoning.Config) reasoning.Config {
+	cfg.DisableReasoning = nil
+	return cfg
+}
--- a/core/http/endpoints/openai/realtime_thinking_test.go
+++ b/core/http/endpoints/openai/realtime_thinking_test.go
@@ -5,6 +5,7 @@ import (
 	. "github.com/onsi/gomega"

 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/reasoning"
 )

 // applyPipelineThinking lets a realtime pipeline force the LLM's thinking off
@@ -24,3 +25,26 @@ var _ = Describe("applyPipelineThinking", func() {
 		Expect(llm.ReasoningConfig.DisableReasoning).To(BeNil())
 	})
 })
+
+// spokenReasoningConfig clears DisableReasoning so realtime spoken output always
+// strips reasoning, even though disable_thinking sets DisableReasoning=true on the
+// LLM config (which the backend reads as enable_thinking=false).
+var _ = Describe("spokenReasoningConfig", func() {
+	It("clears DisableReasoning so the extractor still strips leaked reasoning", func() {
+		disable := true
+		out := spokenReasoningConfig(reasoning.Config{DisableReasoning: &disable})
+		Expect(out.DisableReasoning).To(BeNil())
+	})
+
+	It("preserves the other reasoning settings", func() {
+		disable := true
+		out := spokenReasoningConfig(reasoning.Config{
+			DisableReasoning:    &disable,
+			ThinkingStartTokens: []string{"<reason>"},
+			TagPairs:            []reasoning.TagPair{{Start: "<reason>", End: "</reason>"}},
+		})
+		Expect(out.ThinkingStartTokens).To(Equal([]string{"<reason>"}))
+		Expect(out.TagPairs).To(HaveLen(1))
+		Expect(out.TagPairs[0].Start).To(Equal("<reason>"))
+	})
+})