diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go
index 078cf4a5b..bc2a80785 100644
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -1590,7 +1590,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
// ExtractReasoningWithConfig is a no-op when no tag pair matches,
// so it's safe to apply unconditionally in the no-reasoning branch.
if deltaReasoning == "" && deltaContent != "" {
- deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
+ deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
}
reasoningText = deltaReasoning
responseWithoutReasoning = deltaContent
@@ -1598,7 +1598,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
cleanedResponse = deltaContent
toolCalls = deltaToolCalls
} else {
- reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
+ reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
diff --git a/core/http/endpoints/openai/realtime_doubles_test.go b/core/http/endpoints/openai/realtime_doubles_test.go
index 2a54f3dbe..afb1f5e7a 100644
--- a/core/http/endpoints/openai/realtime_doubles_test.go
+++ b/core/http/endpoints/openai/realtime_doubles_test.go
@@ -2,6 +2,7 @@ package openai
import (
"context"
+ "strings"
"github.com/mudler/LocalAI/core/backend"
"github.com/mudler/LocalAI/core/config"
@@ -48,6 +49,18 @@ func (f *fakeTransport) countEvents(et types.ServerEventType) int {
return n
}
+// transcriptDeltaText concatenates the Delta of every recorded transcript
+// delta event — i.e. the text streamed to the client as it is generated.
+func (f *fakeTransport) transcriptDeltaText() string {
+ var b strings.Builder
+ for _, e := range f.events {
+ if d, ok := e.(types.ResponseOutputAudioTranscriptDeltaEvent); ok {
+ b.WriteString(d.Delta)
+ }
+ }
+ return b.String()
+}
+
// fakeModel is a configurable Model double. TTSStream replays ttsStreamChunks
// and TranscribeStream replays transcribeDeltas, so the handler's streaming
// paths can be driven deterministically.
diff --git a/core/http/endpoints/openai/realtime_stream.go b/core/http/endpoints/openai/realtime_stream.go
index 015f6850e..09526c561 100644
--- a/core/http/endpoints/openai/realtime_stream.go
+++ b/core/http/endpoints/openai/realtime_stream.go
@@ -35,6 +35,9 @@ type speechStreamer struct {
}
func newSpeechStreamer(ctx context.Context, t Transport, session *Session, responseID, itemID, thinkingStartToken string, reasoningCfg reasoning.Config) *speechStreamer {
+ // Spoken output must never contain reasoning, even when disable_thinking set
+ // DisableReasoning (which would otherwise turn the extractor's stripping off).
+ reasoningCfg = spokenReasoningConfig(reasoningCfg)
return &speechStreamer{
ctx: ctx,
t: t,
diff --git a/core/http/endpoints/openai/realtime_stream_test.go b/core/http/endpoints/openai/realtime_stream_test.go
index a6d233175..d8697c331 100644
--- a/core/http/endpoints/openai/realtime_stream_test.go
+++ b/core/http/endpoints/openai/realtime_stream_test.go
@@ -43,6 +43,32 @@ var _ = Describe("speechStreamer", func() {
Expect(audio).To(Equal([]byte{7, 7}))
})
+ It("strips leaked reasoning even when reasoning is disabled (disable_thinking safety net)", func() {
+ // disable_thinking maps to ReasoningConfig.DisableReasoning=true (it tells
+ // the backend enable_thinking=false). When the model ignores that and emits
+ // thinking anyway, the spoken stream must still not leak it: the streamer is
+ // the last line of defence and always strips reasoning from spoken content.
+ disable := true
+ session := &Session{
+ OutputSampleRate: 24000,
+ ModelInterface: &fakeModel{},
+ ModelConfig: &config.ModelConfig{}, // streaming.tts off
+ }
+ t := &fakeTransport{}
+ s := newSpeechStreamer(context.Background(), t, session, "resp1", "item1", "",
+ reasoning.Config{DisableReasoning: &disable})
+
+ s.onToken("secret plan")
+ s.onToken("The answer is 42.")
+ content, _, err := s.finish()
+
+ Expect(err).ToNot(HaveOccurred())
+ Expect(content).To(Equal("The answer is 42."))
+ Expect(content).ToNot(ContainSubstring("secret plan"))
+ // The text streamed to the client must not carry the reasoning either.
+ Expect(t.transcriptDeltaText()).ToNot(ContainSubstring("secret plan"))
+ })
+
It("does not synthesize audio when TTS streaming is disabled", func() {
m := &fakeModel{ttsStreamChunks: [][]byte{{7}}, ttsStreamRate: 24000}
session := &Session{
diff --git a/core/http/endpoints/openai/realtime_thinking.go b/core/http/endpoints/openai/realtime_thinking.go
index 41addf963..8222219af 100644
--- a/core/http/endpoints/openai/realtime_thinking.go
+++ b/core/http/endpoints/openai/realtime_thinking.go
@@ -1,6 +1,9 @@
package openai
-import "github.com/mudler/LocalAI/core/config"
+import (
+ "github.com/mudler/LocalAI/core/config"
+ "github.com/mudler/LocalAI/pkg/reasoning"
+)
// applyPipelineThinking forces the LLM's reasoning/thinking off when the realtime
// pipeline sets disable_thinking, mapping to the enable_thinking=false backend
@@ -15,3 +18,16 @@ func applyPipelineThinking(llm *config.ModelConfig, pipeline config.Pipeline) {
disable := true
llm.ReasoningConfig.DisableReasoning = &disable
}
+
+// spokenReasoningConfig adapts a model's reasoning config for stripping reasoning
+// OUT of realtime spoken output. ReasoningConfig.DisableReasoning is overloaded:
+// the backend reads it as the "enable_thinking=false" hint (which pipeline
+// disable_thinking sets via applyPipelineThinking), but the reasoning extractor
+// reads it as "skip stripping, assume there is no reasoning". Honouring the latter
+// when extracting for speech would leak raw … whenever the model
+// ignores the suppression hint. Spoken output must never contain reasoning, so we
+// always strip: clear DisableReasoning while keeping custom tokens/tag pairs.
+func spokenReasoningConfig(cfg reasoning.Config) reasoning.Config {
+ cfg.DisableReasoning = nil
+ return cfg
+}
diff --git a/core/http/endpoints/openai/realtime_thinking_test.go b/core/http/endpoints/openai/realtime_thinking_test.go
index 6a38fa86d..a056dd0e7 100644
--- a/core/http/endpoints/openai/realtime_thinking_test.go
+++ b/core/http/endpoints/openai/realtime_thinking_test.go
@@ -5,6 +5,7 @@ import (
. "github.com/onsi/gomega"
"github.com/mudler/LocalAI/core/config"
+ "github.com/mudler/LocalAI/pkg/reasoning"
)
// applyPipelineThinking lets a realtime pipeline force the LLM's thinking off
@@ -24,3 +25,26 @@ var _ = Describe("applyPipelineThinking", func() {
Expect(llm.ReasoningConfig.DisableReasoning).To(BeNil())
})
})
+
+// spokenReasoningConfig clears DisableReasoning so realtime spoken output always
+// strips reasoning, even though disable_thinking sets DisableReasoning=true on the
+// LLM config (which the backend reads as enable_thinking=false).
+var _ = Describe("spokenReasoningConfig", func() {
+ It("clears DisableReasoning so the extractor still strips leaked reasoning", func() {
+ disable := true
+ out := spokenReasoningConfig(reasoning.Config{DisableReasoning: &disable})
+ Expect(out.DisableReasoning).To(BeNil())
+ })
+
+ It("preserves the other reasoning settings", func() {
+ disable := true
+ out := spokenReasoningConfig(reasoning.Config{
+ DisableReasoning: &disable,
+ ThinkingStartTokens: []string{""},
+ TagPairs: []reasoning.TagPair{{Start: "", End: ""}},
+ })
+ Expect(out.ThinkingStartTokens).To(Equal([]string{""}))
+ Expect(out.TagPairs).To(HaveLen(1))
+ Expect(out.TagPairs[0].Start).To(Equal(""))
+ })
+})