diff --git a/core/http/endpoints/openai/realtime.go b/core/http/endpoints/openai/realtime.go index 078cf4a5b..bc2a80785 100644 --- a/core/http/endpoints/openai/realtime.go +++ b/core/http/endpoints/openai/realtime.go @@ -1590,7 +1590,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa // ExtractReasoningWithConfig is a no-op when no tag pair matches, // so it's safe to apply unconditionally in the no-reasoning branch. if deltaReasoning == "" && deltaContent != "" { - deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig) + deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig)) } reasoningText = deltaReasoning responseWithoutReasoning = deltaContent @@ -1598,7 +1598,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa cleanedResponse = deltaContent toolCalls = deltaToolCalls } else { - reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig) + reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig)) textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig) cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig) toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig) diff --git a/core/http/endpoints/openai/realtime_doubles_test.go b/core/http/endpoints/openai/realtime_doubles_test.go index 2a54f3dbe..afb1f5e7a 100644 --- a/core/http/endpoints/openai/realtime_doubles_test.go +++ b/core/http/endpoints/openai/realtime_doubles_test.go @@ -2,6 +2,7 @@ package openai import ( "context" + "strings" "github.com/mudler/LocalAI/core/backend" "github.com/mudler/LocalAI/core/config" @@ -48,6 +49,18 @@ func (f *fakeTransport) countEvents(et types.ServerEventType) int { return n } +// transcriptDeltaText concatenates the Delta of every recorded transcript +// delta event — i.e. the text streamed to the client as it is generated. +func (f *fakeTransport) transcriptDeltaText() string { + var b strings.Builder + for _, e := range f.events { + if d, ok := e.(types.ResponseOutputAudioTranscriptDeltaEvent); ok { + b.WriteString(d.Delta) + } + } + return b.String() +} + // fakeModel is a configurable Model double. TTSStream replays ttsStreamChunks // and TranscribeStream replays transcribeDeltas, so the handler's streaming // paths can be driven deterministically. diff --git a/core/http/endpoints/openai/realtime_stream.go b/core/http/endpoints/openai/realtime_stream.go index 015f6850e..09526c561 100644 --- a/core/http/endpoints/openai/realtime_stream.go +++ b/core/http/endpoints/openai/realtime_stream.go @@ -35,6 +35,9 @@ type speechStreamer struct { } func newSpeechStreamer(ctx context.Context, t Transport, session *Session, responseID, itemID, thinkingStartToken string, reasoningCfg reasoning.Config) *speechStreamer { + // Spoken output must never contain reasoning, even when disable_thinking set + // DisableReasoning (which would otherwise turn the extractor's stripping off). + reasoningCfg = spokenReasoningConfig(reasoningCfg) return &speechStreamer{ ctx: ctx, t: t, diff --git a/core/http/endpoints/openai/realtime_stream_test.go b/core/http/endpoints/openai/realtime_stream_test.go index a6d233175..d8697c331 100644 --- a/core/http/endpoints/openai/realtime_stream_test.go +++ b/core/http/endpoints/openai/realtime_stream_test.go @@ -43,6 +43,32 @@ var _ = Describe("speechStreamer", func() { Expect(audio).To(Equal([]byte{7, 7})) }) + It("strips leaked reasoning even when reasoning is disabled (disable_thinking safety net)", func() { + // disable_thinking maps to ReasoningConfig.DisableReasoning=true (it tells + // the backend enable_thinking=false). When the model ignores that and emits + // thinking anyway, the spoken stream must still not leak it: the streamer is + // the last line of defence and always strips reasoning from spoken content. + disable := true + session := &Session{ + OutputSampleRate: 24000, + ModelInterface: &fakeModel{}, + ModelConfig: &config.ModelConfig{}, // streaming.tts off + } + t := &fakeTransport{} + s := newSpeechStreamer(context.Background(), t, session, "resp1", "item1", "", + reasoning.Config{DisableReasoning: &disable}) + + s.onToken("secret plan") + s.onToken("The answer is 42.") + content, _, err := s.finish() + + Expect(err).ToNot(HaveOccurred()) + Expect(content).To(Equal("The answer is 42.")) + Expect(content).ToNot(ContainSubstring("secret plan")) + // The text streamed to the client must not carry the reasoning either. + Expect(t.transcriptDeltaText()).ToNot(ContainSubstring("secret plan")) + }) + It("does not synthesize audio when TTS streaming is disabled", func() { m := &fakeModel{ttsStreamChunks: [][]byte{{7}}, ttsStreamRate: 24000} session := &Session{ diff --git a/core/http/endpoints/openai/realtime_thinking.go b/core/http/endpoints/openai/realtime_thinking.go index 41addf963..8222219af 100644 --- a/core/http/endpoints/openai/realtime_thinking.go +++ b/core/http/endpoints/openai/realtime_thinking.go @@ -1,6 +1,9 @@ package openai -import "github.com/mudler/LocalAI/core/config" +import ( + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/pkg/reasoning" +) // applyPipelineThinking forces the LLM's reasoning/thinking off when the realtime // pipeline sets disable_thinking, mapping to the enable_thinking=false backend @@ -15,3 +18,16 @@ func applyPipelineThinking(llm *config.ModelConfig, pipeline config.Pipeline) { disable := true llm.ReasoningConfig.DisableReasoning = &disable } + +// spokenReasoningConfig adapts a model's reasoning config for stripping reasoning +// OUT of realtime spoken output. ReasoningConfig.DisableReasoning is overloaded: +// the backend reads it as the "enable_thinking=false" hint (which pipeline +// disable_thinking sets via applyPipelineThinking), but the reasoning extractor +// reads it as "skip stripping, assume there is no reasoning". Honouring the latter +// when extracting for speech would leak raw whenever the model +// ignores the suppression hint. Spoken output must never contain reasoning, so we +// always strip: clear DisableReasoning while keeping custom tokens/tag pairs. +func spokenReasoningConfig(cfg reasoning.Config) reasoning.Config { + cfg.DisableReasoning = nil + return cfg +} diff --git a/core/http/endpoints/openai/realtime_thinking_test.go b/core/http/endpoints/openai/realtime_thinking_test.go index 6a38fa86d..a056dd0e7 100644 --- a/core/http/endpoints/openai/realtime_thinking_test.go +++ b/core/http/endpoints/openai/realtime_thinking_test.go @@ -5,6 +5,7 @@ import ( . "github.com/onsi/gomega" "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/pkg/reasoning" ) // applyPipelineThinking lets a realtime pipeline force the LLM's thinking off @@ -24,3 +25,26 @@ var _ = Describe("applyPipelineThinking", func() { Expect(llm.ReasoningConfig.DisableReasoning).To(BeNil()) }) }) + +// spokenReasoningConfig clears DisableReasoning so realtime spoken output always +// strips reasoning, even though disable_thinking sets DisableReasoning=true on the +// LLM config (which the backend reads as enable_thinking=false). +var _ = Describe("spokenReasoningConfig", func() { + It("clears DisableReasoning so the extractor still strips leaked reasoning", func() { + disable := true + out := spokenReasoningConfig(reasoning.Config{DisableReasoning: &disable}) + Expect(out.DisableReasoning).To(BeNil()) + }) + + It("preserves the other reasoning settings", func() { + disable := true + out := spokenReasoningConfig(reasoning.Config{ + DisableReasoning: &disable, + ThinkingStartTokens: []string{""}, + TagPairs: []reasoning.TagPair{{Start: "", End: ""}}, + }) + Expect(out.ThinkingStartTokens).To(Equal([]string{""})) + Expect(out.TagPairs).To(HaveLen(1)) + Expect(out.TagPairs[0].Start).To(Equal("")) + }) +})