fix(realtime): always strip reasoning from spoken output

disable_thinking maps to ReasoningConfig.DisableReasoning=true on the LLM
config, which the backend reads as enable_thinking=false. But the realtime
handler reads that SAME config to drive reasoning extraction, and there
DisableReasoning=true means "skip stripping". PredictConfig() returns this
LLM config, so both the streamed (speechStreamer) and buffered realtime
paths stopped stripping <think>…</think> exactly when disable_thinking was
on — leaking raw reasoning to the client whenever the model ignored the
enable_thinking hint (e.g. lfm2.5).

Add spokenReasoningConfig() which clears DisableReasoning for extraction
(keeping custom tokens/tag pairs) and route both realtime paths through it.
Spoken output now always strips reasoning, independent of the backend
suppression hint.

Assisted-by: Claude:claude-opus-4-8 go test, golangci-lint
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-06-04 23:18:37 +00:00
parent f48344f2ff
commit cb3609530a
6 changed files with 85 additions and 3 deletions

View File

@@ -1590,7 +1590,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
// ExtractReasoningWithConfig is a no-op when no tag pair matches,
// so it's safe to apply unconditionally in the no-reasoning branch.
if deltaReasoning == "" && deltaContent != "" {
deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
}
reasoningText = deltaReasoning
responseWithoutReasoning = deltaContent
@@ -1598,7 +1598,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
cleanedResponse = deltaContent
toolCalls = deltaToolCalls
} else {
reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)

View File

@@ -2,6 +2,7 @@ package openai
import (
"context"
"strings"
"github.com/mudler/LocalAI/core/backend"
"github.com/mudler/LocalAI/core/config"
@@ -48,6 +49,18 @@ func (f *fakeTransport) countEvents(et types.ServerEventType) int {
return n
}
// transcriptDeltaText concatenates the Delta of every recorded transcript
// delta event — i.e. the text streamed to the client as it is generated.
func (f *fakeTransport) transcriptDeltaText() string {
var b strings.Builder
for _, e := range f.events {
if d, ok := e.(types.ResponseOutputAudioTranscriptDeltaEvent); ok {
b.WriteString(d.Delta)
}
}
return b.String()
}
// fakeModel is a configurable Model double. TTSStream replays ttsStreamChunks
// and TranscribeStream replays transcribeDeltas, so the handler's streaming
// paths can be driven deterministically.

View File

@@ -35,6 +35,9 @@ type speechStreamer struct {
}
func newSpeechStreamer(ctx context.Context, t Transport, session *Session, responseID, itemID, thinkingStartToken string, reasoningCfg reasoning.Config) *speechStreamer {
// Spoken output must never contain reasoning, even when disable_thinking set
// DisableReasoning (which would otherwise turn the extractor's stripping off).
reasoningCfg = spokenReasoningConfig(reasoningCfg)
return &speechStreamer{
ctx: ctx,
t: t,

View File

@@ -43,6 +43,32 @@ var _ = Describe("speechStreamer", func() {
Expect(audio).To(Equal([]byte{7, 7}))
})
It("strips leaked reasoning even when reasoning is disabled (disable_thinking safety net)", func() {
// disable_thinking maps to ReasoningConfig.DisableReasoning=true (it tells
// the backend enable_thinking=false). When the model ignores that and emits
// thinking anyway, the spoken stream must still not leak it: the streamer is
// the last line of defence and always strips reasoning from spoken content.
disable := true
session := &Session{
OutputSampleRate: 24000,
ModelInterface: &fakeModel{},
ModelConfig: &config.ModelConfig{}, // streaming.tts off
}
t := &fakeTransport{}
s := newSpeechStreamer(context.Background(), t, session, "resp1", "item1", "",
reasoning.Config{DisableReasoning: &disable})
s.onToken("<think>secret plan</think>")
s.onToken("The answer is 42.")
content, _, err := s.finish()
Expect(err).ToNot(HaveOccurred())
Expect(content).To(Equal("The answer is 42."))
Expect(content).ToNot(ContainSubstring("secret plan"))
// The text streamed to the client must not carry the reasoning either.
Expect(t.transcriptDeltaText()).ToNot(ContainSubstring("secret plan"))
})
It("does not synthesize audio when TTS streaming is disabled", func() {
m := &fakeModel{ttsStreamChunks: [][]byte{{7}}, ttsStreamRate: 24000}
session := &Session{

View File

@@ -1,6 +1,9 @@
package openai
import "github.com/mudler/LocalAI/core/config"
import (
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/pkg/reasoning"
)
// applyPipelineThinking forces the LLM's reasoning/thinking off when the realtime
// pipeline sets disable_thinking, mapping to the enable_thinking=false backend
@@ -15,3 +18,16 @@ func applyPipelineThinking(llm *config.ModelConfig, pipeline config.Pipeline) {
disable := true
llm.ReasoningConfig.DisableReasoning = &disable
}
// spokenReasoningConfig adapts a model's reasoning config for stripping reasoning
// OUT of realtime spoken output. ReasoningConfig.DisableReasoning is overloaded:
// the backend reads it as the "enable_thinking=false" hint (which pipeline
// disable_thinking sets via applyPipelineThinking), but the reasoning extractor
// reads it as "skip stripping, assume there is no reasoning". Honouring the latter
// when extracting for speech would leak raw <think>…</think> whenever the model
// ignores the suppression hint. Spoken output must never contain reasoning, so we
// always strip: clear DisableReasoning while keeping custom tokens/tag pairs.
func spokenReasoningConfig(cfg reasoning.Config) reasoning.Config {
cfg.DisableReasoning = nil
return cfg
}

View File

@@ -5,6 +5,7 @@ import (
. "github.com/onsi/gomega"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/pkg/reasoning"
)
// applyPipelineThinking lets a realtime pipeline force the LLM's thinking off
@@ -24,3 +25,26 @@ var _ = Describe("applyPipelineThinking", func() {
Expect(llm.ReasoningConfig.DisableReasoning).To(BeNil())
})
})
// spokenReasoningConfig clears DisableReasoning so realtime spoken output always
// strips reasoning, even though disable_thinking sets DisableReasoning=true on the
// LLM config (which the backend reads as enable_thinking=false).
var _ = Describe("spokenReasoningConfig", func() {
It("clears DisableReasoning so the extractor still strips leaked reasoning", func() {
disable := true
out := spokenReasoningConfig(reasoning.Config{DisableReasoning: &disable})
Expect(out.DisableReasoning).To(BeNil())
})
It("preserves the other reasoning settings", func() {
disable := true
out := spokenReasoningConfig(reasoning.Config{
DisableReasoning: &disable,
ThinkingStartTokens: []string{"<reason>"},
TagPairs: []reasoning.TagPair{{Start: "<reason>", End: "</reason>"}},
})
Expect(out.ThinkingStartTokens).To(Equal([]string{"<reason>"}))
Expect(out.TagPairs).To(HaveLen(1))
Expect(out.TagPairs[0].Start).To(Equal("<reason>"))
})
})