mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-06 07:46:15 -04:00
fix(realtime): always strip reasoning from spoken output
disable_thinking maps to ReasoningConfig.DisableReasoning=true on the LLM config, which the backend reads as enable_thinking=false. But the realtime handler reads that SAME config to drive reasoning extraction, and there DisableReasoning=true means "skip stripping". PredictConfig() returns this LLM config, so both the streamed (speechStreamer) and buffered realtime paths stopped stripping <think>…</think> exactly when disable_thinking was on — leaking raw reasoning to the client whenever the model ignored the enable_thinking hint (e.g. lfm2.5). Add spokenReasoningConfig() which clears DisableReasoning for extraction (keeping custom tokens/tag pairs) and route both realtime paths through it. Spoken output now always strips reasoning, independent of the backend suppression hint. Assisted-by: Claude:claude-opus-4-8 go test, golangci-lint Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -1590,7 +1590,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
|
||||
// ExtractReasoningWithConfig is a no-op when no tag pair matches,
|
||||
// so it's safe to apply unconditionally in the no-reasoning branch.
|
||||
if deltaReasoning == "" && deltaContent != "" {
|
||||
deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
|
||||
deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
|
||||
}
|
||||
reasoningText = deltaReasoning
|
||||
responseWithoutReasoning = deltaContent
|
||||
@@ -1598,7 +1598,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
|
||||
cleanedResponse = deltaContent
|
||||
toolCalls = deltaToolCalls
|
||||
} else {
|
||||
reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
|
||||
reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, spokenReasoningConfig(config.ReasoningConfig))
|
||||
textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
|
||||
cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
|
||||
toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
|
||||
|
||||
@@ -2,6 +2,7 @@ package openai
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
@@ -48,6 +49,18 @@ func (f *fakeTransport) countEvents(et types.ServerEventType) int {
|
||||
return n
|
||||
}
|
||||
|
||||
// transcriptDeltaText concatenates the Delta of every recorded transcript
|
||||
// delta event — i.e. the text streamed to the client as it is generated.
|
||||
func (f *fakeTransport) transcriptDeltaText() string {
|
||||
var b strings.Builder
|
||||
for _, e := range f.events {
|
||||
if d, ok := e.(types.ResponseOutputAudioTranscriptDeltaEvent); ok {
|
||||
b.WriteString(d.Delta)
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// fakeModel is a configurable Model double. TTSStream replays ttsStreamChunks
|
||||
// and TranscribeStream replays transcribeDeltas, so the handler's streaming
|
||||
// paths can be driven deterministically.
|
||||
|
||||
@@ -35,6 +35,9 @@ type speechStreamer struct {
|
||||
}
|
||||
|
||||
func newSpeechStreamer(ctx context.Context, t Transport, session *Session, responseID, itemID, thinkingStartToken string, reasoningCfg reasoning.Config) *speechStreamer {
|
||||
// Spoken output must never contain reasoning, even when disable_thinking set
|
||||
// DisableReasoning (which would otherwise turn the extractor's stripping off).
|
||||
reasoningCfg = spokenReasoningConfig(reasoningCfg)
|
||||
return &speechStreamer{
|
||||
ctx: ctx,
|
||||
t: t,
|
||||
|
||||
@@ -43,6 +43,32 @@ var _ = Describe("speechStreamer", func() {
|
||||
Expect(audio).To(Equal([]byte{7, 7}))
|
||||
})
|
||||
|
||||
It("strips leaked reasoning even when reasoning is disabled (disable_thinking safety net)", func() {
|
||||
// disable_thinking maps to ReasoningConfig.DisableReasoning=true (it tells
|
||||
// the backend enable_thinking=false). When the model ignores that and emits
|
||||
// thinking anyway, the spoken stream must still not leak it: the streamer is
|
||||
// the last line of defence and always strips reasoning from spoken content.
|
||||
disable := true
|
||||
session := &Session{
|
||||
OutputSampleRate: 24000,
|
||||
ModelInterface: &fakeModel{},
|
||||
ModelConfig: &config.ModelConfig{}, // streaming.tts off
|
||||
}
|
||||
t := &fakeTransport{}
|
||||
s := newSpeechStreamer(context.Background(), t, session, "resp1", "item1", "",
|
||||
reasoning.Config{DisableReasoning: &disable})
|
||||
|
||||
s.onToken("<think>secret plan</think>")
|
||||
s.onToken("The answer is 42.")
|
||||
content, _, err := s.finish()
|
||||
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(content).To(Equal("The answer is 42."))
|
||||
Expect(content).ToNot(ContainSubstring("secret plan"))
|
||||
// The text streamed to the client must not carry the reasoning either.
|
||||
Expect(t.transcriptDeltaText()).ToNot(ContainSubstring("secret plan"))
|
||||
})
|
||||
|
||||
It("does not synthesize audio when TTS streaming is disabled", func() {
|
||||
m := &fakeModel{ttsStreamChunks: [][]byte{{7}}, ttsStreamRate: 24000}
|
||||
session := &Session{
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
package openai
|
||||
|
||||
import "github.com/mudler/LocalAI/core/config"
|
||||
import (
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/pkg/reasoning"
|
||||
)
|
||||
|
||||
// applyPipelineThinking forces the LLM's reasoning/thinking off when the realtime
|
||||
// pipeline sets disable_thinking, mapping to the enable_thinking=false backend
|
||||
@@ -15,3 +18,16 @@ func applyPipelineThinking(llm *config.ModelConfig, pipeline config.Pipeline) {
|
||||
disable := true
|
||||
llm.ReasoningConfig.DisableReasoning = &disable
|
||||
}
|
||||
|
||||
// spokenReasoningConfig adapts a model's reasoning config for stripping reasoning
|
||||
// OUT of realtime spoken output. ReasoningConfig.DisableReasoning is overloaded:
|
||||
// the backend reads it as the "enable_thinking=false" hint (which pipeline
|
||||
// disable_thinking sets via applyPipelineThinking), but the reasoning extractor
|
||||
// reads it as "skip stripping, assume there is no reasoning". Honouring the latter
|
||||
// when extracting for speech would leak raw <think>…</think> whenever the model
|
||||
// ignores the suppression hint. Spoken output must never contain reasoning, so we
|
||||
// always strip: clear DisableReasoning while keeping custom tokens/tag pairs.
|
||||
func spokenReasoningConfig(cfg reasoning.Config) reasoning.Config {
|
||||
cfg.DisableReasoning = nil
|
||||
return cfg
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
. "github.com/onsi/gomega"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/pkg/reasoning"
|
||||
)
|
||||
|
||||
// applyPipelineThinking lets a realtime pipeline force the LLM's thinking off
|
||||
@@ -24,3 +25,26 @@ var _ = Describe("applyPipelineThinking", func() {
|
||||
Expect(llm.ReasoningConfig.DisableReasoning).To(BeNil())
|
||||
})
|
||||
})
|
||||
|
||||
// spokenReasoningConfig clears DisableReasoning so realtime spoken output always
|
||||
// strips reasoning, even though disable_thinking sets DisableReasoning=true on the
|
||||
// LLM config (which the backend reads as enable_thinking=false).
|
||||
var _ = Describe("spokenReasoningConfig", func() {
|
||||
It("clears DisableReasoning so the extractor still strips leaked reasoning", func() {
|
||||
disable := true
|
||||
out := spokenReasoningConfig(reasoning.Config{DisableReasoning: &disable})
|
||||
Expect(out.DisableReasoning).To(BeNil())
|
||||
})
|
||||
|
||||
It("preserves the other reasoning settings", func() {
|
||||
disable := true
|
||||
out := spokenReasoningConfig(reasoning.Config{
|
||||
DisableReasoning: &disable,
|
||||
ThinkingStartTokens: []string{"<reason>"},
|
||||
TagPairs: []reasoning.TagPair{{Start: "<reason>", End: "</reason>"}},
|
||||
})
|
||||
Expect(out.ThinkingStartTokens).To(Equal([]string{"<reason>"}))
|
||||
Expect(out.TagPairs).To(HaveLen(1))
|
||||
Expect(out.TagPairs[0].Start).To(Equal("<reason>"))
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user