From 685e4632d7ef96c7419e7d818380e781da8d97a5 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Thu, 4 Jun 2026 16:25:53 +0000
Subject: [PATCH] feat(realtime): pipeline disable_thinking maps to
 enable_thinking off

applyPipelineThinking forces the LLM's ReasoningConfig.DisableReasoning when
pipeline.disable_thinking is set, which gRPCPredictOpts turns into the
enable_thinking=false backend metadata. Applied at newModel construction on
the per-session LLM config copy, so it doesn't leak to other model users and
needs no realtime-specific request plumbing.

Assisted-by: Claude:claude-opus-4-8 go vet
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/http/endpoints/openai/realtime_model.go  |  4 ++-
 .../endpoints/openai/realtime_thinking.go     | 17 ++++++++++++
 .../openai/realtime_thinking_test.go          | 26 +++++++++++++++++++
 3 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 core/http/endpoints/openai/realtime_thinking.go
 create mode 100644 core/http/endpoints/openai/realtime_thinking_test.go

diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go
index b18439340..8281197a3 100644
--- a/core/http/endpoints/openai/realtime_model.go
+++ b/core/http/endpoints/openai/realtime_model.go
@@ -528,8 +528,10 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model
 		return nil, fmt.Errorf("failed to validate config: %w", err)
 	}
 
-	// Let the pipeline set the LLM's reasoning effort (cfgLLM is a per-session copy).
+	// Let the pipeline set the LLM's reasoning effort and force thinking off
+	// (cfgLLM is a per-session copy). disable_thinking applies after the effort.
 	applyPipelineReasoning(cfgLLM, *pipeline)
+	applyPipelineThinking(cfgLLM, *pipeline)
 
 	cfgTTS, err := cl.LoadModelConfigFileByName(pipeline.TTS, ml.ModelPath)
 	if err != nil {
diff --git a/core/http/endpoints/openai/realtime_thinking.go b/core/http/endpoints/openai/realtime_thinking.go
new file mode 100644
index 000000000..41addf963
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_thinking.go
@@ -0,0 +1,17 @@
+package openai
+
+import "github.com/mudler/LocalAI/core/config"
+
+// applyPipelineThinking forces the LLM's reasoning/thinking off when the realtime
+// pipeline sets disable_thinking, mapping to the enable_thinking=false backend
+// metadata via ReasoningConfig.DisableReasoning. The LLM config passed in is the
+// per-session copy returned by the config loader, so this does not affect other
+// users of the same model. When the pipeline does not set disable_thinking the
+// LLM config is left untouched.
+func applyPipelineThinking(llm *config.ModelConfig, pipeline config.Pipeline) {
+	if llm == nil || !pipeline.ThinkingDisabled() {
+		return
+	}
+	disable := true
+	llm.ReasoningConfig.DisableReasoning = &disable
+}
diff --git a/core/http/endpoints/openai/realtime_thinking_test.go b/core/http/endpoints/openai/realtime_thinking_test.go
new file mode 100644
index 000000000..6a38fa86d
--- /dev/null
+++ b/core/http/endpoints/openai/realtime_thinking_test.go
@@ -0,0 +1,26 @@
+package openai
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	"github.com/mudler/LocalAI/core/config"
+)
+
+// applyPipelineThinking lets a realtime pipeline force the LLM's thinking off
+// (enable_thinking=false metadata) without editing the LLM model config.
+var _ = Describe("applyPipelineThinking", func() {
+	It("disables reasoning on the LLM config when the pipeline disables thinking", func() {
+		disable := true
+		llm := &config.ModelConfig{}
+		applyPipelineThinking(llm, config.Pipeline{DisableThinking: &disable})
+		Expect(llm.ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*llm.ReasoningConfig.DisableReasoning).To(BeTrue())
+	})
+
+	It("leaves the LLM config untouched when the pipeline does not set disable_thinking", func() {
+		llm := &config.ModelConfig{}
+		applyPipelineThinking(llm, config.Pipeline{})
+		Expect(llm.ReasoningConfig.DisableReasoning).To(BeNil())
+	})
+})