feat(reasoning): honor per-request reasoning_effort on chat completions (#10082)

The OpenAI `reasoning_effort` field only reached the prompt template; it never toggled the backend's thinking. Map it onto ReasoningConfig.DisableReasoning (which becomes the enable_thinking gRPC metadata) in the request merge, so reasoning_effort="none" disables reasoning per request: the use case from #10072 (run a single Qwen3-style model and turn reasoning off for low-latency tasks while keeping it on for others). Effort levels (minimal/low/medium/high) enable thinking unless the model config explicitly disabled it (reasoning.disable: true wins and is never re-enabled by a request); "none" always disables. Closes #10072 Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-07-19 12:44:10 -04:00 · 2026-05-30 00:09:07 +02:00
parent 4647770316
commit 4a2cc64d07
4 changed files with 191 additions and 1 deletions
--- a/core/backend/options_internal_test.go
+++ b/core/backend/options_internal_test.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"

 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/pkg/reasoning"

 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -42,3 +43,35 @@ var _ = Describe("grpcModelOpts EngineArgs", func() {
 		Expect(opts.EngineArgs).To(BeEmpty())
 	})
 })
+
+// Guards the DisableReasoning -> enable_thinking metadata conversion that the
+// per-request reasoning_effort feature (issue #10072) relies on: the request
+// merge sets ReasoningConfig.DisableReasoning, and gRPCPredictOpts is where it
+// becomes the gRPC PredictOptions.Metadata the backend reads.
+var _ = Describe("gRPCPredictOpts enable_thinking metadata", func() {
+	// withReasoning builds a fully-defaulted config (gRPCPredictOpts dereferences
+	// many pointer fields) and overrides only the reasoning toggle.
+	withReasoning := func(disable *bool) config.ModelConfig {
+		cfg := config.ModelConfig{}
+		cfg.SetDefaults()
+		cfg.ReasoningConfig = reasoning.Config{DisableReasoning: disable}
+		return cfg
+	}
+	disabled := true
+	enabled := false
+
+	It("emits enable_thinking=false when reasoning is disabled", func() {
+		opts := gRPCPredictOpts(withReasoning(&disabled), "/tmp/models")
+		Expect(opts.Metadata).To(HaveKeyWithValue("enable_thinking", "false"))
+	})
+
+	It("emits enable_thinking=true when reasoning is enabled", func() {
+		opts := gRPCPredictOpts(withReasoning(&enabled), "/tmp/models")
+		Expect(opts.Metadata).To(HaveKeyWithValue("enable_thinking", "true"))
+	})
+
+	It("omits enable_thinking when reasoning is unset", func() {
+		opts := gRPCPredictOpts(withReasoning(nil), "/tmp/models")
+		Expect(opts.Metadata).ToNot(HaveKey("enable_thinking"))
+	})
+})
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -310,6 +310,26 @@ func mergeOpenAIRequestAndModelConfig(config *config.ModelConfig, input *schema.
 		config.Temperature = input.Temperature
 	}

+	// Map the per-request reasoning_effort onto the reasoning toggle the
+	// backend reads (enable_thinking metadata, set in gRPCPredictOpts).
+	// "none" disables thinking for this request - the use case from #10072,
+	// running a single Qwen3-style model and turning reasoning off per
+	// request. Any explicit effort level enables thinking, UNLESS the model
+	// config explicitly disabled it (DisableReasoning==true wins): an
+	// operator who deliberately turned reasoning off should not be overridden
+	// by a request. A value of "none" always disables, since that never
+	// conflicts with a config that also disables.
+	switch strings.ToLower(input.ReasoningEffort) {
+	case "none":
+		disable := true
+		config.ReasoningConfig.DisableReasoning = &disable
+	case "minimal", "low", "medium", "high":
+		if config.ReasoningConfig.DisableReasoning == nil || !*config.ReasoningConfig.DisableReasoning {
+			enable := false
+			config.ReasoningConfig.DisableReasoning = &enable
+		}
+	}
+
 	// Collapse the modern max_completion_tokens alias into the
 	// legacy Maxtokens field so downstream code reads exactly one.
 	// MaxCompletionTokens wins on conflict — it's the canonical
--- a/core/http/middleware/request_test.go
+++ b/core/http/middleware/request_test.go
@@ -597,3 +597,137 @@ var _ = Describe("SetModelAndConfig tool_choice parsing (chat completions)", fun
 		})
 	})
 })
+
+// These tests cover the per-request reasoning_effort -> enable_thinking mapping.
+// The merge lives in mergeOpenAIRequestAndModelConfig (called from
+// SetOpenAIRequest), so they drive the full middleware chain like the
+// production /v1/chat/completions route does. The block builds its own app per
+// test so the model config can be varied (some cases need reasoning.disable set
+// in the model YAML to assert that an explicit config disable wins).
+//
+// Mapping under test (issue #10072):
+//   - reasoning_effort=none                 -> DisableReasoning=true
+//   - reasoning_effort=low/medium/high      -> DisableReasoning=false, UNLESS the
+//     model config explicitly set true
+//   - empty / unrecognized                  -> no change
+var _ = Describe("SetModelAndConfig reasoning_effort parsing (chat completions)", func() {
+	var modelDir string
+
+	BeforeEach(func() {
+		var err error
+		modelDir, err = os.MkdirTemp("", "localai-test-models-*")
+		Expect(err).ToNot(HaveOccurred())
+	})
+
+	AfterEach(func() {
+		_ = os.RemoveAll(modelDir)
+	})
+
+	// buildApp writes a model config with the given YAML body and returns an app
+	// plus a pointer to the captured per-request config.
+	buildApp := func(cfgYAML string) (*echo.Echo, **config.ModelConfig) {
+		Expect(os.WriteFile(filepath.Join(modelDir, "test-model.yaml"), []byte(cfgYAML), 0644)).To(Succeed())
+
+		ss := &system.SystemState{Model: system.Model{ModelsPath: modelDir}}
+		appConfig := config.NewApplicationConfig()
+		appConfig.SystemState = ss
+		mcl := config.NewModelConfigLoader(modelDir)
+		ml := model.NewModelLoader(ss)
+		re := NewRequestExtractor(mcl, ml, appConfig)
+
+		captured := new(*config.ModelConfig)
+		app := echo.New()
+		app.POST("/v1/chat/completions",
+			func(c echo.Context) error {
+				if cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig); ok {
+					*captured = cfg
+				}
+				return c.String(http.StatusOK, "ok")
+			},
+			re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+			func(next echo.HandlerFunc) echo.HandlerFunc {
+				return func(c echo.Context) error {
+					if err := re.SetOpenAIRequest(c); err != nil {
+						return err
+					}
+					return next(c)
+				}
+			},
+		)
+		return app, captured
+	}
+
+	chatReq := func(effort string) string {
+		return `{"model":"test-model",` +
+			`"messages":[{"role":"user","content":"hi"}],` +
+			`"reasoning_effort":` + effort + `}`
+	}
+
+	plainCfg := "name: test-model\nbackend: llama-cpp\n"
+
+	It("disables thinking for reasoning_effort=none", func() {
+		app, captured := buildApp(plainCfg)
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"none"`))
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeTrue())
+	})
+
+	It("enables thinking for reasoning_effort=high when config is unset", func() {
+		app, captured := buildApp(plainCfg)
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"high"`))
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeFalse())
+	})
+
+	It("enables thinking for reasoning_effort=high when config explicitly set false", func() {
+		app, captured := buildApp(plainCfg + "reasoning:\n  disable: false\n")
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"high"`))
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeFalse())
+	})
+
+	It("config wins: reasoning_effort=high cannot re-enable when config explicitly disabled", func() {
+		app, captured := buildApp(plainCfg + "reasoning:\n  disable: true\n")
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"high"`))
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeTrue())
+	})
+
+	It("is a no-op when reasoning_effort is empty", func() {
+		app, captured := buildApp(plainCfg)
+		rec := postJSON(app, "/v1/chat/completions",
+			`{"model":"test-model","messages":[{"role":"user","content":"hi"}]}`)
+
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).To(BeNil())
+	})
+
+	It("is case-insensitive (None disables, HIGH enables)", func() {
+		app, captured := buildApp(plainCfg)
+		rec := postJSON(app, "/v1/chat/completions", chatReq(`"None"`))
+		Expect(rec.Code).To(Equal(http.StatusOK))
+		Expect(*captured).ToNot(BeNil())
+		Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeTrue())
+
+		app2, captured2 := buildApp(plainCfg)
+		rec2 := postJSON(app2, "/v1/chat/completions", chatReq(`"HIGH"`))
+		Expect(rec2.Code).To(Equal(http.StatusOK))
+		Expect(*captured2).ToNot(BeNil())
+		Expect((*captured2).ReasoningConfig.DisableReasoning).ToNot(BeNil())
+		Expect(*(*captured2).ReasoningConfig.DisableReasoning).To(BeFalse())
+	})
+})
--- a/docs/content/advanced/model-configuration.md
+++ b/docs/content/advanced/model-configuration.md
@@ -412,7 +412,10 @@ These load-time options control how the backend parses `<think>` reasoning block
 | `prefill_assistant` | bool | `true` | When `false`, the trailing assistant message is not pre-filled by the chat template. |

 {{% notice note %}}
-This is the load-time reasoning configuration. The orthogonal per-request `enable_thinking` chat-template kwarg (set via the YAML `reasoning.disable` field) toggles thinking on/off per call without restarting the model.
+This is the load-time reasoning configuration. The orthogonal per-request `enable_thinking` chat-template kwarg toggles thinking on/off per call without restarting the model. It can be driven either by the YAML `reasoning.disable` field (model default) or per request via the OpenAI `reasoning_effort` field on `/v1/chat/completions`:
+
+- `reasoning_effort: "none"` disables thinking for that request (`enable_thinking=false`) - useful to run a single reasoning model like Qwen3 for low-latency tasks while still enabling reasoning on other requests.
+- `reasoning_effort: "minimal" | "low" | "medium" | "high"` enables thinking, unless the model config explicitly set `reasoning.disable: true` (an operator's explicit disable wins and is never re-enabled by a request).
 {{% /notice %}}

 ### Multimodal Backend Options