From 4a2cc64d07fa8834ceec8957fd337bec644f4b00 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 30 May 2026 00:09:07 +0200 Subject: [PATCH] feat(reasoning): honor per-request reasoning_effort on chat completions (#10082) The OpenAI `reasoning_effort` field only reached the prompt template; it never toggled the backend's thinking. Map it onto ReasoningConfig.DisableReasoning (which becomes the enable_thinking gRPC metadata) in the request merge, so reasoning_effort="none" disables reasoning per request: the use case from #10072 (run a single Qwen3-style model and turn reasoning off for low-latency tasks while keeping it on for others). Effort levels (minimal/low/medium/high) enable thinking unless the model config explicitly disabled it (reasoning.disable: true wins and is never re-enabled by a request); "none" always disables. Closes #10072 Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- core/backend/options_internal_test.go | 33 +++++ core/http/middleware/request.go | 20 +++ core/http/middleware/request_test.go | 134 +++++++++++++++++++ docs/content/advanced/model-configuration.md | 5 +- 4 files changed, 191 insertions(+), 1 deletion(-) diff --git a/core/backend/options_internal_test.go b/core/backend/options_internal_test.go index bdce828b3..af5d59992 100644 --- a/core/backend/options_internal_test.go +++ b/core/backend/options_internal_test.go @@ -4,6 +4,7 @@ import ( "encoding/json" "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/pkg/reasoning" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -42,3 +43,35 @@ var _ = Describe("grpcModelOpts EngineArgs", func() { Expect(opts.EngineArgs).To(BeEmpty()) }) }) + +// Guards the DisableReasoning -> enable_thinking metadata conversion that the +// per-request reasoning_effort feature (issue #10072) relies on: the request +// merge sets ReasoningConfig.DisableReasoning, and gRPCPredictOpts is where it +// becomes the gRPC PredictOptions.Metadata the backend reads. +var _ = Describe("gRPCPredictOpts enable_thinking metadata", func() { + // withReasoning builds a fully-defaulted config (gRPCPredictOpts dereferences + // many pointer fields) and overrides only the reasoning toggle. + withReasoning := func(disable *bool) config.ModelConfig { + cfg := config.ModelConfig{} + cfg.SetDefaults() + cfg.ReasoningConfig = reasoning.Config{DisableReasoning: disable} + return cfg + } + disabled := true + enabled := false + + It("emits enable_thinking=false when reasoning is disabled", func() { + opts := gRPCPredictOpts(withReasoning(&disabled), "/tmp/models") + Expect(opts.Metadata).To(HaveKeyWithValue("enable_thinking", "false")) + }) + + It("emits enable_thinking=true when reasoning is enabled", func() { + opts := gRPCPredictOpts(withReasoning(&enabled), "/tmp/models") + Expect(opts.Metadata).To(HaveKeyWithValue("enable_thinking", "true")) + }) + + It("omits enable_thinking when reasoning is unset", func() { + opts := gRPCPredictOpts(withReasoning(nil), "/tmp/models") + Expect(opts.Metadata).ToNot(HaveKey("enable_thinking")) + }) +}) diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go index 7f3df885e..3e6f3555a 100644 --- a/core/http/middleware/request.go +++ b/core/http/middleware/request.go @@ -310,6 +310,26 @@ func mergeOpenAIRequestAndModelConfig(config *config.ModelConfig, input *schema. config.Temperature = input.Temperature } + // Map the per-request reasoning_effort onto the reasoning toggle the + // backend reads (enable_thinking metadata, set in gRPCPredictOpts). + // "none" disables thinking for this request - the use case from #10072, + // running a single Qwen3-style model and turning reasoning off per + // request. Any explicit effort level enables thinking, UNLESS the model + // config explicitly disabled it (DisableReasoning==true wins): an + // operator who deliberately turned reasoning off should not be overridden + // by a request. A value of "none" always disables, since that never + // conflicts with a config that also disables. + switch strings.ToLower(input.ReasoningEffort) { + case "none": + disable := true + config.ReasoningConfig.DisableReasoning = &disable + case "minimal", "low", "medium", "high": + if config.ReasoningConfig.DisableReasoning == nil || !*config.ReasoningConfig.DisableReasoning { + enable := false + config.ReasoningConfig.DisableReasoning = &enable + } + } + // Collapse the modern max_completion_tokens alias into the // legacy Maxtokens field so downstream code reads exactly one. // MaxCompletionTokens wins on conflict — it's the canonical diff --git a/core/http/middleware/request_test.go b/core/http/middleware/request_test.go index cc4e8199e..04c30dee4 100644 --- a/core/http/middleware/request_test.go +++ b/core/http/middleware/request_test.go @@ -597,3 +597,137 @@ var _ = Describe("SetModelAndConfig tool_choice parsing (chat completions)", fun }) }) }) + +// These tests cover the per-request reasoning_effort -> enable_thinking mapping. +// The merge lives in mergeOpenAIRequestAndModelConfig (called from +// SetOpenAIRequest), so they drive the full middleware chain like the +// production /v1/chat/completions route does. The block builds its own app per +// test so the model config can be varied (some cases need reasoning.disable set +// in the model YAML to assert that an explicit config disable wins). +// +// Mapping under test (issue #10072): +// - reasoning_effort=none -> DisableReasoning=true +// - reasoning_effort=low/medium/high -> DisableReasoning=false, UNLESS the +// model config explicitly set true +// - empty / unrecognized -> no change +var _ = Describe("SetModelAndConfig reasoning_effort parsing (chat completions)", func() { + var modelDir string + + BeforeEach(func() { + var err error + modelDir, err = os.MkdirTemp("", "localai-test-models-*") + Expect(err).ToNot(HaveOccurred()) + }) + + AfterEach(func() { + _ = os.RemoveAll(modelDir) + }) + + // buildApp writes a model config with the given YAML body and returns an app + // plus a pointer to the captured per-request config. + buildApp := func(cfgYAML string) (*echo.Echo, **config.ModelConfig) { + Expect(os.WriteFile(filepath.Join(modelDir, "test-model.yaml"), []byte(cfgYAML), 0644)).To(Succeed()) + + ss := &system.SystemState{Model: system.Model{ModelsPath: modelDir}} + appConfig := config.NewApplicationConfig() + appConfig.SystemState = ss + mcl := config.NewModelConfigLoader(modelDir) + ml := model.NewModelLoader(ss) + re := NewRequestExtractor(mcl, ml, appConfig) + + captured := new(*config.ModelConfig) + app := echo.New() + app.POST("/v1/chat/completions", + func(c echo.Context) error { + if cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig); ok { + *captured = cfg + } + return c.String(http.StatusOK, "ok") + }, + re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }), + func(next echo.HandlerFunc) echo.HandlerFunc { + return func(c echo.Context) error { + if err := re.SetOpenAIRequest(c); err != nil { + return err + } + return next(c) + } + }, + ) + return app, captured + } + + chatReq := func(effort string) string { + return `{"model":"test-model",` + + `"messages":[{"role":"user","content":"hi"}],` + + `"reasoning_effort":` + effort + `}` + } + + plainCfg := "name: test-model\nbackend: llama-cpp\n" + + It("disables thinking for reasoning_effort=none", func() { + app, captured := buildApp(plainCfg) + rec := postJSON(app, "/v1/chat/completions", chatReq(`"none"`)) + + Expect(rec.Code).To(Equal(http.StatusOK)) + Expect(*captured).ToNot(BeNil()) + Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeTrue()) + }) + + It("enables thinking for reasoning_effort=high when config is unset", func() { + app, captured := buildApp(plainCfg) + rec := postJSON(app, "/v1/chat/completions", chatReq(`"high"`)) + + Expect(rec.Code).To(Equal(http.StatusOK)) + Expect(*captured).ToNot(BeNil()) + Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeFalse()) + }) + + It("enables thinking for reasoning_effort=high when config explicitly set false", func() { + app, captured := buildApp(plainCfg + "reasoning:\n disable: false\n") + rec := postJSON(app, "/v1/chat/completions", chatReq(`"high"`)) + + Expect(rec.Code).To(Equal(http.StatusOK)) + Expect(*captured).ToNot(BeNil()) + Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeFalse()) + }) + + It("config wins: reasoning_effort=high cannot re-enable when config explicitly disabled", func() { + app, captured := buildApp(plainCfg + "reasoning:\n disable: true\n") + rec := postJSON(app, "/v1/chat/completions", chatReq(`"high"`)) + + Expect(rec.Code).To(Equal(http.StatusOK)) + Expect(*captured).ToNot(BeNil()) + Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeTrue()) + }) + + It("is a no-op when reasoning_effort is empty", func() { + app, captured := buildApp(plainCfg) + rec := postJSON(app, "/v1/chat/completions", + `{"model":"test-model","messages":[{"role":"user","content":"hi"}]}`) + + Expect(rec.Code).To(Equal(http.StatusOK)) + Expect(*captured).ToNot(BeNil()) + Expect((*captured).ReasoningConfig.DisableReasoning).To(BeNil()) + }) + + It("is case-insensitive (None disables, HIGH enables)", func() { + app, captured := buildApp(plainCfg) + rec := postJSON(app, "/v1/chat/completions", chatReq(`"None"`)) + Expect(rec.Code).To(Equal(http.StatusOK)) + Expect(*captured).ToNot(BeNil()) + Expect((*captured).ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*(*captured).ReasoningConfig.DisableReasoning).To(BeTrue()) + + app2, captured2 := buildApp(plainCfg) + rec2 := postJSON(app2, "/v1/chat/completions", chatReq(`"HIGH"`)) + Expect(rec2.Code).To(Equal(http.StatusOK)) + Expect(*captured2).ToNot(BeNil()) + Expect((*captured2).ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*(*captured2).ReasoningConfig.DisableReasoning).To(BeFalse()) + }) +}) diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md index 20277f8de..9c6760126 100644 --- a/docs/content/advanced/model-configuration.md +++ b/docs/content/advanced/model-configuration.md @@ -412,7 +412,10 @@ These load-time options control how the backend parses `` reasoning block | `prefill_assistant` | bool | `true` | When `false`, the trailing assistant message is not pre-filled by the chat template. | {{% notice note %}} -This is the load-time reasoning configuration. The orthogonal per-request `enable_thinking` chat-template kwarg (set via the YAML `reasoning.disable` field) toggles thinking on/off per call without restarting the model. +This is the load-time reasoning configuration. The orthogonal per-request `enable_thinking` chat-template kwarg toggles thinking on/off per call without restarting the model. It can be driven either by the YAML `reasoning.disable` field (model default) or per request via the OpenAI `reasoning_effort` field on `/v1/chat/completions`: + +- `reasoning_effort: "none"` disables thinking for that request (`enable_thinking=false`) - useful to run a single reasoning model like Qwen3 for low-latency tasks while still enabling reasoning on other requests. +- `reasoning_effort: "minimal" | "low" | "medium" | "high"` enables thinking, unless the model config explicitly set `reasoning.disable: true` (an operator's explicit disable wins and is never re-enabled by a request). {{% /notice %}} ### Multimodal Backend Options