diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 2ca329134..ac5521bc4 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -1944,6 +1944,17 @@ public: body_json["chat_template_kwargs"]["enable_thinking"] = (et_it->second == "true"); } + // Pass reasoning_effort via chat_template_kwargs too: the lever + // jinja templates like gpt-oss (Harmony) / LFM2.5 read, distinct + // from enable_thinking which those templates ignore. + auto re_it = metadata.find("reasoning_effort"); + if (re_it != metadata.end() && !re_it->second.empty()) { + if (!body_json.contains("chat_template_kwargs")) { + body_json["chat_template_kwargs"] = json::object(); + } + body_json["chat_template_kwargs"]["reasoning_effort"] = re_it->second; + } + // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.) SRV_DBG("[CONVERSATION DEBUG] PredictStream: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str()); @@ -2737,6 +2748,17 @@ public: body_json["chat_template_kwargs"]["enable_thinking"] = (predict_et_it->second == "true"); } + // Pass reasoning_effort via chat_template_kwargs too: the lever + // jinja templates like gpt-oss (Harmony) / LFM2.5 read, distinct + // from enable_thinking which those templates ignore. + auto predict_re_it = predict_metadata.find("reasoning_effort"); + if (predict_re_it != predict_metadata.end() && !predict_re_it->second.empty()) { + if (!body_json.contains("chat_template_kwargs")) { + body_json["chat_template_kwargs"] = json::object(); + } + body_json["chat_template_kwargs"]["reasoning_effort"] = predict_re_it->second; + } + // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.) SRV_DBG("[CONVERSATION DEBUG] Predict: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str()); diff --git a/core/backend/options.go b/core/backend/options.go index c891b6d67..0274bdb6e 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -239,13 +239,13 @@ func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions { if c.Backend == "cloud-proxy" { opts.Proxy = &pb.ProxyOptions{ - UpstreamUrl: c.Proxy.UpstreamURL, - Mode: c.Proxy.Mode, - Provider: c.Proxy.Provider, - ApiKeyEnv: c.Proxy.APIKeyEnv, - ApiKeyFile: c.Proxy.APIKeyFile, - UpstreamModel: c.Proxy.UpstreamModel, - RequestTimeoutSeconds: int32(c.Proxy.RequestTimeoutSeconds), + UpstreamUrl: c.Proxy.UpstreamURL, + Mode: c.Proxy.Mode, + Provider: c.Proxy.Provider, + ApiKeyEnv: c.Proxy.APIKeyEnv, + ApiKeyFile: c.Proxy.APIKeyFile, + UpstreamModel: c.Proxy.UpstreamModel, + RequestTimeoutSeconds: int32(c.Proxy.RequestTimeoutSeconds), } } @@ -323,6 +323,12 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions metadata["enable_thinking"] = "true" } } + // Forward the effective reasoning effort so the backend can pass it to the + // jinja chat template (chat_template_kwargs.reasoning_effort) — the lever + // models like gpt-oss / LFM2.5 actually read, distinct from enable_thinking. + if c.ReasoningEffort != "" { + metadata["reasoning_effort"] = c.ReasoningEffort + } pbOpts.Metadata = metadata // Logprobs and TopLogprobs are set by the caller if provided diff --git a/core/backend/options_internal_test.go b/core/backend/options_internal_test.go index af5d59992..5e1848f0f 100644 --- a/core/backend/options_internal_test.go +++ b/core/backend/options_internal_test.go @@ -75,3 +75,25 @@ var _ = Describe("gRPCPredictOpts enable_thinking metadata", func() { Expect(opts.Metadata).ToNot(HaveKey("enable_thinking")) }) }) + +// Guards forwarding the effective reasoning_effort into PredictOptions.Metadata, +// where the backend passes it to the jinja chat template (chat_template_kwargs) +// so models like gpt-oss / LFM2.5 honor it. +var _ = Describe("gRPCPredictOpts reasoning_effort metadata", func() { + withEffort := func(effort string) config.ModelConfig { + cfg := config.ModelConfig{} + cfg.SetDefaults() + cfg.ReasoningEffort = effort + return cfg + } + + It("forwards reasoning_effort when set", func() { + opts := gRPCPredictOpts(withEffort("none"), "/tmp/models") + Expect(opts.Metadata).To(HaveKeyWithValue("reasoning_effort", "none")) + }) + + It("omits reasoning_effort when empty", func() { + opts := gRPCPredictOpts(withEffort(""), "/tmp/models") + Expect(opts.Metadata).ToNot(HaveKey("reasoning_effort")) + }) +}) diff --git a/core/config/meta/registry.go b/core/config/meta/registry.go index 54d891106..548b21892 100644 --- a/core/config/meta/registry.go +++ b/core/config/meta/registry.go @@ -128,6 +128,22 @@ func DefaultRegistry() map[string]FieldMetaOverride { Advanced: true, Order: 21, }, + "reasoning_effort": { + Section: "llm", + Label: "Reasoning Effort", + Description: "Default reasoning effort, forwarded to the backend as the reasoning_effort chat_template_kwarg (jinja models like gpt-oss / LFM2.5 honor it). A per-request reasoning_effort overrides it. 'none' also turns thinking off.", + Component: "select", + Options: []FieldOption{ + {Value: "", Label: "Unset (model default)"}, + {Value: "none", Label: "none (disable thinking)"}, + {Value: "minimal", Label: "minimal"}, + {Value: "low", Label: "low"}, + {Value: "medium", Label: "medium"}, + {Value: "high", Label: "high"}, + }, + Advanced: true, + Order: 22, + }, "cache_type_k": { Section: "llm", Label: "KV Cache Type (K)", @@ -277,6 +293,21 @@ func DefaultRegistry() map[string]FieldMetaOverride { AutocompleteProvider: ProviderModelsVAD, Order: 63, }, + "pipeline.reasoning_effort": { + Section: "pipeline", + Label: "Reasoning Effort", + Description: "Reasoning effort for the pipeline's LLM, forwarded to the backend as the reasoning_effort chat_template_kwarg (jinja models like gpt-oss / LFM2.5 honor it). Overrides the LLM model's own reasoning_effort. 'none' also turns thinking off.", + Component: "select", + Options: []FieldOption{ + {Value: "", Label: "Default (model config)"}, + {Value: "none", Label: "none (disable thinking)"}, + {Value: "minimal", Label: "minimal"}, + {Value: "low", Label: "low"}, + {Value: "medium", Label: "medium"}, + {Value: "high", Label: "high"}, + }, + Order: 64, + }, // --- Functions --- "function.grammar.parallel_calls": { diff --git a/core/config/model_config.go b/core/config/model_config.go index a1b000798..9980c92e8 100644 --- a/core/config/model_config.go +++ b/core/config/model_config.go @@ -63,6 +63,13 @@ type ModelConfig struct { FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"` ReasoningConfig reasoning.Config `yaml:"reasoning,omitempty" json:"reasoning,omitempty"` + // ReasoningEffort is the default reasoning effort (none|minimal|low|medium|high) + // for this model. A per-request reasoning_effort overrides it. It is forwarded + // to the backend as the reasoning_effort chat_template_kwarg (see + // gRPCPredictOpts), so jinja-templated models that key on it — e.g. gpt-oss + // (Harmony) or LFM2.5 — honor it; "none" also toggles enable_thinking off. + ReasoningEffort string `yaml:"reasoning_effort,omitempty" json:"reasoning_effort,omitempty"` + FeatureFlag FeatureFlag `yaml:"feature_flags,omitempty" json:"feature_flags,omitempty"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early. // LLM configs (GPT4ALL, Llama.cpp, ...) LLMConfig `yaml:",inline" json:",inline"` @@ -487,6 +494,40 @@ type Pipeline struct { LLM string `yaml:"llm,omitempty" json:"llm,omitempty"` Transcription string `yaml:"transcription,omitempty" json:"transcription,omitempty"` VAD string `yaml:"vad,omitempty" json:"vad,omitempty"` + + // ReasoningEffort sets the reasoning effort (none|minimal|low|medium|high) for + // the pipeline's LLM without editing the LLM model config. Overrides the LLM's + // own reasoning_effort. Unset leaves the LLM model config in charge. + ReasoningEffort string `yaml:"reasoning_effort,omitempty" json:"reasoning_effort,omitempty"` +} + +// ApplyReasoningEffort resolves the effective reasoning effort — a per-request +// value (requestEffort) overrides the config's own ReasoningEffort default — +// stores it on the config so gRPCPredictOpts forwards it to the backend as the +// reasoning_effort chat_template_kwarg, and maps it onto the enable_thinking +// toggle the backend also reads: +// - "none" always disables thinking. +// - any explicit level enables it, UNLESS the config already disabled reasoning +// (an operator's explicit disable wins over a request asking to think). +// +// An empty requestEffort keeps the config's own default. With no effort set +// anywhere it is a no-op, leaving the model's reasoning settings untouched. +func (c *ModelConfig) ApplyReasoningEffort(requestEffort string) { + effort := requestEffort + if effort == "" { + effort = c.ReasoningEffort + } + c.ReasoningEffort = effort + switch strings.ToLower(effort) { + case "none": + disable := true + c.ReasoningConfig.DisableReasoning = &disable + case "minimal", "low", "medium", "high": + if c.ReasoningConfig.DisableReasoning == nil || !*c.ReasoningConfig.DisableReasoning { + enable := false + c.ReasoningConfig.DisableReasoning = &enable + } + } } // @Description File configuration for model downloads diff --git a/core/config/reasoning_effort_test.go b/core/config/reasoning_effort_test.go new file mode 100644 index 000000000..8f23ba690 --- /dev/null +++ b/core/config/reasoning_effort_test.go @@ -0,0 +1,52 @@ +package config_test + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/mudler/LocalAI/core/config" +) + +// ApplyReasoningEffort resolves the effective reasoning effort (request value +// overrides the model config default), stores it on the config so it reaches the +// backend, and maps it onto the enable_thinking toggle. +var _ = Describe("ModelConfig.ApplyReasoningEffort", func() { + It("uses the request value over the config default", func() { + c := &config.ModelConfig{ReasoningEffort: "high"} + c.ApplyReasoningEffort("none") + Expect(c.ReasoningEffort).To(Equal("none")) + Expect(c.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*c.ReasoningConfig.DisableReasoning).To(BeTrue()) + }) + + It("falls back to the config default when the request omits it", func() { + c := &config.ModelConfig{ReasoningEffort: "none"} + c.ApplyReasoningEffort("") + Expect(c.ReasoningEffort).To(Equal("none")) + Expect(c.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*c.ReasoningConfig.DisableReasoning).To(BeTrue()) + }) + + It("enables thinking for an explicit effort level", func() { + c := &config.ModelConfig{} + c.ApplyReasoningEffort("medium") + Expect(c.ReasoningEffort).To(Equal("medium")) + Expect(c.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*c.ReasoningConfig.DisableReasoning).To(BeFalse()) + }) + + It("does not let a level override an operator's config-level disable", func() { + disabled := true + c := &config.ModelConfig{} + c.ReasoningConfig.DisableReasoning = &disabled + c.ApplyReasoningEffort("high") + Expect(*c.ReasoningConfig.DisableReasoning).To(BeTrue()) + }) + + It("is a no-op on the toggle when no effort is set anywhere", func() { + c := &config.ModelConfig{} + c.ApplyReasoningEffort("") + Expect(c.ReasoningEffort).To(Equal("")) + Expect(c.ReasoningConfig.DisableReasoning).To(BeNil()) + }) +}) diff --git a/core/http/endpoints/openai/realtime_model.go b/core/http/endpoints/openai/realtime_model.go index afb93201b..b9a3adda9 100644 --- a/core/http/endpoints/openai/realtime_model.go +++ b/core/http/endpoints/openai/realtime_model.go @@ -44,10 +44,10 @@ type wrappedModel struct { // deps in. nil-safe: with classifierRegistry == nil the per-turn // routing block in Predict is skipped, preserving today's "one LLM // for the whole session" behaviour. - routerDeps *middleware.ClassifierDeps - routerStore router.DecisionStore - routerSessionID string - routerUserID string + routerDeps *middleware.ClassifierDeps + routerStore router.DecisionStore + routerSessionID string + routerUserID string } // anyToAnyModel represent a model which supports Any-to-Any operations @@ -119,6 +119,11 @@ func (m *wrappedModel) Predict(ctx context.Context, messages schema.Messages, im } } + // Surface the resolved reasoning effort to the Go-side template path too + // (jinja models get it via backend metadata in gRPCPredictOpts; Go-templated + // models like gpt-oss read it from the template's .ReasoningEffort). + input.ReasoningEffort = turnCfg.ReasoningEffort + var predInput string var funcs []functions.Function if !turnCfg.TemplateConfig.UseTokenizerTemplate { @@ -449,6 +454,9 @@ func newModel(pipeline *config.Pipeline, cl *config.ModelConfigLoader, ml *model return nil, fmt.Errorf("failed to validate config: %w", err) } + // Let the pipeline set the LLM's reasoning effort (cfgLLM is a per-session copy). + applyPipelineReasoning(cfgLLM, *pipeline) + cfgTTS, err := cl.LoadModelConfigFileByName(pipeline.TTS, ml.ModelPath) if err != nil { diff --git a/core/http/endpoints/openai/realtime_reasoning.go b/core/http/endpoints/openai/realtime_reasoning.go new file mode 100644 index 000000000..0d8eb5f8f --- /dev/null +++ b/core/http/endpoints/openai/realtime_reasoning.go @@ -0,0 +1,16 @@ +package openai + +import "github.com/mudler/LocalAI/core/config" + +// applyPipelineReasoning sets the reasoning effort for a realtime pipeline's LLM +// from the pipeline config, without editing the underlying LLM model config. The +// pipeline value overrides the LLM's own reasoning_effort; when the pipeline does +// not set it, the LLM model config's reasoning_effort (if any) is used. The LLM +// config passed in is the per-session copy returned by the config loader, so this +// does not affect other users of the same model. +func applyPipelineReasoning(llm *config.ModelConfig, pipeline config.Pipeline) { + if llm == nil { + return + } + llm.ApplyReasoningEffort(pipeline.ReasoningEffort) +} diff --git a/core/http/endpoints/openai/realtime_reasoning_test.go b/core/http/endpoints/openai/realtime_reasoning_test.go new file mode 100644 index 000000000..f76a2c9c1 --- /dev/null +++ b/core/http/endpoints/openai/realtime_reasoning_test.go @@ -0,0 +1,33 @@ +package openai + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/mudler/LocalAI/core/config" +) + +// applyPipelineReasoning lets a realtime pipeline set the reasoning effort for +// its LLM (forwarded to the backend as reasoning_effort) without editing the LLM +// model config. The pipeline value overrides the LLM's own reasoning_effort. +var _ = Describe("applyPipelineReasoning", func() { + It("applies the pipeline reasoning_effort to the LLM config", func() { + llm := &config.ModelConfig{} + applyPipelineReasoning(llm, config.Pipeline{ReasoningEffort: "none"}) + Expect(llm.ReasoningEffort).To(Equal("none")) + Expect(llm.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*llm.ReasoningConfig.DisableReasoning).To(BeTrue()) + }) + + It("falls back to the LLM's own reasoning_effort when the pipeline is unset", func() { + llm := &config.ModelConfig{ReasoningEffort: "high"} + applyPipelineReasoning(llm, config.Pipeline{}) + Expect(llm.ReasoningEffort).To(Equal("high")) + Expect(llm.ReasoningConfig.DisableReasoning).ToNot(BeNil()) + Expect(*llm.ReasoningConfig.DisableReasoning).To(BeFalse()) + }) + + It("is nil-safe", func() { + applyPipelineReasoning(nil, config.Pipeline{ReasoningEffort: "low"}) + }) +}) diff --git a/core/http/middleware/request.go b/core/http/middleware/request.go index 3e6f3555a..591ec8b93 100644 --- a/core/http/middleware/request.go +++ b/core/http/middleware/request.go @@ -310,25 +310,13 @@ func mergeOpenAIRequestAndModelConfig(config *config.ModelConfig, input *schema. config.Temperature = input.Temperature } - // Map the per-request reasoning_effort onto the reasoning toggle the - // backend reads (enable_thinking metadata, set in gRPCPredictOpts). - // "none" disables thinking for this request - the use case from #10072, - // running a single Qwen3-style model and turning reasoning off per - // request. Any explicit effort level enables thinking, UNLESS the model - // config explicitly disabled it (DisableReasoning==true wins): an - // operator who deliberately turned reasoning off should not be overridden - // by a request. A value of "none" always disables, since that never - // conflicts with a config that also disables. - switch strings.ToLower(input.ReasoningEffort) { - case "none": - disable := true - config.ReasoningConfig.DisableReasoning = &disable - case "minimal", "low", "medium", "high": - if config.ReasoningConfig.DisableReasoning == nil || !*config.ReasoningConfig.DisableReasoning { - enable := false - config.ReasoningConfig.DisableReasoning = &enable - } - } + // Resolve the effective reasoning effort (request overrides the model config + // default), store it so gRPCPredictOpts forwards it to the backend as the + // reasoning_effort chat_template_kwarg (what gpt-oss / LFM2.5 read), and map + // it onto the enable_thinking toggle. "none" disables thinking (the #10072 + // use case); a level enables it unless the config already disabled reasoning + // (an operator's explicit disable wins over a request asking to think). + config.ApplyReasoningEffort(input.ReasoningEffort) // Collapse the modern max_completion_tokens alias into the // legacy Maxtokens field so downstream code reads exactly one. diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md index 9c6760126..1d3268b11 100644 --- a/docs/content/advanced/model-configuration.md +++ b/docs/content/advanced/model-configuration.md @@ -418,6 +418,26 @@ This is the load-time reasoning configuration. The orthogonal per-request `enabl - `reasoning_effort: "minimal" | "low" | "medium" | "high"` enables thinking, unless the model config explicitly set `reasoning.disable: true` (an operator's explicit disable wins and is never re-enabled by a request). {{% /notice %}} +#### `reasoning_effort` as a chat-template kwarg + +`reasoning_effort` is also forwarded to the backend as a `chat_template_kwarg`, so models whose **jinja chat template** keys on it — e.g. gpt-oss (Harmony) or LFM2.5 — honor the **level**, not just the on/off `enable_thinking` flag. This matters for models that ignore `enable_thinking` entirely (LFM2.5 keeps emitting `` for `enable_thinking=false`, but respects `reasoning_effort`). + +Set a per-model default in the config so every request inherits it (a per-request `reasoning_effort` still overrides): + +```yaml +name: my-model +reasoning_effort: none # none | minimal | low | medium | high +``` + +For [realtime pipelines]({{%relref "docs/features/openai-realtime" %}}), set it on the pipeline so it applies to the pipeline's LLM without editing that model's own config: + +```yaml +name: gpt-realtime +pipeline: + llm: lfm2.5 + reasoning_effort: none # overrides the LLM model's own reasoning_effort +``` + ### Multimodal Backend Options | Option | Type | Default | Description |