diff --git a/core/backend/options.go b/core/backend/options.go index de8b4c44e..efe6c649f 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -368,6 +368,25 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions if c.ReasoningEffort != "" { metadata["reasoning_effort"] = c.ReasoningEffort } + // Client request metadata overrides the server-derived reasoning levers and + // reaches every backend through these standalone string keys (Python backends + // read them directly). The reserved blob key is server-owned and skipped. + for k, v := range c.RequestMetadata { + if k == "chat_template_kwargs" { + continue + } + metadata[k] = v + } + // Build the generic chat_template_kwargs blob (model config map + coerced + // metadata) for llama.cpp and write it LAST so a client cannot clobber it. + if blob := c.ResolveChatTemplateKwargs(metadata); len(blob) > 0 { + b, err := json.Marshal(blob) + if err != nil { + xlog.Warn("failed to marshal chat_template_kwargs", "error", err) + } else { + metadata["chat_template_kwargs"] = string(b) + } + } pbOpts.Metadata = metadata // Logprobs and TopLogprobs are set by the caller if provided diff --git a/core/backend/options_internal_test.go b/core/backend/options_internal_test.go index aa07b43bd..e8399849f 100644 --- a/core/backend/options_internal_test.go +++ b/core/backend/options_internal_test.go @@ -161,3 +161,54 @@ var _ = Describe("grpcModelOpts NBatch", func() { Expect(opts.ContextSize).To(BeEquivalentTo(4096), "n_batch must match the effective n_ctx the backend receives") }) }) + +// Guards the generic chat_template_kwargs forwarding: the model config map plus any +// per-request metadata overrides are merged, coerced, and serialised into the +// backend metadata blob that llama.cpp reads. Client metadata also overrides the +// server-derived standalone enable_thinking key (cross-backend consistency). +var _ = Describe("gRPCPredictOpts chat_template_kwargs metadata", func() { + baseCfg := func() config.ModelConfig { + cfg := config.ModelConfig{} + cfg.SetDefaults() + return cfg + } + + It("serialises the config map into the chat_template_kwargs blob", func() { + cfg := baseCfg() + cfg.ChatTemplateKwargs = map[string]any{"preserve_thinking": true} + opts := gRPCPredictOpts(cfg, "/tmp/models") + Expect(opts.Metadata).To(HaveKey("chat_template_kwargs")) + var blob map[string]any + Expect(json.Unmarshal([]byte(opts.Metadata["chat_template_kwargs"]), &blob)).To(Succeed()) + Expect(blob).To(HaveKeyWithValue("preserve_thinking", true)) + }) + + It("lets client request metadata override the server-derived enable_thinking key", func() { + cfg := baseCfg() + disable := true + cfg.ReasoningConfig = reasoning.Config{DisableReasoning: &disable} // server: enable_thinking=false + cfg.RequestMetadata = map[string]string{"enable_thinking": "true"} // client overrides + opts := gRPCPredictOpts(cfg, "/tmp/models") + // standalone key (Python backends) reflects the client override + Expect(opts.Metadata).To(HaveKeyWithValue("enable_thinking", "true")) + // blob (llama.cpp) reflects it too, as a real bool + var blob map[string]any + Expect(json.Unmarshal([]byte(opts.Metadata["chat_template_kwargs"]), &blob)).To(Succeed()) + Expect(blob).To(HaveKeyWithValue("enable_thinking", true)) + }) + + It("does not let a client clobber the blob via a chat_template_kwargs metadata key", func() { + cfg := baseCfg() + cfg.ChatTemplateKwargs = map[string]any{"preserve_thinking": true} + cfg.RequestMetadata = map[string]string{"chat_template_kwargs": "{\"preserve_thinking\": false}"} + opts := gRPCPredictOpts(cfg, "/tmp/models") + var blob map[string]any + Expect(json.Unmarshal([]byte(opts.Metadata["chat_template_kwargs"]), &blob)).To(Succeed()) + Expect(blob).To(HaveKeyWithValue("preserve_thinking", true)) + }) + + It("omits the blob when there is nothing to forward", func() { + opts := gRPCPredictOpts(baseCfg(), "/tmp/models") + Expect(opts.Metadata).ToNot(HaveKey("chat_template_kwargs")) + }) +})