mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-16 04:38:50 -04:00
feat(backend): forward resolved chat_template_kwargs blob to backends
gRPCPredictOpts now merges per-request client metadata over the server-derived enable_thinking/reasoning_effort (reaching all backends via the standalone keys) and serialises the resolved chat_template_kwargs map into a JSON blob for llama.cpp, written last so a client cannot clobber it. Issue #10329. Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -368,6 +368,25 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
|
||||
if c.ReasoningEffort != "" {
|
||||
metadata["reasoning_effort"] = c.ReasoningEffort
|
||||
}
|
||||
// Client request metadata overrides the server-derived reasoning levers and
|
||||
// reaches every backend through these standalone string keys (Python backends
|
||||
// read them directly). The reserved blob key is server-owned and skipped.
|
||||
for k, v := range c.RequestMetadata {
|
||||
if k == "chat_template_kwargs" {
|
||||
continue
|
||||
}
|
||||
metadata[k] = v
|
||||
}
|
||||
// Build the generic chat_template_kwargs blob (model config map + coerced
|
||||
// metadata) for llama.cpp and write it LAST so a client cannot clobber it.
|
||||
if blob := c.ResolveChatTemplateKwargs(metadata); len(blob) > 0 {
|
||||
b, err := json.Marshal(blob)
|
||||
if err != nil {
|
||||
xlog.Warn("failed to marshal chat_template_kwargs", "error", err)
|
||||
} else {
|
||||
metadata["chat_template_kwargs"] = string(b)
|
||||
}
|
||||
}
|
||||
pbOpts.Metadata = metadata
|
||||
|
||||
// Logprobs and TopLogprobs are set by the caller if provided
|
||||
|
||||
@@ -161,3 +161,54 @@ var _ = Describe("grpcModelOpts NBatch", func() {
|
||||
Expect(opts.ContextSize).To(BeEquivalentTo(4096), "n_batch must match the effective n_ctx the backend receives")
|
||||
})
|
||||
})
|
||||
|
||||
// Guards the generic chat_template_kwargs forwarding: the model config map plus any
|
||||
// per-request metadata overrides are merged, coerced, and serialised into the
|
||||
// backend metadata blob that llama.cpp reads. Client metadata also overrides the
|
||||
// server-derived standalone enable_thinking key (cross-backend consistency).
|
||||
var _ = Describe("gRPCPredictOpts chat_template_kwargs metadata", func() {
|
||||
baseCfg := func() config.ModelConfig {
|
||||
cfg := config.ModelConfig{}
|
||||
cfg.SetDefaults()
|
||||
return cfg
|
||||
}
|
||||
|
||||
It("serialises the config map into the chat_template_kwargs blob", func() {
|
||||
cfg := baseCfg()
|
||||
cfg.ChatTemplateKwargs = map[string]any{"preserve_thinking": true}
|
||||
opts := gRPCPredictOpts(cfg, "/tmp/models")
|
||||
Expect(opts.Metadata).To(HaveKey("chat_template_kwargs"))
|
||||
var blob map[string]any
|
||||
Expect(json.Unmarshal([]byte(opts.Metadata["chat_template_kwargs"]), &blob)).To(Succeed())
|
||||
Expect(blob).To(HaveKeyWithValue("preserve_thinking", true))
|
||||
})
|
||||
|
||||
It("lets client request metadata override the server-derived enable_thinking key", func() {
|
||||
cfg := baseCfg()
|
||||
disable := true
|
||||
cfg.ReasoningConfig = reasoning.Config{DisableReasoning: &disable} // server: enable_thinking=false
|
||||
cfg.RequestMetadata = map[string]string{"enable_thinking": "true"} // client overrides
|
||||
opts := gRPCPredictOpts(cfg, "/tmp/models")
|
||||
// standalone key (Python backends) reflects the client override
|
||||
Expect(opts.Metadata).To(HaveKeyWithValue("enable_thinking", "true"))
|
||||
// blob (llama.cpp) reflects it too, as a real bool
|
||||
var blob map[string]any
|
||||
Expect(json.Unmarshal([]byte(opts.Metadata["chat_template_kwargs"]), &blob)).To(Succeed())
|
||||
Expect(blob).To(HaveKeyWithValue("enable_thinking", true))
|
||||
})
|
||||
|
||||
It("does not let a client clobber the blob via a chat_template_kwargs metadata key", func() {
|
||||
cfg := baseCfg()
|
||||
cfg.ChatTemplateKwargs = map[string]any{"preserve_thinking": true}
|
||||
cfg.RequestMetadata = map[string]string{"chat_template_kwargs": "{\"preserve_thinking\": false}"}
|
||||
opts := gRPCPredictOpts(cfg, "/tmp/models")
|
||||
var blob map[string]any
|
||||
Expect(json.Unmarshal([]byte(opts.Metadata["chat_template_kwargs"]), &blob)).To(Succeed())
|
||||
Expect(blob).To(HaveKeyWithValue("preserve_thinking", true))
|
||||
})
|
||||
|
||||
It("omits the blob when there is nothing to forward", func() {
|
||||
opts := gRPCPredictOpts(baseCfg(), "/tmp/models")
|
||||
Expect(opts.Metadata).ToNot(HaveKey("chat_template_kwargs"))
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user