From 826d91ddf455f40dc5364491d4eed17105f59254 Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Tue, 16 Jun 2026 08:01:21 +0000
Subject: [PATCH] feat(llama-cpp): generic chat_template_kwargs merge (drop
 per-key blocks)

Replace the per-key enable_thinking/reasoning_effort handling in both the
streaming and non-streaming chat paths with a single block that parses the
chat_template_kwargs JSON blob resolved by the Go layer and merges every key
into body_json. New jinja template levers (e.g. preserve_thinking) now need
no C++ change. Issue #10329.

Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 backend/cpp/llama-cpp/grpc-server.cpp | 71 ++++++++++++++-------------
 1 file changed, 37 insertions(+), 34 deletions(-)

diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index 5b032ad4e..8502e9530 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -1922,25 +1922,27 @@ public:
                     body_json["min_p"] = data["min_p"];
                 }
 
-                // Pass enable_thinking via chat_template_kwargs (where oaicompat_chat_params_parse reads it)
+                // Forward the chat_template_kwargs the Go layer resolved (model config
+                // chat_template_kwargs + per-request metadata: enable_thinking,
+                // reasoning_effort, preserve_thinking, ...). One generic merge replaces
+                // the previous per-key handling - new template levers need no C++ change.
+                // oaicompat_chat_params_parse reads these from body_json.
                 const auto& metadata = request->metadata();
-                auto et_it = metadata.find("enable_thinking");
-                if (et_it != metadata.end()) {
-                    if (!body_json.contains("chat_template_kwargs")) {
-                        body_json["chat_template_kwargs"] = json::object();
+                auto ctk_it = metadata.find("chat_template_kwargs");
+                if (ctk_it != metadata.end() && !ctk_it->second.empty()) {
+                    try {
+                        json ctk = json::parse(ctk_it->second);
+                        if (ctk.is_object()) {
+                            if (!body_json.contains("chat_template_kwargs")) {
+                                body_json["chat_template_kwargs"] = json::object();
+                            }
+                            for (auto& el : ctk.items()) {
+                                body_json["chat_template_kwargs"][el.key()] = el.value();
+                            }
+                        }
+                    } catch (const std::exception & e) {
+                        SRV_WRN("failed to parse chat_template_kwargs metadata: %s\n", e.what());
                     }
-                    body_json["chat_template_kwargs"]["enable_thinking"] = (et_it->second == "true");
-                }
-
-                // Pass reasoning_effort via chat_template_kwargs too: the lever
-                // jinja templates like gpt-oss (Harmony) / LFM2.5 read, distinct
-                // from enable_thinking which those templates ignore.
-                auto re_it = metadata.find("reasoning_effort");
-                if (re_it != metadata.end() && !re_it->second.empty()) {
-                    if (!body_json.contains("chat_template_kwargs")) {
-                        body_json["chat_template_kwargs"] = json::object();
-                    }
-                    body_json["chat_template_kwargs"]["reasoning_effort"] = re_it->second;
                 }
 
                 // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)
@@ -2756,25 +2758,26 @@ public:
                     body_json["min_p"] = data["min_p"];
                 }
 
-                // Pass enable_thinking via chat_template_kwargs (where oaicompat_chat_params_parse reads it)
+                // Forward the chat_template_kwargs the Go layer resolved (model config
+                // chat_template_kwargs + per-request metadata: enable_thinking,
+                // reasoning_effort, preserve_thinking, ...). One generic merge replaces
+                // the previous per-key handling - new template levers need no C++ change.
                 const auto& predict_metadata = request->metadata();
-                auto predict_et_it = predict_metadata.find("enable_thinking");
-                if (predict_et_it != predict_metadata.end()) {
-                    if (!body_json.contains("chat_template_kwargs")) {
-                        body_json["chat_template_kwargs"] = json::object();
+                auto predict_ctk_it = predict_metadata.find("chat_template_kwargs");
+                if (predict_ctk_it != predict_metadata.end() && !predict_ctk_it->second.empty()) {
+                    try {
+                        json ctk = json::parse(predict_ctk_it->second);
+                        if (ctk.is_object()) {
+                            if (!body_json.contains("chat_template_kwargs")) {
+                                body_json["chat_template_kwargs"] = json::object();
+                            }
+                            for (auto& el : ctk.items()) {
+                                body_json["chat_template_kwargs"][el.key()] = el.value();
+                            }
+                        }
+                    } catch (const std::exception & e) {
+                        SRV_WRN("failed to parse chat_template_kwargs metadata: %s\n", e.what());
                     }
-                    body_json["chat_template_kwargs"]["enable_thinking"] = (predict_et_it->second == "true");
-                }
-
-                // Pass reasoning_effort via chat_template_kwargs too: the lever
-                // jinja templates like gpt-oss (Harmony) / LFM2.5 read, distinct
-                // from enable_thinking which those templates ignore.
-                auto predict_re_it = predict_metadata.find("reasoning_effort");
-                if (predict_re_it != predict_metadata.end() && !predict_re_it->second.empty()) {
-                    if (!body_json.contains("chat_template_kwargs")) {
-                        body_json["chat_template_kwargs"] = json::object();
-                    }
-                    body_json["chat_template_kwargs"]["reasoning_effort"] = predict_re_it->second;
                 }
 
                 // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)