feat(llama.cpp): consolidate options and respect tokenizer template when enabled (#7120)

* feat(llama.cpp): expose env vars as options for consistency This allows to configure everything in the YAML file of the model rather than have global configurations Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(llama.cpp): respect usetokenizertemplate and use llama.cpp templating system to process messages Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * WIP Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Detect template exists if use tokenizer template is enabled Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Better recognization of chat Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fixes to support tool calls while using templates from tokenizer Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Fixups Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Drop template guessing, fix passing tools to tokenizer Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Extract grammar and other options from chat template, add schema struct Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * WIP Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * WIP Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Automatically set use_jinja Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Cleanups, identify by default gguf models for chat Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Update docs Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-03 05:51:53 -04:00 · 2025-11-07 21:23:50 +01:00
parent e5e86d0acb
commit 02cc8cbcaa
17 changed files with 974 additions and 545 deletions
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -27,8 +27,6 @@ using grpc::Status;
 // END LocalAI


-
-
 /////////////////////////////////
 ////////////////////////////////
 //////// LOCALAI code starts below here
@@ -37,6 +35,14 @@ using grpc::Status;

 bool loaded_model; // TODO: add a mutex for this, but happens only once loading the model

+// Forward declarations
+static void start_llama_server(server_context& ctx_server);
+static json parse_options(bool streaming, const backend::PredictOptions* predict, const server_context& ctx_server);
+static ggml_type kv_cache_type_from_str(const std::string & s);
+static std::string get_all_kv_cache_types();
+static void add_rpc_devices(std::string servers);
+static void params_parse(server_context& ctx_server, const backend::ModelOptions* request, common_params & params);
+
 static void start_llama_server(server_context& ctx_server) {

    LOG_INF("%s: starting llama server\n", __func__);
@@ -57,9 +63,8 @@ static void start_llama_server(server_context& ctx_server) {
    //     common_chat_templates_source(ctx_server.chat_templates.get()),
    //     common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja).c_str(), ctx_server.params_base.default_template_kwargs);

-    // Reset the chat templates
-    // TODO: We should make this configurable by respecting the option that is already present in LocalAI for vLLM
-    ctx_server.chat_templates.reset();
+    // Keep the chat templates initialized in load_model() so they can be used when UseTokenizerTemplate is enabled
+    // Templates will only be used conditionally in Predict/PredictStream when UseTokenizerTemplate is true and Messages are provided

    ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) {
        ctx_server.process_single_task(std::move(task));
@@ -114,12 +119,55 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const
    data["mirostat_eta"] = predict->mirostateta();
    data["n_keep"] = predict->nkeep();
    data["seed"] = predict->seed();
-    data["grammar"] = predict->grammar();
-    data["prompt"] = predict->prompt();
+    
+
+    std::string grammar_str = predict->grammar();
+    
+ 
+    
+    if (!grammar_str.empty()) {
+        data["grammar"] = grammar_str;
+        SRV_INF("Using grammar: %s\n", grammar_str.c_str());
+    }
+    
+    // Only set prompt if UseTokenizerTemplate is false or if no Messages are provided
+    // When UseTokenizerTemplate is true and Messages are provided, prompt will be set via chat templates in Predict/PredictStream
+    if (!predict->usetokenizertemplate() || predict->messages_size() == 0) {
+        data["prompt"] = predict->prompt();
+    }
+    
+    // Extract tools and tool_choice from proto and add to data JSON
+    if (!predict->tools().empty()) {
+        try {
+            // Parse tools JSON string and add to data
+            json tools_json = json::parse(predict->tools());
+            data["tools"] = tools_json;
+            SRV_INF("Extracted tools from proto: %s\n", predict->tools().c_str());
+        } catch (const json::parse_error& e) {
+            SRV_WRN("Failed to parse tools JSON from proto: %s\n", e.what());
+        }
+    }
+    if (!predict->toolchoice().empty()) {
+        try {
+            // Parse tool_choice JSON string
+            json tool_choice_json = json::parse(predict->toolchoice());
+            // tool_choice can be a string ("auto", "none", "required") or an object
+            // Store it as-is (string or object) so we can convert object to "required" later when adding to body_json
+            if (tool_choice_json.is_string()) {
+                data["tool_choice"] = tool_choice_json.get<std::string>();
+            } else {
+                // Store object as-is so we can detect it later and convert to "required"
+                data["tool_choice"] = tool_choice_json;
+            }
+            SRV_INF("Extracted tool_choice from proto: %s\n", predict->toolchoice().c_str());
+        } catch (const json::parse_error& e) {
+            // If parsing fails, treat as string
+            data["tool_choice"] = predict->toolchoice();
+            SRV_INF("Extracted tool_choice as string: %s\n", predict->toolchoice().c_str());
+        }
+    }
    data["ignore_eos"] = predict->ignoreeos();
    data["embeddings"] = predict->embeddings();
-    // TODO: add back json_schema and let this be controlled by the user
-    // data["json_schema"] = predict->jsonschema();

    // Add the correlationid to json data
    data["correlation_id"] = predict->correlationid();
@@ -253,27 +301,19 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
    params.cpuparams.n_threads = request->threads();
    params.n_gpu_layers = request->ngpulayers();
    params.n_batch = request->nbatch();
+    //params.verbosity = INT_MAX;
+    // Enable all debug logs by setting verbosity threshold to maximum
+    //common_log_set_verbosity_thold(INT_MAX);
    params.n_ubatch = request->nbatch(); // fixes issue with reranking models being limited to 512 tokens (the default n_ubatch size); allows for setting the maximum input amount of tokens thereby avoiding this error "input is too large to process. increase the physical batch size"
-    // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
-    //params.n_parallel = 1;
-    const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");
-    if (env_parallel != NULL) {
-        params.n_parallel = std::stoi(env_parallel);
-        params.cont_batching = true;
-    } else {
-        params.n_parallel = 1;
-    }
-
-
-    const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
-    if (llama_grpc_servers != NULL) {
-        add_rpc_devices(std::string(llama_grpc_servers));
-    }
    
    // Initialize ctx_shift to false by default (can be overridden by options)
    params.ctx_shift = false;
    // Initialize cache_ram_mib to -1 by default (no limit, can be overridden by options)
    params.cache_ram_mib = -1;
+    // Initialize n_parallel to 1 by default (can be overridden by options)
+    params.n_parallel = 1;
+    // Initialize grpc_servers to empty (can be overridden by options)
+    std::string grpc_servers_option = "";

     // decode options. Options are in form optname:optvale, or if booleans only optname.
    for (int i = 0; i < request->options_size(); i++) {
@@ -290,6 +330,12 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
            } else if (!strcmp(optval, "false") || !strcmp(optval, "0") || !strcmp(optval, "no") || !strcmp(optval, "off") || !strcmp(optval, "disabled")) {
                params.ctx_shift = false;
            }
+        } else if (!strcmp(optname, "use_jinja") || !strcmp(optname, "jinja")) {
+            if (!strcmp(optval, "true") || !strcmp(optval, "1") || !strcmp(optval, "yes") || !strcmp(optval, "on") || !strcmp(optval, "enabled")) {
+                params.use_jinja = true;
+            } else if (!strcmp(optval, "false") || !strcmp(optval, "0") || !strcmp(optval, "no") || !strcmp(optval, "off") || !strcmp(optval, "disabled")) {
+                params.use_jinja = false;
+            }
        } else if (!strcmp(optname, "cache_ram")) {
            if (optval != NULL) {
                try {
@@ -298,6 +344,46 @@ static void params_parse(server_context& ctx_server, const backend::ModelOptions
                    // If conversion fails, keep default value (-1)
                }
            }
+        } else if (!strcmp(optname, "parallel") || !strcmp(optname, "n_parallel")) {
+            if (optval != NULL) {
+                try {
+                    params.n_parallel = std::stoi(optval);
+                    if (params.n_parallel > 1) {
+                        params.cont_batching = true;
+                    }
+                } catch (const std::exception& e) {
+                    // If conversion fails, keep default value (1)
+                }
+            }
+        } else if (!strcmp(optname, "grpc_servers") || !strcmp(optname, "rpc_servers")) {
+            if (optval != NULL) {
+                grpc_servers_option = std::string(optval);
+            }
+        }
+    }
+    
+    // Set params.n_parallel from environment variable if not set via options (fallback)
+    if (params.n_parallel == 1) {
+        const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");
+        if (env_parallel != NULL) {
+            try {
+                params.n_parallel = std::stoi(env_parallel);
+                if (params.n_parallel > 1) {
+                    params.cont_batching = true;
+                }
+            } catch (const std::exception& e) {
+                // If conversion fails, keep default value (1)
+            }
+        }
+    }
+
+    // Add RPC devices from option or environment variable (fallback)
+    if (!grpc_servers_option.empty()) {
+        add_rpc_devices(grpc_servers_option);
+    } else {
+        const char *llama_grpc_servers = std::getenv("LLAMACPP_GRPC_SERVERS");
+        if (llama_grpc_servers != NULL) {
+            add_rpc_devices(std::string(llama_grpc_servers));
        }
    }

@@ -422,6 +508,8 @@ public:
        params_parse(ctx_server, request, params);

        common_init();
+        // Ensure debug logs are enabled after common_init() sets up logging
+        common_log_set_verbosity_thold(params.verbosity);

        llama_backend_init();
        llama_numa_init(params.numa);
@@ -495,46 +583,213 @@ public:
        try {
            std::vector<server_task> tasks;

-            const auto & prompt = data.at("prompt");
+            std::string prompt_str;
+            std::vector<raw_buffer> files; // Declare files early so it's accessible in both branches
+            // Handle chat templates when UseTokenizerTemplate is enabled and Messages are provided
+            if (request->usetokenizertemplate() && request->messages_size() > 0 && ctx_server.chat_templates != nullptr) {
+                // Convert proto Messages to JSON format compatible with oaicompat_chat_params_parse
+                json body_json;
+                json messages_json = json::array();
+                for (int i = 0; i < request->messages_size(); i++) {
+                    const auto& msg = request->messages(i);
+                    json msg_json;
+                    msg_json["role"] = msg.role();
+                    
+                    // Handle content - can be string, null, or array
+                    // For multimodal content, we'll embed images/audio from separate fields
+                    if (!msg.content().empty()) {
+                        msg_json["content"] = msg.content();
+                    } else if (request->images_size() > 0 || request->audios_size() > 0) {
+                        // If no content but has images/audio, create content array
+                        json content_array = json::array();
+                        if (request->images_size() > 0) {
+                            for (int j = 0; j < request->images_size(); j++) {
+                                json image_chunk;
+                                image_chunk["type"] = "image_url";
+                                json image_url;
+                                image_url["url"] = "data:image/jpeg;base64," + request->images(j);
+                                image_chunk["image_url"] = image_url;
+                                content_array.push_back(image_chunk);
+                            }
+                        }
+                        if (request->audios_size() > 0) {
+                            for (int j = 0; j < request->audios_size(); j++) {
+                                json audio_chunk;
+                                audio_chunk["type"] = "input_audio";
+                                json input_audio;
+                                input_audio["data"] = request->audios(j);
+                                input_audio["format"] = "wav"; // default, could be made configurable
+                                audio_chunk["input_audio"] = input_audio;
+                                content_array.push_back(audio_chunk);
+                            }
+                        }
+                        msg_json["content"] = content_array;
+                    }
+                    
+                    // Add optional fields for OpenAI-compatible message format
+                    if (!msg.name().empty()) {
+                        msg_json["name"] = msg.name();
+                    }
+                    if (!msg.tool_call_id().empty()) {
+                        msg_json["tool_call_id"] = msg.tool_call_id();
+                    }
+                    if (!msg.reasoning_content().empty()) {
+                        msg_json["reasoning_content"] = msg.reasoning_content();
+                    }
+                    if (!msg.tool_calls().empty()) {
+                        // Parse tool_calls JSON string and add to message
+                        try {
+                            json tool_calls = json::parse(msg.tool_calls());
+                            msg_json["tool_calls"] = tool_calls;
+                        } catch (const json::parse_error& e) {
+                            SRV_WRN("Failed to parse tool_calls JSON: %s\n", e.what());
+                        }
+                    }
+                    
+                    messages_json.push_back(msg_json);
+                }
+
+                body_json["messages"] = messages_json;
+                body_json["stream"] = true; // PredictStream is always streaming
+                
+                // Check if grammar is provided from Go layer (NoGrammar=false)
+                // If grammar is provided, we must use it and NOT let template generate grammar from tools
+                // oaicompat_chat_params_parse throws an error if both grammar and tools are provided
+                bool has_grammar_from_go = data.contains("grammar") && 
+                    data["grammar"].is_string() && 
+                    !data["grammar"].get<std::string>().empty();
+                
+                // Copy other relevant fields from data that oaicompat_chat_params_parse expects
+                // Tools and tool_choice are only passed when NoGrammar is true (grammar not provided)
+                // When grammar is provided from Go layer, we use it instead of template-generated grammar
+                if (!has_grammar_from_go) {
+                    // NoGrammar=true: pass tools and let template generate grammar
+                    if (data.contains("tools")) {
+                        body_json["tools"] = data["tools"];
+                        std::string tools_str = data["tools"].dump();
+                        SRV_INF("Using tools from data (NoGrammar=true): %s\n", tools_str.c_str());
+                    } else {
+                        SRV_WRN("%s", "No tools found in data - tool calls will not work without tools field\n");
+                    }
+                    if (data.contains("tool_choice")) {
+                        // tool_choice can be a string or object, but oaicompat_chat_params_parse expects a string
+                        // Convert object tool_choice to "required" (since a specific function is requested)
+                        if (data["tool_choice"].is_string()) {
+                            body_json["tool_choice"] = data["tool_choice"].get<std::string>();
+                        } else if (data["tool_choice"].is_object()) {
+                            // Object tool_choice means a specific function is requested, use "required"
+                            body_json["tool_choice"] = "required";
+                            std::string tool_choice_obj_str = data["tool_choice"].dump();
+                            SRV_INF("Converted object tool_choice to 'required': %s\n", tool_choice_obj_str.c_str());
+                        } else {
+                            // Fallback: convert to string
+                            body_json["tool_choice"] = data["tool_choice"].dump();
+                        }
+                        std::string tool_choice_str = body_json["tool_choice"].get<std::string>();
+                        SRV_INF("Using tool_choice: %s\n", tool_choice_str.c_str());
+                    } else {
+                        // Default to "auto" if not specified
+                        body_json["tool_choice"] = "auto";
+                    }
+                } else {
+                    // Grammar is provided from Go layer (NoGrammar=false) - use it, don't pass tools
+                    SRV_INF("%s", "Grammar provided from Go layer - using it instead of template-generated grammar\n");
+                    // Grammar will be copied from data after parsing (it's already in data)
+                }
+                
+                if (data.contains("json_schema")) {
+                    body_json["json_schema"] = data["json_schema"];
+                }
+                // If grammar is provided from Go layer, copy it to body_json so it's preserved
+                // (though oaicompat_chat_params_parse may not use it if tools are present)
+                if (has_grammar_from_go) {
+                    body_json["grammar"] = data["grammar"];
+                }
+                if (data.contains("response_format")) {
+                    body_json["response_format"] = data["response_format"];
+                }
+                if (data.contains("chat_template_kwargs")) {
+                    body_json["chat_template_kwargs"] = data["chat_template_kwargs"];
+                }
+
+                // Use the same approach as server.cpp: call oaicompat_chat_params_parse
+                // This handles all template application, grammar merging, etc. automatically
+                // Files extracted from multimodal content in messages will be added to the files vector
+                // Create parser options with current chat_templates to ensure tmpls is not null
+                oaicompat_parser_options parser_opt = ctx_server.oai_parser_opt;
+                parser_opt.tmpls = ctx_server.chat_templates.get(); // Ensure tmpls is set to current chat_templates
+                json parsed_data = oaicompat_chat_params_parse(body_json, parser_opt, files);
+                
+                // Extract the prompt from parsed data
+                prompt_str = parsed_data.at("prompt").get<std::string>();
+                
+                // Preserve grammar from Go layer if it was provided (NoGrammar=false)
+                // Otherwise, use grammar from parsed_data (template-generated when NoGrammar=true)
+                json preserved_grammar;
+                if (has_grammar_from_go && data.contains("grammar")) {
+                    preserved_grammar = data["grammar"];
+                }
+                
+                // Merge all fields from parsed_data into data (grammar, grammar_triggers, preserved_tokens, etc.)
+                // This ensures all template-generated fields are included
+                for (const auto& item : parsed_data.items()) {
+                    if (item.key() != "prompt") { // Don't overwrite prompt_str, we already extracted it
+                        // If grammar was provided from Go layer, preserve it instead of template-generated grammar
+                        if (item.key() == "grammar" && has_grammar_from_go && !preserved_grammar.is_null()) {
+                            data["grammar"] = preserved_grammar;
+                        } else {
+                            data[item.key()] = item.value();
+                        }
+                    }
+                }
+            } else {
+                // Use prompt directly from data
+                if (data.contains("prompt") && data["prompt"].is_string()) {
+                    prompt_str = data["prompt"].get<std::string>();
+                } else {
+                    prompt_str = request->prompt();
+                }
+            }
+
+            const auto & prompt = prompt_str;
            const auto type = SERVER_TASK_TYPE_COMPLETION;
            // TODO: this log can become very long, put it behind a flag or think about a more compact format
            //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());

-            std::vector<raw_buffer> files;
-            const auto &images_data = data.find("image_data");
-            if (images_data != data.end() && images_data->is_array())
-            {
-                for (const auto &img : *images_data)
+            // If not using chat templates, extract files from image_data/audio_data fields
+            // (If using chat templates, files were already extracted by oaicompat_chat_params_parse)
+            //if (!request->usetokenizertemplate() || request->messages_size() == 0 || ctx_server.chat_templates == nullptr) {
+                const auto &images_data = data.find("image_data");
+                if (images_data != data.end() && images_data->is_array())
                {
-                    auto decoded_data = base64_decode(img["data"].get<std::string>());
-                    files.push_back(decoded_data);
+                    for (const auto &img : *images_data)
+                    {
+                        auto decoded_data = base64_decode(img["data"].get<std::string>());
+                        files.push_back(decoded_data);
+                    }
                }
-            }

-            const auto &audio_data = data.find("audio_data");
-            if (audio_data != data.end() && audio_data->is_array())
-            {
-                for (const auto &audio : *audio_data)
+                const auto &audio_data = data.find("audio_data");
+                if (audio_data != data.end() && audio_data->is_array())
                {
-                    auto decoded_data = base64_decode(audio["data"].get<std::string>());
-                    files.push_back(decoded_data);
+                    for (const auto &audio : *audio_data)
+                    {
+                        auto decoded_data = base64_decode(audio["data"].get<std::string>());
+                        files.push_back(decoded_data);
+                    }
                }
-            }
+           // }

            const bool has_mtmd = ctx_server.mctx != nullptr;

            // process prompt
            std::vector<server_tokens> inputs;
-            if (!prompt.is_string()) {
-                throw std::runtime_error("prompt must be a string");
-            }
-
            if (has_mtmd) {
                // multimodal
-                inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files));
+                inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt_str, files));
            } else {
                 // Everything else, including multimodal completions.
-                inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
+                inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt_str, true, true);
            }

            tasks.reserve(inputs.size());
@@ -644,52 +899,219 @@ public:
        try {
            std::vector<server_task> tasks;

-            const auto & prompt = data.at("prompt");
+            std::string prompt_str;
+            std::vector<raw_buffer> files; // Declare files early so it's accessible in both branches
+            // Handle chat templates when UseTokenizerTemplate is enabled and Messages are provided
+            if (request->usetokenizertemplate() && request->messages_size() > 0 && ctx_server.chat_templates != nullptr) {
+                // Convert proto Messages to JSON format compatible with oaicompat_chat_params_parse
+                json body_json;
+                json messages_json = json::array();
+                for (int i = 0; i < request->messages_size(); i++) {
+                    const auto& msg = request->messages(i);
+                    json msg_json;
+                    msg_json["role"] = msg.role();
+                    
+                    // Handle content - can be string, null, or array
+                    // For multimodal content, we'll embed images/audio from separate fields
+                    if (!msg.content().empty()) {
+                        msg_json["content"] = msg.content();
+                    } else if (request->images_size() > 0 || request->audios_size() > 0) {
+                        // If no content but has images/audio, create content array
+                        json content_array = json::array();
+                        if (request->images_size() > 0) {
+                            for (int j = 0; j < request->images_size(); j++) {
+                                json image_chunk;
+                                image_chunk["type"] = "image_url";
+                                json image_url;
+                                image_url["url"] = "data:image/jpeg;base64," + request->images(j);
+                                image_chunk["image_url"] = image_url;
+                                content_array.push_back(image_chunk);
+                            }
+                        }
+                        if (request->audios_size() > 0) {
+                            for (int j = 0; j < request->audios_size(); j++) {
+                                json audio_chunk;
+                                audio_chunk["type"] = "input_audio";
+                                json input_audio;
+                                input_audio["data"] = request->audios(j);
+                                input_audio["format"] = "wav"; // default, could be made configurable
+                                audio_chunk["input_audio"] = input_audio;
+                                content_array.push_back(audio_chunk);
+                            }
+                        }
+                        msg_json["content"] = content_array;
+                    } else if (!msg.tool_calls().empty()) {
+                        // Tool call messages may have null content
+                        msg_json["content"] = json();
+                    }
+                    
+                    // Add optional fields for OpenAI-compatible message format
+                    if (!msg.name().empty()) {
+                        msg_json["name"] = msg.name();
+                    }
+                    if (!msg.tool_call_id().empty()) {
+                        msg_json["tool_call_id"] = msg.tool_call_id();
+                    }
+                    if (!msg.reasoning_content().empty()) {
+                        msg_json["reasoning_content"] = msg.reasoning_content();
+                    }
+                    if (!msg.tool_calls().empty()) {
+                        // Parse tool_calls JSON string and add to message
+                        try {
+                            json tool_calls = json::parse(msg.tool_calls());
+                            msg_json["tool_calls"] = tool_calls;
+                        } catch (const json::parse_error& e) {
+                            SRV_WRN("Failed to parse tool_calls JSON: %s\n", e.what());
+                        }
+                    }
+                    
+                    messages_json.push_back(msg_json);
+                }
+
+                body_json["messages"] = messages_json;
+                body_json["stream"] = false;
+                
+                // Check if grammar is provided from Go layer (NoGrammar=false)
+                // If grammar is provided, we must use it and NOT let template generate grammar from tools
+                // oaicompat_chat_params_parse throws an error if both grammar and tools are provided
+                bool has_grammar_from_go = data.contains("grammar") && 
+                    data["grammar"].is_string() && 
+                    !data["grammar"].get<std::string>().empty();
+                
+                // Copy other relevant fields from data that oaicompat_chat_params_parse expects
+                // Tools and tool_choice are only passed when NoGrammar is true (grammar not provided)
+                // When grammar is provided from Go layer, we use it instead of template-generated grammar
+                if (!has_grammar_from_go) {
+                    // NoGrammar=true: pass tools and let template generate grammar
+                    if (data.contains("tools")) {
+                        body_json["tools"] = data["tools"];
+                        std::string tools_str = data["tools"].dump();
+                        SRV_INF("Using tools from data (NoGrammar=true): %s\n", tools_str.c_str());
+                    } else {
+                        SRV_WRN("%s", "No tools found in data - tool calls will not work without tools field\n");
+                    }
+                    if (data.contains("tool_choice")) {
+                        // tool_choice can be a string or object, but oaicompat_chat_params_parse expects a string
+                        // Convert object tool_choice to "required" (since a specific function is requested)
+                        if (data["tool_choice"].is_string()) {
+                            body_json["tool_choice"] = data["tool_choice"].get<std::string>();
+                        } else if (data["tool_choice"].is_object()) {
+                            // Object tool_choice means a specific function is requested, use "required"
+                            body_json["tool_choice"] = "required";
+                            std::string tool_choice_obj_str = data["tool_choice"].dump();
+                            SRV_INF("Converted object tool_choice to 'required': %s\n", tool_choice_obj_str.c_str());
+                        } else {
+                            // Fallback: convert to string
+                            body_json["tool_choice"] = data["tool_choice"].dump();
+                        }
+                        std::string tool_choice_str = body_json["tool_choice"].get<std::string>();
+                        SRV_INF("Using tool_choice: %s\n", tool_choice_str.c_str());
+                    } else {
+                        // Default to "auto" if not specified
+                        body_json["tool_choice"] = "auto";
+                    }
+                } else {
+                    // Grammar is provided from Go layer (NoGrammar=false) - use it, don't pass tools
+                    SRV_INF("%s", "Grammar provided from Go layer - using it instead of template-generated grammar\n");
+                    // Grammar will be copied from data after parsing (it's already in data)
+                }
+                
+                if (data.contains("json_schema")) {
+                    body_json["json_schema"] = data["json_schema"];
+                }
+                // If grammar is provided from Go layer, copy it to body_json so it's preserved
+                // (though oaicompat_chat_params_parse may not use it if tools are present)
+                if (has_grammar_from_go) {
+                    body_json["grammar"] = data["grammar"];
+                }
+                if (data.contains("response_format")) {
+                    body_json["response_format"] = data["response_format"];
+                }
+                if (data.contains("chat_template_kwargs")) {
+                    body_json["chat_template_kwargs"] = data["chat_template_kwargs"];
+                }
+
+                // Use the same approach as server.cpp: call oaicompat_chat_params_parse
+                // This handles all template application, grammar merging, etc. automatically
+                // Files extracted from multimodal content in messages will be added to the files vector
+                // Create parser options with current chat_templates to ensure tmpls is not null
+                oaicompat_parser_options parser_opt = ctx_server.oai_parser_opt;
+                parser_opt.tmpls = ctx_server.chat_templates.get(); // Ensure tmpls is set to current chat_templates
+                json parsed_data = oaicompat_chat_params_parse(body_json, parser_opt, files);
+                
+                // Extract the prompt from parsed data
+                prompt_str = parsed_data.at("prompt").get<std::string>();
+                
+                // Preserve grammar from Go layer if it was provided (NoGrammar=false)
+                // Otherwise, use grammar from parsed_data (template-generated when NoGrammar=true)
+                json preserved_grammar;
+                if (has_grammar_from_go && data.contains("grammar")) {
+                    preserved_grammar = data["grammar"];
+                }
+                
+                // Merge all fields from parsed_data into data (grammar, grammar_triggers, preserved_tokens, etc.)
+                // This ensures all template-generated fields are included
+                for (const auto& item : parsed_data.items()) {
+                    if (item.key() != "prompt") { // Don't overwrite prompt_str, we already extracted it
+                        // If grammar was provided from Go layer, preserve it instead of template-generated grammar
+                        if (item.key() == "grammar" && has_grammar_from_go && !preserved_grammar.is_null()) {
+                            data["grammar"] = preserved_grammar;
+                        } else {
+                            data[item.key()] = item.value();
+                        }
+                    }
+                }
+            } else {
+                // Use prompt directly from data
+                if (data.contains("prompt") && data["prompt"].is_string()) {
+                    prompt_str = data["prompt"].get<std::string>();
+                } else {
+                    prompt_str = request->prompt();
+                }
+            }
+
+            const auto & prompt = prompt_str;
            const auto type = SERVER_TASK_TYPE_COMPLETION;
            // TODO: this log can become very long, put it behind a flag or think about a more compact format
            //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get<std::string>().c_str() : prompt.dump(2).c_str());

-            std::vector<raw_buffer> files;
-            const auto &images_data = data.find("image_data");
-           // std::cout << "[PREDICT] Images data: " << images_data->dump(2) << std::endl;
-           
-            if (images_data != data.end() && images_data->is_array())
-            {
-                std::cout << "[PREDICT] Processing " << images_data->size() << " images" << std::endl;
-                for (const auto &img : *images_data)
+            // If not using chat templates, extract files from image_data/audio_data fields
+            // (If using chat templates, files were already extracted by oaicompat_chat_params_parse)
+           // if (!request->usetokenizertemplate() || request->messages_size() == 0 || ctx_server.chat_templates == nullptr) {
+                const auto &images_data = data.find("image_data");
+                if (images_data != data.end() && images_data->is_array())
                {
-                    std::cout << "[PREDICT] Processing image" << std::endl;
-                    auto decoded_data = base64_decode(img["data"].get<std::string>());
-                    files.push_back(decoded_data);
+                    std::cout << "[PREDICT] Processing " << images_data->size() << " images" << std::endl;
+                    for (const auto &img : *images_data)
+                    {
+                        std::cout << "[PREDICT] Processing image" << std::endl;
+                        auto decoded_data = base64_decode(img["data"].get<std::string>());
+                        files.push_back(decoded_data);
+                    }
                }
-            }

-            const auto &audio_data = data.find("audio_data");
-            if (audio_data != data.end() && audio_data->is_array())
-            {
-                for (const auto &audio : *audio_data)
+                const auto &audio_data = data.find("audio_data");
+                if (audio_data != data.end() && audio_data->is_array())
                {
-                    auto decoded_data = base64_decode(audio["data"].get<std::string>());
-                    files.push_back(decoded_data);
+                    for (const auto &audio : *audio_data)
+                    {
+                        auto decoded_data = base64_decode(audio["data"].get<std::string>());
+                        files.push_back(decoded_data);
+                    }
                }
-            }
+           // }

            // process files
            const bool has_mtmd = ctx_server.mctx != nullptr;

            // process prompt
            std::vector<server_tokens> inputs;
-            if (!prompt.is_string()) {
-                std::cout << "[PREDICT] Prompt must be a string" << std::endl;
-                throw std::runtime_error("prompt must be a string");
-            }
-
            if (has_mtmd) {
                // multimodal
-                inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get<std::string>(), files));
+                inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt_str, files));
            } else {
                 // Everything else, including multimodal completions.
-                inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
+                inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt_str, true, true);
            }

            tasks.reserve(inputs.size());