diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index a26d38626..5292ef0f6 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -1261,6 +1261,42 @@ public: body_json["add_generation_prompt"] = data["add_generation_prompt"]; } + // Pass sampling parameters to body_json so oaicompat_chat_params_parse respects them + // and doesn't overwrite them with defaults in the returned parsed_data + if (data.contains("n_predict")) { + body_json["max_tokens"] = data["n_predict"]; + } + if (data.contains("ignore_eos")) { + body_json["ignore_eos"] = data["ignore_eos"]; + } + if (data.contains("stop")) { + body_json["stop"] = data["stop"]; + } + if (data.contains("temperature")) { + body_json["temperature"] = data["temperature"]; + } + if (data.contains("top_p")) { + body_json["top_p"] = data["top_p"]; + } + if (data.contains("frequency_penalty")) { + body_json["frequency_penalty"] = data["frequency_penalty"]; + } + if (data.contains("presence_penalty")) { + body_json["presence_penalty"] = data["presence_penalty"]; + } + if (data.contains("seed")) { + body_json["seed"] = data["seed"]; + } + if (data.contains("logit_bias")) { + body_json["logit_bias"] = data["logit_bias"]; + } + if (data.contains("top_k")) { + body_json["top_k"] = data["top_k"]; + } + if (data.contains("min_p")) { + body_json["min_p"] = data["min_p"]; + } + // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.) SRV_DBG("[CONVERSATION DEBUG] PredictStream: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str()); @@ -1992,6 +2028,42 @@ public: body_json["add_generation_prompt"] = data["add_generation_prompt"]; } + // Pass sampling parameters to body_json so oaicompat_chat_params_parse respects them + // and doesn't overwrite them with defaults in the returned parsed_data + if (data.contains("n_predict")) { + body_json["max_tokens"] = data["n_predict"]; + } + if (data.contains("ignore_eos")) { + body_json["ignore_eos"] = data["ignore_eos"]; + } + if (data.contains("stop")) { + body_json["stop"] = data["stop"]; + } + if (data.contains("temperature")) { + body_json["temperature"] = data["temperature"]; + } + if (data.contains("top_p")) { + body_json["top_p"] = data["top_p"]; + } + if (data.contains("frequency_penalty")) { + body_json["frequency_penalty"] = data["frequency_penalty"]; + } + if (data.contains("presence_penalty")) { + body_json["presence_penalty"] = data["presence_penalty"]; + } + if (data.contains("seed")) { + body_json["seed"] = data["seed"]; + } + if (data.contains("logit_bias")) { + body_json["logit_bias"] = data["logit_bias"]; + } + if (data.contains("top_k")) { + body_json["top_k"] = data["top_k"]; + } + if (data.contains("min_p")) { + body_json["min_p"] = data["min_p"]; + } + // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.) SRV_DBG("[CONVERSATION DEBUG] Predict: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str());