diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index 865f9481f..b3a27fd4b 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -1,5 +1,5 @@ -LLAMA_VERSION?=a9883db8ee021cf16783016a60996d41820b5195 +LLAMA_VERSION?=7f3f843c31cd32dc4adc10b393342dfee071c332 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp CMAKE_ARGS?= diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 5c1d07766..6c4fa6946 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -450,6 +451,8 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // vector; the turboquant fork still uses the legacy scalar. The // LOCALAI_LEGACY_LLAMA_CPP_SPEC macro is injected by // backend/cpp/turboquant/patch-grpc-server.sh for fork builds only. + // Upstream renamed COMMON_SPECULATIVE_TYPE_DRAFT -> ..._DRAFT_SIMPLE + // in ggml-org/llama.cpp#22964; the fork still uses the old name. #ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC if (params.speculative.type == COMMON_SPECULATIVE_TYPE_NONE) { params.speculative.type = COMMON_SPECULATIVE_TYPE_DRAFT; @@ -458,7 +461,7 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt const bool no_spec_type = params.speculative.types.empty() || (params.speculative.types.size() == 1 && params.speculative.types[0] == COMMON_SPECULATIVE_TYPE_NONE); if (no_spec_type) { - params.speculative.types = { COMMON_SPECULATIVE_TYPE_DRAFT }; + params.speculative.types = { COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE }; } #endif } @@ -701,16 +704,27 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // Upstream switched to a vector of types (comma-separated for multi-type // chaining via common_speculative_types_from_names). We keep accepting a // single value here, but also tolerate comma-separated lists. + // + // ggml-org/llama.cpp#22964 also renamed the registered names from + // underscore- to dash-separated form, and replaced the bare + // `draft`/`eagle3` aliases with `draft-simple`/`draft-eagle3`. We + // normalize each token here so existing model configs keep working. + auto normalize_spec_name = [](std::string s) -> std::string { + std::replace(s.begin(), s.end(), '_', '-'); + if (s == "draft") return "draft-simple"; + if (s == "eagle3") return "draft-eagle3"; + return s; + }; std::vector names; std::string item; for (char c : optval_str) { if (c == ',') { - if (!item.empty()) { names.push_back(item); item.clear(); } + if (!item.empty()) { names.push_back(normalize_spec_name(item)); item.clear(); } } else { item.push_back(c); } } - if (!item.empty()) names.push_back(item); + if (!item.empty()) names.push_back(normalize_spec_name(item)); auto parsed = common_speculative_types_from_names(names); if (!parsed.empty()) { params.speculative.types = parsed;