diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index ee164b9e4..d509ddacd 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -1,5 +1,5 @@ -LLAMA_VERSION?=665abc609740d397d30c0d8ef4157dbf900bd1a3 +LLAMA_VERSION?=d77599234ea6e498775aeadbce665eece5bd98cd LLAMA_REPO?=https://github.com/ggerganov/llama.cpp CMAKE_ARGS?= diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 6bf144e91..df3d075e7 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -442,7 +442,7 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // Draft model for speculative decoding if (!request->draftmodel().empty()) { - params.speculative.mparams_dft.path = request->draftmodel(); + params.speculative.draft.mparams.path = request->draftmodel(); // Default to draft type if a draft model is set but no explicit type if (params.speculative.type == COMMON_SPECULATIVE_TYPE_NONE) { params.speculative.type = COMMON_SPECULATIVE_TYPE_DRAFT; @@ -679,39 +679,39 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt } } else if (!strcmp(optname, "spec_n_max") || !strcmp(optname, "draft_max")) { if (optval != NULL) { - try { params.speculative.n_max = std::stoi(optval_str); } catch (...) {} + try { params.speculative.draft.n_max = std::stoi(optval_str); } catch (...) {} } } else if (!strcmp(optname, "spec_n_min") || !strcmp(optname, "draft_min")) { if (optval != NULL) { - try { params.speculative.n_min = std::stoi(optval_str); } catch (...) {} + try { params.speculative.draft.n_min = std::stoi(optval_str); } catch (...) {} } } else if (!strcmp(optname, "spec_p_min") || !strcmp(optname, "draft_p_min")) { if (optval != NULL) { - try { params.speculative.p_min = std::stof(optval_str); } catch (...) {} + try { params.speculative.draft.p_min = std::stof(optval_str); } catch (...) {} } } else if (!strcmp(optname, "spec_p_split")) { if (optval != NULL) { - try { params.speculative.p_split = std::stof(optval_str); } catch (...) {} + try { params.speculative.draft.p_split = std::stof(optval_str); } catch (...) {} } } else if (!strcmp(optname, "spec_ngram_size_n") || !strcmp(optname, "ngram_size_n")) { if (optval != NULL) { - try { params.speculative.ngram_size_n = (uint16_t)std::stoi(optval_str); } catch (...) {} + try { params.speculative.ngram_simple.size_n = (uint16_t)std::stoi(optval_str); } catch (...) {} } } else if (!strcmp(optname, "spec_ngram_size_m") || !strcmp(optname, "ngram_size_m")) { if (optval != NULL) { - try { params.speculative.ngram_size_m = (uint16_t)std::stoi(optval_str); } catch (...) {} + try { params.speculative.ngram_simple.size_m = (uint16_t)std::stoi(optval_str); } catch (...) {} } } else if (!strcmp(optname, "spec_ngram_min_hits") || !strcmp(optname, "ngram_min_hits")) { if (optval != NULL) { - try { params.speculative.ngram_min_hits = (uint16_t)std::stoi(optval_str); } catch (...) {} + try { params.speculative.ngram_simple.min_hits = (uint16_t)std::stoi(optval_str); } catch (...) {} } } else if (!strcmp(optname, "draft_gpu_layers")) { if (optval != NULL) { - try { params.speculative.n_gpu_layers = std::stoi(optval_str); } catch (...) {} + try { params.speculative.draft.n_gpu_layers = std::stoi(optval_str); } catch (...) {} } } else if (!strcmp(optname, "draft_ctx_size")) { if (optval != NULL) { - try { params.speculative.n_ctx = std::stoi(optval_str); } catch (...) {} + try { params.speculative.draft.n_ctx = std::stoi(optval_str); } catch (...) {} } } } @@ -933,8 +933,8 @@ public: if (!params.mmproj.path.empty()) { error_msg += " (with mmproj: " + params.mmproj.path + ")"; } - if (params.speculative.has_dft() && !params.speculative.mparams_dft.path.empty()) { - error_msg += " (with draft model: " + params.speculative.mparams_dft.path + ")"; + if (params.speculative.has_dft() && !params.speculative.draft.mparams.path.empty()) { + error_msg += " (with draft model: " + params.speculative.draft.mparams.path + ")"; } // Add captured error details if available diff --git a/backend/cpp/turboquant/patch-grpc-server.sh b/backend/cpp/turboquant/patch-grpc-server.sh index c3dd967a0..a4c2df62c 100755 --- a/backend/cpp/turboquant/patch-grpc-server.sh +++ b/backend/cpp/turboquant/patch-grpc-server.sh @@ -1,6 +1,6 @@ #!/bin/bash # Patch the shared backend/cpp/llama-cpp/grpc-server.cpp *copy* used by the -# turboquant build to account for two gaps between upstream and the fork: +# turboquant build to account for the gaps between upstream and the fork: # # 1. Augment the kv_cache_types[] allow-list so `LoadModel` accepts the # fork-specific `turbo2` / `turbo3` / `turbo4` cache types. @@ -11,6 +11,14 @@ # "<__media__>", and Go-side tooling falls back to that sentinel when the # backend does not expose media_marker, so substituting the literal keeps # behavior identical on the turboquant path. +# 3. Revert the `common_params_speculative` field references to the +# pre-refactor flat layout. Upstream ggml-org/llama.cpp#22397 split the +# struct into nested `draft` / `ngram_simple` / `ngram_mod` / etc. members; +# the turboquant fork branched before that PR and still exposes the flat +# `n_max`, `mparams_dft`, `ngram_size_n`, ... fields. The substitutions +# below map the new nested paths back to the legacy flat names so the +# shared grpc-server.cpp keeps compiling against the fork's common.h. +# Drop this block once the fork rebases past #22397. # # We patch the *copy* sitting in turboquant--build/, never the original # under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps compiling @@ -77,4 +85,27 @@ else echo "==> $SRC has no get_media_marker() call, skipping media-marker patch" fi +if grep -q 'params\.speculative\.draft\.\|params\.speculative\.ngram_simple\.' "$SRC"; then + echo "==> patching $SRC to revert common_params_speculative refs to pre-#22397 flat layout" + # Each substitution is the exact post-refactor path → legacy flat field. + # Order doesn't matter because the source paths are disjoint, but we keep + # the most-specific (mparams.path) first for readability. + sed -E \ + -e 's/params\.speculative\.draft\.mparams\.path/params.speculative.mparams_dft.path/g' \ + -e 's/params\.speculative\.draft\.n_max/params.speculative.n_max/g' \ + -e 's/params\.speculative\.draft\.n_min/params.speculative.n_min/g' \ + -e 's/params\.speculative\.draft\.p_min/params.speculative.p_min/g' \ + -e 's/params\.speculative\.draft\.p_split/params.speculative.p_split/g' \ + -e 's/params\.speculative\.draft\.n_gpu_layers/params.speculative.n_gpu_layers/g' \ + -e 's/params\.speculative\.draft\.n_ctx/params.speculative.n_ctx/g' \ + -e 's/params\.speculative\.ngram_simple\.size_n/params.speculative.ngram_size_n/g' \ + -e 's/params\.speculative\.ngram_simple\.size_m/params.speculative.ngram_size_m/g' \ + -e 's/params\.speculative\.ngram_simple\.min_hits/params.speculative.ngram_min_hits/g' \ + "$SRC" > "$SRC.tmp" + mv "$SRC.tmp" "$SRC" + echo "==> speculative field rename OK" +else + echo "==> $SRC has no post-#22397 speculative field refs, skipping spec rename patch" +fi + echo "==> all patches applied"