From 1c92b00918898fdd94c9c79f4836b325c12cfa80 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Thu, 28 May 2026 17:37:54 +0200 Subject: [PATCH] fix(turboquant): guard upstream-only grpc-server fields for fork (#10043) fix(turboquant): guard upstream-only grpc-server fields for fork build backend/cpp/llama-cpp/grpc-server.cpp is reused by the turboquant build, which compiles against an older llama.cpp fork (TheTom/llama-cpp-turboquant). Two recent changes added references to upstream-only struct fields outside the existing LOCALAI_LEGACY_LLAMA_CPP_SPEC guards: - common_params::checkpoint_min_step (default + option handler), added with the ggml-org/llama.cpp 35c9b1f3 bump (#9998) - the common_params_speculative::draft tensor_buft_overrides sentinel termination (#9919), which sat after the guard's #endif The fork has neither field, so grpc-server.cpp failed to compile for every turboquant flavor. Wrap the three references in #ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC, matching the existing fork-compat guards, so the stock llama-cpp build is unchanged and the fork build skips them. Update patch-grpc-server.sh's doc comment to record what the macro now gates out. Verified by a local fallback-flavor turboquant build: grpc-server.cpp compiles against the fork and the backend image builds. Assisted-by: Claude:claude-opus-4-7 [Claude Code] Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- backend/cpp/llama-cpp/grpc-server.cpp | 20 +++++++++++++++++++- backend/cpp/turboquant/patch-grpc-server.sh | 7 +++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 3681a21dc..f8dd48f5a 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -573,8 +573,12 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // checkpoint_min_step: minimum spacing between context checkpoints in // tokens (0 disables the minimum). Match upstream's default (256). This // field was renamed from `checkpoint_every_nt` in llama.cpp; the semantics - // also shifted from a fixed cadence to a minimum spacing. + // also shifted from a fixed cadence to a minimum spacing. The turboquant + // fork branched before the field existed, so skip it on the legacy path + // (LOCALAI_LEGACY_LLAMA_CPP_SPEC is injected by patch-grpc-server.sh). +#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC params.checkpoint_min_step = 256; +#endif // decode options. Options are in form optname:optvale, or if booleans only optname. for (int i = 0; i < request->options_size(); i++) { @@ -748,11 +752,18 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt params.cache_idle_slots = false; } +#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC // --- minimum context-checkpoint spacing (upstream -cms / --checkpoint-min-step) --- // 0 disables the minimum-spacing gate. Old option names (`checkpoint_every_nt`, // `checkpoint_every_n_tokens`) are kept as aliases for backward compatibility // with existing user configs: upstream renamed the field and shifted its // semantics from a fixed cadence to a minimum spacing. + // + // Gated out for the turboquant fork, which lacks common_params:: + // checkpoint_min_step. The leading `}` closing the cache_idle_slots + // branch is removed with this block; the next `} else if` (n_ubatch) + // then closes cache_idle_slots, so braces stay balanced under both + // preprocessor branches. } else if (!strcmp(optname, "checkpoint_min_step") || !strcmp(optname, "checkpoint_min_spacing") || !strcmp(optname, "checkpoint_every_nt") || !strcmp(optname, "checkpoint_every_n_tokens")) { if (optval != NULL) { @@ -762,6 +773,7 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // If conversion fails, keep default value (256) } } +#endif // --- physical batch size (upstream -ub / --ubatch-size) --- // Note: line ~482 already aliases n_ubatch to n_batch as a default; this @@ -1165,9 +1177,15 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt params.tensor_buft_overrides.push_back({nullptr, nullptr}); } } + // The draft tensor_buft_overrides are only populated under the modern + // (post-#22838) layout, whose population code is itself gated by + // LOCALAI_LEGACY_LLAMA_CPP_SPEC above. The turboquant fork lacks + // common_params_speculative::draft entirely, so skip the sentinel there too. +#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC if (!params.speculative.draft.tensor_buft_overrides.empty()) { params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr}); } +#endif // TODO: Add yarn diff --git a/backend/cpp/turboquant/patch-grpc-server.sh b/backend/cpp/turboquant/patch-grpc-server.sh index d071c6156..b1a62b215 100755 --- a/backend/cpp/turboquant/patch-grpc-server.sh +++ b/backend/cpp/turboquant/patch-grpc-server.sh @@ -124,8 +124,11 @@ fi # 5. Define LOCALAI_LEGACY_LLAMA_CPP_SPEC at the top of the file so the # grpc-server option parser skips the new option-handler blocks (ngram_mod, # ngram_map_k, ngram_map_k4v, ngram_cache, draft.cache_type_*, draft.cpuparams*, -# draft.tensor_buft_overrides) introduced for the post-#22838 layout. Those -# blocks reference struct fields that simply do not exist in the fork. +# draft.tensor_buft_overrides) introduced for the post-#22838 layout, the +# draft.tensor_buft_overrides sentinel termination, and the +# common_params::checkpoint_min_step default/option (added with the +# 35c9b1f3 bump). Those blocks reference struct fields that simply do not +# exist in the fork. if grep -q '^#define LOCALAI_LEGACY_LLAMA_CPP_SPEC' "$SRC"; then echo "==> $SRC already defines LOCALAI_LEGACY_LLAMA_CPP_SPEC, skipping" else