From d11a152ad3fe93d72efe00d104f7a49d8f2b3ab5 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 6 Jun 2026 18:25:39 +0000 Subject: [PATCH] fix(turboquant): drop obsolete legacy-spec shim after fork rebased The TheTom/llama-cpp-turboquant fork (pin c9aa86a) rebased past the upstream common_params_speculative refactor (ggml-org/llama.cpp #22397/#22838/#22964), the model_tgt rename (#22838) and get_media_marker (#21962). The old fork-compat shim forced now-wrong legacy code paths, breaking the build with errors like 'struct common_params_speculative has no member named mparams_dft / type' and 'server_context_impl has no member named model'. Remove the obsolete LOCALAI_LEGACY_LLAMA_CPP_SPEC branches from the shared grpc-server.cpp (stock llama-cpp and the modern fork both take the modern path now), and narrow the one remaining gap (the fork still lacks common_params::checkpoint_min_step) to a dedicated LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP guard injected by patch-grpc-server.sh. The patch script now only adds the turbo2/3/4 KV-cache types and injects that one macro. Signed-off-by: Ettore Di Giacinto Assisted-by: Claude:claude-opus-4-8 [Claude Code] --- backend/cpp/llama-cpp/grpc-server.cpp | 59 ++--------- backend/cpp/turboquant/patch-grpc-server.sh | 106 +++++--------------- 2 files changed, 35 insertions(+), 130 deletions(-) diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index ac5521bc4..90a5477a9 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -482,23 +482,13 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt if (!request->draftmodel().empty()) { params.speculative.draft.mparams.path = request->draftmodel(); // Default to draft type if a draft model is set but no explicit type. - // Upstream (post ggml-org/llama.cpp#22838) made the speculative type a - // vector; the turboquant fork still uses the legacy scalar. The - // LOCALAI_LEGACY_LLAMA_CPP_SPEC macro is injected by - // backend/cpp/turboquant/patch-grpc-server.sh for fork builds only. - // Upstream renamed COMMON_SPECULATIVE_TYPE_DRAFT -> ..._DRAFT_SIMPLE - // in ggml-org/llama.cpp#22964; the fork still uses the old name. -#ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC - if (params.speculative.type == COMMON_SPECULATIVE_TYPE_NONE) { - params.speculative.type = COMMON_SPECULATIVE_TYPE_DRAFT; - } -#else + // Upstream made the speculative type a vector (ggml-org/llama.cpp#22838) + // and renamed COMMON_SPECULATIVE_TYPE_DRAFT -> ..._DRAFT_SIMPLE (#22964). const bool no_spec_type = params.speculative.types.empty() || (params.speculative.types.size() == 1 && params.speculative.types[0] == COMMON_SPECULATIVE_TYPE_NONE); if (no_spec_type) { params.speculative.types = { COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE }; } -#endif } // params.model_alias ?? @@ -574,9 +564,10 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // tokens (0 disables the minimum). Match upstream's default (256). This // field was renamed from `checkpoint_every_nt` in llama.cpp; the semantics // also shifted from a fixed cadence to a minimum spacing. The turboquant - // fork branched before the field existed, so skip it on the legacy path - // (LOCALAI_LEGACY_LLAMA_CPP_SPEC is injected by patch-grpc-server.sh). -#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC + // fork still lacks common_params::checkpoint_min_step, so skip it there + // (LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP is injected by + // backend/cpp/turboquant/patch-grpc-server.sh). +#ifndef LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP params.checkpoint_min_step = 256; #endif @@ -752,7 +743,7 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt params.cache_idle_slots = false; } -#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC +#ifndef LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP // --- minimum context-checkpoint spacing (upstream -cms / --checkpoint-min-step) --- // 0 disables the minimum-spacing gate. Old option names (`checkpoint_every_nt`, // `checkpoint_every_n_tokens`) are kept as aliases for backward compatibility @@ -906,17 +897,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // Speculative decoding options } else if (!strcmp(optname, "spec_type") || !strcmp(optname, "speculative_type")) { -#ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC - // Fork only knows a single scalar `type`. Take the first comma- - // separated value and assign it via the singular helper. - std::string first = optval_str; - const auto comma = first.find(','); - if (comma != std::string::npos) first = first.substr(0, comma); - auto type = common_speculative_type_from_name(first); - if (type != COMMON_SPECULATIVE_TYPE_COUNT) { - params.speculative.type = type; - } -#else // Upstream switched to a vector of types (comma-separated for multi-type // chaining via common_speculative_types_from_names). We keep accepting a // single value here, but also tolerate comma-separated lists. @@ -945,7 +925,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt if (!parsed.empty()) { params.speculative.types = parsed; } -#endif } else if (!strcmp(optname, "spec_n_max") || !strcmp(optname, "draft_max")) { if (optval != NULL) { try { params.speculative.draft.n_max = std::stoi(optval_str); } catch (...) {} @@ -983,21 +962,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // shares the target context size. Accept the option for backward // compatibility but silently ignore it. -// Everything below relies on struct shape introduced in ggml-org/llama.cpp#22838 -// (parallel drafting): `ngram_mod`, `ngram_map_k`, `ngram_map_k4v`, -// `ngram_cache`, and the `draft.{cache_type_*, cpuparams*, tensor_buft_overrides}` -// fields. The turboquant fork branched before that, so its build defines -// LOCALAI_LEGACY_LLAMA_CPP_SPEC via patch-grpc-server.sh and these option -// keys become unrecognized (silently dropped, like any unknown opt) for it. -// -// The `#ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC` / `#else` split below sits at the -// closing-brace position of the `draft_ctx_size` branch on purpose: in the -// legacy build the chain ends here (the brace closes draft_ctx_size), and in -// the modern build the chain continues with `} else if (...)` instead, so the -// brace count stays balanced under both branches of the preprocessor. -#ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC - } -#else // --- ngram_mod family (upstream --spec-ngram-mod-*) --- } else if (!strcmp(optname, "spec_ngram_mod_n_min")) { if (optval != NULL) { @@ -1127,7 +1091,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt } if (!cur.empty()) flush(cur); } -#endif // LOCALAI_LEGACY_LLAMA_CPP_SPEC — closes the `else`/`#ifdef` opened at draft_ctx_size } // Set params.n_parallel from environment variable if not set via options (fallback) @@ -1177,15 +1140,11 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt params.tensor_buft_overrides.push_back({nullptr, nullptr}); } } - // The draft tensor_buft_overrides are only populated under the modern - // (post-#22838) layout, whose population code is itself gated by - // LOCALAI_LEGACY_LLAMA_CPP_SPEC above. The turboquant fork lacks - // common_params_speculative::draft entirely, so skip the sentinel there too. -#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC + // Terminate the draft tensor_buft_overrides list with a sentinel, mirroring + // the main-model handling above. if (!params.speculative.draft.tensor_buft_overrides.empty()) { params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr}); } -#endif // TODO: Add yarn diff --git a/backend/cpp/turboquant/patch-grpc-server.sh b/backend/cpp/turboquant/patch-grpc-server.sh index b1a62b215..fa11897dd 100755 --- a/backend/cpp/turboquant/patch-grpc-server.sh +++ b/backend/cpp/turboquant/patch-grpc-server.sh @@ -4,21 +4,19 @@ # # 1. Augment the kv_cache_types[] allow-list so `LoadModel` accepts the # fork-specific `turbo2` / `turbo3` / `turbo4` cache types. -# 2. Replace `get_media_marker()` (added upstream in ggml-org/llama.cpp#21962, -# server-side random per-instance marker) with the legacy "<__media__>" -# literal. The fork branched before that PR, so server-common.cpp has no -# get_media_marker symbol. The fork's mtmd_default_marker() still returns -# "<__media__>", and Go-side tooling falls back to that sentinel when the -# backend does not expose media_marker, so substituting the literal keeps -# behavior identical on the turboquant path. -# 3. Revert the `common_params_speculative` field references to the -# pre-refactor flat layout. Upstream ggml-org/llama.cpp#22397 split the -# struct into nested `draft` / `ngram_simple` / `ngram_mod` / etc. members; -# the turboquant fork branched before that PR and still exposes the flat -# `n_max`, `mparams_dft`, `ngram_size_n`, ... fields. The substitutions -# below map the new nested paths back to the legacy flat names so the -# shared grpc-server.cpp keeps compiling against the fork's common.h. -# Drop this block once the fork rebases past #22397. +# 2. Define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP at the top of the file +# so the grpc-server option parser skips the two references to +# common_params::checkpoint_min_step (the default and the option handler). +# That field does not exist in the fork yet; drop this once it does. +# +# The fork used to lag upstream on the whole common_params_speculative refactor +# (ggml-org/llama.cpp#22397/#22838/#22964), the model_tgt rename (#22838) and +# get_media_marker (#21962), which required a much larger compat shim here +# (flat-field sed renames + a coarse LOCALAI_LEGACY_LLAMA_CPP_SPEC define). The +# fork has since rebased past all of those, so the only remaining gap is +# checkpoint_min_step. If a future bump reintroduces a divergence, add a narrow +# guard in grpc-server.cpp keyed on a fork-specific macro and inject it here +# rather than resurrecting the coarse one. # # We patch the *copy* sitting in turboquant--build/, never the original # under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps compiling @@ -72,72 +70,20 @@ else echo "==> KV allow-list patch OK" fi -if grep -q 'get_media_marker()' "$SRC"; then - echo "==> patching $SRC to replace get_media_marker() with legacy \"<__media__>\" literal" - # Only one call site today (ModelMetadata), but replace all occurrences to - # stay robust if upstream adds more. Use a temp file to avoid relying on - # sed -i portability (the builder image uses GNU sed, but keeping this - # consistent with the awk block above). - sed 's/get_media_marker()/"<__media__>"/g' "$SRC" > "$SRC.tmp" - mv "$SRC.tmp" "$SRC" - echo "==> get_media_marker() substitution OK" +# 2. Define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP at the top of the file so +# the grpc-server option parser skips the two references to +# common_params::checkpoint_min_step (the default assignment and the option +# handler). That field does not exist in the fork yet. Drop this block once +# the fork rebases past the bump that added checkpoint_min_step. +if grep -q '^#define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP' "$SRC"; then + echo "==> $SRC already defines LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP, skipping" else - echo "==> $SRC has no get_media_marker() call, skipping media-marker patch" -fi - -if grep -q 'params\.speculative\.draft\.\|params\.speculative\.ngram_simple\.' "$SRC"; then - echo "==> patching $SRC to revert common_params_speculative refs to pre-#22397 flat layout" - # Each substitution is the exact post-refactor path → legacy flat field. - # Order doesn't matter because the source paths are disjoint, but we keep - # the most-specific (mparams.path) first for readability. - sed -E \ - -e 's/params\.speculative\.draft\.mparams\.path/params.speculative.mparams_dft.path/g' \ - -e 's/params\.speculative\.draft\.n_max/params.speculative.n_max/g' \ - -e 's/params\.speculative\.draft\.n_min/params.speculative.n_min/g' \ - -e 's/params\.speculative\.draft\.p_min/params.speculative.p_min/g' \ - -e 's/params\.speculative\.draft\.p_split/params.speculative.p_split/g' \ - -e 's/params\.speculative\.draft\.n_gpu_layers/params.speculative.n_gpu_layers/g' \ - -e 's/params\.speculative\.draft\.n_ctx/params.speculative.n_ctx/g' \ - -e 's/params\.speculative\.ngram_simple\.size_n/params.speculative.ngram_size_n/g' \ - -e 's/params\.speculative\.ngram_simple\.size_m/params.speculative.ngram_size_m/g' \ - -e 's/params\.speculative\.ngram_simple\.min_hits/params.speculative.ngram_min_hits/g' \ - "$SRC" > "$SRC.tmp" - mv "$SRC.tmp" "$SRC" - echo "==> speculative field rename OK" -else - echo "==> $SRC has no post-#22397 speculative field refs, skipping spec rename patch" -fi - -# 4. Revert the `ctx_server.impl->model_tgt` rename introduced by upstream -# ggml-org/llama.cpp#22838 (parallel drafting). The turboquant fork still -# exposes the field as `model` on `server_context_impl`. The two call sites -# are in the Rerank and ModelMetadata RPC handlers. -if grep -q 'ctx_server\.impl->model_tgt' "$SRC"; then - echo "==> patching $SRC to revert ctx_server.impl->model_tgt -> ctx_server.impl->model" - sed -E 's/ctx_server\.impl->model_tgt/ctx_server.impl->model/g' "$SRC" > "$SRC.tmp" - mv "$SRC.tmp" "$SRC" - echo "==> model_tgt rename OK" -else - echo "==> $SRC has no ctx_server.impl->model_tgt refs, skipping model_tgt rename patch" -fi - -# 5. Define LOCALAI_LEGACY_LLAMA_CPP_SPEC at the top of the file so the -# grpc-server option parser skips the new option-handler blocks (ngram_mod, -# ngram_map_k, ngram_map_k4v, ngram_cache, draft.cache_type_*, draft.cpuparams*, -# draft.tensor_buft_overrides) introduced for the post-#22838 layout, the -# draft.tensor_buft_overrides sentinel termination, and the -# common_params::checkpoint_min_step default/option (added with the -# 35c9b1f3 bump). Those blocks reference struct fields that simply do not -# exist in the fork. -if grep -q '^#define LOCALAI_LEGACY_LLAMA_CPP_SPEC' "$SRC"; then - echo "==> $SRC already defines LOCALAI_LEGACY_LLAMA_CPP_SPEC, skipping" -else - echo "==> patching $SRC to define LOCALAI_LEGACY_LLAMA_CPP_SPEC at the top" - # Insert the define before the very first `#include` so it precedes all the - # speculative-decoding code paths. + echo "==> patching $SRC to define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP at the top" + # Insert the define before the very first `#include` so it precedes the + # checkpoint_min_step references. awk ' !done && /^#include/ { - print "#define LOCALAI_LEGACY_LLAMA_CPP_SPEC 1" + print "#define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP 1" print "// ^ injected by backend/cpp/turboquant/patch-grpc-server.sh" print "" done = 1 @@ -145,13 +91,13 @@ else { print } END { if (!done) { - print "patch-grpc-server.sh: no #include anchor found to insert LOCALAI_LEGACY_LLAMA_CPP_SPEC" > "/dev/stderr" + print "patch-grpc-server.sh: no #include anchor found to insert LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP" > "/dev/stderr" exit 1 } } ' "$SRC" > "$SRC.tmp" mv "$SRC.tmp" "$SRC" - echo "==> LOCALAI_LEGACY_LLAMA_CPP_SPEC define OK" + echo "==> LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP define OK" fi echo "==> all patches applied"