fix(turboquant): drop obsolete legacy-spec shim after fork rebased

The TheTom/llama-cpp-turboquant fork (pin c9aa86a) rebased past the upstream common_params_speculative refactor (ggml-org/llama.cpp #22397/#22838/#22964), the model_tgt rename (#22838) and get_media_marker (#21962). The old fork-compat shim forced now-wrong legacy code paths, breaking the build with errors like 'struct common_params_speculative has no member named mparams_dft / type' and 'server_context_impl has no member named model'. Remove the obsolete LOCALAI_LEGACY_LLAMA_CPP_SPEC branches from the shared grpc-server.cpp (stock llama-cpp and the modern fork both take the modern path now), and narrow the one remaining gap (the fork still lacks common_params::checkpoint_min_step) to a dedicated LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP guard injected by patch-grpc-server.sh. The patch script now only adds the turbo2/3/4 KV-cache types and injects that one macro. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
2026-06-07 08:16:53 -04:00 · 2026-06-06 18:25:39 +00:00
parent 3cdd6a8e63
commit d11a152ad3
2 changed files with 35 additions and 130 deletions
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -482,23 +482,13 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
    if (!request->draftmodel().empty()) {
        params.speculative.draft.mparams.path = request->draftmodel();
        // Default to draft type if a draft model is set but no explicit type.
-        // Upstream (post ggml-org/llama.cpp#22838) made the speculative type a
-        // vector; the turboquant fork still uses the legacy scalar. The
-        // LOCALAI_LEGACY_LLAMA_CPP_SPEC macro is injected by
-        // backend/cpp/turboquant/patch-grpc-server.sh for fork builds only.
-        // Upstream renamed COMMON_SPECULATIVE_TYPE_DRAFT -> ..._DRAFT_SIMPLE
-        // in ggml-org/llama.cpp#22964; the fork still uses the old name.
-#ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC
-        if (params.speculative.type == COMMON_SPECULATIVE_TYPE_NONE) {
-            params.speculative.type = COMMON_SPECULATIVE_TYPE_DRAFT;
-        }
-#else
+        // Upstream made the speculative type a vector (ggml-org/llama.cpp#22838)
+        // and renamed COMMON_SPECULATIVE_TYPE_DRAFT -> ..._DRAFT_SIMPLE (#22964).
        const bool no_spec_type = params.speculative.types.empty() ||
            (params.speculative.types.size() == 1 && params.speculative.types[0] == COMMON_SPECULATIVE_TYPE_NONE);
        if (no_spec_type) {
            params.speculative.types = { COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE };
        }
-#endif
    }

    //  params.model_alias ??
@@ -574,9 +564,10 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
    // tokens (0 disables the minimum). Match upstream's default (256). This
    // field was renamed from `checkpoint_every_nt` in llama.cpp; the semantics
    // also shifted from a fixed cadence to a minimum spacing. The turboquant
-    // fork branched before the field existed, so skip it on the legacy path
-    // (LOCALAI_LEGACY_LLAMA_CPP_SPEC is injected by patch-grpc-server.sh).
-#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC
+    // fork still lacks common_params::checkpoint_min_step, so skip it there
+    // (LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP is injected by
+    // backend/cpp/turboquant/patch-grpc-server.sh).
+#ifndef LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP
    params.checkpoint_min_step = 256;
 #endif

@@ -752,7 +743,7 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
                params.cache_idle_slots = false;
            }

-#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC
+#ifndef LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP
        // --- minimum context-checkpoint spacing (upstream -cms / --checkpoint-min-step) ---
        // 0 disables the minimum-spacing gate. Old option names (`checkpoint_every_nt`,
        // `checkpoint_every_n_tokens`) are kept as aliases for backward compatibility
@@ -906,17 +897,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt

        // Speculative decoding options
        } else if (!strcmp(optname, "spec_type") || !strcmp(optname, "speculative_type")) {
-#ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC
-            // Fork only knows a single scalar `type`. Take the first comma-
-            // separated value and assign it via the singular helper.
-            std::string first = optval_str;
-            const auto comma = first.find(',');
-            if (comma != std::string::npos) first = first.substr(0, comma);
-            auto type = common_speculative_type_from_name(first);
-            if (type != COMMON_SPECULATIVE_TYPE_COUNT) {
-                params.speculative.type = type;
-            }
-#else
            // Upstream switched to a vector of types (comma-separated for multi-type
            // chaining via common_speculative_types_from_names). We keep accepting a
            // single value here, but also tolerate comma-separated lists.
@@ -945,7 +925,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
            if (!parsed.empty()) {
                params.speculative.types = parsed;
            }
-#endif
        } else if (!strcmp(optname, "spec_n_max") || !strcmp(optname, "draft_max")) {
            if (optval != NULL) {
                try { params.speculative.draft.n_max = std::stoi(optval_str); } catch (...) {}
@@ -983,21 +962,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
            // shares the target context size. Accept the option for backward
            // compatibility but silently ignore it.

-// Everything below relies on struct shape introduced in ggml-org/llama.cpp#22838
-// (parallel drafting): `ngram_mod`, `ngram_map_k`, `ngram_map_k4v`,
-// `ngram_cache`, and the `draft.{cache_type_*, cpuparams*, tensor_buft_overrides}`
-// fields. The turboquant fork branched before that, so its build defines
-// LOCALAI_LEGACY_LLAMA_CPP_SPEC via patch-grpc-server.sh and these option
-// keys become unrecognized (silently dropped, like any unknown opt) for it.
-//
-// The `#ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC` / `#else` split below sits at the
-// closing-brace position of the `draft_ctx_size` branch on purpose: in the
-// legacy build the chain ends here (the brace closes draft_ctx_size), and in
-// the modern build the chain continues with `} else if (...)` instead, so the
-// brace count stays balanced under both branches of the preprocessor.
-#ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC
-        }
-#else
        // --- ngram_mod family (upstream --spec-ngram-mod-*) ---
        } else if (!strcmp(optname, "spec_ngram_mod_n_min")) {
            if (optval != NULL) {
@@ -1127,7 +1091,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
            }
            if (!cur.empty()) flush(cur);
        }
-#endif // LOCALAI_LEGACY_LLAMA_CPP_SPEC — closes the `else`/`#ifdef` opened at draft_ctx_size
    }

    // Set params.n_parallel from environment variable if not set via options (fallback)
@@ -1177,15 +1140,11 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
            params.tensor_buft_overrides.push_back({nullptr, nullptr});
        }
    }
-    // The draft tensor_buft_overrides are only populated under the modern
-    // (post-#22838) layout, whose population code is itself gated by
-    // LOCALAI_LEGACY_LLAMA_CPP_SPEC above. The turboquant fork lacks
-    // common_params_speculative::draft entirely, so skip the sentinel there too.
-#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC
+    // Terminate the draft tensor_buft_overrides list with a sentinel, mirroring
+    // the main-model handling above.
    if (!params.speculative.draft.tensor_buft_overrides.empty()) {
        params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr});
    }
-#endif

    // TODO: Add yarn