fix(turboquant): drop obsolete legacy-spec shim after fork rebased

The TheTom/llama-cpp-turboquant fork (pin c9aa86a) rebased past the upstream common_params_speculative refactor (ggml-org/llama.cpp #22397/#22838/#22964), the model_tgt rename (#22838) and get_media_marker (#21962). The old fork-compat shim forced now-wrong legacy code paths, breaking the build with errors like 'struct common_params_speculative has no member named mparams_dft / type' and 'server_context_impl has no member named model'. Remove the obsolete LOCALAI_LEGACY_LLAMA_CPP_SPEC branches from the shared grpc-server.cpp (stock llama-cpp and the modern fork both take the modern path now), and narrow the one remaining gap (the fork still lacks common_params::checkpoint_min_step) to a dedicated LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP guard injected by patch-grpc-server.sh. The patch script now only adds the turbo2/3/4 KV-cache types and injects that one macro. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Assisted-by: Claude:claude-opus-4-8 [Claude Code]
2026-06-07 08:16:53 -04:00 · 2026-06-06 18:25:39 +00:00
parent 3cdd6a8e63
commit d11a152ad3
2 changed files with 35 additions and 130 deletions
--- a/backend/cpp/turboquant/patch-grpc-server.sh
+++ b/backend/cpp/turboquant/patch-grpc-server.sh
@@ -4,21 +4,19 @@
 #
 #   1. Augment the kv_cache_types[] allow-list so `LoadModel` accepts the
 #      fork-specific `turbo2` / `turbo3` / `turbo4` cache types.
-#   2. Replace `get_media_marker()` (added upstream in ggml-org/llama.cpp#21962,
-#      server-side random per-instance marker) with the legacy "<__media__>"
-#      literal. The fork branched before that PR, so server-common.cpp has no
-#      get_media_marker symbol. The fork's mtmd_default_marker() still returns
-#      "<__media__>", and Go-side tooling falls back to that sentinel when the
-#      backend does not expose media_marker, so substituting the literal keeps
-#      behavior identical on the turboquant path.
-#   3. Revert the `common_params_speculative` field references to the
-#      pre-refactor flat layout. Upstream ggml-org/llama.cpp#22397 split the
-#      struct into nested `draft` / `ngram_simple` / `ngram_mod` / etc. members;
-#      the turboquant fork branched before that PR and still exposes the flat
-#      `n_max`, `mparams_dft`, `ngram_size_n`, ... fields. The substitutions
-#      below map the new nested paths back to the legacy flat names so the
-#      shared grpc-server.cpp keeps compiling against the fork's common.h.
-#      Drop this block once the fork rebases past #22397.
+#   2. Define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP at the top of the file
+#      so the grpc-server option parser skips the two references to
+#      common_params::checkpoint_min_step (the default and the option handler).
+#      That field does not exist in the fork yet; drop this once it does.
+#
+# The fork used to lag upstream on the whole common_params_speculative refactor
+# (ggml-org/llama.cpp#22397/#22838/#22964), the model_tgt rename (#22838) and
+# get_media_marker (#21962), which required a much larger compat shim here
+# (flat-field sed renames + a coarse LOCALAI_LEGACY_LLAMA_CPP_SPEC define). The
+# fork has since rebased past all of those, so the only remaining gap is
+# checkpoint_min_step. If a future bump reintroduces a divergence, add a narrow
+# guard in grpc-server.cpp keyed on a fork-specific macro and inject it here
+# rather than resurrecting the coarse one.
 #
 # We patch the *copy* sitting in turboquant-<flavor>-build/, never the original
 # under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps compiling
@@ -72,72 +70,20 @@ else
    echo "==> KV allow-list patch OK"
 fi

-if grep -q 'get_media_marker()' "$SRC"; then
-    echo "==> patching $SRC to replace get_media_marker() with legacy \"<__media__>\" literal"
-    # Only one call site today (ModelMetadata), but replace all occurrences to
-    # stay robust if upstream adds more. Use a temp file to avoid relying on
-    # sed -i portability (the builder image uses GNU sed, but keeping this
-    # consistent with the awk block above).
-    sed 's/get_media_marker()/"<__media__>"/g' "$SRC" > "$SRC.tmp"
-    mv "$SRC.tmp" "$SRC"
-    echo "==> get_media_marker() substitution OK"
+# 2. Define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP at the top of the file so
+#    the grpc-server option parser skips the two references to
+#    common_params::checkpoint_min_step (the default assignment and the option
+#    handler). That field does not exist in the fork yet. Drop this block once
+#    the fork rebases past the bump that added checkpoint_min_step.
+if grep -q '^#define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP' "$SRC"; then
+    echo "==> $SRC already defines LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP, skipping"
 else
-    echo "==> $SRC has no get_media_marker() call, skipping media-marker patch"
-fi
-
-if grep -q 'params\.speculative\.draft\.\|params\.speculative\.ngram_simple\.' "$SRC"; then
-    echo "==> patching $SRC to revert common_params_speculative refs to pre-#22397 flat layout"
-    # Each substitution is the exact post-refactor path → legacy flat field.
-    # Order doesn't matter because the source paths are disjoint, but we keep
-    # the most-specific (mparams.path) first for readability.
-    sed -E \
-        -e 's/params\.speculative\.draft\.mparams\.path/params.speculative.mparams_dft.path/g' \
-        -e 's/params\.speculative\.draft\.n_max/params.speculative.n_max/g' \
-        -e 's/params\.speculative\.draft\.n_min/params.speculative.n_min/g' \
-        -e 's/params\.speculative\.draft\.p_min/params.speculative.p_min/g' \
-        -e 's/params\.speculative\.draft\.p_split/params.speculative.p_split/g' \
-        -e 's/params\.speculative\.draft\.n_gpu_layers/params.speculative.n_gpu_layers/g' \
-        -e 's/params\.speculative\.draft\.n_ctx/params.speculative.n_ctx/g' \
-        -e 's/params\.speculative\.ngram_simple\.size_n/params.speculative.ngram_size_n/g' \
-        -e 's/params\.speculative\.ngram_simple\.size_m/params.speculative.ngram_size_m/g' \
-        -e 's/params\.speculative\.ngram_simple\.min_hits/params.speculative.ngram_min_hits/g' \
-        "$SRC" > "$SRC.tmp"
-    mv "$SRC.tmp" "$SRC"
-    echo "==> speculative field rename OK"
-else
-    echo "==> $SRC has no post-#22397 speculative field refs, skipping spec rename patch"
-fi
-
-# 4. Revert the `ctx_server.impl->model_tgt` rename introduced by upstream
-#    ggml-org/llama.cpp#22838 (parallel drafting). The turboquant fork still
-#    exposes the field as `model` on `server_context_impl`. The two call sites
-#    are in the Rerank and ModelMetadata RPC handlers.
-if grep -q 'ctx_server\.impl->model_tgt' "$SRC"; then
-    echo "==> patching $SRC to revert ctx_server.impl->model_tgt -> ctx_server.impl->model"
-    sed -E 's/ctx_server\.impl->model_tgt/ctx_server.impl->model/g' "$SRC" > "$SRC.tmp"
-    mv "$SRC.tmp" "$SRC"
-    echo "==> model_tgt rename OK"
-else
-    echo "==> $SRC has no ctx_server.impl->model_tgt refs, skipping model_tgt rename patch"
-fi
-
-# 5. Define LOCALAI_LEGACY_LLAMA_CPP_SPEC at the top of the file so the
-#    grpc-server option parser skips the new option-handler blocks (ngram_mod,
-#    ngram_map_k, ngram_map_k4v, ngram_cache, draft.cache_type_*, draft.cpuparams*,
-#    draft.tensor_buft_overrides) introduced for the post-#22838 layout, the
-#    draft.tensor_buft_overrides sentinel termination, and the
-#    common_params::checkpoint_min_step default/option (added with the
-#    35c9b1f3 bump). Those blocks reference struct fields that simply do not
-#    exist in the fork.
-if grep -q '^#define LOCALAI_LEGACY_LLAMA_CPP_SPEC' "$SRC"; then
-    echo "==> $SRC already defines LOCALAI_LEGACY_LLAMA_CPP_SPEC, skipping"
-else
-    echo "==> patching $SRC to define LOCALAI_LEGACY_LLAMA_CPP_SPEC at the top"
-    # Insert the define before the very first `#include` so it precedes all the
-    # speculative-decoding code paths.
+    echo "==> patching $SRC to define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP at the top"
+    # Insert the define before the very first `#include` so it precedes the
+    # checkpoint_min_step references.
    awk '
        !done && /^#include/ {
-            print "#define LOCALAI_LEGACY_LLAMA_CPP_SPEC 1"
+            print "#define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP 1"
            print "// ^ injected by backend/cpp/turboquant/patch-grpc-server.sh"
            print ""
            done = 1
@@ -145,13 +91,13 @@ else
        { print }
        END {
            if (!done) {
-                print "patch-grpc-server.sh: no #include anchor found to insert LOCALAI_LEGACY_LLAMA_CPP_SPEC" > "/dev/stderr"
+                print "patch-grpc-server.sh: no #include anchor found to insert LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP" > "/dev/stderr"
                exit 1
            }
        }
    ' "$SRC" > "$SRC.tmp"
    mv "$SRC.tmp" "$SRC"
-    echo "==> LOCALAI_LEGACY_LLAMA_CPP_SPEC define OK"
+    echo "==> LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP define OK"
 fi

 echo "==> all patches applied"