mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-30 11:36:31 -04:00
spiritbuun/buun-llama-cpp is a fork of TheTom/llama-cpp-turboquant that adds two independent features on top: DFlash block-diffusion speculative decoding (via a dedicated DFlashDraftModel GGUF arch) and two extra TCQ KV-cache variants (turbo2_tcq, turbo3_tcq) on top of TurboQuant's turbo2/turbo3/turbo4. Follows the turboquant thin-wrapper pattern — reuses backend/cpp/llama-cpp grpc-server sources verbatim, patches only the build copy to extend the KV allow-list and wire up buun-exclusive tree_budget / draft_topk options. DraftModel is already wired end-to-end (proto field 39 → params.speculative), so DFlash activation only needs the existing options passthrough (spec_type:dflash) plus the drafter path in draft_model. CacheTypeOptions now surfaces the five turbo* values so the React UI dropdown shows them — benefits turboquant too (previously users had to type them in YAML manually). Assisted-by: Claude:Opus-4.7 [Read] [Edit] [Bash] [WebFetch] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
142 lines
6.1 KiB
Bash
Executable File
142 lines
6.1 KiB
Bash
Executable File
#!/bin/bash
|
|
# Patch the shared backend/cpp/llama-cpp/grpc-server.cpp *copy* used by the
|
|
# buun-llama-cpp build to account for three gaps between upstream and the fork:
|
|
#
|
|
# 1. Augment the kv_cache_types[] allow-list so `LoadModel` accepts the
|
|
# fork-specific `turbo2` / `turbo3` / `turbo4` cache types plus the buun
|
|
# additions `turbo2_tcq` / `turbo3_tcq`.
|
|
#
|
|
# 2. Wire up buun-exclusive speculative-decoding option handlers
|
|
# (tree_budget / draft_topk) alongside the existing spec_* handlers.
|
|
# These reference struct fields (common_params.speculative.tree_budget
|
|
# and .draft_topk) that only exist in buun's common/common.h — adding
|
|
# them to the shared backend/cpp/llama-cpp/grpc-server.cpp would break
|
|
# the stock llama-cpp build, so we inject them only into the buun copy.
|
|
#
|
|
# 3. Replace `get_media_marker()` (added upstream in ggml-org/llama.cpp#21962,
|
|
# server-side random per-instance marker) with the legacy "<__media__>"
|
|
# literal. The fork branched before that PR, so server-common.cpp has no
|
|
# get_media_marker symbol. The fork's mtmd_default_marker() still returns
|
|
# "<__media__>", and Go-side tooling falls back to that sentinel when the
|
|
# backend does not expose media_marker, so substituting the literal keeps
|
|
# behavior identical on the buun path.
|
|
#
|
|
# We patch the *copy* sitting in buun-llama-cpp-<flavor>-build/, never the
|
|
# original under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps
|
|
# compiling against vanilla upstream.
|
|
#
|
|
# Idempotent: skips each insertion if its marker is already present (so re-runs
|
|
# of the same build dir don't double-insert).
|
|
|
|
set -euo pipefail
|
|
|
|
if [[ $# -ne 1 ]]; then
|
|
echo "usage: $0 <grpc-server.cpp>" >&2
|
|
exit 2
|
|
fi
|
|
|
|
SRC=$1
|
|
|
|
if [[ ! -f "$SRC" ]]; then
|
|
echo "grpc-server.cpp not found at $SRC" >&2
|
|
exit 2
|
|
fi
|
|
|
|
if grep -q 'GGML_TYPE_TURBO2_TCQ' "$SRC"; then
|
|
echo "==> $SRC already has buun cache types, skipping KV allow-list patch"
|
|
else
|
|
echo "==> patching $SRC to allow turbo2/turbo3/turbo4/turbo2_tcq/turbo3_tcq KV-cache types"
|
|
|
|
# Insert the five TURBO entries right after the first ` GGML_TYPE_Q5_1,`
|
|
# line (the kv_cache_types[] allow-list). Using awk because the builder
|
|
# image does not ship python3, and GNU sed's multi-line `a\` quoting is
|
|
# awkward.
|
|
awk '
|
|
/^ GGML_TYPE_Q5_1,$/ && !done {
|
|
print
|
|
print " // buun-llama-cpp fork extras — added by patch-grpc-server.sh"
|
|
print " GGML_TYPE_TURBO2_0,"
|
|
print " GGML_TYPE_TURBO3_0,"
|
|
print " GGML_TYPE_TURBO4_0,"
|
|
print " GGML_TYPE_TURBO2_TCQ,"
|
|
print " GGML_TYPE_TURBO3_TCQ,"
|
|
done = 1
|
|
next
|
|
}
|
|
{ print }
|
|
END {
|
|
if (!done) {
|
|
print "patch-grpc-server.sh: anchor ` GGML_TYPE_Q5_1,` not found" > "/dev/stderr"
|
|
exit 1
|
|
}
|
|
}
|
|
' "$SRC" > "$SRC.tmp"
|
|
mv "$SRC.tmp" "$SRC"
|
|
|
|
echo "==> KV allow-list patch OK"
|
|
fi
|
|
|
|
if grep -q 'optname, "tree_budget"' "$SRC"; then
|
|
echo "==> $SRC already has DFlash option handlers, skipping"
|
|
else
|
|
echo "==> patching $SRC to add tree_budget / draft_topk option handlers"
|
|
|
|
# Insert two new `else if` handlers between the inner close-brace of the
|
|
# `spec_p_split` block and the next `} else if (…spec_ngram_size_n…)` line.
|
|
# Upstream writes each `} else if` as a single physical line, so we don't
|
|
# emit an outer `}` ourselves — the existing next line provides both the
|
|
# close of our `draft_topk` block and the open of `spec_ngram_size_n`.
|
|
# Anchor on the exact 3-line body of spec_p_split so we can't drift.
|
|
awk '
|
|
prev2 == " } else if (!strcmp(optname, \"spec_p_split\")) {" &&
|
|
prev1 ~ /^ +if \(optval != NULL\) \{$/ &&
|
|
$0 ~ /^ +try \{ params\.speculative\.p_split = std::stof\(optval_str\); \} catch \(\.\.\.\) \{\}$/ &&
|
|
!done {
|
|
print # print the try-line itself
|
|
getline inner_close # read " }" closing the inner if
|
|
print inner_close # print it — this closes spec_p_split body
|
|
print " // buun-llama-cpp DFlash options — added by patch-grpc-server.sh"
|
|
print " } else if (!strcmp(optname, \"tree_budget\")) {"
|
|
print " if (optval != NULL) {"
|
|
print " try { params.speculative.tree_budget = std::stoi(optval_str); } catch (...) {}"
|
|
print " }"
|
|
print " } else if (!strcmp(optname, \"draft_topk\")) {"
|
|
print " if (optval != NULL) {"
|
|
print " try { params.speculative.draft_topk = std::stoi(optval_str); } catch (...) {}"
|
|
print " }"
|
|
# The next source line (`} else if (…spec_ngram_size_n…) {`) closes
|
|
# our draft_topk block and continues the chain naturally; fall back
|
|
# into the main loop to emit it and everything after.
|
|
done = 1
|
|
prev2 = prev1
|
|
prev1 = inner_close
|
|
next
|
|
}
|
|
{ print; prev2 = prev1; prev1 = $0 }
|
|
END {
|
|
if (!done) {
|
|
print "patch-grpc-server.sh: spec_p_split anchor not found" > "/dev/stderr"
|
|
exit 1
|
|
}
|
|
}
|
|
' "$SRC" > "$SRC.tmp"
|
|
mv "$SRC.tmp" "$SRC"
|
|
|
|
echo "==> DFlash option-handler patch OK"
|
|
fi
|
|
|
|
if grep -q 'get_media_marker()' "$SRC"; then
|
|
echo "==> patching $SRC to replace get_media_marker() with legacy \"<__media__>\" literal"
|
|
# Only one call site today (ModelMetadata), but replace all occurrences to
|
|
# stay robust if upstream adds more. Use a temp file to avoid relying on
|
|
# sed -i portability (the builder image uses GNU sed, but keeping this
|
|
# consistent with the awk block above).
|
|
sed 's/get_media_marker()/"<__media__>"/g' "$SRC" > "$SRC.tmp"
|
|
mv "$SRC.tmp" "$SRC"
|
|
echo "==> get_media_marker() substitution OK"
|
|
else
|
|
echo "==> $SRC has no get_media_marker() call, skipping media-marker patch"
|
|
fi
|
|
|
|
echo "==> all patches applied"
|