diff --git a/.github/backend-matrix.yml b/.github/backend-matrix.yml index 9d7e36259..464ffc36c 100644 --- a/.github/backend-matrix.yml +++ b/.github/backend-matrix.yml @@ -1766,20 +1766,6 @@ include: dockerfile: "./backend/Dockerfile.llama-cpp" context: "./" ubuntu-version: '2404' - - build-type: 'hipblas' - cuda-major-version: "" - cuda-minor-version: "" - platforms: 'linux/amd64' - tag-latest: 'auto' - tag-suffix: '-gpu-rocm-hipblas-turboquant' - builder-base-image: 'quay.io/go-skynet/ci-cache:base-grpc-rocm-amd64' - runs-on: 'ubuntu-latest' - base-image: "rocm/dev-ubuntu-24.04:7.2.1" - skip-drivers: 'false' - backend: "turboquant" - dockerfile: "./backend/Dockerfile.turboquant" - context: "./" - ubuntu-version: '2404' - build-type: 'hipblas' cuda-major-version: "" cuda-minor-version: "" diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index ac5521bc4..90a5477a9 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -482,23 +482,13 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt if (!request->draftmodel().empty()) { params.speculative.draft.mparams.path = request->draftmodel(); // Default to draft type if a draft model is set but no explicit type. - // Upstream (post ggml-org/llama.cpp#22838) made the speculative type a - // vector; the turboquant fork still uses the legacy scalar. The - // LOCALAI_LEGACY_LLAMA_CPP_SPEC macro is injected by - // backend/cpp/turboquant/patch-grpc-server.sh for fork builds only. - // Upstream renamed COMMON_SPECULATIVE_TYPE_DRAFT -> ..._DRAFT_SIMPLE - // in ggml-org/llama.cpp#22964; the fork still uses the old name. -#ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC - if (params.speculative.type == COMMON_SPECULATIVE_TYPE_NONE) { - params.speculative.type = COMMON_SPECULATIVE_TYPE_DRAFT; - } -#else + // Upstream made the speculative type a vector (ggml-org/llama.cpp#22838) + // and renamed COMMON_SPECULATIVE_TYPE_DRAFT -> ..._DRAFT_SIMPLE (#22964). const bool no_spec_type = params.speculative.types.empty() || (params.speculative.types.size() == 1 && params.speculative.types[0] == COMMON_SPECULATIVE_TYPE_NONE); if (no_spec_type) { params.speculative.types = { COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE }; } -#endif } // params.model_alias ?? @@ -574,9 +564,10 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // tokens (0 disables the minimum). Match upstream's default (256). This // field was renamed from `checkpoint_every_nt` in llama.cpp; the semantics // also shifted from a fixed cadence to a minimum spacing. The turboquant - // fork branched before the field existed, so skip it on the legacy path - // (LOCALAI_LEGACY_LLAMA_CPP_SPEC is injected by patch-grpc-server.sh). -#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC + // fork still lacks common_params::checkpoint_min_step, so skip it there + // (LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP is injected by + // backend/cpp/turboquant/patch-grpc-server.sh). +#ifndef LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP params.checkpoint_min_step = 256; #endif @@ -752,7 +743,7 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt params.cache_idle_slots = false; } -#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC +#ifndef LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP // --- minimum context-checkpoint spacing (upstream -cms / --checkpoint-min-step) --- // 0 disables the minimum-spacing gate. Old option names (`checkpoint_every_nt`, // `checkpoint_every_n_tokens`) are kept as aliases for backward compatibility @@ -906,17 +897,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // Speculative decoding options } else if (!strcmp(optname, "spec_type") || !strcmp(optname, "speculative_type")) { -#ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC - // Fork only knows a single scalar `type`. Take the first comma- - // separated value and assign it via the singular helper. - std::string first = optval_str; - const auto comma = first.find(','); - if (comma != std::string::npos) first = first.substr(0, comma); - auto type = common_speculative_type_from_name(first); - if (type != COMMON_SPECULATIVE_TYPE_COUNT) { - params.speculative.type = type; - } -#else // Upstream switched to a vector of types (comma-separated for multi-type // chaining via common_speculative_types_from_names). We keep accepting a // single value here, but also tolerate comma-separated lists. @@ -945,7 +925,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt if (!parsed.empty()) { params.speculative.types = parsed; } -#endif } else if (!strcmp(optname, "spec_n_max") || !strcmp(optname, "draft_max")) { if (optval != NULL) { try { params.speculative.draft.n_max = std::stoi(optval_str); } catch (...) {} @@ -983,21 +962,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // shares the target context size. Accept the option for backward // compatibility but silently ignore it. -// Everything below relies on struct shape introduced in ggml-org/llama.cpp#22838 -// (parallel drafting): `ngram_mod`, `ngram_map_k`, `ngram_map_k4v`, -// `ngram_cache`, and the `draft.{cache_type_*, cpuparams*, tensor_buft_overrides}` -// fields. The turboquant fork branched before that, so its build defines -// LOCALAI_LEGACY_LLAMA_CPP_SPEC via patch-grpc-server.sh and these option -// keys become unrecognized (silently dropped, like any unknown opt) for it. -// -// The `#ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC` / `#else` split below sits at the -// closing-brace position of the `draft_ctx_size` branch on purpose: in the -// legacy build the chain ends here (the brace closes draft_ctx_size), and in -// the modern build the chain continues with `} else if (...)` instead, so the -// brace count stays balanced under both branches of the preprocessor. -#ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC - } -#else // --- ngram_mod family (upstream --spec-ngram-mod-*) --- } else if (!strcmp(optname, "spec_ngram_mod_n_min")) { if (optval != NULL) { @@ -1127,7 +1091,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt } if (!cur.empty()) flush(cur); } -#endif // LOCALAI_LEGACY_LLAMA_CPP_SPEC — closes the `else`/`#ifdef` opened at draft_ctx_size } // Set params.n_parallel from environment variable if not set via options (fallback) @@ -1177,15 +1140,11 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt params.tensor_buft_overrides.push_back({nullptr, nullptr}); } } - // The draft tensor_buft_overrides are only populated under the modern - // (post-#22838) layout, whose population code is itself gated by - // LOCALAI_LEGACY_LLAMA_CPP_SPEC above. The turboquant fork lacks - // common_params_speculative::draft entirely, so skip the sentinel there too. -#ifndef LOCALAI_LEGACY_LLAMA_CPP_SPEC + // Terminate the draft tensor_buft_overrides list with a sentinel, mirroring + // the main-model handling above. if (!params.speculative.draft.tensor_buft_overrides.empty()) { params.speculative.draft.tensor_buft_overrides.push_back({nullptr, nullptr}); } -#endif // TODO: Add yarn diff --git a/backend/cpp/turboquant/Makefile b/backend/cpp/turboquant/Makefile index 901ca6471..98f5e4978 100644 --- a/backend/cpp/turboquant/Makefile +++ b/backend/cpp/turboquant/Makefile @@ -1,7 +1,7 @@ # Pinned to the HEAD of feature/turboquant-kv-cache on https://github.com/TheTom/llama-cpp-turboquant. # Auto-bumped nightly by .github/workflows/bump_deps.yaml. -TURBOQUANT_VERSION?=5aeb2fdbe26cd4c534c6fa15de73cb5749bd0403 +TURBOQUANT_VERSION?=7d9715f1f071fa07c7b2ad3dbfd320b314139e65 LLAMA_REPO?=https://github.com/TheTom/llama-cpp-turboquant CMAKE_ARGS?= diff --git a/backend/cpp/turboquant/patch-grpc-server.sh b/backend/cpp/turboquant/patch-grpc-server.sh index b1a62b215..fa11897dd 100755 --- a/backend/cpp/turboquant/patch-grpc-server.sh +++ b/backend/cpp/turboquant/patch-grpc-server.sh @@ -4,21 +4,19 @@ # # 1. Augment the kv_cache_types[] allow-list so `LoadModel` accepts the # fork-specific `turbo2` / `turbo3` / `turbo4` cache types. -# 2. Replace `get_media_marker()` (added upstream in ggml-org/llama.cpp#21962, -# server-side random per-instance marker) with the legacy "<__media__>" -# literal. The fork branched before that PR, so server-common.cpp has no -# get_media_marker symbol. The fork's mtmd_default_marker() still returns -# "<__media__>", and Go-side tooling falls back to that sentinel when the -# backend does not expose media_marker, so substituting the literal keeps -# behavior identical on the turboquant path. -# 3. Revert the `common_params_speculative` field references to the -# pre-refactor flat layout. Upstream ggml-org/llama.cpp#22397 split the -# struct into nested `draft` / `ngram_simple` / `ngram_mod` / etc. members; -# the turboquant fork branched before that PR and still exposes the flat -# `n_max`, `mparams_dft`, `ngram_size_n`, ... fields. The substitutions -# below map the new nested paths back to the legacy flat names so the -# shared grpc-server.cpp keeps compiling against the fork's common.h. -# Drop this block once the fork rebases past #22397. +# 2. Define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP at the top of the file +# so the grpc-server option parser skips the two references to +# common_params::checkpoint_min_step (the default and the option handler). +# That field does not exist in the fork yet; drop this once it does. +# +# The fork used to lag upstream on the whole common_params_speculative refactor +# (ggml-org/llama.cpp#22397/#22838/#22964), the model_tgt rename (#22838) and +# get_media_marker (#21962), which required a much larger compat shim here +# (flat-field sed renames + a coarse LOCALAI_LEGACY_LLAMA_CPP_SPEC define). The +# fork has since rebased past all of those, so the only remaining gap is +# checkpoint_min_step. If a future bump reintroduces a divergence, add a narrow +# guard in grpc-server.cpp keyed on a fork-specific macro and inject it here +# rather than resurrecting the coarse one. # # We patch the *copy* sitting in turboquant--build/, never the original # under backend/cpp/llama-cpp/, so the stock llama-cpp build keeps compiling @@ -72,72 +70,20 @@ else echo "==> KV allow-list patch OK" fi -if grep -q 'get_media_marker()' "$SRC"; then - echo "==> patching $SRC to replace get_media_marker() with legacy \"<__media__>\" literal" - # Only one call site today (ModelMetadata), but replace all occurrences to - # stay robust if upstream adds more. Use a temp file to avoid relying on - # sed -i portability (the builder image uses GNU sed, but keeping this - # consistent with the awk block above). - sed 's/get_media_marker()/"<__media__>"/g' "$SRC" > "$SRC.tmp" - mv "$SRC.tmp" "$SRC" - echo "==> get_media_marker() substitution OK" +# 2. Define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP at the top of the file so +# the grpc-server option parser skips the two references to +# common_params::checkpoint_min_step (the default assignment and the option +# handler). That field does not exist in the fork yet. Drop this block once +# the fork rebases past the bump that added checkpoint_min_step. +if grep -q '^#define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP' "$SRC"; then + echo "==> $SRC already defines LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP, skipping" else - echo "==> $SRC has no get_media_marker() call, skipping media-marker patch" -fi - -if grep -q 'params\.speculative\.draft\.\|params\.speculative\.ngram_simple\.' "$SRC"; then - echo "==> patching $SRC to revert common_params_speculative refs to pre-#22397 flat layout" - # Each substitution is the exact post-refactor path → legacy flat field. - # Order doesn't matter because the source paths are disjoint, but we keep - # the most-specific (mparams.path) first for readability. - sed -E \ - -e 's/params\.speculative\.draft\.mparams\.path/params.speculative.mparams_dft.path/g' \ - -e 's/params\.speculative\.draft\.n_max/params.speculative.n_max/g' \ - -e 's/params\.speculative\.draft\.n_min/params.speculative.n_min/g' \ - -e 's/params\.speculative\.draft\.p_min/params.speculative.p_min/g' \ - -e 's/params\.speculative\.draft\.p_split/params.speculative.p_split/g' \ - -e 's/params\.speculative\.draft\.n_gpu_layers/params.speculative.n_gpu_layers/g' \ - -e 's/params\.speculative\.draft\.n_ctx/params.speculative.n_ctx/g' \ - -e 's/params\.speculative\.ngram_simple\.size_n/params.speculative.ngram_size_n/g' \ - -e 's/params\.speculative\.ngram_simple\.size_m/params.speculative.ngram_size_m/g' \ - -e 's/params\.speculative\.ngram_simple\.min_hits/params.speculative.ngram_min_hits/g' \ - "$SRC" > "$SRC.tmp" - mv "$SRC.tmp" "$SRC" - echo "==> speculative field rename OK" -else - echo "==> $SRC has no post-#22397 speculative field refs, skipping spec rename patch" -fi - -# 4. Revert the `ctx_server.impl->model_tgt` rename introduced by upstream -# ggml-org/llama.cpp#22838 (parallel drafting). The turboquant fork still -# exposes the field as `model` on `server_context_impl`. The two call sites -# are in the Rerank and ModelMetadata RPC handlers. -if grep -q 'ctx_server\.impl->model_tgt' "$SRC"; then - echo "==> patching $SRC to revert ctx_server.impl->model_tgt -> ctx_server.impl->model" - sed -E 's/ctx_server\.impl->model_tgt/ctx_server.impl->model/g' "$SRC" > "$SRC.tmp" - mv "$SRC.tmp" "$SRC" - echo "==> model_tgt rename OK" -else - echo "==> $SRC has no ctx_server.impl->model_tgt refs, skipping model_tgt rename patch" -fi - -# 5. Define LOCALAI_LEGACY_LLAMA_CPP_SPEC at the top of the file so the -# grpc-server option parser skips the new option-handler blocks (ngram_mod, -# ngram_map_k, ngram_map_k4v, ngram_cache, draft.cache_type_*, draft.cpuparams*, -# draft.tensor_buft_overrides) introduced for the post-#22838 layout, the -# draft.tensor_buft_overrides sentinel termination, and the -# common_params::checkpoint_min_step default/option (added with the -# 35c9b1f3 bump). Those blocks reference struct fields that simply do not -# exist in the fork. -if grep -q '^#define LOCALAI_LEGACY_LLAMA_CPP_SPEC' "$SRC"; then - echo "==> $SRC already defines LOCALAI_LEGACY_LLAMA_CPP_SPEC, skipping" -else - echo "==> patching $SRC to define LOCALAI_LEGACY_LLAMA_CPP_SPEC at the top" - # Insert the define before the very first `#include` so it precedes all the - # speculative-decoding code paths. + echo "==> patching $SRC to define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP at the top" + # Insert the define before the very first `#include` so it precedes the + # checkpoint_min_step references. awk ' !done && /^#include/ { - print "#define LOCALAI_LEGACY_LLAMA_CPP_SPEC 1" + print "#define LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP 1" print "// ^ injected by backend/cpp/turboquant/patch-grpc-server.sh" print "" done = 1 @@ -145,13 +91,13 @@ else { print } END { if (!done) { - print "patch-grpc-server.sh: no #include anchor found to insert LOCALAI_LEGACY_LLAMA_CPP_SPEC" > "/dev/stderr" + print "patch-grpc-server.sh: no #include anchor found to insert LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP" > "/dev/stderr" exit 1 } } ' "$SRC" > "$SRC.tmp" mv "$SRC.tmp" "$SRC" - echo "==> LOCALAI_LEGACY_LLAMA_CPP_SPEC define OK" + echo "==> LOCALAI_TURBOQUANT_NO_CHECKPOINT_MIN_STEP define OK" fi echo "==> all patches applied" diff --git a/backend/cpp/turboquant/patches/0001-hip-guard-copy2d-peer-fastpath.patch b/backend/cpp/turboquant/patches/0001-hip-guard-copy2d-peer-fastpath.patch new file mode 100644 index 000000000..71e55f621 --- /dev/null +++ b/backend/cpp/turboquant/patches/0001-hip-guard-copy2d-peer-fastpath.patch @@ -0,0 +1,55 @@ +hip: port the turboquant CUDA additions that ggml's HIP shim doesn't cover + +The turboquant fork adds/modifies a few ggml-cuda.cu spots with CUDA APIs +that ggml's HIP (and MUSA) compatibility layer does not provide, breaking +the -gpu-rocm-hipblas-turboquant build: + + 1. ggml_cuda_copy2d_across_devices() (host-staged cross-device copy for + split mul_mat output) uses the CUDA 3D-peer copy APIs + cudaMemcpy3DPeerParms / make_cudaPitchedPtr / make_cudaExtent / + cudaMemcpy3DPeerAsync. HIP genuinely does not support these (see the + fork's own comment "HIP does not support cudaMemcpy3DPeerAsync"), so + guard the peer fast path with #if !defined(GGML_USE_HIP) && + !defined(GGML_USE_MUSA) -- matching how the fork already guards the + same API for the sibling 2D copy -- and fall through to the existing + cudaMemcpyAsync staging fallback below (functionally identical, + slightly slower on multi-GPU ROCm). + + 2. ggml_backend_cuda_device_event_new() creates its event with plain + cudaEventCreate, which ggml's HIP shim does not alias (it only aliases + cudaEventCreateWithFlags). Use cudaEventCreateWithFlags(..., + cudaEventDisableTiming) -- exactly what the rest of this file already + does (cf. lines ~1034, ~3461) and HIP-safe. + +CUDA builds are unaffected. Drop the relevant hunk once the fork HIP-ports +these; apply-patches.sh fails fast if an anchor goes stale. + +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index 0427e6b..6352e6a 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -1933,6 +1933,7 @@ static cudaError_t ggml_cuda_copy2d_across_devices( + size_t width, size_t height, cudaStream_t dst_stream, cudaStream_t src_stream) { + + const auto & info = ggml_cuda_info(); ++#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) // 3D-peer copy types unmapped by ggml's HIP/MUSA shim; use staging fallback below + if (info.peer_access[src_device][dst_device]) { + cudaMemcpy3DPeerParms p = {}; + p.dstDevice = dst_device; +@@ -1942,6 +1943,7 @@ static cudaError_t ggml_cuda_copy2d_across_devices( + p.extent = make_cudaExtent(width, height, 1); + return cudaMemcpy3DPeerAsync(&p, dst_stream); + } ++#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) + + // Fallback: stage all rows through a single contiguous pinned buffer + int prev_device = ggml_cuda_get_device(); +@@ -5714,7 +5716,7 @@ static ggml_backend_event_t ggml_backend_cuda_device_event_new(ggml_backend_dev_ + ggml_cuda_set_device(dev_ctx->device); + + cudaEvent_t event; +- CUDA_CHECK(cudaEventCreate(&event)); ++ CUDA_CHECK(cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); + + return new ggml_backend_event { + /* .device = */ dev,