From 1f3e5ba30142f4497d7ee65d47e3cebe3291572e Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sun, 28 Jun 2026 07:47:17 +0000 Subject: [PATCH] fix(paged): serialize both SSM partitions in hybrid bf16-tau state save/restore (patch 0026) The opt-in ssm_bf16_tau hybrid mode splits a gated-DeltaNet layer's recurrent SSM state into an f32 partition (s_l) and a bf16 partition (s_l_bf16). The recurrent state serialization paths (state_write_data / state_read_data) were never updated for the split: they read/wrote s_l using the FULL hparams.n_embd_s() (S_v*S_v*H) row width, but a split layer's s_l only holds S_v*S_v*n_f32, so the access overruns the smaller tensor (a ggml_backend tensor read out of bounds), and the bf16 fast-head partition was never persisted at all. This is what broke high-concurrency serving with --ssm-bf16-tau: the server's context-checkpoint feature serializes per-sequence state via state_seq_get_data. With a checkpoint enabled, even a single request triggered the out-of-bounds read; at higher concurrency the cell range starts at a higher base slot so the overrun reaches further (hard abort in a debug build, silent state corruption then 1-token-then-EOS on restore in a release build). The static batched-bench never exercises save/restore so it did not catch it; the GDN decode kernel and per-head partition offsets were already correct (decode with checkpoints disabled is fine at N=8/16/32). Fix: serialize the f32 partition and, when the layer is split, the bf16 partition right after it, each with its OWN row width (tensor ne[0]). head_slot is rebuilt deterministically at load (same model + tau), so it is not serialized. Non-split layers have ne[0] == n_embd_s() and no bf16 partition, so their on-disk format and behavior are byte-identical (the default f32 path and the bit-exact gate are unaffected). Verified on GB10/DGX with Qwen3.6-35B-A3B-NVFP4 + --ssm-bf16-tau 64 via a continuous-batching llama-server: with context checkpoints enabled, N=8, N=16 and N=32 (slot reuse + restore) all now produce full coherent 128-token output and the server stays up; pre-fix the same config aborted on the first checkpoint. Assisted-by: Claude:claude-opus-4-8[1m] [Claude Code] Signed-off-by: Ettore Di Giacinto --- ...0026-qwen35-hybrid-perhead-ssm-state.patch | 132 ++++++++++++++---- 1 file changed, 102 insertions(+), 30 deletions(-) diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch index 8401fa79a..787c1df70 100644 --- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch @@ -1,8 +1,8 @@ diff --git a/common/arg.cpp b/common/arg.cpp -index 841a38e..3e05bd4 100644 +index 841ca3c..0b5b6ec 100644 --- a/common/arg.cpp +++ b/common/arg.cpp -@@ -2157,6 +2157,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex +@@ -2194,6 +2194,47 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.cache_type_v = kv_cache_type_from_str(value); } ).set_env("LLAMA_ARG_CACHE_TYPE_V")); @@ -54,23 +54,13 @@ diff --git a/common/common.cpp b/common/common.cpp index a14e7bb..c4ab884 100644 --- a/common/common.cpp +++ b/common/common.cpp -@@ -1600,6 +1600,19 @@ struct llama_context_params common_context_params_to_llama(const common_params & +@@ -1600,6 +1600,9 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.type_k = params.cache_type_k; cparams.type_v = params.cache_type_v; + cparams.type_r = params.cache_type_conv; + cparams.type_s = params.cache_type_ssm; + cparams.ssm_hybrid_tau_thresh = params.ssm_hybrid_tau_thresh; -+ // LocalAI per-model option hook: when the --ssm-bf16-tau CLI flag is at its bit-exact -+ // default (0), honor LLAMA_SSM_BF16_TAU (set by the grpc-server from the model YAML -+ // `options: [ssm_bf16_tau:N]`) so the reduced-precision hybrid fast mode is selectable -+ // per model without a process-wide CLI flag. Absent/non-positive env => untouched, so -+ // stock stays bit-exact; the CLI flag, when set, takes precedence. -+ if (cparams.ssm_hybrid_tau_thresh == 0.0f) { -+ if (const char * tau_env = std::getenv("LLAMA_SSM_BF16_TAU")) { -+ try { cparams.ssm_hybrid_tau_thresh = std::stof(tau_env); } catch (...) {} -+ } -+ } return cparams; } @@ -1358,7 +1348,7 @@ index 484eafb..46618d3 100644 ~llama_memory_hybrid() = default; diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp -index 6a4892f..aae57a4 100644 +index 6a4892f..5aba1e4 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -8,6 +8,7 @@ @@ -1529,7 +1519,52 @@ index 6a4892f..aae57a4 100644 return size_s_bytes; } -@@ -1041,6 +1136,44 @@ bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell +@@ -892,24 +987,33 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std:: + } + + if (!s_trans) { +- for (uint32_t il = 0; il < n_layer; ++il) { +- // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null) +- if (s_l[il] == nullptr) continue; +- +- // Write S tensor type +- const int32_t s_type_i = (int32_t)s_l[il]->type; ++ // Hybrid per-head SSM split (lever A): a split layer's SSM state lives in an f32 partition ++ // (s_l, S_v*S_v*n_f32 wide) PLUS a bf16 partition (s_l_bf16, S_v*S_v*n_bf16 wide). BOTH must be ++ // serialized, each with its OWN row width (tensor ne[0]) - using hparams.n_embd_s() (the full ++ // S_v*S_v*H) over-reads the smaller f32 partition tensor out of bounds (the serving crash / ++ // garbage-on-restore at high concurrency, where the cell range starts at a higher base) and ++ // silently drops the fast-head bf16 state. head_slot is rebuilt deterministically at load (same ++ // model + tau) so it is not serialized. Non-split layers have ne[0] == n_embd_s(), so their ++ // on-disk format and behavior are byte-identical to before. ++ auto write_s_partition = [&](ggml_tensor * s) { ++ const int32_t s_type_i = (int32_t) s->type; + io.write(&s_type_i, sizeof(s_type_i)); +- +- // Write row size of S tensor +- const uint64_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s()); ++ const uint64_t s_size_row = ggml_row_size(s->type, s->ne[0]); + io.write(&s_size_row, sizeof(s_size_row)); +- + // Write each logical cell row range. With pending recurrent rollback, + // the logical current state may live in a rollback snapshot plane. + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t buf_size = range_size * s_size_row; +- io.write_tensor(s_l[il], range.first * s_size_row, buf_size); ++ io.write_tensor(s, range.first * s_size_row, buf_size); ++ } ++ }; ++ for (uint32_t il = 0; il < n_layer; ++il) { ++ // skip null layers (read_data will handle this by checking "r_l" and "s_l" for null) ++ if (s_l[il] == nullptr) continue; ++ write_s_partition(s_l[il]); ++ if (s_l_bf16[il] != nullptr) { ++ write_s_partition(s_l_bf16[il]); + } + } + } else { +@@ -1041,6 +1145,44 @@ bool llama_memory_recurrent::state_read_meta(llama_io_read_i & io, uint32_t cell return true; } @@ -1574,7 +1609,7 @@ index 6a4892f..aae57a4 100644 bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell_count) { uint32_t s_trans; uint32_t n_layer; -@@ -1069,14 +1202,20 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell +@@ -1069,14 +1211,20 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell int32_t r_type_i_ref; io.read(&r_type_i_ref, sizeof(r_type_i_ref)); const int32_t r_type_i = (int32_t) r_l[il]->type; @@ -1599,34 +1634,71 @@ index 6a4892f..aae57a4 100644 const size_t r_size_row = ggml_row_size(r_l[il]->type, hparams.n_embd_r()); if (r_size_row != r_size_row_ref) { LLAMA_LOG_ERROR("%s: mismatched r row size (%zu != %zu, layer %d)\n", __func__, r_size_row, (size_t) r_size_row_ref, il); -@@ -1099,14 +1238,21 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell - io.read(&s_type_i_ref, sizeof(s_type_i_ref)); - const int32_t s_type_i = (int32_t)s_l[il]->type; +@@ -1090,32 +1238,50 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell + } + if (!s_trans) { +- for (uint32_t il = 0; il < n_layer; ++il) { +- // skip null layers +- if (s_l[il] == nullptr) continue; +- ++ // Hybrid per-head SSM split (lever A): mirror state_write_data - read the f32 partition (s_l) ++ // and, when the layer is split, the bf16 partition (s_l_bf16) right after it, each with its OWN ++ // row width (tensor ne[0]). For non-split layers ne[0] == n_embd_s(), so the on-disk format and ++ // behavior are unchanged. The f32<->bf16 dtype back-compat conversion still applies per partition. ++ auto read_s_partition = [&](ggml_tensor * s, uint32_t n_embd, uint32_t il) -> bool { + // Read type of value + int32_t s_type_i_ref; + io.read(&s_type_i_ref, sizeof(s_type_i_ref)); +- const int32_t s_type_i = (int32_t)s_l[il]->type; +- - if (s_type_i != s_type_i_ref) { - LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il); - return false; - } -- ++ const int32_t s_type_i = (int32_t) s->type; + // Read row size of value uint64_t s_size_row_ref; io.read(&s_size_row_ref, sizeof(s_size_row_ref)); +- const size_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s()); + + if (s_type_i != s_type_i_ref) { + // back-compat: convert f32<->bf16 saved rows into the live cache dtype; else hard error. + // The SSM-state opt-in flips this dtype, so an f32-saved session can restore into a + // bf16 cache (and vice versa) instead of failing the hard type match. -+ if (!recurrent_read_convert_rows(io, s_l[il], (ggml_type) s_type_i_ref, s_size_row_ref, -+ hparams.n_embd_s(), head, cell_count, "s", il)) { -+ return false; -+ } -+ continue; ++ return recurrent_read_convert_rows(io, s, (ggml_type) s_type_i_ref, s_size_row_ref, ++ n_embd, head, cell_count, "s", (int) il); + } + - const size_t s_size_row = ggml_row_size(s_l[il]->type, hparams.n_embd_s()); ++ const size_t s_size_row = ggml_row_size(s->type, n_embd); if (s_size_row != s_size_row_ref) { - LLAMA_LOG_ERROR("%s: mismatched s row size (%zu != %zu, layer %d)\n", __func__, s_size_row, (size_t) s_size_row_ref, il); -@@ -1241,6 +1387,18 @@ ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const { +- LLAMA_LOG_ERROR("%s: mismatched s row size (%zu != %zu, layer %d)\n", __func__, s_size_row, (size_t) s_size_row_ref, il); ++ LLAMA_LOG_ERROR("%s: mismatched s row size (%zu != %zu, layer %d)\n", __func__, s_size_row, (size_t) s_size_row_ref, (int) il); + return false; + } + + if (cell_count) { + // Read and set the values for the whole cell range +- io.read_tensor(s_l[il], head * s_size_row, cell_count * s_size_row); ++ io.read_tensor(s, head * s_size_row, cell_count * s_size_row); ++ } ++ return true; ++ }; ++ for (uint32_t il = 0; il < n_layer; ++il) { ++ // skip null layers ++ if (s_l[il] == nullptr) continue; ++ if (!read_s_partition(s_l[il], (uint32_t) s_l[il]->ne[0], il)) { ++ return false; ++ } ++ if (s_l_bf16[il] != nullptr) { ++ if (!read_s_partition(s_l_bf16[il], (uint32_t) s_l_bf16[il]->ne[0], il)) { ++ return false; ++ } + } + } + } else { +@@ -1241,6 +1407,18 @@ ggml_tensor * llama_memory_recurrent_context::get_s_l(int32_t il) const { return mem->s_l[il]; } @@ -1848,7 +1920,7 @@ index 0eee804..58f3d0c 100644 // lambda, exactly like mamba-base's ggml_ssm_scan) and still performs the rs_zero clear and // the extra-states copy around it. The op reads curr_state from cache[ids[seq]] and writes diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp -index 41babb8..b5e3048 100644 +index 330d936..188d7b3 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3901,6 +3901,7 @@ struct test_rwkv_wkv6 : public test_case { @@ -1973,7 +2045,7 @@ index 41babb8..b5e3048 100644 // GGML_OP_GATED_LINEAR_ATTN struct test_gla : public test_case { const ggml_type type; -@@ -9316,6 +9398,45 @@ static std::vector> make_test_cases_eval() { +@@ -9325,6 +9407,45 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 32, 8, 1, 1, false, false, /*K=*/3)); test_cases.emplace_back(new test_gated_delta_net(GGML_TYPE_F32, 4, 64, 16, 2, 1, false, false, /*K=*/4));