diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-feat-paged-P1-bf16-stream-residual-segment-executor-.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-feat-paged-P1-bf16-stream-residual-segment-executor-.patch new file mode 100644 index 000000000..7f78a8b70 --- /dev/null +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-feat-paged-P1-bf16-stream-residual-segment-executor-.patch @@ -0,0 +1,922 @@ +From 1271488fc47d7db2319163d0b34601dd30d49250 Mon Sep 17 00:00:00 2001 +From: Ettore Di Giacinto +Date: Thu, 2 Jul 2026 16:29:07 +0200 +Subject: [PATCH 53/55] feat(paged): P1 bf16-stream residual-segment executor + + norm-bf16 kernels + +Additive, default-off (LLAMA_BF16_STREAM=1) bf16-resident activation stream +for the q36 residual path, targeting prefill bucket 3 (the convert/glue tax). + +- norm-bf16.cu/.cuh: rms_norm, the 0042 pre_add_mul, and the 0044 gate_mul + norms templated on output dtype, bit-faithful to the f32 kernels up to the + __float2bfloat16 store. +- One additive clause in ggml_cuda_try_fuse detects a residual-stream + norm-producer whose consumers are all large-M cuBLAS-bf16 projections, runs + the norm into a bf16 pool buffer, executes the owned span inline through a + bf16 view, then skips it. Strict all-consumers-are-ours guard keeps the f32 + norm un-materialised and bails to the stock f32 path otherwise (small-M, + decode, MMQ, native-FP4, multi-consumer). +- LLAMA_BF16_CUBLAS_F32_OUT plank: owned projections write f32 directly from + bf16 tensor-core compute, skipping the bf16 dst pool + bf16->f32 convert; + the F32_OUT else-branch is byte-identical to the original cuBLAS path. + +Default md5 stays canonical with the code present-but-off and env-on (small-M +prompts bail): MoE 8cb0ce23777bf55f92f63d0292c756b0, dense 5951a5b4d624ce891e22ab5fca9bc439. + +Assisted-by: Claude:opus-4.8 [Claude Code] +Signed-off-by: Ettore Di Giacinto +--- + ggml/src/ggml-cuda/ggml-cuda.cu | 297 +++++++++++++++++-- + ggml/src/ggml-cuda/norm-bf16.cu | 483 +++++++++++++++++++++++++++++++ + ggml/src/ggml-cuda/norm-bf16.cuh | 37 +++ + 3 files changed, 793 insertions(+), 24 deletions(-) + create mode 100644 ggml/src/ggml-cuda/norm-bf16.cu + create mode 100644 ggml/src/ggml-cuda/norm-bf16.cuh + +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index ef1bdc3b4..5626ccebb 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -36,6 +36,7 @@ + #include "ggml-cuda/mmvf.cuh" + #include "ggml-cuda/mmvq.cuh" + #include "ggml-cuda/norm.cuh" ++#include "ggml-cuda/norm-bf16.cuh" + #include "ggml-cuda/opt-step-adamw.cuh" + #include "ggml-cuda/opt-step-sgd.cuh" + #include "ggml-cuda/out-prod.cuh" +@@ -1628,12 +1629,29 @@ static const cublas_force_compute_type & ggml_cuda_cublas_get_force_compute_type + return compute_type; + } + ++// [P1 bf16-stream] LLAMA_BF16_CUBLAS_F32_OUT plank. When set (by the bf16-stream ++// segment executor around an owned projection, or globally via the env), the cuBLAS ++// bf16/nvfp4 GEMM writes f32 directly from the bf16 tensor-core compute, skipping the ++// bf16 dst pool buffer + the bf16->f32 output convert_dtype. The result is the full ++// f32 GEMM accumulation (the current path rounds it to bf16 then widens back), so this ++// is a strictly-more-precise dtype change gated on the opt-in KL path, never md5. ++static thread_local bool g_bf16_stream_f32_out = false; ++static bool ggml_cuda_bf16_cublas_f32_out_env() { ++ static const bool e = [] { ++ const char * s = getenv("LLAMA_BF16_CUBLAS_F32_OUT"); ++ return s != nullptr && atoi(s) != 0; ++ }(); ++ return e; ++} ++ + static void ggml_cuda_op_mul_mat_cublas( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, cudaStream_t stream) { + ++ const bool bf16_stream_f32_out = g_bf16_stream_f32_out || ggml_cuda_bf16_cublas_f32_out_env(); ++ + GGML_ASSERT(src0_dd_i != nullptr); + GGML_ASSERT(src1_ddf_i != nullptr); + GGML_ASSERT(dst_dd_i != nullptr); +@@ -1686,23 +1704,34 @@ static void ggml_cuda_op_mul_mat_cublas( + } + const nv_bfloat16 * src1_ptr = src1->type == GGML_TYPE_BF16 ? (const nv_bfloat16 *) src1_ddf_i : src1_as_bf16.get(); + const nv_bfloat16 * src0_ptr = src0_as_bf16.get(); +- ggml_cuda_pool_alloc dst_bf16(ctx.pool(id), row_diff*src1_ncols); + + const float alpha_f32 = 1.0f; + const float beta_f32 = 0.0f; + + CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream)); +- CUBLAS_CHECK( +- cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N, +- row_diff, src1_ncols, ne10, +- &alpha_f32, src0_ptr, CUDA_R_16BF, ne00, +- src1_ptr, CUDA_R_16BF, ne10, +- &beta_f32, dst_bf16.get(), CUDA_R_16BF, ldc, +- CUBLAS_COMPUTE_32F, +- CUBLAS_GEMM_DEFAULT_TENSOR_OP)); +- +- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16); +- to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream); ++ if (bf16_stream_f32_out) { ++ // [P1 bf16-stream] write f32 directly, skip the bf16 dst pool + convert. ++ CUBLAS_CHECK( ++ cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N, ++ row_diff, src1_ncols, ne10, ++ &alpha_f32, src0_ptr, CUDA_R_16BF, ne00, ++ src1_ptr, CUDA_R_16BF, ne10, ++ &beta_f32, dst_dd_i, CUDA_R_32F, ldc, ++ CUBLAS_COMPUTE_32F, ++ CUBLAS_GEMM_DEFAULT_TENSOR_OP)); ++ } else { ++ ggml_cuda_pool_alloc dst_bf16(ctx.pool(id), row_diff*src1_ncols); ++ CUBLAS_CHECK( ++ cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N, ++ row_diff, src1_ncols, ne10, ++ &alpha_f32, src0_ptr, CUDA_R_16BF, ne00, ++ src1_ptr, CUDA_R_16BF, ne10, ++ &beta_f32, dst_bf16.get(), CUDA_R_16BF, ldc, ++ CUBLAS_COMPUTE_32F, ++ CUBLAS_GEMM_DEFAULT_TENSOR_OP)); ++ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16); ++ to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream); ++ } + } else if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) { + ggml_cuda_pool_alloc src1_as_bf16(ctx.pool(id)); + if (src1->type != GGML_TYPE_BF16) { +@@ -1714,23 +1743,34 @@ static void ggml_cuda_op_mul_mat_cublas( + } + const nv_bfloat16 * src1_ptr = src1->type == GGML_TYPE_BF16 ? (const nv_bfloat16 *) src1_ddf_i : src1_as_bf16.get(); + const nv_bfloat16 * src0_ptr = (const nv_bfloat16 *)src0_dd_i; +- ggml_cuda_pool_alloc dst_bf16(ctx.pool(id), row_diff*src1_ncols); + + const float alpha_f32 = 1.0f; + const float beta_f32 = 0.0f; + + CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream)); +- CUBLAS_CHECK( +- cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N, +- row_diff, src1_ncols, ne10, +- &alpha_f32, src0_ptr, CUDA_R_16BF, ne00, +- src1_ptr, CUDA_R_16BF, ne10, +- &beta_f32, dst_bf16.get(), CUDA_R_16BF, ldc, +- CUBLAS_COMPUTE_32F, +- CUBLAS_GEMM_DEFAULT_TENSOR_OP)); +- +- const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16); +- to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream); ++ if (bf16_stream_f32_out) { ++ // [P1 bf16-stream] write f32 directly, skip the bf16 dst pool + convert. ++ CUBLAS_CHECK( ++ cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N, ++ row_diff, src1_ncols, ne10, ++ &alpha_f32, src0_ptr, CUDA_R_16BF, ne00, ++ src1_ptr, CUDA_R_16BF, ne10, ++ &beta_f32, dst_dd_i, CUDA_R_32F, ldc, ++ CUBLAS_COMPUTE_32F, ++ CUBLAS_GEMM_DEFAULT_TENSOR_OP)); ++ } else { ++ ggml_cuda_pool_alloc dst_bf16(ctx.pool(id), row_diff*src1_ncols); ++ CUBLAS_CHECK( ++ cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N, ++ row_diff, src1_ncols, ne10, ++ &alpha_f32, src0_ptr, CUDA_R_16BF, ne00, ++ src1_ptr, CUDA_R_16BF, ne10, ++ &beta_f32, dst_bf16.get(), CUDA_R_16BF, ldc, ++ CUBLAS_COMPUTE_32F, ++ CUBLAS_GEMM_DEFAULT_TENSOR_OP)); ++ const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16); ++ to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream); ++ } + } else if (fast_fp16_hardware_available(cc) && use_fp16) { + // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 + ggml_cuda_pool_alloc src0_as_f16(ctx.pool(id)); +@@ -4706,6 +4746,215 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph + return 2; + } + ++ // [P1 bf16-stream] Generalized additive segment executor (LLAMA_BF16_STREAM=1, ++ // default off). ONE clause; the residual-stream segment is detected inside it. ++ // Owns any norm-producer whose consumers are ALL large-M cuBLAS-bf16 projections and ++ // runs that norm into a bf16 pool buffer so every projection reads the bf16 ++ // activation directly - no per-op f32->bf16 convert_dtype glue. Two live q36 kinds: ++ // * plain rms_norm+mul {RMS_NORM,MUL} -> BF16 q/k/v / GDN in_proj (may be ++ // multi-consumer: q,k,v share it) ++ // * 0044 gated-DeltaNet output norm {SILU,RMS_NORM,MUL,MUL} -> ssm_out (the P0 seg) ++ // (The 0042 {ADD,RMS_NORM,MUL} residual-fused norm is handled by its f32 clause below ++ // and, on q36, feeds the NVFP4-MMQ experts, so a bf16 stream there would bail; its ++ // bf16 variant lives in norm-bf16.cu for op-set completeness.) ++ // ++ // Correctness: strict all-consumers-are-ours guard - the f32 norm output is never ++ // materialised, so every node that transitively reads it must be one of our owned ++ // projections (as src1); any other reader, or an unrelated compute node inside the ++ // skipped span, bails and the f32 fused-norm path runs unchanged. Each projection is ++ // executed inline through a bf16 view of the shared buffer; the whole owned span ++ // (norm nodes + intervening pure-view no-ops + the projections) is then skipped. The ++ // LLAMA_BF16_CUBLAS_F32_OUT plank additionally makes the owned projections write f32 ++ // directly (skipping the dst convert). Env-off path and decode/small-M md5 untouched. ++ static const bool bf16_stream = [] { ++ const char * e = getenv("LLAMA_BF16_STREAM"); ++ return e != nullptr && atoi(e) != 0; ++ }(); ++ static const int bf16_stream_trace = [] { ++ const char * e = getenv("LLAMA_BF16_STREAM_TRACE"); ++ return e != nullptr ? atoi(e) : 0; ++ }(); ++ static const bool bf16_stream_f32_out_default = [] { ++ const char * e = getenv("LLAMA_BF16_CUBLAS_F32_OUT"); ++ return e == nullptr || atoi(e) != 0; // plank ON by default when a segment engages ++ }(); ++ if (bf16_stream) { ++ // ---- detect the norm-producer kind + the f32 activation tensor + node span ---- ++ int kind = 0; // 1=plain rms+mul, 2=gated-DeltaNet output norm ++ int norm_span = 0; ++ const char * seg_kind = nullptr; ++ ggml_tensor * k_rms = nullptr, * k_mul = nullptr, * k_silu = nullptr; ++ ggml_tensor * norm_out = nullptr; ++ if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_MUL }, { GGML_UNARY_OP_SILU })) { ++ kind = 2; k_silu = cgraph->nodes[i]; k_rms = cgraph->nodes[i + 1]; k_mul = cgraph->nodes[i + 2]; ++ norm_out = cgraph->nodes[i + 3]; norm_span = 4; seg_kind = "gate_norm"; ++ } else if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) { ++ kind = 1; k_rms = cgraph->nodes[i]; k_mul = cgraph->nodes[i + 1]; ++ norm_out = cgraph->nodes[i + 1]; norm_span = 2; seg_kind = "rms_norm"; ++ } ++ ++ if (kind != 0) { ++ const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; ++ const int norm_end = i + norm_span; ++ ++ // follow a pure view/reshape chain up to norm_out ++ auto roots_at = [](const ggml_tensor * t, const ggml_tensor * root) -> bool { ++ const ggml_tensor * c = t; ++ for (int d = 0; d < 8 && c != nullptr; ++d) { ++ if (c == root) return true; ++ if (c->view_src) { c = c->view_src; continue; } ++ if ((c->op == GGML_OP_RESHAPE || c->op == GGML_OP_VIEW || c->op == GGML_OP_PERMUTE || ++ c->op == GGML_OP_TRANSPOSE || c->op == GGML_OP_CONT) && c->src[0]) { c = c->src[0]; continue; } ++ break; ++ } ++ return false; ++ }; ++ // Metadata-only no-ops (match the stock capture loop's skip set). CONT is ++ // NOT here: it materializes a contiguous copy, so a CONT of norm_out must fall ++ // through to the roots_at check below and bail (it would need the f32 norm). ++ auto is_pure_view = [](const ggml_tensor * t) -> bool { ++ return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_PERMUTE || ++ t->op == GGML_OP_TRANSPOSE || t->op == GGML_OP_NONE; ++ }; ++ // ownable large-M cuBLAS-bf16 projection whose src1 is the FULL norm output ++ auto is_owned_proj = [&](const ggml_tensor * p) -> bool { ++ if (p->op != GGML_OP_MUL_MAT) return false; ++ const ggml_tensor * w = p->src[0]; ++ const ggml_tensor * x1 = p->src[1]; ++ if (!w || !x1) return false; ++ if (!(x1 == norm_out || x1->view_src == norm_out || ++ (x1->op == GGML_OP_RESHAPE && x1->src[0] == norm_out))) return false; ++ if (ggml_nelements(x1) != ggml_nelements(norm_out)) return false; // full, offset 0 ++ return (w->type == GGML_TYPE_BF16 || w->type == GGML_TYPE_NVFP4) && ggml_is_contiguous(w) && ++ p->type == GGML_TYPE_F32 && ++ x1->ne[2] == 1 && x1->ne[3] == 1 && ++ x1->ne[1] >= 128 && ++ !ggml_cuda_fp4_prefill_should_engage(w, x1, const_cast(p), cc) && ++ !ggml_cuda_should_use_mmq(w->type, cc, x1->ne[1], /*n_experts=*/0); ++ }; ++ ++ // Scan the rest of the graph: collect our projections, enforce that every ++ // consumer of norm_out is one of them, and that the skipped span holds only ++ // pure views / our projections. ++ bool ok = true; ++ int n_proj = 0; ++ int max_proj_idx = -1; ++ const char * miss_reason = "unknown"; ++ int miss_node = -1; ++ const char * miss_op = ""; ++ ggml_tensor * projs[16]; ++ for (int j = norm_end; j < cgraph->n_nodes && ok; ++j) { ++ ggml_tensor * nj = cgraph->nodes[j]; ++ // Pure view/reshape no-ops are part of the src1 view chain (or unrelated ++ // metadata ops): they carry no kernel and are re-expressed by the inline ++ // bf16 src1, so they never force f32 materialization. A *real* downstream ++ // consumer that reads norm_out through such a view is still caught below, ++ // because roots_at() climbs the view chain to norm_out. ++ if (is_pure_view(nj)) { ++ continue; ++ } ++ if (is_owned_proj(nj)) { ++ if (n_proj < 16) { projs[n_proj] = nj; } ++ n_proj++; ++ max_proj_idx = j; ++ continue; ++ } ++ // any (non-view, non-projection) reader of norm_out disqualifies the segment ++ for (int s = 0; s < GGML_MAX_SRC; ++s) { ++ if (nj->src[s] && roots_at(nj->src[s], norm_out)) { ++ ok = false; miss_reason = "nonproj_consumer"; miss_node = j; miss_op = ggml_op_name(nj->op); break; ++ } ++ } ++ } ++ // require projections, room in the fixed buffer, and a bounded span. The ++ // span [norm_end, max_proj_idx] may hold non-projection compute (q36 QK-norm / ++ // scale on the projection outputs); those never read norm_out (enforced above) ++ // so the whole span is executed inline in graph order below - owned projections ++ // through the bf16 buffer, everything else via the stock per-node executor - ++ // and then skipped as one unit. ++ const int span_len = max_proj_idx - norm_end; ++ if (!(n_proj >= 1 && n_proj <= 16 && span_len <= 96)) { ++ if (ok) { miss_reason = (n_proj == 0) ? "no_owned_proj" : (n_proj > 16 ? "too_many_proj" : "span_too_long"); } ++ ok = false; ++ } ++ ++ if (ok) { ++ const int64_t ne_tot = ggml_nelements(norm_out); ++ ggml_cuda_pool_alloc norm_bf16(cuda_ctx->pool(), ne_tot); ++ if (kind == 2) { ++ ggml_cuda_rms_norm_gate_mul_bf16out(*cuda_ctx, k_rms, k_mul, k_silu, norm_out, norm_bf16.get()); ++ } else { ++ ggml_cuda_rms_norm_mul_bf16out(*cuda_ctx, k_rms, k_mul, norm_bf16.get()); ++ } ++ ++ // Execute the whole owned span inline, in graph order (mirrors the stock ++ // capture loop's per-node handling for the non-owned nodes). ++ for (int j = norm_end; j <= max_proj_idx; ++j) { ++ ggml_tensor * nj = cgraph->nodes[j]; ++ ++ bool mine = false; ++ for (int p = 0; p < n_proj; ++p) { if (projs[p] == nj) { mine = true; break; } } ++ ++ if (mine) { ++ ggml_tensor * proj_src1 = nj->src[1]; ++ ggml_tensor src1_bf16 = *proj_src1; ++ src1_bf16.type = GGML_TYPE_BF16; ++ src1_bf16.data = norm_bf16.get(); ++ src1_bf16.view_src = nullptr; ++ src1_bf16.view_offs = 0; ++ src1_bf16.nb[0] = sizeof(nv_bfloat16); ++ src1_bf16.nb[1] = src1_bf16.nb[0] * src1_bf16.ne[0]; ++ src1_bf16.nb[2] = src1_bf16.nb[1] * src1_bf16.ne[1]; ++ src1_bf16.nb[3] = src1_bf16.nb[2] * src1_bf16.ne[2]; ++ ++ ggml_tensor * saved_src1 = nj->src[1]; ++ nj->src[1] = &src1_bf16; ++ g_bf16_stream_f32_out = bf16_stream_f32_out_default; ++ const bool okc = ggml_cuda_compute_forward(*cuda_ctx, nj); ++ g_bf16_stream_f32_out = false; ++ nj->src[1] = saved_src1; ++ GGML_ASSERT(okc); ++ continue; ++ } ++ ++ // non-owned span node: mirror the stock loop (skip metadata no-ops, ++ // run the rest through the per-node executor) ++ if (ggml_is_empty(nj) || nj->op == GGML_OP_RESHAPE || nj->op == GGML_OP_TRANSPOSE || ++ nj->op == GGML_OP_VIEW || nj->op == GGML_OP_PERMUTE || nj->op == GGML_OP_NONE) { ++ continue; ++ } ++ if ((nj->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) { ++ continue; ++ } ++ const bool okn = ggml_cuda_compute_forward(*cuda_ctx, nj); ++ GGML_ASSERT(okn); ++ } ++ ++ static std::atomic bf16_stream_engage_count{0}; ++ const int ec = bf16_stream_engage_count.fetch_add(1, std::memory_order_relaxed); ++ if (bf16_stream_trace > 0 && ec < bf16_stream_trace) { ++ const ggml_tensor * w0 = projs[0]->src[0]; ++ fprintf(stderr, ++ "[LLAMA_BF16_STREAM] engaged seg=%s node=%d n_proj=%d last_proj=%d " ++ "M=%" PRId64 " N=%" PRId64 " K=%" PRId64 " f32out=%d skip=%d\n", ++ seg_kind, i, n_proj, max_proj_idx, projs[0]->src[1]->ne[1], w0->ne[1], w0->ne[0], ++ bf16_stream_f32_out_default ? 1 : 0, max_proj_idx - i); ++ } ++ return max_proj_idx - i; // skip norm nodes + intervening views + all owned projections ++ } ++ ++ if (bf16_stream_trace > 0) { ++ static std::atomic bf16_stream_miss_count{0}; ++ const int mc = bf16_stream_miss_count.fetch_add(1, std::memory_order_relaxed); ++ if (mc < bf16_stream_trace) { ++ fprintf(stderr, ++ "[LLAMA_BF16_STREAM] miss seg=%s node=%d n_proj=%d reason=%s miss_node=%d miss_op=%s\n", ++ seg_kind, i, n_proj, miss_reason, miss_node, miss_op); ++ } ++ } ++ } ++ } ++ + // Fused gated RMS norm: RMS norm + weight multiply + SiLU-gated multiply + // (bit-exact). The Qwen3.6 gated-DeltaNet output norm. Default ON; set + // LLAMA_FUSE_GATE_RMSNORM=0 for a clean A/B against the unfused path. +diff --git a/ggml/src/ggml-cuda/norm-bf16.cu b/ggml/src/ggml-cuda/norm-bf16.cu +new file mode 100644 +index 000000000..77ccbf80e +--- /dev/null ++++ b/ggml/src/ggml-cuda/norm-bf16.cu +@@ -0,0 +1,483 @@ ++#include "norm-bf16.cuh" ++ ++#include ++ ++// [P1 bf16-stream] bf16-output variants of the residual-stream norms. Same reduction ++// and FP order as the f32 kernels in norm.cu; only the final store is narrowed. Kept ++// bit-faithful to the f32 norms up to the __float2bfloat16 store so the opt-in stream ++// stays a pure dtype (KL-benign) change, not an algorithmic one. ++ ++// Output-store policy: identity for float, round-to-nearest bf16 for nv_bfloat16. ++template struct bf16stream_store; ++template <> struct bf16stream_store { ++ static __device__ __forceinline__ float store(float v) { return v; } ++}; ++template <> struct bf16stream_store { ++ static __device__ __forceinline__ nv_bfloat16 store(float v) { return __float2bfloat16(v); } ++}; ++ ++// --------------------------------------------------------------------------- ++// plain rms_norm + weight multiply -> Tdst (mirrors rms_norm_f32) ++// --------------------------------------------------------------------------- ++template ++static __global__ void rms_norm_mul_out(const float * x, ++ Tdst * dst, ++ const int ncols, ++ const int64_t stride_row, ++ const int64_t stride_channel, ++ const int64_t stride_sample, ++ const float eps, ++ const float * mul, ++ const int64_t mul_stride_row, ++ const int64_t mul_stride_channel, ++ const int64_t mul_stride_sample, ++ const uint3 mul_ncols_packed, ++ const uint3 mul_nrows_packed, ++ const uint3 mul_nchannels_packed, ++ const uint3 mul_nsamples_packed) { ++ ggml_cuda_pdl_lc(); ++ const int nrows = gridDim.x; ++ const int nchannels = gridDim.y; ++ ++ const int row = blockIdx.x; ++ const int channel = blockIdx.y; ++ const int sample = blockIdx.z; ++ const int tid = threadIdx.x; ++ ++ x += sample*stride_sample + channel*stride_channel + row*stride_row; ++ dst += ((sample*nchannels + channel)*nrows + row)*ncols; ++ ++ { ++ const uint32_t mul_row = fastmodulo(row, mul_nrows_packed); ++ const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed); ++ const uint32_t mul_sample = fastmodulo(sample, mul_nsamples_packed); ++ mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row; ++ } ++ ++ float tmp = 0.0f; ++ ++ ggml_cuda_pdl_sync(); ++ for (int col = tid; col < ncols; col += block_size) { ++ const float xi = x[col]; ++ tmp += xi * xi; ++ } ++ ++ extern __shared__ float s_sum[]; ++ tmp = block_reduce(tmp, s_sum); ++ ++ const float mean = tmp / ncols; ++ const float scale = rsqrtf(mean + eps); ++ ++ for (int col = tid; col < ncols; col += block_size) { ++ const int mul_col = fastmodulo(col, mul_ncols_packed); ++ dst[col] = bf16stream_store::store(scale * x[col] * mul[mul_col]); ++ } ++} ++ ++// --------------------------------------------------------------------------- ++// 0042 residual-add + rms_norm + weight multiply -> f32 h_out + Tdst dst ++// (mirrors rms_norm_pre_add_mul_f32; h_out stays f32 so the next ++// residual add reads the same f32 residual stream) ++// --------------------------------------------------------------------------- ++template ++static __global__ void rms_norm_pre_add_mul_out(const float * a, ++ const float * b, ++ float * h_out, ++ Tdst * dst, ++ const int ncols, ++ const int64_t stride_row, ++ const int64_t stride_channel, ++ const int64_t stride_sample, ++ const float eps, ++ const float * mul, ++ const int64_t mul_stride_row, ++ const int64_t mul_stride_channel, ++ const int64_t mul_stride_sample, ++ const uint3 mul_ncols_packed, ++ const uint3 mul_nrows_packed, ++ const uint3 mul_nchannels_packed, ++ const uint3 mul_nsamples_packed) { ++ ggml_cuda_pdl_lc(); ++ const int nrows = gridDim.x; ++ const int nchannels = gridDim.y; ++ ++ const int row = blockIdx.x; ++ const int channel = blockIdx.y; ++ const int sample = blockIdx.z; ++ const int tid = threadIdx.x; ++ ++ const int64_t row_offset = sample*stride_sample + channel*stride_channel + row*stride_row; ++ a += row_offset; ++ b += row_offset; ++ h_out += row_offset; ++ dst += ((sample*nchannels + channel)*nrows + row)*ncols; ++ ++ { ++ const uint32_t mul_row = fastmodulo(row, mul_nrows_packed); ++ const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed); ++ const uint32_t mul_sample = fastmodulo(sample, mul_nsamples_packed); ++ mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row; ++ } ++ ++ float tmp = 0.0f; ++ ++ ggml_cuda_pdl_sync(); ++ for (int col = tid; col < ncols; col += block_size) { ++ const float hi = a[col] + b[col]; ++ h_out[col] = hi; // publish the f32 residual stream for the next add ++ tmp += hi * hi; ++ } ++ ++ extern __shared__ float s_sum[]; ++ tmp = block_reduce(tmp, s_sum); ++ ++ const float mean = tmp / ncols; ++ const float scale = rsqrtf(mean + eps); ++ ++ for (int col = tid; col < ncols; col += block_size) { ++ const float hi = h_out[col]; ++ const int mul_col = fastmodulo(col, mul_ncols_packed); ++ dst[col] = bf16stream_store::store(scale * hi * mul[mul_col]); ++ } ++} ++ ++// --------------------------------------------------------------------------- ++// 0044 gated-DeltaNet output norm scale*x*w*silu(z) -> Tdst (the P0 segment) ++// --------------------------------------------------------------------------- ++template ++static __global__ void rms_norm_gate_mul_out(const float * x, ++ Tdst * dst, ++ const int ncols, ++ const int64_t stride_row, ++ const int64_t stride_channel, ++ const int64_t stride_sample, ++ const float eps, ++ const float * mul, ++ const int64_t mul_stride_row, ++ const int64_t mul_stride_channel, ++ const int64_t mul_stride_sample, ++ const uint3 mul_ncols_packed, ++ const uint3 mul_nrows_packed, ++ const uint3 mul_nchannels_packed, ++ const uint3 mul_nsamples_packed, ++ const float * gate, ++ const int64_t gate_stride_row, ++ const int64_t gate_stride_channel, ++ const int64_t gate_stride_sample, ++ const uint3 gate_ncols_packed, ++ const uint3 gate_nrows_packed, ++ const uint3 gate_nchannels_packed, ++ const uint3 gate_nsamples_packed) { ++ ggml_cuda_pdl_lc(); ++ const int nrows = gridDim.x; ++ const int nchannels = gridDim.y; ++ ++ const int row = blockIdx.x; ++ const int channel = blockIdx.y; ++ const int sample = blockIdx.z; ++ const int tid = threadIdx.x; ++ ++ x += sample*stride_sample + channel*stride_channel + row*stride_row; ++ dst += ((sample*nchannels + channel)*nrows + row)*ncols; ++ ++ { ++ const uint32_t mul_row = fastmodulo(row, mul_nrows_packed); ++ const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed); ++ const uint32_t mul_sample = fastmodulo(sample, mul_nsamples_packed); ++ mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row; ++ } ++ { ++ const uint32_t gate_row = fastmodulo(row, gate_nrows_packed); ++ const uint32_t gate_channel = fastmodulo(channel, gate_nchannels_packed); ++ const uint32_t gate_sample = fastmodulo(sample, gate_nsamples_packed); ++ gate += gate_sample * gate_stride_sample + gate_channel * gate_stride_channel + gate_row * gate_stride_row; ++ } ++ ++ float tmp = 0.0f; ++ ++ ggml_cuda_pdl_sync(); ++ for (int col = tid; col < ncols; col += block_size) { ++ const float xi = x[col]; ++ tmp += xi * xi; ++ } ++ ++ extern __shared__ float s_sum[]; ++ tmp = block_reduce(tmp, s_sum); ++ ++ const float mean = tmp / ncols; ++ const float scale = rsqrtf(mean + eps); ++ ++ for (int col = tid; col < ncols; col += block_size) { ++ const int mul_col = fastmodulo(col, mul_ncols_packed); ++ const int gate_col = fastmodulo(col, gate_ncols_packed); ++ const float zi = gate[gate_col]; ++ const float silu_z = zi / (1.0f + expf(-zi)); ++ dst[col] = bf16stream_store::store(scale * x[col] * mul[mul_col] * silu_z); ++ } ++} ++ ++// =========================================================================== ++// launchers ++// =========================================================================== ++template ++static void rms_norm_mul_out_cuda(const float * x, Tdst * dst, ++ const int ncols, const int nrows, const int nchannels, const int nsamples, ++ const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, ++ const float * mul, ++ const int64_t mul_stride_row, const int64_t mul_stride_channel, const int64_t mul_stride_sample, ++ const uint32_t mul_ncols, const uint32_t mul_nrows, const uint32_t mul_nchannels, const uint32_t mul_nsamples, ++ const float eps, cudaStream_t stream) { ++ const dim3 blocks_num(nrows, nchannels, nsamples); ++ GGML_ASSERT(mul != nullptr); ++ const uint3 mc = init_fastdiv_values(mul_ncols); ++ const uint3 mr = init_fastdiv_values(mul_nrows); ++ const uint3 mch = init_fastdiv_values(mul_nchannels); ++ const uint3 ms = init_fastdiv_values(mul_nsamples); ++ if (ncols < 1024) { ++ const dim3 block_dims(256, 1, 1); ++ const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream}; ++ ggml_cuda_kernel_launch(rms_norm_mul_out<256, Tdst>, lp, ++ x, dst, ncols, stride_row, stride_channel, stride_sample, eps, ++ mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms); ++ } else { ++ const dim3 block_dims(1024, 1, 1); ++ const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream}; ++ ggml_cuda_kernel_launch(rms_norm_mul_out<1024, Tdst>, lp, ++ x, dst, ncols, stride_row, stride_channel, stride_sample, eps, ++ mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms); ++ } ++} ++ ++template ++static void rms_norm_pre_add_mul_out_cuda(const float * a, const float * b, float * h_out, Tdst * dst, ++ const int ncols, const int nrows, const int nchannels, const int nsamples, ++ const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, ++ const float * mul, ++ const int64_t mul_stride_row, const int64_t mul_stride_channel, const int64_t mul_stride_sample, ++ const uint32_t mul_ncols, const uint32_t mul_nrows, const uint32_t mul_nchannels, const uint32_t mul_nsamples, ++ const float eps, cudaStream_t stream) { ++ const dim3 blocks_num(nrows, nchannels, nsamples); ++ GGML_ASSERT(mul != nullptr); ++ const uint3 mc = init_fastdiv_values(mul_ncols); ++ const uint3 mr = init_fastdiv_values(mul_nrows); ++ const uint3 mch = init_fastdiv_values(mul_nchannels); ++ const uint3 ms = init_fastdiv_values(mul_nsamples); ++ if (ncols < 1024) { ++ const dim3 block_dims(256, 1, 1); ++ const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream}; ++ ggml_cuda_kernel_launch(rms_norm_pre_add_mul_out<256, Tdst>, lp, ++ a, b, h_out, dst, ncols, stride_row, stride_channel, stride_sample, eps, ++ mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms); ++ } else { ++ const dim3 block_dims(1024, 1, 1); ++ const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream}; ++ ggml_cuda_kernel_launch(rms_norm_pre_add_mul_out<1024, Tdst>, lp, ++ a, b, h_out, dst, ncols, stride_row, stride_channel, stride_sample, eps, ++ mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms); ++ } ++} ++ ++template ++static void rms_norm_gate_mul_out_cuda(const float * x, Tdst * dst, ++ const int ncols, const int nrows, const int nchannels, const int nsamples, ++ const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, ++ const float * mul, ++ const int64_t mul_stride_row, const int64_t mul_stride_channel, const int64_t mul_stride_sample, ++ const uint32_t mul_ncols, const uint32_t mul_nrows, const uint32_t mul_nchannels, const uint32_t mul_nsamples, ++ const float * gate, ++ const int64_t gate_stride_row, const int64_t gate_stride_channel, const int64_t gate_stride_sample, ++ const uint32_t gate_ncols, const uint32_t gate_nrows, const uint32_t gate_nchannels, const uint32_t gate_nsamples, ++ const float eps, cudaStream_t stream) { ++ const dim3 blocks_num(nrows, nchannels, nsamples); ++ GGML_ASSERT(mul != nullptr); ++ GGML_ASSERT(gate != nullptr); ++ const uint3 mc = init_fastdiv_values(mul_ncols); ++ const uint3 mr = init_fastdiv_values(mul_nrows); ++ const uint3 mch = init_fastdiv_values(mul_nchannels); ++ const uint3 ms = init_fastdiv_values(mul_nsamples); ++ const uint3 gc = init_fastdiv_values(gate_ncols); ++ const uint3 gr = init_fastdiv_values(gate_nrows); ++ const uint3 gch = init_fastdiv_values(gate_nchannels); ++ const uint3 gs = init_fastdiv_values(gate_nsamples); ++ if (ncols < 1024) { ++ const dim3 block_dims(256, 1, 1); ++ const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream}; ++ ggml_cuda_kernel_launch(rms_norm_gate_mul_out<256, Tdst>, lp, ++ x, dst, ncols, stride_row, stride_channel, stride_sample, eps, ++ mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms, ++ gate, gate_stride_row, gate_stride_channel, gate_stride_sample, gc, gr, gch, gs); ++ } else { ++ const dim3 block_dims(1024, 1, 1); ++ const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream}; ++ ggml_cuda_kernel_launch(rms_norm_gate_mul_out<1024, Tdst>, lp, ++ x, dst, ncols, stride_row, stride_channel, stride_sample, eps, ++ mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms, ++ gate, gate_stride_row, gate_stride_channel, gate_stride_sample, gc, gr, gch, gs); ++ } ++} ++ ++// =========================================================================== ++// host entries ++// =========================================================================== ++void ggml_cuda_rms_norm_mul_bf16out(ggml_backend_cuda_context & ctx, ++ const ggml_tensor * rms_norm_tensor, ++ const ggml_tensor * mul_tensor, ++ void * dst_bf16) { ++ const ggml_tensor * x_src = rms_norm_tensor->src[0]; ++ const ggml_tensor * mul_src = (mul_tensor->src[0] == rms_norm_tensor) ? mul_tensor->src[1] : mul_tensor->src[0]; ++ GGML_ASSERT(mul_tensor->src[0] == rms_norm_tensor || mul_tensor->src[1] == rms_norm_tensor); ++ ++ float eps = 0.0f; ++ memcpy(&eps, rms_norm_tensor->op_params, sizeof(float)); ++ GGML_ASSERT(eps >= 0.0f); ++ ++ GGML_ASSERT(x_src->type == GGML_TYPE_F32); ++ GGML_ASSERT(mul_src->type == GGML_TYPE_F32); ++ GGML_ASSERT(rms_norm_tensor->type == GGML_TYPE_F32); ++ GGML_ASSERT(mul_tensor->type == GGML_TYPE_F32); ++ ++ const float * x_d = (const float *) x_src->data; ++ const float * mul_d = (const float *) mul_src->data; ++ nv_bfloat16 * dst_d = (nv_bfloat16 *) dst_bf16; ++ cudaStream_t stream = ctx.stream(); ++ ++ const int64_t ne00 = rms_norm_tensor->ne[0]; ++ const int64_t ne01 = rms_norm_tensor->ne[1]; ++ const int64_t ne02 = rms_norm_tensor->ne[2]; ++ const int64_t ne03 = rms_norm_tensor->ne[3]; ++ ++ const size_t ts0 = ggml_type_size(x_src->type); ++ GGML_ASSERT(x_src->nb[0] == ts0); ++ const int64_t s01 = x_src->nb[1] / ts0; ++ const int64_t s02 = x_src->nb[2] / ts0; ++ const int64_t s03 = x_src->nb[3] / ts0; ++ ++ const size_t ts_mul = ggml_type_size(mul_src->type); ++ GGML_ASSERT(mul_src->nb[0] == ts_mul); ++ const int64_t mul_s01 = mul_src->nb[1] / ts_mul; ++ const int64_t mul_s02 = mul_src->nb[2] / ts_mul; ++ const int64_t mul_s03 = mul_src->nb[3] / ts_mul; ++ ++ rms_norm_mul_out_cuda(x_d, dst_d, ++ ne00, ne01, ne02, ne03, s01, s02, s03, ++ mul_d, mul_s01, mul_s02, mul_s03, ++ mul_src->ne[0], mul_src->ne[1], mul_src->ne[2], mul_src->ne[3], ++ eps, stream); ++} ++ ++void ggml_cuda_rms_norm_pre_add_mul_bf16out(ggml_backend_cuda_context & ctx, ++ const ggml_tensor * add_tensor, ++ const ggml_tensor * rms_norm_tensor, ++ const ggml_tensor * mul_tensor, ++ void * dst_bf16) { ++ GGML_ASSERT(rms_norm_tensor->src[0] == add_tensor); ++ ++ const ggml_tensor * a_src = add_tensor->src[0]; ++ const ggml_tensor * b_src = add_tensor->src[1]; ++ ++ float eps = 0.0f; ++ memcpy(&eps, rms_norm_tensor->op_params, sizeof(float)); ++ GGML_ASSERT(eps >= 0.0f); ++ ++ const ggml_tensor * mul_src = (mul_tensor->src[0] == rms_norm_tensor) ? mul_tensor->src[1] : mul_tensor->src[0]; ++ GGML_ASSERT(mul_tensor->src[0] == rms_norm_tensor || mul_tensor->src[1] == rms_norm_tensor); ++ ++ GGML_ASSERT(a_src->type == GGML_TYPE_F32); ++ GGML_ASSERT(b_src->type == GGML_TYPE_F32); ++ GGML_ASSERT(add_tensor->type == GGML_TYPE_F32); ++ GGML_ASSERT(rms_norm_tensor->type == GGML_TYPE_F32); ++ GGML_ASSERT(mul_tensor->type == GGML_TYPE_F32); ++ GGML_ASSERT(ggml_are_same_shape(a_src, b_src)); ++ ++ const float * a_d = (const float *) a_src->data; ++ const float * b_d = (const float *) b_src->data; ++ float * h_d = (float *) add_tensor->data; // f32 residual stream ++ const float * mul_d = (const float *) mul_src->data; ++ nv_bfloat16 * dst_d = (nv_bfloat16 *) dst_bf16; ++ cudaStream_t stream = ctx.stream(); ++ ++ const int64_t ne00 = add_tensor->ne[0]; ++ const int64_t ne01 = add_tensor->ne[1]; ++ const int64_t ne02 = add_tensor->ne[2]; ++ const int64_t ne03 = add_tensor->ne[3]; ++ ++ const size_t ts0 = ggml_type_size(a_src->type); ++ GGML_ASSERT(a_src->nb[0] == ts0 && b_src->nb[0] == ts0); ++ const int64_t s01 = a_src->nb[1] / ts0; ++ const int64_t s02 = a_src->nb[2] / ts0; ++ const int64_t s03 = a_src->nb[3] / ts0; ++ ++ const size_t ts_mul = ggml_type_size(mul_src->type); ++ GGML_ASSERT(mul_src->nb[0] == ts_mul); ++ const int64_t mul_s01 = mul_src->nb[1] / ts_mul; ++ const int64_t mul_s02 = mul_src->nb[2] / ts_mul; ++ const int64_t mul_s03 = mul_src->nb[3] / ts_mul; ++ ++ rms_norm_pre_add_mul_out_cuda(a_d, b_d, h_d, dst_d, ++ ne00, ne01, ne02, ne03, s01, s02, s03, ++ mul_d, mul_s01, mul_s02, mul_s03, ++ mul_src->ne[0], mul_src->ne[1], mul_src->ne[2], mul_src->ne[3], ++ eps, stream); ++} ++ ++void ggml_cuda_rms_norm_gate_mul_bf16out(ggml_backend_cuda_context & ctx, ++ const ggml_tensor * rms_norm_tensor, ++ const ggml_tensor * mul_tensor, ++ const ggml_tensor * silu_tensor, ++ const ggml_tensor * gate_mul_tensor, ++ void * dst_bf16) { ++ GGML_ASSERT(mul_tensor->src[0] == rms_norm_tensor || mul_tensor->src[1] == rms_norm_tensor); ++ GGML_ASSERT(gate_mul_tensor->src[0] == silu_tensor || gate_mul_tensor->src[1] == silu_tensor); ++ ++ const ggml_tensor * x_src = rms_norm_tensor->src[0]; ++ const ggml_tensor * w_src = (mul_tensor->src[0] == rms_norm_tensor) ? mul_tensor->src[1] : mul_tensor->src[0]; ++ const ggml_tensor * gate_src = silu_tensor->src[0]; ++ ++ float eps = 0.0f; ++ memcpy(&eps, rms_norm_tensor->op_params, sizeof(float)); ++ GGML_ASSERT(eps >= 0.0f); ++ ++ const float * x_d = (const float *) x_src->data; ++ const float * w_d = (const float *) w_src->data; ++ const float * gate_d = (const float *) gate_src->data; ++ nv_bfloat16 * dst_d = (nv_bfloat16 *) dst_bf16; ++ cudaStream_t stream = ctx.stream(); ++ ++ GGML_ASSERT(x_src->type == GGML_TYPE_F32); ++ GGML_ASSERT(w_src->type == GGML_TYPE_F32); ++ GGML_ASSERT(gate_src->type == GGML_TYPE_F32); ++ GGML_ASSERT(rms_norm_tensor->type == GGML_TYPE_F32); ++ GGML_ASSERT(mul_tensor->type == GGML_TYPE_F32); ++ GGML_ASSERT(silu_tensor->type == GGML_TYPE_F32); ++ ++ const int64_t ne00 = rms_norm_tensor->ne[0]; ++ const int64_t ne01 = rms_norm_tensor->ne[1]; ++ const int64_t ne02 = rms_norm_tensor->ne[2]; ++ const int64_t ne03 = rms_norm_tensor->ne[3]; ++ ++ const size_t ts0 = ggml_type_size(x_src->type); ++ GGML_ASSERT(x_src->nb[0] == ts0); ++ const int64_t s01 = x_src->nb[1] / ts0; ++ const int64_t s02 = x_src->nb[2] / ts0; ++ const int64_t s03 = x_src->nb[3] / ts0; ++ ++ const size_t ts_mul = ggml_type_size(w_src->type); ++ GGML_ASSERT(w_src->nb[0] == ts_mul); ++ const int64_t mul_s01 = w_src->nb[1] / ts_mul; ++ const int64_t mul_s02 = w_src->nb[2] / ts_mul; ++ const int64_t mul_s03 = w_src->nb[3] / ts_mul; ++ ++ const size_t ts_gate = ggml_type_size(gate_src->type); ++ GGML_ASSERT(gate_src->nb[0] == ts_gate); ++ const int64_t gate_s01 = gate_src->nb[1] / ts_gate; ++ const int64_t gate_s02 = gate_src->nb[2] / ts_gate; ++ const int64_t gate_s03 = gate_src->nb[3] / ts_gate; ++ ++ rms_norm_gate_mul_out_cuda(x_d, dst_d, ++ ne00, ne01, ne02, ne03, s01, s02, s03, ++ w_d, mul_s01, mul_s02, mul_s03, ++ w_src->ne[0], w_src->ne[1], w_src->ne[2], w_src->ne[3], ++ gate_d, gate_s01, gate_s02, gate_s03, ++ gate_src->ne[0], gate_src->ne[1], gate_src->ne[2], gate_src->ne[3], ++ eps, stream); ++} +diff --git a/ggml/src/ggml-cuda/norm-bf16.cuh b/ggml/src/ggml-cuda/norm-bf16.cuh +new file mode 100644 +index 000000000..3b52757b8 +--- /dev/null ++++ b/ggml/src/ggml-cuda/norm-bf16.cuh +@@ -0,0 +1,37 @@ ++#include "common.cuh" ++ ++// [P1 bf16-stream] bf16-resident execution-pass helpers (default-off, gated by ++// LLAMA_BF16_STREAM at the ggml_cuda_try_fuse call site). Siblings of the plain ++// rms_norm and the 0042/0044 fused norms in norm.cu; these variants write a bf16 ++// output so the consuming projection GEMM reads the activation directly (no f32->bf16 ++// convert_dtype glue). Each kernel is templated on the output dtype (float or ++// nv_bfloat16) so the file carries the full op-variant set the P1 contract names; ++// the live q36 engage path instantiates the bf16 output only. ++// ++// All three keep the same reduction and FP order as their f32 originals; only the ++// final store is narrowed to bf16 via __float2bfloat16, so the opt-in path stays a ++// pure dtype (KL-benign) change, not an algorithmic one. ++ ++// plain rms_norm + weight multiply -> bf16 (attention input norm, GDN input norm) ++void ggml_cuda_rms_norm_mul_bf16out(ggml_backend_cuda_context & ctx, ++ const ggml_tensor * rms_norm_tensor, ++ const ggml_tensor * mul_tensor, ++ void * dst_bf16); ++ ++// 0042 residual-add + rms_norm + weight multiply -> f32 residual (h_out) + bf16 normed ++// (op-set completeness; on q36 the ffn/moe-input norm feeds MMQ experts so the engage ++// path bails, but the kernel + entry keep the op-variant set whole and the sentinel ++// exercises it). ++void ggml_cuda_rms_norm_pre_add_mul_bf16out(ggml_backend_cuda_context & ctx, ++ const ggml_tensor * add_tensor, ++ const ggml_tensor * rms_norm_tensor, ++ const ggml_tensor * mul_tensor, ++ void * dst_bf16); ++ ++// 0044 gated-DeltaNet output norm scale*x*w*silu(z) -> bf16 (ssm_out; the P0 segment) ++void ggml_cuda_rms_norm_gate_mul_bf16out(ggml_backend_cuda_context & ctx, ++ const ggml_tensor * rms_norm_tensor, ++ const ggml_tensor * mul_tensor, ++ const ggml_tensor * silu_tensor, ++ const ggml_tensor * gate_mul_tensor, ++ void * dst_bf16); +-- +2.43.0 + diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0054-feat-paged-P1-bf16-stream-bf16-residual-add-rope-op-.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0054-feat-paged-P1-bf16-stream-bf16-residual-add-rope-op-.patch new file mode 100644 index 000000000..c60f1c8ec --- /dev/null +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0054-feat-paged-P1-bf16-stream-bf16-residual-add-rope-op-.patch @@ -0,0 +1,107 @@ +From 91373e1b9ab290eb9df63ce26e7cd17da81970fe Mon Sep 17 00:00:00 2001 +From: Ettore Di Giacinto +Date: Thu, 2 Jul 2026 16:29:20 +0200 +Subject: [PATCH 54/55] feat(paged): P1 bf16-stream bf16 residual-add + rope + op-variants + +Round out the op-variant set for the bf16-resident stream: bf16 branches in +binbcast.cu (residual add) and bf16 rope instantiations (asserts widened +only). Standing infra; Option-A keeps f32 at segment boundaries so these are +not on the current measured path. Existing f32 paths untouched. + +Assisted-by: Claude:opus-4.8 [Claude Code] +Signed-off-by: Ettore Di Giacinto +--- + ggml/src/ggml-cuda/binbcast.cu | 10 +++++++++- + ggml/src/ggml-cuda/rope.cu | 31 ++++++++++++++++++++++++++++--- + 2 files changed, 37 insertions(+), 4 deletions(-) + +diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu +index 2e38077bf..135becedb 100644 +--- a/ggml/src/ggml-cuda/binbcast.cu ++++ b/ggml/src/ggml-cuda/binbcast.cu +@@ -413,7 +413,7 @@ static void ggml_cuda_op_bin_bcast( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) { + +- GGML_ASSERT(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); ++ GGML_ASSERT(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_BF16); + + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream); +@@ -423,6 +423,14 @@ static void ggml_cuda_op_bin_bcast( + op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream); ++ // [P1 bf16-stream] bf16 residual-add variants, so a bf16-resident segment can keep ++ // its residual add in bf16 (half the memory traffic) rather than widening to f32. ++ } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { ++ op()(src0, src1, dst, (const nv_bfloat16 *) src0_dd, (const nv_bfloat16 *) src1_dd, (nv_bfloat16 *) dst_dd, stream); ++ } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16) { ++ op()(src0, src1, dst, (const nv_bfloat16 *) src0_dd, (const float *) src1_dd, (nv_bfloat16 *) dst_dd, stream); ++ } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) { ++ op()(src0, src1, dst, (const nv_bfloat16 *) src0_dd, (const float *) src1_dd, (float *) dst_dd, stream); + } else { + fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, + ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); +diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu +index e20a5cb6b..923032327 100644 +--- a/ggml/src/ggml-cuda/rope.cu ++++ b/ggml/src/ggml-cuda/rope.cu +@@ -528,11 +528,16 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, + } + cudaStream_t stream = ctx.stream(); + +- GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); +- GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); ++ // [P1 bf16-stream] bf16 is accepted so a bf16-resident attention segment can rope ++ // its Q/K in bf16 (the norm/neox kernels are float-internal, so the bf16 arms just ++ // add T/D = nv_bfloat16 instantiations below). ++ GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16); ++ GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_BF16); + // When not fused, src0 and dst types must match + // When fused (ROPE+VIEW+SET_ROWS), src0 may be F32 and dst may be F16 +- GGML_ASSERT(src0->type == dst->type || (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16)); ++ GGML_ASSERT(src0->type == dst->type || ++ (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) || ++ (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16)); + + const int64_t ne00 = src0->ne[0]; // head dims + const int64_t ne01 = src0->ne[1]; // num heads +@@ -610,6 +615,16 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, + s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base, + ext_factor, attn_factor, corr_dims, freq_factors, row_indices, + set_rows_stride, stream); ++ } else if (src0->type == GGML_TYPE_BF16 && dst_type == GGML_TYPE_BF16) { ++ rope_neox_cuda((const nv_bfloat16 *) src0_d, (nv_bfloat16 *) dst_d, ne00, ne01, ne02, s01, s02, ++ s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base, ++ ext_factor, attn_factor, corr_dims, freq_factors, row_indices, ++ set_rows_stride, stream); ++ } else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_BF16) { ++ rope_neox_cuda((const float *) src0_d, (nv_bfloat16 *) dst_d, ne00, ne01, ne02, s01, s02, ++ s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base, ++ ext_factor, attn_factor, corr_dims, freq_factors, row_indices, ++ set_rows_stride, stream); + } else { + GGML_ABORT("fatal error"); + } +@@ -653,6 +668,16 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx, + s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base, + ext_factor, attn_factor, corr_dims, freq_factors, row_indices, + set_rows_stride, stream); ++ } else if (src0->type == GGML_TYPE_BF16 && dst_type == GGML_TYPE_BF16) { ++ rope_norm_cuda((const nv_bfloat16 *) src0_d, (nv_bfloat16 *) dst_d, ne00, ne01, ne02, s01, s02, ++ s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base, ++ ext_factor, attn_factor, corr_dims, freq_factors, row_indices, ++ set_rows_stride, stream); ++ } else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_BF16) { ++ rope_norm_cuda((const float *) src0_d, (nv_bfloat16 *) dst_d, ne00, ne01, ne02, s01, s02, ++ s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base, ++ ext_factor, attn_factor, corr_dims, freq_factors, row_indices, ++ set_rows_stride, stream); + } else { + GGML_ABORT("fatal error"); + } +-- +2.43.0 + diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0055-test-paged-P1-bf16-stream-BF16_STREAM_SEGMENT-sentin.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0055-test-paged-P1-bf16-stream-BF16_STREAM_SEGMENT-sentin.patch new file mode 100644 index 000000000..133365eb1 --- /dev/null +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0055-test-paged-P1-bf16-stream-BF16_STREAM_SEGMENT-sentin.patch @@ -0,0 +1,116 @@ +From 653bb2f3d5914872010eac29287863bff67de943 Mon Sep 17 00:00:00 2001 +From: Ettore Di Giacinto +Date: Thu, 2 Jul 2026 16:29:20 +0200 +Subject: [PATCH 55/55] test(paged): P1 bf16-stream BF16_STREAM_SEGMENT + sentinel + +Whole-graph test-backend-ops case (MOE_SWIGLU_DOWN style) that engages both +segment kinds (plain rms_norm multi-consumer and the 0044 gate_mul ssm_out) +under LLAMA_BF16_STREAM. Green default and opt-in (4/4). + +Assisted-by: Claude:opus-4.8 [Claude Code] +Signed-off-by: Ettore Di Giacinto +--- + tests/test-backend-ops.cpp | 79 ++++++++++++++++++++++++++++++++++++++ + 1 file changed, 79 insertions(+) + +diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp +index 8c41ae56a..d0c56521c 100644 +--- a/tests/test-backend-ops.cpp ++++ b/tests/test-backend-ops.cpp +@@ -4532,6 +4532,78 @@ struct test_moe_swiglu_down : public test_case { + } + }; + ++// [P1 bf16-stream] Standing coverage for the bf16-native residual-stream segment ++// executor (LLAMA_BF16_STREAM). Builds a residual-stream norm feeding a large-M ++// cuBLAS-bf16 projection (the exact shape ggml_cuda_try_fuse owns): kind 1 = plain ++// rms_norm+mul -> proj (attention / GDN input norm), kind 2 = 0044 gated-DeltaNet ++// output norm silu(z)*rms(x)*w -> proj (ssm_out). Whole-graph so the fusion pass runs; ++// with the env off it validates the f32 path, and with LLAMA_BF16_STREAM=1 it validates ++// the bf16-activation path (the projection reads the bf16 norm output directly). The ++// weight is BF16 so the projection deterministically routes to the cuBLAS-bf16 branch. ++struct test_bf16_stream_segment : public test_case { ++ const int kind; // 1 = plain rms+mul, 2 = gated-DeltaNet output norm ++ const int64_t n_embd; // K (contraction) ++ const int64_t n_out; // N (projection rows) ++ const int64_t n_tokens; // M (>= 128 so the executor engages) ++ ++ std::string vars() override { ++ return VARS_TO_STR4(kind, n_embd, n_out, n_tokens); ++ } ++ ++ double max_nmse_err() override { ++ // bf16 activation rounding on both the default and opt-in paths (the BF16 weight ++ // GEMM already rounds the activation); generous but tight enough to catch a bug. ++ return 1e-2; ++ } ++ ++ uint64_t op_flops(ggml_tensor * t) override { ++ GGML_UNUSED(t); ++ return 2 * n_embd * n_out * n_tokens; ++ } ++ ++ test_bf16_stream_segment(int kind = 1, int64_t n_embd = 4096, int64_t n_out = 2048, int64_t n_tokens = 256) ++ : kind(kind), n_embd(n_embd), n_out(n_out), n_tokens(n_tokens) {} ++ ++ ggml_tensor * build_graph(ggml_context * ctx) override { ++ const float eps = 1e-5f; ++ ++ ggml_tensor * x = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); ++ ggml_set_name(x, "x"); ++ ggml_tensor * w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); ++ ggml_set_name(w, "rms_w"); ++ ggml_tensor * proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_BF16, n_embd, n_out); ++ ggml_set_name(proj_w, "proj_w"); ++ ++ ggml_tensor * rms = ggml_rms_norm(ctx, x, eps); ++ ggml_set_name(rms, "rms"); ++ ggml_tensor * mul = ggml_mul(ctx, rms, w); ++ ggml_set_name(mul, "rms_mul"); ++ ++ ggml_tensor * norm_out = mul; ++ if (kind == 2) { ++ ggml_tensor * z = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); ++ ggml_set_name(z, "gate_z"); ++ ggml_tensor * silu = ggml_silu(ctx, z); ++ ggml_set_name(silu, "silu"); ++ // silu as src0 so the topological order is {SILU, RMS_NORM, MUL, MUL}, ++ // matching the fusion-pass pattern (and the real q36 gated-output norm). ++ norm_out = ggml_mul(ctx, silu, mul); ++ ggml_set_name(norm_out, "gate_mul"); ++ } ++ ++ ggml_tensor * out = ggml_mul_mat(ctx, proj_w, norm_out); ++ ggml_set_name(out, "proj"); ++ return out; ++ } ++ ++ bool run_whole_graph() override { return true; } ++ ++ std::string op_desc(ggml_tensor * t) override { ++ GGML_UNUSED(t); ++ return "BF16_STREAM_SEGMENT"; ++ } ++}; ++ + // MoE down projection -> router-weight multiply -> rank-ordered expert add. + struct test_moe_weighted_combine : public test_case { + const ggml_type type_a; +@@ -9043,6 +9115,13 @@ static std::vector> make_test_cases_eval() { + test_cases.emplace_back(new test_moe_swiglu_down(GGML_TYPE_NVFP4, 128, 8, 768, n, 2048)); + } + ++ // [P1 bf16-stream] bf16-native residual-stream segment executor coverage. Small-M ++ // (bails, f32 path) + large-M (engages under LLAMA_BF16_STREAM=1) for both norm kinds. ++ for (int kind : {1, 2}) { ++ test_cases.emplace_back(new test_bf16_stream_segment(kind, 4096, 2048, 64)); ++ test_cases.emplace_back(new test_bf16_stream_segment(kind, 4096, 2048, 256)); ++ } ++ + // [paged Phase 7] MoE down projection -> router-weight multiply -> rank-ordered + // expert add gate for the weighted-combine fusion candidate. + test_cases.emplace_back(new test_moe_weighted_combine(GGML_TYPE_F32, 8, 2, 32, 8, 64)); +-- +2.43.0 +