feat(paged): regenerate patch series 0053-0055 (P1 bf16-stream)

Additive regen mirroring fork mudler/llama.cpp:localai-paged HEAD 653bb2f3d (base 1edddc8fe + 3 P1 commits). Patches 0001-0052 are untouched. - 0053 residual-segment executor + norm-bf16.{cu,cuh} + LLAMA_BF16_CUBLAS_F32_OUT - 0054 bf16 residual-add + rope op-variants - 0055 BF16_STREAM_SEGMENT test-backend-ops sentinel Kill-gate: a fresh detached worktree at pin 0ed235ea2c17a19fc8238668653946721ed136fd applied all 46 on-disk patches in numeric order (strict git apply) and staged tree 6cf1523047e0e38679baff20844bdc9e6829eb22, byte-for-byte == fork HEAD tree. All default-off (LLAMA_BF16_STREAM); default md5 canonical both models (MoE 8cb0ce23777bf55f92f63d0292c756b0, dense 5951a5b4d624ce891e22ab5fca9bc439). Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-07-03 12:57:02 -04:00 · 2026-07-02 14:34:14 +00:00
parent b2784ccbca
commit 500d653bfa
3 changed files with 1145 additions and 0 deletions
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-feat-paged-P1-bf16-stream-residual-segment-executor-.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-feat-paged-P1-bf16-stream-residual-segment-executor-.patch
@@ -0,0 +1,922 @@
+From 1271488fc47d7db2319163d0b34601dd30d49250 Mon Sep 17 00:00:00 2001
+From: Ettore Di Giacinto <mudler@localai.io>
+Date: Thu, 2 Jul 2026 16:29:07 +0200
+Subject: [PATCH 53/55] feat(paged): P1 bf16-stream residual-segment executor +
+ norm-bf16 kernels
+
+Additive, default-off (LLAMA_BF16_STREAM=1) bf16-resident activation stream
+for the q36 residual path, targeting prefill bucket 3 (the convert/glue tax).
+
+- norm-bf16.cu/.cuh: rms_norm, the 0042 pre_add_mul, and the 0044 gate_mul
+  norms templated on output dtype, bit-faithful to the f32 kernels up to the
+  __float2bfloat16 store.
+- One additive clause in ggml_cuda_try_fuse detects a residual-stream
+  norm-producer whose consumers are all large-M cuBLAS-bf16 projections, runs
+  the norm into a bf16 pool buffer, executes the owned span inline through a
+  bf16 view, then skips it. Strict all-consumers-are-ours guard keeps the f32
+  norm un-materialised and bails to the stock f32 path otherwise (small-M,
+  decode, MMQ, native-FP4, multi-consumer).
+- LLAMA_BF16_CUBLAS_F32_OUT plank: owned projections write f32 directly from
+  bf16 tensor-core compute, skipping the bf16 dst pool + bf16->f32 convert;
+  the F32_OUT else-branch is byte-identical to the original cuBLAS path.
+
+Default md5 stays canonical with the code present-but-off and env-on (small-M
+prompts bail): MoE 8cb0ce23777bf55f92f63d0292c756b0, dense 5951a5b4d624ce891e22ab5fca9bc439.
+
+Assisted-by: Claude:opus-4.8 [Claude Code]
+Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
+---
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 297 +++++++++++++++++--
+ ggml/src/ggml-cuda/norm-bf16.cu  | 483 +++++++++++++++++++++++++++++++
+ ggml/src/ggml-cuda/norm-bf16.cuh |  37 +++
+ 3 files changed, 793 insertions(+), 24 deletions(-)
+ create mode 100644 ggml/src/ggml-cuda/norm-bf16.cu
+ create mode 100644 ggml/src/ggml-cuda/norm-bf16.cuh
+
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index ef1bdc3b4..5626ccebb 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -36,6 +36,7 @@
+ #include "ggml-cuda/mmvf.cuh"
+ #include "ggml-cuda/mmvq.cuh"
+ #include "ggml-cuda/norm.cuh"
+#include "ggml-cuda/norm-bf16.cuh"
+ #include "ggml-cuda/opt-step-adamw.cuh"
+ #include "ggml-cuda/opt-step-sgd.cuh"
+ #include "ggml-cuda/out-prod.cuh"
+@@ -1628,12 +1629,29 @@ static const cublas_force_compute_type & ggml_cuda_cublas_get_force_compute_type
+     return compute_type;
+ }
+ 
+// [P1 bf16-stream] LLAMA_BF16_CUBLAS_F32_OUT plank. When set (by the bf16-stream
+// segment executor around an owned projection, or globally via the env), the cuBLAS
+// bf16/nvfp4 GEMM writes f32 directly from the bf16 tensor-core compute, skipping the
+// bf16 dst pool buffer + the bf16->f32 output convert_dtype. The result is the full
+// f32 GEMM accumulation (the current path rounds it to bf16 then widens back), so this
+// is a strictly-more-precise dtype change gated on the opt-in KL path, never md5.
+static thread_local bool g_bf16_stream_f32_out = false;
+static bool ggml_cuda_bf16_cublas_f32_out_env() {
+    static const bool e = [] {
+        const char * s = getenv("LLAMA_BF16_CUBLAS_F32_OUT");
+        return s != nullptr && atoi(s) != 0;
+    }();
+    return e;
+}
+
+ static void ggml_cuda_op_mul_mat_cublas(
+     ggml_backend_cuda_context & ctx,
+     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+     const int64_t src1_padded_row_size, cudaStream_t stream) {
+ 
+    const bool bf16_stream_f32_out = g_bf16_stream_f32_out || ggml_cuda_bf16_cublas_f32_out_env();
+
+     GGML_ASSERT(src0_dd_i  != nullptr);
+     GGML_ASSERT(src1_ddf_i != nullptr);
+     GGML_ASSERT(dst_dd_i   != nullptr);
+@@ -1686,23 +1704,34 @@ static void ggml_cuda_op_mul_mat_cublas(
+         }
+         const nv_bfloat16 * src1_ptr = src1->type == GGML_TYPE_BF16 ? (const nv_bfloat16 *) src1_ddf_i : src1_as_bf16.get();
+         const nv_bfloat16 * src0_ptr = src0_as_bf16.get();
+-        ggml_cuda_pool_alloc<nv_bfloat16> dst_bf16(ctx.pool(id), row_diff*src1_ncols);
+ 
+         const float alpha_f32 = 1.0f;
+         const float beta_f32  = 0.0f;
+ 
+         CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
+-        CUBLAS_CHECK(
+-            cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
+-                    row_diff, src1_ncols, ne10,
+-                    &alpha_f32,  src0_ptr,       CUDA_R_16BF, ne00,
+-                                 src1_ptr,       CUDA_R_16BF, ne10,
+-                    &beta_f32,   dst_bf16.get(), CUDA_R_16BF, ldc,
+-                    CUBLAS_COMPUTE_32F,
+-                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+-
+-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
+-        to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
+        if (bf16_stream_f32_out) {
+            // [P1 bf16-stream] write f32 directly, skip the bf16 dst pool + convert.
+            CUBLAS_CHECK(
+                cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
+                        row_diff, src1_ncols, ne10,
+                        &alpha_f32,  src0_ptr,  CUDA_R_16BF, ne00,
+                                     src1_ptr,  CUDA_R_16BF, ne10,
+                        &beta_f32,   dst_dd_i,  CUDA_R_32F,  ldc,
+                        CUBLAS_COMPUTE_32F,
+                        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+        } else {
+            ggml_cuda_pool_alloc<nv_bfloat16> dst_bf16(ctx.pool(id), row_diff*src1_ncols);
+            CUBLAS_CHECK(
+                cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
+                        row_diff, src1_ncols, ne10,
+                        &alpha_f32,  src0_ptr,       CUDA_R_16BF, ne00,
+                                     src1_ptr,       CUDA_R_16BF, ne10,
+                        &beta_f32,   dst_bf16.get(), CUDA_R_16BF, ldc,
+                        CUBLAS_COMPUTE_32F,
+                        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
+            to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
+        }
+     } else if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
+         ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
+         if (src1->type != GGML_TYPE_BF16) {
+@@ -1714,23 +1743,34 @@ static void ggml_cuda_op_mul_mat_cublas(
+         }
+         const nv_bfloat16 * src1_ptr = src1->type == GGML_TYPE_BF16 ? (const nv_bfloat16 *) src1_ddf_i : src1_as_bf16.get();
+         const nv_bfloat16 * src0_ptr = (const nv_bfloat16 *)src0_dd_i;
+-        ggml_cuda_pool_alloc<nv_bfloat16> dst_bf16(ctx.pool(id), row_diff*src1_ncols);
+ 
+         const float alpha_f32 = 1.0f;
+         const float beta_f32  = 0.0f;
+ 
+         CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
+-        CUBLAS_CHECK(
+-            cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
+-                    row_diff, src1_ncols, ne10,
+-                    &alpha_f32,  src0_ptr,       CUDA_R_16BF, ne00,
+-                                 src1_ptr,       CUDA_R_16BF, ne10,
+-                    &beta_f32,   dst_bf16.get(), CUDA_R_16BF, ldc,
+-                    CUBLAS_COMPUTE_32F,
+-                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+-
+-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
+-        to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
+        if (bf16_stream_f32_out) {
+            // [P1 bf16-stream] write f32 directly, skip the bf16 dst pool + convert.
+            CUBLAS_CHECK(
+                cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
+                        row_diff, src1_ncols, ne10,
+                        &alpha_f32,  src0_ptr,  CUDA_R_16BF, ne00,
+                                     src1_ptr,  CUDA_R_16BF, ne10,
+                        &beta_f32,   dst_dd_i,  CUDA_R_32F,  ldc,
+                        CUBLAS_COMPUTE_32F,
+                        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+        } else {
+            ggml_cuda_pool_alloc<nv_bfloat16> dst_bf16(ctx.pool(id), row_diff*src1_ncols);
+            CUBLAS_CHECK(
+                cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
+                        row_diff, src1_ncols, ne10,
+                        &alpha_f32,  src0_ptr,       CUDA_R_16BF, ne00,
+                                     src1_ptr,       CUDA_R_16BF, ne10,
+                        &beta_f32,   dst_bf16.get(), CUDA_R_16BF, ldc,
+                        CUBLAS_COMPUTE_32F,
+                        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
+            to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
+        }
+     } else if (fast_fp16_hardware_available(cc) && use_fp16) {
+         // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
+         ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
+@@ -4706,6 +4746,215 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph
+         return 2;
+     }
+ 
+    // [P1 bf16-stream] Generalized additive segment executor (LLAMA_BF16_STREAM=1,
+    // default off). ONE clause; the residual-stream segment is detected inside it.
+    // Owns any norm-producer whose consumers are ALL large-M cuBLAS-bf16 projections and
+    // runs that norm into a bf16 pool buffer so every projection reads the bf16
+    // activation directly - no per-op f32->bf16 convert_dtype glue. Two live q36 kinds:
+    //   * plain rms_norm+mul  {RMS_NORM,MUL}        -> BF16 q/k/v / GDN in_proj (may be
+    //                                                  multi-consumer: q,k,v share it)
+    //   * 0044 gated-DeltaNet output norm {SILU,RMS_NORM,MUL,MUL} -> ssm_out (the P0 seg)
+    // (The 0042 {ADD,RMS_NORM,MUL} residual-fused norm is handled by its f32 clause below
+    //  and, on q36, feeds the NVFP4-MMQ experts, so a bf16 stream there would bail; its
+    //  bf16 variant lives in norm-bf16.cu for op-set completeness.)
+    //
+    // Correctness: strict all-consumers-are-ours guard - the f32 norm output is never
+    // materialised, so every node that transitively reads it must be one of our owned
+    // projections (as src1); any other reader, or an unrelated compute node inside the
+    // skipped span, bails and the f32 fused-norm path runs unchanged. Each projection is
+    // executed inline through a bf16 view of the shared buffer; the whole owned span
+    // (norm nodes + intervening pure-view no-ops + the projections) is then skipped. The
+    // LLAMA_BF16_CUBLAS_F32_OUT plank additionally makes the owned projections write f32
+    // directly (skipping the dst convert). Env-off path and decode/small-M md5 untouched.
+    static const bool bf16_stream = [] {
+        const char * e = getenv("LLAMA_BF16_STREAM");
+        return e != nullptr && atoi(e) != 0;
+    }();
+    static const int bf16_stream_trace = [] {
+        const char * e = getenv("LLAMA_BF16_STREAM_TRACE");
+        return e != nullptr ? atoi(e) : 0;
+    }();
+    static const bool bf16_stream_f32_out_default = [] {
+        const char * e = getenv("LLAMA_BF16_CUBLAS_F32_OUT");
+        return e == nullptr || atoi(e) != 0;   // plank ON by default when a segment engages
+    }();
+    if (bf16_stream) {
+        // ---- detect the norm-producer kind + the f32 activation tensor + node span ----
+        int           kind      = 0;     // 1=plain rms+mul, 2=gated-DeltaNet output norm
+        int           norm_span = 0;
+        const char *  seg_kind  = nullptr;
+        ggml_tensor * k_rms  = nullptr, * k_mul = nullptr, * k_silu = nullptr;
+        ggml_tensor * norm_out = nullptr;
+        if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_MUL }, { GGML_UNARY_OP_SILU })) {
+            kind = 2; k_silu = cgraph->nodes[i]; k_rms = cgraph->nodes[i + 1]; k_mul = cgraph->nodes[i + 2];
+            norm_out = cgraph->nodes[i + 3]; norm_span = 4; seg_kind = "gate_norm";
+        } else if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
+            kind = 1; k_rms = cgraph->nodes[i]; k_mul = cgraph->nodes[i + 1];
+            norm_out = cgraph->nodes[i + 1]; norm_span = 2; seg_kind = "rms_norm";
+        }
+
+        if (kind != 0) {
+            const int cc        = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+            const int norm_end  = i + norm_span;
+
+            // follow a pure view/reshape chain up to norm_out
+            auto roots_at = [](const ggml_tensor * t, const ggml_tensor * root) -> bool {
+                const ggml_tensor * c = t;
+                for (int d = 0; d < 8 && c != nullptr; ++d) {
+                    if (c == root) return true;
+                    if (c->view_src) { c = c->view_src; continue; }
+                    if ((c->op == GGML_OP_RESHAPE || c->op == GGML_OP_VIEW || c->op == GGML_OP_PERMUTE ||
+                         c->op == GGML_OP_TRANSPOSE || c->op == GGML_OP_CONT) && c->src[0]) { c = c->src[0]; continue; }
+                    break;
+                }
+                return false;
+            };
+            // Metadata-only no-ops (match the stock capture loop's skip set). CONT is
+            // NOT here: it materializes a contiguous copy, so a CONT of norm_out must fall
+            // through to the roots_at check below and bail (it would need the f32 norm).
+            auto is_pure_view = [](const ggml_tensor * t) -> bool {
+                return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_PERMUTE ||
+                       t->op == GGML_OP_TRANSPOSE || t->op == GGML_OP_NONE;
+            };
+            // ownable large-M cuBLAS-bf16 projection whose src1 is the FULL norm output
+            auto is_owned_proj = [&](const ggml_tensor * p) -> bool {
+                if (p->op != GGML_OP_MUL_MAT) return false;
+                const ggml_tensor * w  = p->src[0];
+                const ggml_tensor * x1 = p->src[1];
+                if (!w || !x1) return false;
+                if (!(x1 == norm_out || x1->view_src == norm_out ||
+                      (x1->op == GGML_OP_RESHAPE && x1->src[0] == norm_out))) return false;
+                if (ggml_nelements(x1) != ggml_nelements(norm_out)) return false;   // full, offset 0
+                return (w->type == GGML_TYPE_BF16 || w->type == GGML_TYPE_NVFP4) && ggml_is_contiguous(w) &&
+                       p->type == GGML_TYPE_F32 &&
+                       x1->ne[2] == 1 && x1->ne[3] == 1 &&
+                       x1->ne[1] >= 128 &&
+                       !ggml_cuda_fp4_prefill_should_engage(w, x1, const_cast<ggml_tensor *>(p), cc) &&
+                       !ggml_cuda_should_use_mmq(w->type, cc, x1->ne[1], /*n_experts=*/0);
+            };
+
+            // Scan the rest of the graph: collect our projections, enforce that every
+            // consumer of norm_out is one of them, and that the skipped span holds only
+            // pure views / our projections.
+            bool          ok           = true;
+            int           n_proj       = 0;
+            int           max_proj_idx = -1;
+            const char *  miss_reason  = "unknown";
+            int           miss_node    = -1;
+            const char *  miss_op      = "";
+            ggml_tensor * projs[16];
+            for (int j = norm_end; j < cgraph->n_nodes && ok; ++j) {
+                ggml_tensor * nj = cgraph->nodes[j];
+                // Pure view/reshape no-ops are part of the src1 view chain (or unrelated
+                // metadata ops): they carry no kernel and are re-expressed by the inline
+                // bf16 src1, so they never force f32 materialization. A *real* downstream
+                // consumer that reads norm_out through such a view is still caught below,
+                // because roots_at() climbs the view chain to norm_out.
+                if (is_pure_view(nj)) {
+                    continue;
+                }
+                if (is_owned_proj(nj)) {
+                    if (n_proj < 16) { projs[n_proj] = nj; }
+                    n_proj++;
+                    max_proj_idx = j;
+                    continue;
+                }
+                // any (non-view, non-projection) reader of norm_out disqualifies the segment
+                for (int s = 0; s < GGML_MAX_SRC; ++s) {
+                    if (nj->src[s] && roots_at(nj->src[s], norm_out)) {
+                        ok = false; miss_reason = "nonproj_consumer"; miss_node = j; miss_op = ggml_op_name(nj->op); break;
+                    }
+                }
+            }
+            // require projections, room in the fixed buffer, and a bounded span. The
+            // span [norm_end, max_proj_idx] may hold non-projection compute (q36 QK-norm /
+            // scale on the projection outputs); those never read norm_out (enforced above)
+            // so the whole span is executed inline in graph order below - owned projections
+            // through the bf16 buffer, everything else via the stock per-node executor -
+            // and then skipped as one unit.
+            const int span_len = max_proj_idx - norm_end;
+            if (!(n_proj >= 1 && n_proj <= 16 && span_len <= 96)) {
+                if (ok) { miss_reason = (n_proj == 0) ? "no_owned_proj" : (n_proj > 16 ? "too_many_proj" : "span_too_long"); }
+                ok = false;
+            }
+
+            if (ok) {
+                const int64_t ne_tot = ggml_nelements(norm_out);
+                ggml_cuda_pool_alloc<nv_bfloat16> norm_bf16(cuda_ctx->pool(), ne_tot);
+                if (kind == 2) {
+                    ggml_cuda_rms_norm_gate_mul_bf16out(*cuda_ctx, k_rms, k_mul, k_silu, norm_out, norm_bf16.get());
+                } else {
+                    ggml_cuda_rms_norm_mul_bf16out(*cuda_ctx, k_rms, k_mul, norm_bf16.get());
+                }
+
+                // Execute the whole owned span inline, in graph order (mirrors the stock
+                // capture loop's per-node handling for the non-owned nodes).
+                for (int j = norm_end; j <= max_proj_idx; ++j) {
+                    ggml_tensor * nj = cgraph->nodes[j];
+
+                    bool mine = false;
+                    for (int p = 0; p < n_proj; ++p) { if (projs[p] == nj) { mine = true; break; } }
+
+                    if (mine) {
+                        ggml_tensor * proj_src1 = nj->src[1];
+                        ggml_tensor   src1_bf16 = *proj_src1;
+                        src1_bf16.type      = GGML_TYPE_BF16;
+                        src1_bf16.data      = norm_bf16.get();
+                        src1_bf16.view_src  = nullptr;
+                        src1_bf16.view_offs = 0;
+                        src1_bf16.nb[0]     = sizeof(nv_bfloat16);
+                        src1_bf16.nb[1]     = src1_bf16.nb[0] * src1_bf16.ne[0];
+                        src1_bf16.nb[2]     = src1_bf16.nb[1] * src1_bf16.ne[1];
+                        src1_bf16.nb[3]     = src1_bf16.nb[2] * src1_bf16.ne[2];
+
+                        ggml_tensor * saved_src1 = nj->src[1];
+                        nj->src[1] = &src1_bf16;
+                        g_bf16_stream_f32_out = bf16_stream_f32_out_default;
+                        const bool okc = ggml_cuda_compute_forward(*cuda_ctx, nj);
+                        g_bf16_stream_f32_out = false;
+                        nj->src[1] = saved_src1;
+                        GGML_ASSERT(okc);
+                        continue;
+                    }
+
+                    // non-owned span node: mirror the stock loop (skip metadata no-ops,
+                    // run the rest through the per-node executor)
+                    if (ggml_is_empty(nj) || nj->op == GGML_OP_RESHAPE || nj->op == GGML_OP_TRANSPOSE ||
+                        nj->op == GGML_OP_VIEW || nj->op == GGML_OP_PERMUTE || nj->op == GGML_OP_NONE) {
+                        continue;
+                    }
+                    if ((nj->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
+                        continue;
+                    }
+                    const bool okn = ggml_cuda_compute_forward(*cuda_ctx, nj);
+                    GGML_ASSERT(okn);
+                }
+
+                static std::atomic<int> bf16_stream_engage_count{0};
+                const int ec = bf16_stream_engage_count.fetch_add(1, std::memory_order_relaxed);
+                if (bf16_stream_trace > 0 && ec < bf16_stream_trace) {
+                    const ggml_tensor * w0 = projs[0]->src[0];
+                    fprintf(stderr,
+                        "[LLAMA_BF16_STREAM] engaged seg=%s node=%d n_proj=%d last_proj=%d "
+                        "M=%" PRId64 " N=%" PRId64 " K=%" PRId64 " f32out=%d skip=%d\n",
+                        seg_kind, i, n_proj, max_proj_idx, projs[0]->src[1]->ne[1], w0->ne[1], w0->ne[0],
+                        bf16_stream_f32_out_default ? 1 : 0, max_proj_idx - i);
+                }
+                return max_proj_idx - i;   // skip norm nodes + intervening views + all owned projections
+            }
+
+            if (bf16_stream_trace > 0) {
+                static std::atomic<int> bf16_stream_miss_count{0};
+                const int mc = bf16_stream_miss_count.fetch_add(1, std::memory_order_relaxed);
+                if (mc < bf16_stream_trace) {
+                    fprintf(stderr,
+                        "[LLAMA_BF16_STREAM] miss seg=%s node=%d n_proj=%d reason=%s miss_node=%d miss_op=%s\n",
+                        seg_kind, i, n_proj, miss_reason, miss_node, miss_op);
+                }
+            }
+        }
+    }
+
+     // Fused gated RMS norm: RMS norm + weight multiply + SiLU-gated multiply
+     // (bit-exact). The Qwen3.6 gated-DeltaNet output norm. Default ON; set
+     // LLAMA_FUSE_GATE_RMSNORM=0 for a clean A/B against the unfused path.
+diff --git a/ggml/src/ggml-cuda/norm-bf16.cu b/ggml/src/ggml-cuda/norm-bf16.cu
+new file mode 100644
+index 000000000..77ccbf80e
+--- /dev/null
+++ b/ggml/src/ggml-cuda/norm-bf16.cu
+@@ -0,0 +1,483 @@
+#include "norm-bf16.cuh"
+
+#include <cstring>
+
+// [P1 bf16-stream] bf16-output variants of the residual-stream norms. Same reduction
+// and FP order as the f32 kernels in norm.cu; only the final store is narrowed. Kept
+// bit-faithful to the f32 norms up to the __float2bfloat16 store so the opt-in stream
+// stays a pure dtype (KL-benign) change, not an algorithmic one.
+
+// Output-store policy: identity for float, round-to-nearest bf16 for nv_bfloat16.
+template <typename Tdst> struct bf16stream_store;
+template <> struct bf16stream_store<float> {
+    static __device__ __forceinline__ float store(float v) { return v; }
+};
+template <> struct bf16stream_store<nv_bfloat16> {
+    static __device__ __forceinline__ nv_bfloat16 store(float v) { return __float2bfloat16(v); }
+};
+
+// ---------------------------------------------------------------------------
+// plain rms_norm + weight multiply -> Tdst    (mirrors rms_norm_f32<do_multiply=true>)
+// ---------------------------------------------------------------------------
+template <int block_size, typename Tdst>
+static __global__ void rms_norm_mul_out(const float * x,
+                                        Tdst *        dst,
+                                        const int     ncols,
+                                        const int64_t stride_row,
+                                        const int64_t stride_channel,
+                                        const int64_t stride_sample,
+                                        const float   eps,
+                                        const float * mul,
+                                        const int64_t mul_stride_row,
+                                        const int64_t mul_stride_channel,
+                                        const int64_t mul_stride_sample,
+                                        const uint3   mul_ncols_packed,
+                                        const uint3   mul_nrows_packed,
+                                        const uint3   mul_nchannels_packed,
+                                        const uint3   mul_nsamples_packed) {
+    ggml_cuda_pdl_lc();
+    const int nrows     = gridDim.x;
+    const int nchannels = gridDim.y;
+
+    const int row     = blockIdx.x;
+    const int channel = blockIdx.y;
+    const int sample  = blockIdx.z;
+    const int tid     = threadIdx.x;
+
+    x   += sample*stride_sample + channel*stride_channel + row*stride_row;
+    dst += ((sample*nchannels + channel)*nrows + row)*ncols;
+
+    {
+        const uint32_t mul_row     = fastmodulo(row, mul_nrows_packed);
+        const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed);
+        const uint32_t mul_sample  = fastmodulo(sample, mul_nsamples_packed);
+        mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row;
+    }
+
+    float tmp = 0.0f;
+
+    ggml_cuda_pdl_sync();
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[col];
+        tmp += xi * xi;
+    }
+
+    extern __shared__ float s_sum[];
+    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
+
+    const float mean  = tmp / ncols;
+    const float scale = rsqrtf(mean + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int mul_col = fastmodulo(col, mul_ncols_packed);
+        dst[col]          = bf16stream_store<Tdst>::store(scale * x[col] * mul[mul_col]);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// 0042 residual-add + rms_norm + weight multiply -> f32 h_out + Tdst dst
+// (mirrors rms_norm_pre_add_mul_f32<do_multiply=true>; h_out stays f32 so the next
+//  residual add reads the same f32 residual stream)
+// ---------------------------------------------------------------------------
+template <int block_size, typename Tdst>
+static __global__ void rms_norm_pre_add_mul_out(const float * a,
+                                                const float * b,
+                                                float *       h_out,
+                                                Tdst *        dst,
+                                                const int     ncols,
+                                                const int64_t stride_row,
+                                                const int64_t stride_channel,
+                                                const int64_t stride_sample,
+                                                const float   eps,
+                                                const float * mul,
+                                                const int64_t mul_stride_row,
+                                                const int64_t mul_stride_channel,
+                                                const int64_t mul_stride_sample,
+                                                const uint3   mul_ncols_packed,
+                                                const uint3   mul_nrows_packed,
+                                                const uint3   mul_nchannels_packed,
+                                                const uint3   mul_nsamples_packed) {
+    ggml_cuda_pdl_lc();
+    const int nrows     = gridDim.x;
+    const int nchannels = gridDim.y;
+
+    const int row     = blockIdx.x;
+    const int channel = blockIdx.y;
+    const int sample  = blockIdx.z;
+    const int tid     = threadIdx.x;
+
+    const int64_t row_offset = sample*stride_sample + channel*stride_channel + row*stride_row;
+    a     += row_offset;
+    b     += row_offset;
+    h_out += row_offset;
+    dst   += ((sample*nchannels + channel)*nrows + row)*ncols;
+
+    {
+        const uint32_t mul_row     = fastmodulo(row, mul_nrows_packed);
+        const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed);
+        const uint32_t mul_sample  = fastmodulo(sample, mul_nsamples_packed);
+        mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row;
+    }
+
+    float tmp = 0.0f;
+
+    ggml_cuda_pdl_sync();
+    for (int col = tid; col < ncols; col += block_size) {
+        const float hi = a[col] + b[col];
+        h_out[col] = hi;   // publish the f32 residual stream for the next add
+        tmp += hi * hi;
+    }
+
+    extern __shared__ float s_sum[];
+    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
+
+    const float mean  = tmp / ncols;
+    const float scale = rsqrtf(mean + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float hi      = h_out[col];
+        const int   mul_col = fastmodulo(col, mul_ncols_packed);
+        dst[col]            = bf16stream_store<Tdst>::store(scale * hi * mul[mul_col]);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// 0044 gated-DeltaNet output norm  scale*x*w*silu(z) -> Tdst  (the P0 segment)
+// ---------------------------------------------------------------------------
+template <int block_size, typename Tdst>
+static __global__ void rms_norm_gate_mul_out(const float * x,
+                                             Tdst *        dst,
+                                             const int     ncols,
+                                             const int64_t stride_row,
+                                             const int64_t stride_channel,
+                                             const int64_t stride_sample,
+                                             const float   eps,
+                                             const float * mul,
+                                             const int64_t mul_stride_row,
+                                             const int64_t mul_stride_channel,
+                                             const int64_t mul_stride_sample,
+                                             const uint3   mul_ncols_packed,
+                                             const uint3   mul_nrows_packed,
+                                             const uint3   mul_nchannels_packed,
+                                             const uint3   mul_nsamples_packed,
+                                             const float * gate,
+                                             const int64_t gate_stride_row,
+                                             const int64_t gate_stride_channel,
+                                             const int64_t gate_stride_sample,
+                                             const uint3   gate_ncols_packed,
+                                             const uint3   gate_nrows_packed,
+                                             const uint3   gate_nchannels_packed,
+                                             const uint3   gate_nsamples_packed) {
+    ggml_cuda_pdl_lc();
+    const int nrows     = gridDim.x;
+    const int nchannels = gridDim.y;
+
+    const int row     = blockIdx.x;
+    const int channel = blockIdx.y;
+    const int sample  = blockIdx.z;
+    const int tid     = threadIdx.x;
+
+    x   += sample*stride_sample + channel*stride_channel + row*stride_row;
+    dst += ((sample*nchannels + channel)*nrows + row)*ncols;
+
+    {
+        const uint32_t mul_row     = fastmodulo(row, mul_nrows_packed);
+        const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed);
+        const uint32_t mul_sample  = fastmodulo(sample, mul_nsamples_packed);
+        mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row;
+    }
+    {
+        const uint32_t gate_row     = fastmodulo(row, gate_nrows_packed);
+        const uint32_t gate_channel = fastmodulo(channel, gate_nchannels_packed);
+        const uint32_t gate_sample  = fastmodulo(sample, gate_nsamples_packed);
+        gate += gate_sample * gate_stride_sample + gate_channel * gate_stride_channel + gate_row * gate_stride_row;
+    }
+
+    float tmp = 0.0f;
+
+    ggml_cuda_pdl_sync();
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[col];
+        tmp += xi * xi;
+    }
+
+    extern __shared__ float s_sum[];
+    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
+
+    const float mean  = tmp / ncols;
+    const float scale = rsqrtf(mean + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int   mul_col  = fastmodulo(col, mul_ncols_packed);
+        const int   gate_col = fastmodulo(col, gate_ncols_packed);
+        const float zi       = gate[gate_col];
+        const float silu_z   = zi / (1.0f + expf(-zi));
+        dst[col]             = bf16stream_store<Tdst>::store(scale * x[col] * mul[mul_col] * silu_z);
+    }
+}
+
+// ===========================================================================
+// launchers
+// ===========================================================================
+template <typename Tdst>
+static void rms_norm_mul_out_cuda(const float * x, Tdst * dst,
+                                  const int ncols, const int nrows, const int nchannels, const int nsamples,
+                                  const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
+                                  const float * mul,
+                                  const int64_t mul_stride_row, const int64_t mul_stride_channel, const int64_t mul_stride_sample,
+                                  const uint32_t mul_ncols, const uint32_t mul_nrows, const uint32_t mul_nchannels, const uint32_t mul_nsamples,
+                                  const float eps, cudaStream_t stream) {
+    const dim3 blocks_num(nrows, nchannels, nsamples);
+    GGML_ASSERT(mul != nullptr);
+    const uint3 mc = init_fastdiv_values(mul_ncols);
+    const uint3 mr = init_fastdiv_values(mul_nrows);
+    const uint3 mch = init_fastdiv_values(mul_nchannels);
+    const uint3 ms = init_fastdiv_values(mul_nsamples);
+    if (ncols < 1024) {
+        const dim3 block_dims(256, 1, 1);
+        const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
+        ggml_cuda_kernel_launch(rms_norm_mul_out<256, Tdst>, lp,
+            x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
+            mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
+        ggml_cuda_kernel_launch(rms_norm_mul_out<1024, Tdst>, lp,
+            x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
+            mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms);
+    }
+}
+
+template <typename Tdst>
+static void rms_norm_pre_add_mul_out_cuda(const float * a, const float * b, float * h_out, Tdst * dst,
+                                          const int ncols, const int nrows, const int nchannels, const int nsamples,
+                                          const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
+                                          const float * mul,
+                                          const int64_t mul_stride_row, const int64_t mul_stride_channel, const int64_t mul_stride_sample,
+                                          const uint32_t mul_ncols, const uint32_t mul_nrows, const uint32_t mul_nchannels, const uint32_t mul_nsamples,
+                                          const float eps, cudaStream_t stream) {
+    const dim3 blocks_num(nrows, nchannels, nsamples);
+    GGML_ASSERT(mul != nullptr);
+    const uint3 mc = init_fastdiv_values(mul_ncols);
+    const uint3 mr = init_fastdiv_values(mul_nrows);
+    const uint3 mch = init_fastdiv_values(mul_nchannels);
+    const uint3 ms = init_fastdiv_values(mul_nsamples);
+    if (ncols < 1024) {
+        const dim3 block_dims(256, 1, 1);
+        const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
+        ggml_cuda_kernel_launch(rms_norm_pre_add_mul_out<256, Tdst>, lp,
+            a, b, h_out, dst, ncols, stride_row, stride_channel, stride_sample, eps,
+            mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
+        ggml_cuda_kernel_launch(rms_norm_pre_add_mul_out<1024, Tdst>, lp,
+            a, b, h_out, dst, ncols, stride_row, stride_channel, stride_sample, eps,
+            mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms);
+    }
+}
+
+template <typename Tdst>
+static void rms_norm_gate_mul_out_cuda(const float * x, Tdst * dst,
+                                       const int ncols, const int nrows, const int nchannels, const int nsamples,
+                                       const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
+                                       const float * mul,
+                                       const int64_t mul_stride_row, const int64_t mul_stride_channel, const int64_t mul_stride_sample,
+                                       const uint32_t mul_ncols, const uint32_t mul_nrows, const uint32_t mul_nchannels, const uint32_t mul_nsamples,
+                                       const float * gate,
+                                       const int64_t gate_stride_row, const int64_t gate_stride_channel, const int64_t gate_stride_sample,
+                                       const uint32_t gate_ncols, const uint32_t gate_nrows, const uint32_t gate_nchannels, const uint32_t gate_nsamples,
+                                       const float eps, cudaStream_t stream) {
+    const dim3 blocks_num(nrows, nchannels, nsamples);
+    GGML_ASSERT(mul  != nullptr);
+    GGML_ASSERT(gate != nullptr);
+    const uint3 mc = init_fastdiv_values(mul_ncols);
+    const uint3 mr = init_fastdiv_values(mul_nrows);
+    const uint3 mch = init_fastdiv_values(mul_nchannels);
+    const uint3 ms = init_fastdiv_values(mul_nsamples);
+    const uint3 gc = init_fastdiv_values(gate_ncols);
+    const uint3 gr = init_fastdiv_values(gate_nrows);
+    const uint3 gch = init_fastdiv_values(gate_nchannels);
+    const uint3 gs = init_fastdiv_values(gate_nsamples);
+    if (ncols < 1024) {
+        const dim3 block_dims(256, 1, 1);
+        const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
+        ggml_cuda_kernel_launch(rms_norm_gate_mul_out<256, Tdst>, lp,
+            x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
+            mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms,
+            gate, gate_stride_row, gate_stride_channel, gate_stride_sample, gc, gr, gch, gs);
+    } else {
+        const dim3 block_dims(1024, 1, 1);
+        const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
+        ggml_cuda_kernel_launch(rms_norm_gate_mul_out<1024, Tdst>, lp,
+            x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
+            mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms,
+            gate, gate_stride_row, gate_stride_channel, gate_stride_sample, gc, gr, gch, gs);
+    }
+}
+
+// ===========================================================================
+// host entries
+// ===========================================================================
+void ggml_cuda_rms_norm_mul_bf16out(ggml_backend_cuda_context & ctx,
+                                    const ggml_tensor *         rms_norm_tensor,
+                                    const ggml_tensor *         mul_tensor,
+                                    void *                      dst_bf16) {
+    const ggml_tensor * x_src   = rms_norm_tensor->src[0];
+    const ggml_tensor * mul_src = (mul_tensor->src[0] == rms_norm_tensor) ? mul_tensor->src[1] : mul_tensor->src[0];
+    GGML_ASSERT(mul_tensor->src[0] == rms_norm_tensor || mul_tensor->src[1] == rms_norm_tensor);
+
+    float eps = 0.0f;
+    memcpy(&eps, rms_norm_tensor->op_params, sizeof(float));
+    GGML_ASSERT(eps >= 0.0f);
+
+    GGML_ASSERT(x_src->type           == GGML_TYPE_F32);
+    GGML_ASSERT(mul_src->type         == GGML_TYPE_F32);
+    GGML_ASSERT(rms_norm_tensor->type == GGML_TYPE_F32);
+    GGML_ASSERT(mul_tensor->type      == GGML_TYPE_F32);
+
+    const float * x_d   = (const float *) x_src->data;
+    const float * mul_d = (const float *) mul_src->data;
+    nv_bfloat16 * dst_d = (nv_bfloat16 *)  dst_bf16;
+    cudaStream_t  stream = ctx.stream();
+
+    const int64_t ne00 = rms_norm_tensor->ne[0];
+    const int64_t ne01 = rms_norm_tensor->ne[1];
+    const int64_t ne02 = rms_norm_tensor->ne[2];
+    const int64_t ne03 = rms_norm_tensor->ne[3];
+
+    const size_t ts0 = ggml_type_size(x_src->type);
+    GGML_ASSERT(x_src->nb[0] == ts0);
+    const int64_t s01 = x_src->nb[1] / ts0;
+    const int64_t s02 = x_src->nb[2] / ts0;
+    const int64_t s03 = x_src->nb[3] / ts0;
+
+    const size_t ts_mul = ggml_type_size(mul_src->type);
+    GGML_ASSERT(mul_src->nb[0] == ts_mul);
+    const int64_t mul_s01 = mul_src->nb[1] / ts_mul;
+    const int64_t mul_s02 = mul_src->nb[2] / ts_mul;
+    const int64_t mul_s03 = mul_src->nb[3] / ts_mul;
+
+    rms_norm_mul_out_cuda<nv_bfloat16>(x_d, dst_d,
+        ne00, ne01, ne02, ne03, s01, s02, s03,
+        mul_d, mul_s01, mul_s02, mul_s03,
+        mul_src->ne[0], mul_src->ne[1], mul_src->ne[2], mul_src->ne[3],
+        eps, stream);
+}
+
+void ggml_cuda_rms_norm_pre_add_mul_bf16out(ggml_backend_cuda_context & ctx,
+                                            const ggml_tensor *         add_tensor,
+                                            const ggml_tensor *         rms_norm_tensor,
+                                            const ggml_tensor *         mul_tensor,
+                                            void *                      dst_bf16) {
+    GGML_ASSERT(rms_norm_tensor->src[0] == add_tensor);
+
+    const ggml_tensor * a_src = add_tensor->src[0];
+    const ggml_tensor * b_src = add_tensor->src[1];
+
+    float eps = 0.0f;
+    memcpy(&eps, rms_norm_tensor->op_params, sizeof(float));
+    GGML_ASSERT(eps >= 0.0f);
+
+    const ggml_tensor * mul_src = (mul_tensor->src[0] == rms_norm_tensor) ? mul_tensor->src[1] : mul_tensor->src[0];
+    GGML_ASSERT(mul_tensor->src[0] == rms_norm_tensor || mul_tensor->src[1] == rms_norm_tensor);
+
+    GGML_ASSERT(a_src->type           == GGML_TYPE_F32);
+    GGML_ASSERT(b_src->type           == GGML_TYPE_F32);
+    GGML_ASSERT(add_tensor->type      == GGML_TYPE_F32);
+    GGML_ASSERT(rms_norm_tensor->type == GGML_TYPE_F32);
+    GGML_ASSERT(mul_tensor->type      == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_are_same_shape(a_src, b_src));
+
+    const float * a_d   = (const float *) a_src->data;
+    const float * b_d   = (const float *) b_src->data;
+    float *       h_d   = (float *)       add_tensor->data;   // f32 residual stream
+    const float * mul_d = (const float *) mul_src->data;
+    nv_bfloat16 * dst_d = (nv_bfloat16 *)  dst_bf16;
+    cudaStream_t  stream = ctx.stream();
+
+    const int64_t ne00 = add_tensor->ne[0];
+    const int64_t ne01 = add_tensor->ne[1];
+    const int64_t ne02 = add_tensor->ne[2];
+    const int64_t ne03 = add_tensor->ne[3];
+
+    const size_t ts0 = ggml_type_size(a_src->type);
+    GGML_ASSERT(a_src->nb[0] == ts0 && b_src->nb[0] == ts0);
+    const int64_t s01 = a_src->nb[1] / ts0;
+    const int64_t s02 = a_src->nb[2] / ts0;
+    const int64_t s03 = a_src->nb[3] / ts0;
+
+    const size_t ts_mul = ggml_type_size(mul_src->type);
+    GGML_ASSERT(mul_src->nb[0] == ts_mul);
+    const int64_t mul_s01 = mul_src->nb[1] / ts_mul;
+    const int64_t mul_s02 = mul_src->nb[2] / ts_mul;
+    const int64_t mul_s03 = mul_src->nb[3] / ts_mul;
+
+    rms_norm_pre_add_mul_out_cuda<nv_bfloat16>(a_d, b_d, h_d, dst_d,
+        ne00, ne01, ne02, ne03, s01, s02, s03,
+        mul_d, mul_s01, mul_s02, mul_s03,
+        mul_src->ne[0], mul_src->ne[1], mul_src->ne[2], mul_src->ne[3],
+        eps, stream);
+}
+
+void ggml_cuda_rms_norm_gate_mul_bf16out(ggml_backend_cuda_context & ctx,
+                                         const ggml_tensor *         rms_norm_tensor,
+                                         const ggml_tensor *         mul_tensor,
+                                         const ggml_tensor *         silu_tensor,
+                                         const ggml_tensor *         gate_mul_tensor,
+                                         void *                      dst_bf16) {
+    GGML_ASSERT(mul_tensor->src[0] == rms_norm_tensor || mul_tensor->src[1] == rms_norm_tensor);
+    GGML_ASSERT(gate_mul_tensor->src[0] == silu_tensor || gate_mul_tensor->src[1] == silu_tensor);
+
+    const ggml_tensor * x_src    = rms_norm_tensor->src[0];
+    const ggml_tensor * w_src    = (mul_tensor->src[0] == rms_norm_tensor) ? mul_tensor->src[1] : mul_tensor->src[0];
+    const ggml_tensor * gate_src = silu_tensor->src[0];
+
+    float eps = 0.0f;
+    memcpy(&eps, rms_norm_tensor->op_params, sizeof(float));
+    GGML_ASSERT(eps >= 0.0f);
+
+    const float * x_d    = (const float *) x_src->data;
+    const float * w_d    = (const float *) w_src->data;
+    const float * gate_d = (const float *) gate_src->data;
+    nv_bfloat16 * dst_d  = (nv_bfloat16 *)  dst_bf16;
+    cudaStream_t  stream = ctx.stream();
+
+    GGML_ASSERT(x_src->type    == GGML_TYPE_F32);
+    GGML_ASSERT(w_src->type    == GGML_TYPE_F32);
+    GGML_ASSERT(gate_src->type == GGML_TYPE_F32);
+    GGML_ASSERT(rms_norm_tensor->type == GGML_TYPE_F32);
+    GGML_ASSERT(mul_tensor->type      == GGML_TYPE_F32);
+    GGML_ASSERT(silu_tensor->type     == GGML_TYPE_F32);
+
+    const int64_t ne00 = rms_norm_tensor->ne[0];
+    const int64_t ne01 = rms_norm_tensor->ne[1];
+    const int64_t ne02 = rms_norm_tensor->ne[2];
+    const int64_t ne03 = rms_norm_tensor->ne[3];
+
+    const size_t ts0 = ggml_type_size(x_src->type);
+    GGML_ASSERT(x_src->nb[0] == ts0);
+    const int64_t s01 = x_src->nb[1] / ts0;
+    const int64_t s02 = x_src->nb[2] / ts0;
+    const int64_t s03 = x_src->nb[3] / ts0;
+
+    const size_t ts_mul = ggml_type_size(w_src->type);
+    GGML_ASSERT(w_src->nb[0] == ts_mul);
+    const int64_t mul_s01 = w_src->nb[1] / ts_mul;
+    const int64_t mul_s02 = w_src->nb[2] / ts_mul;
+    const int64_t mul_s03 = w_src->nb[3] / ts_mul;
+
+    const size_t ts_gate = ggml_type_size(gate_src->type);
+    GGML_ASSERT(gate_src->nb[0] == ts_gate);
+    const int64_t gate_s01 = gate_src->nb[1] / ts_gate;
+    const int64_t gate_s02 = gate_src->nb[2] / ts_gate;
+    const int64_t gate_s03 = gate_src->nb[3] / ts_gate;
+
+    rms_norm_gate_mul_out_cuda<nv_bfloat16>(x_d, dst_d,
+        ne00, ne01, ne02, ne03, s01, s02, s03,
+        w_d, mul_s01, mul_s02, mul_s03,
+        w_src->ne[0], w_src->ne[1], w_src->ne[2], w_src->ne[3],
+        gate_d, gate_s01, gate_s02, gate_s03,
+        gate_src->ne[0], gate_src->ne[1], gate_src->ne[2], gate_src->ne[3],
+        eps, stream);
+}
+diff --git a/ggml/src/ggml-cuda/norm-bf16.cuh b/ggml/src/ggml-cuda/norm-bf16.cuh
+new file mode 100644
+index 000000000..3b52757b8
+--- /dev/null
+++ b/ggml/src/ggml-cuda/norm-bf16.cuh
+@@ -0,0 +1,37 @@
+#include "common.cuh"
+
+// [P1 bf16-stream] bf16-resident execution-pass helpers (default-off, gated by
+// LLAMA_BF16_STREAM at the ggml_cuda_try_fuse call site). Siblings of the plain
+// rms_norm and the 0042/0044 fused norms in norm.cu; these variants write a bf16
+// output so the consuming projection GEMM reads the activation directly (no f32->bf16
+// convert_dtype glue). Each kernel is templated on the output dtype (float or
+// nv_bfloat16) so the file carries the full op-variant set the P1 contract names;
+// the live q36 engage path instantiates the bf16 output only.
+//
+// All three keep the same reduction and FP order as their f32 originals; only the
+// final store is narrowed to bf16 via __float2bfloat16, so the opt-in path stays a
+// pure dtype (KL-benign) change, not an algorithmic one.
+
+// plain rms_norm + weight multiply -> bf16  (attention input norm, GDN input norm)
+void ggml_cuda_rms_norm_mul_bf16out(ggml_backend_cuda_context & ctx,
+                                    const ggml_tensor *         rms_norm_tensor,
+                                    const ggml_tensor *         mul_tensor,
+                                    void *                      dst_bf16);
+
+// 0042 residual-add + rms_norm + weight multiply -> f32 residual (h_out) + bf16 normed
+// (op-set completeness; on q36 the ffn/moe-input norm feeds MMQ experts so the engage
+// path bails, but the kernel + entry keep the op-variant set whole and the sentinel
+// exercises it).
+void ggml_cuda_rms_norm_pre_add_mul_bf16out(ggml_backend_cuda_context & ctx,
+                                            const ggml_tensor *         add_tensor,
+                                            const ggml_tensor *         rms_norm_tensor,
+                                            const ggml_tensor *         mul_tensor,
+                                            void *                      dst_bf16);
+
+// 0044 gated-DeltaNet output norm  scale*x*w*silu(z)  -> bf16  (ssm_out; the P0 segment)
+void ggml_cuda_rms_norm_gate_mul_bf16out(ggml_backend_cuda_context & ctx,
+                                         const ggml_tensor *         rms_norm_tensor,
+                                         const ggml_tensor *         mul_tensor,
+                                         const ggml_tensor *         silu_tensor,
+                                         const ggml_tensor *         gate_mul_tensor,
+                                         void *                      dst_bf16);
+-- 
+2.43.0
+
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0054-feat-paged-P1-bf16-stream-bf16-residual-add-rope-op-.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0054-feat-paged-P1-bf16-stream-bf16-residual-add-rope-op-.patch
@@ -0,0 +1,107 @@
+From 91373e1b9ab290eb9df63ce26e7cd17da81970fe Mon Sep 17 00:00:00 2001
+From: Ettore Di Giacinto <mudler@localai.io>
+Date: Thu, 2 Jul 2026 16:29:20 +0200
+Subject: [PATCH 54/55] feat(paged): P1 bf16-stream bf16 residual-add + rope
+ op-variants
+
+Round out the op-variant set for the bf16-resident stream: bf16 branches in
+binbcast.cu (residual add) and bf16 rope instantiations (asserts widened
+only). Standing infra; Option-A keeps f32 at segment boundaries so these are
+not on the current measured path. Existing f32 paths untouched.
+
+Assisted-by: Claude:opus-4.8 [Claude Code]
+Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
+---
+ ggml/src/ggml-cuda/binbcast.cu | 10 +++++++++-
+ ggml/src/ggml-cuda/rope.cu     | 31 ++++++++++++++++++++++++++++---
+ 2 files changed, 37 insertions(+), 4 deletions(-)
+
+diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu
+index 2e38077bf..135becedb 100644
+--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
+@@ -413,7 +413,7 @@ static void ggml_cuda_op_bin_bcast(
+     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+     const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
+ 
+-    GGML_ASSERT(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_BF16);
+ 
+     if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+         op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
+@@ -423,6 +423,14 @@ static void ggml_cuda_op_bin_bcast(
+         op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
+     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+         op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
+    // [P1 bf16-stream] bf16 residual-add variants, so a bf16-resident segment can keep
+    // its residual add in bf16 (half the memory traffic) rather than widening to f32.
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) {
+        op()(src0, src1, dst, (const nv_bfloat16 *) src0_dd, (const nv_bfloat16 *) src1_dd, (nv_bfloat16 *) dst_dd, stream);
+    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16) {
+        op()(src0, src1, dst, (const nv_bfloat16 *) src0_dd, (const float *) src1_dd, (nv_bfloat16 *) dst_dd, stream);
+    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
+        op()(src0, src1, dst, (const nv_bfloat16 *) src0_dd, (const float *) src1_dd, (float *) dst_dd, stream);
+     } else {
+         fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+             ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
+index e20a5cb6b..923032327 100644
+--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
+@@ -528,11 +528,16 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
+     }
+     cudaStream_t stream = ctx.stream();
+ 
+-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+-    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
+    // [P1 bf16-stream] bf16 is accepted so a bf16-resident attention segment can rope
+    // its Q/K in bf16 (the norm/neox kernels are float-internal, so the bf16 arms just
+    // add T/D = nv_bfloat16 instantiations below).
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16 ||  dst->type == GGML_TYPE_BF16);
+     // When not fused, src0 and dst types must match
+     // When fused (ROPE+VIEW+SET_ROWS), src0 may be F32 and dst may be F16
+-    GGML_ASSERT(src0->type == dst->type || (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16));
+    GGML_ASSERT(src0->type == dst->type ||
+                (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) ||
+                (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16));
+ 
+     const int64_t ne00 = src0->ne[0]; // head dims
+     const int64_t ne01 = src0->ne[1]; // num heads
+@@ -610,6 +615,16 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
+                                                 s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                 ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                 set_rows_stride, stream);
+        } else if (src0->type == GGML_TYPE_BF16 && dst_type == GGML_TYPE_BF16) {
+            rope_neox_cuda<forward, nv_bfloat16, nv_bfloat16>((const nv_bfloat16 *) src0_d, (nv_bfloat16 *) dst_d, ne00, ne01, ne02, s01, s02,
+                                                s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                set_rows_stride, stream);
+        } else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_BF16) {
+            rope_neox_cuda<forward, float, nv_bfloat16>((const float *) src0_d, (nv_bfloat16 *) dst_d, ne00, ne01, ne02, s01, s02,
+                                                s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                set_rows_stride, stream);
+         } else {
+             GGML_ABORT("fatal error");
+         }
+@@ -653,6 +668,16 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
+                                                 s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                 ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                 set_rows_stride, stream);
+        } else if (src0->type == GGML_TYPE_BF16 && dst_type == GGML_TYPE_BF16) {
+            rope_norm_cuda<forward, nv_bfloat16, nv_bfloat16>((const nv_bfloat16 *) src0_d, (nv_bfloat16 *) dst_d, ne00, ne01, ne02, s01, s02,
+                                                s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                set_rows_stride, stream);
+        } else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_BF16) {
+            rope_norm_cuda<forward, float, nv_bfloat16>((const float *) src0_d, (nv_bfloat16 *) dst_d, ne00, ne01, ne02, s01, s02,
+                                                s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                set_rows_stride, stream);
+         } else {
+             GGML_ABORT("fatal error");
+         }
+-- 
+2.43.0
+
--- a/backend/cpp/llama-cpp-localai-paged/patches/paged/0055-test-paged-P1-bf16-stream-BF16_STREAM_SEGMENT-sentin.patch
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0055-test-paged-P1-bf16-stream-BF16_STREAM_SEGMENT-sentin.patch
@@ -0,0 +1,116 @@
+From 653bb2f3d5914872010eac29287863bff67de943 Mon Sep 17 00:00:00 2001
+From: Ettore Di Giacinto <mudler@localai.io>
+Date: Thu, 2 Jul 2026 16:29:20 +0200
+Subject: [PATCH 55/55] test(paged): P1 bf16-stream BF16_STREAM_SEGMENT
+ sentinel
+
+Whole-graph test-backend-ops case (MOE_SWIGLU_DOWN style) that engages both
+segment kinds (plain rms_norm multi-consumer and the 0044 gate_mul ssm_out)
+under LLAMA_BF16_STREAM. Green default and opt-in (4/4).
+
+Assisted-by: Claude:opus-4.8 [Claude Code]
+Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
+---
+ tests/test-backend-ops.cpp | 79 ++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 79 insertions(+)
+
+diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
+index 8c41ae56a..d0c56521c 100644
+--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
+@@ -4532,6 +4532,78 @@ struct test_moe_swiglu_down : public test_case {
+     }
+ };
+ 
+// [P1 bf16-stream] Standing coverage for the bf16-native residual-stream segment
+// executor (LLAMA_BF16_STREAM). Builds a residual-stream norm feeding a large-M
+// cuBLAS-bf16 projection (the exact shape ggml_cuda_try_fuse owns): kind 1 = plain
+// rms_norm+mul -> proj (attention / GDN input norm), kind 2 = 0044 gated-DeltaNet
+// output norm silu(z)*rms(x)*w -> proj (ssm_out). Whole-graph so the fusion pass runs;
+// with the env off it validates the f32 path, and with LLAMA_BF16_STREAM=1 it validates
+// the bf16-activation path (the projection reads the bf16 norm output directly). The
+// weight is BF16 so the projection deterministically routes to the cuBLAS-bf16 branch.
+struct test_bf16_stream_segment : public test_case {
+    const int     kind;      // 1 = plain rms+mul, 2 = gated-DeltaNet output norm
+    const int64_t n_embd;    // K (contraction)
+    const int64_t n_out;     // N (projection rows)
+    const int64_t n_tokens;  // M (>= 128 so the executor engages)
+
+    std::string vars() override {
+        return VARS_TO_STR4(kind, n_embd, n_out, n_tokens);
+    }
+
+    double max_nmse_err() override {
+        // bf16 activation rounding on both the default and opt-in paths (the BF16 weight
+        // GEMM already rounds the activation); generous but tight enough to catch a bug.
+        return 1e-2;
+    }
+
+    uint64_t op_flops(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return 2 * n_embd * n_out * n_tokens;
+    }
+
+    test_bf16_stream_segment(int kind = 1, int64_t n_embd = 4096, int64_t n_out = 2048, int64_t n_tokens = 256)
+        : kind(kind), n_embd(n_embd), n_out(n_out), n_tokens(n_tokens) {}
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        const float eps = 1e-5f;
+
+        ggml_tensor * x = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
+        ggml_set_name(x, "x");
+        ggml_tensor * w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        ggml_set_name(w, "rms_w");
+        ggml_tensor * proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_BF16, n_embd, n_out);
+        ggml_set_name(proj_w, "proj_w");
+
+        ggml_tensor * rms = ggml_rms_norm(ctx, x, eps);
+        ggml_set_name(rms, "rms");
+        ggml_tensor * mul = ggml_mul(ctx, rms, w);
+        ggml_set_name(mul, "rms_mul");
+
+        ggml_tensor * norm_out = mul;
+        if (kind == 2) {
+            ggml_tensor * z = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
+            ggml_set_name(z, "gate_z");
+            ggml_tensor * silu = ggml_silu(ctx, z);
+            ggml_set_name(silu, "silu");
+            // silu as src0 so the topological order is {SILU, RMS_NORM, MUL, MUL},
+            // matching the fusion-pass pattern (and the real q36 gated-output norm).
+            norm_out = ggml_mul(ctx, silu, mul);
+            ggml_set_name(norm_out, "gate_mul");
+        }
+
+        ggml_tensor * out = ggml_mul_mat(ctx, proj_w, norm_out);
+        ggml_set_name(out, "proj");
+        return out;
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "BF16_STREAM_SEGMENT";
+    }
+};
+
+ // MoE down projection -> router-weight multiply -> rank-ordered expert add.
+ struct test_moe_weighted_combine : public test_case {
+     const ggml_type type_a;
+@@ -9043,6 +9115,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+         test_cases.emplace_back(new test_moe_swiglu_down(GGML_TYPE_NVFP4, 128, 8, 768, n, 2048));
+     }
+ 
+    // [P1 bf16-stream] bf16-native residual-stream segment executor coverage. Small-M
+    // (bails, f32 path) + large-M (engages under LLAMA_BF16_STREAM=1) for both norm kinds.
+    for (int kind : {1, 2}) {
+        test_cases.emplace_back(new test_bf16_stream_segment(kind, 4096, 2048, 64));
+        test_cases.emplace_back(new test_bf16_stream_segment(kind, 4096, 2048, 256));
+    }
+
+     // [paged Phase 7] MoE down projection -> router-weight multiply -> rank-ordered
+     // expert add gate for the weighted-combine fusion candidate.
+     test_cases.emplace_back(new test_moe_weighted_combine(GGML_TYPE_F32, 8, 2, 32, 8, 64));
+-- 
+2.43.0
+