diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-feat-paged-P1-bf16-stream-residual-segment-executor-.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-feat-paged-P1-bf16-stream-residual-segment-executor-.patch
new file mode 100644
index 000000000..7f78a8b70
--- /dev/null
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0053-feat-paged-P1-bf16-stream-residual-segment-executor-.patch
@@ -0,0 +1,922 @@
+From 1271488fc47d7db2319163d0b34601dd30d49250 Mon Sep 17 00:00:00 2001
+From: Ettore Di Giacinto <mudler@localai.io>
+Date: Thu, 2 Jul 2026 16:29:07 +0200
+Subject: [PATCH 53/55] feat(paged): P1 bf16-stream residual-segment executor +
+ norm-bf16 kernels
+
+Additive, default-off (LLAMA_BF16_STREAM=1) bf16-resident activation stream
+for the q36 residual path, targeting prefill bucket 3 (the convert/glue tax).
+
+- norm-bf16.cu/.cuh: rms_norm, the 0042 pre_add_mul, and the 0044 gate_mul
+  norms templated on output dtype, bit-faithful to the f32 kernels up to the
+  __float2bfloat16 store.
+- One additive clause in ggml_cuda_try_fuse detects a residual-stream
+  norm-producer whose consumers are all large-M cuBLAS-bf16 projections, runs
+  the norm into a bf16 pool buffer, executes the owned span inline through a
+  bf16 view, then skips it. Strict all-consumers-are-ours guard keeps the f32
+  norm un-materialised and bails to the stock f32 path otherwise (small-M,
+  decode, MMQ, native-FP4, multi-consumer).
+- LLAMA_BF16_CUBLAS_F32_OUT plank: owned projections write f32 directly from
+  bf16 tensor-core compute, skipping the bf16 dst pool + bf16->f32 convert;
+  the F32_OUT else-branch is byte-identical to the original cuBLAS path.
+
+Default md5 stays canonical with the code present-but-off and env-on (small-M
+prompts bail): MoE 8cb0ce23777bf55f92f63d0292c756b0, dense 5951a5b4d624ce891e22ab5fca9bc439.
+
+Assisted-by: Claude:opus-4.8 [Claude Code]
+Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
+---
+ ggml/src/ggml-cuda/ggml-cuda.cu  | 297 +++++++++++++++++--
+ ggml/src/ggml-cuda/norm-bf16.cu  | 483 +++++++++++++++++++++++++++++++
+ ggml/src/ggml-cuda/norm-bf16.cuh |  37 +++
+ 3 files changed, 793 insertions(+), 24 deletions(-)
+ create mode 100644 ggml/src/ggml-cuda/norm-bf16.cu
+ create mode 100644 ggml/src/ggml-cuda/norm-bf16.cuh
+
+diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
+index ef1bdc3b4..5626ccebb 100644
+--- a/ggml/src/ggml-cuda/ggml-cuda.cu
++++ b/ggml/src/ggml-cuda/ggml-cuda.cu
+@@ -36,6 +36,7 @@
+ #include "ggml-cuda/mmvf.cuh"
+ #include "ggml-cuda/mmvq.cuh"
+ #include "ggml-cuda/norm.cuh"
++#include "ggml-cuda/norm-bf16.cuh"
+ #include "ggml-cuda/opt-step-adamw.cuh"
+ #include "ggml-cuda/opt-step-sgd.cuh"
+ #include "ggml-cuda/out-prod.cuh"
+@@ -1628,12 +1629,29 @@ static const cublas_force_compute_type & ggml_cuda_cublas_get_force_compute_type
+     return compute_type;
+ }
+ 
++// [P1 bf16-stream] LLAMA_BF16_CUBLAS_F32_OUT plank. When set (by the bf16-stream
++// segment executor around an owned projection, or globally via the env), the cuBLAS
++// bf16/nvfp4 GEMM writes f32 directly from the bf16 tensor-core compute, skipping the
++// bf16 dst pool buffer + the bf16->f32 output convert_dtype. The result is the full
++// f32 GEMM accumulation (the current path rounds it to bf16 then widens back), so this
++// is a strictly-more-precise dtype change gated on the opt-in KL path, never md5.
++static thread_local bool g_bf16_stream_f32_out = false;
++static bool ggml_cuda_bf16_cublas_f32_out_env() {
++    static const bool e = [] {
++        const char * s = getenv("LLAMA_BF16_CUBLAS_F32_OUT");
++        return s != nullptr && atoi(s) != 0;
++    }();
++    return e;
++}
++
+ static void ggml_cuda_op_mul_mat_cublas(
+     ggml_backend_cuda_context & ctx,
+     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+     const int64_t src1_padded_row_size, cudaStream_t stream) {
+ 
++    const bool bf16_stream_f32_out = g_bf16_stream_f32_out || ggml_cuda_bf16_cublas_f32_out_env();
++
+     GGML_ASSERT(src0_dd_i  != nullptr);
+     GGML_ASSERT(src1_ddf_i != nullptr);
+     GGML_ASSERT(dst_dd_i   != nullptr);
+@@ -1686,23 +1704,34 @@ static void ggml_cuda_op_mul_mat_cublas(
+         }
+         const nv_bfloat16 * src1_ptr = src1->type == GGML_TYPE_BF16 ? (const nv_bfloat16 *) src1_ddf_i : src1_as_bf16.get();
+         const nv_bfloat16 * src0_ptr = src0_as_bf16.get();
+-        ggml_cuda_pool_alloc<nv_bfloat16> dst_bf16(ctx.pool(id), row_diff*src1_ncols);
+ 
+         const float alpha_f32 = 1.0f;
+         const float beta_f32  = 0.0f;
+ 
+         CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
+-        CUBLAS_CHECK(
+-            cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
+-                    row_diff, src1_ncols, ne10,
+-                    &alpha_f32,  src0_ptr,       CUDA_R_16BF, ne00,
+-                                 src1_ptr,       CUDA_R_16BF, ne10,
+-                    &beta_f32,   dst_bf16.get(), CUDA_R_16BF, ldc,
+-                    CUBLAS_COMPUTE_32F,
+-                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+-
+-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
+-        to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
++        if (bf16_stream_f32_out) {
++            // [P1 bf16-stream] write f32 directly, skip the bf16 dst pool + convert.
++            CUBLAS_CHECK(
++                cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
++                        row_diff, src1_ncols, ne10,
++                        &alpha_f32,  src0_ptr,  CUDA_R_16BF, ne00,
++                                     src1_ptr,  CUDA_R_16BF, ne10,
++                        &beta_f32,   dst_dd_i,  CUDA_R_32F,  ldc,
++                        CUBLAS_COMPUTE_32F,
++                        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
++        } else {
++            ggml_cuda_pool_alloc<nv_bfloat16> dst_bf16(ctx.pool(id), row_diff*src1_ncols);
++            CUBLAS_CHECK(
++                cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
++                        row_diff, src1_ncols, ne10,
++                        &alpha_f32,  src0_ptr,       CUDA_R_16BF, ne00,
++                                     src1_ptr,       CUDA_R_16BF, ne10,
++                        &beta_f32,   dst_bf16.get(), CUDA_R_16BF, ldc,
++                        CUBLAS_COMPUTE_32F,
++                        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
++            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
++            to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
++        }
+     } else if (supports_bf16 && src0->type == GGML_TYPE_BF16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
+         ggml_cuda_pool_alloc<nv_bfloat16> src1_as_bf16(ctx.pool(id));
+         if (src1->type != GGML_TYPE_BF16) {
+@@ -1714,23 +1743,34 @@ static void ggml_cuda_op_mul_mat_cublas(
+         }
+         const nv_bfloat16 * src1_ptr = src1->type == GGML_TYPE_BF16 ? (const nv_bfloat16 *) src1_ddf_i : src1_as_bf16.get();
+         const nv_bfloat16 * src0_ptr = (const nv_bfloat16 *)src0_dd_i;
+-        ggml_cuda_pool_alloc<nv_bfloat16> dst_bf16(ctx.pool(id), row_diff*src1_ncols);
+ 
+         const float alpha_f32 = 1.0f;
+         const float beta_f32  = 0.0f;
+ 
+         CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
+-        CUBLAS_CHECK(
+-            cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
+-                    row_diff, src1_ncols, ne10,
+-                    &alpha_f32,  src0_ptr,       CUDA_R_16BF, ne00,
+-                                 src1_ptr,       CUDA_R_16BF, ne10,
+-                    &beta_f32,   dst_bf16.get(), CUDA_R_16BF, ldc,
+-                    CUBLAS_COMPUTE_32F,
+-                    CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+-
+-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
+-        to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
++        if (bf16_stream_f32_out) {
++            // [P1 bf16-stream] write f32 directly, skip the bf16 dst pool + convert.
++            CUBLAS_CHECK(
++                cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
++                        row_diff, src1_ncols, ne10,
++                        &alpha_f32,  src0_ptr,  CUDA_R_16BF, ne00,
++                                     src1_ptr,  CUDA_R_16BF, ne10,
++                        &beta_f32,   dst_dd_i,  CUDA_R_32F,  ldc,
++                        CUBLAS_COMPUTE_32F,
++                        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
++        } else {
++            ggml_cuda_pool_alloc<nv_bfloat16> dst_bf16(ctx.pool(id), row_diff*src1_ncols);
++            CUBLAS_CHECK(
++                cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N,
++                        row_diff, src1_ncols, ne10,
++                        &alpha_f32,  src0_ptr,       CUDA_R_16BF, ne00,
++                                     src1_ptr,       CUDA_R_16BF, ne10,
++                        &beta_f32,   dst_bf16.get(), CUDA_R_16BF, ldc,
++                        CUBLAS_COMPUTE_32F,
++                        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
++            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_BF16);
++            to_fp32_cuda(dst_bf16.get(), dst_dd_i, row_diff*src1_ncols, stream);
++        }
+     } else if (fast_fp16_hardware_available(cc) && use_fp16) {
+         // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
+         ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
+@@ -4706,6 +4746,215 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph
+         return 2;
+     }
+ 
++    // [P1 bf16-stream] Generalized additive segment executor (LLAMA_BF16_STREAM=1,
++    // default off). ONE clause; the residual-stream segment is detected inside it.
++    // Owns any norm-producer whose consumers are ALL large-M cuBLAS-bf16 projections and
++    // runs that norm into a bf16 pool buffer so every projection reads the bf16
++    // activation directly - no per-op f32->bf16 convert_dtype glue. Two live q36 kinds:
++    //   * plain rms_norm+mul  {RMS_NORM,MUL}        -> BF16 q/k/v / GDN in_proj (may be
++    //                                                  multi-consumer: q,k,v share it)
++    //   * 0044 gated-DeltaNet output norm {SILU,RMS_NORM,MUL,MUL} -> ssm_out (the P0 seg)
++    // (The 0042 {ADD,RMS_NORM,MUL} residual-fused norm is handled by its f32 clause below
++    //  and, on q36, feeds the NVFP4-MMQ experts, so a bf16 stream there would bail; its
++    //  bf16 variant lives in norm-bf16.cu for op-set completeness.)
++    //
++    // Correctness: strict all-consumers-are-ours guard - the f32 norm output is never
++    // materialised, so every node that transitively reads it must be one of our owned
++    // projections (as src1); any other reader, or an unrelated compute node inside the
++    // skipped span, bails and the f32 fused-norm path runs unchanged. Each projection is
++    // executed inline through a bf16 view of the shared buffer; the whole owned span
++    // (norm nodes + intervening pure-view no-ops + the projections) is then skipped. The
++    // LLAMA_BF16_CUBLAS_F32_OUT plank additionally makes the owned projections write f32
++    // directly (skipping the dst convert). Env-off path and decode/small-M md5 untouched.
++    static const bool bf16_stream = [] {
++        const char * e = getenv("LLAMA_BF16_STREAM");
++        return e != nullptr && atoi(e) != 0;
++    }();
++    static const int bf16_stream_trace = [] {
++        const char * e = getenv("LLAMA_BF16_STREAM_TRACE");
++        return e != nullptr ? atoi(e) : 0;
++    }();
++    static const bool bf16_stream_f32_out_default = [] {
++        const char * e = getenv("LLAMA_BF16_CUBLAS_F32_OUT");
++        return e == nullptr || atoi(e) != 0;   // plank ON by default when a segment engages
++    }();
++    if (bf16_stream) {
++        // ---- detect the norm-producer kind + the f32 activation tensor + node span ----
++        int           kind      = 0;     // 1=plain rms+mul, 2=gated-DeltaNet output norm
++        int           norm_span = 0;
++        const char *  seg_kind  = nullptr;
++        ggml_tensor * k_rms  = nullptr, * k_mul = nullptr, * k_silu = nullptr;
++        ggml_tensor * norm_out = nullptr;
++        if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_MUL }, { GGML_UNARY_OP_SILU })) {
++            kind = 2; k_silu = cgraph->nodes[i]; k_rms = cgraph->nodes[i + 1]; k_mul = cgraph->nodes[i + 2];
++            norm_out = cgraph->nodes[i + 3]; norm_span = 4; seg_kind = "gate_norm";
++        } else if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL }, {})) {
++            kind = 1; k_rms = cgraph->nodes[i]; k_mul = cgraph->nodes[i + 1];
++            norm_out = cgraph->nodes[i + 1]; norm_span = 2; seg_kind = "rms_norm";
++        }
++
++        if (kind != 0) {
++            const int cc        = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
++            const int norm_end  = i + norm_span;
++
++            // follow a pure view/reshape chain up to norm_out
++            auto roots_at = [](const ggml_tensor * t, const ggml_tensor * root) -> bool {
++                const ggml_tensor * c = t;
++                for (int d = 0; d < 8 && c != nullptr; ++d) {
++                    if (c == root) return true;
++                    if (c->view_src) { c = c->view_src; continue; }
++                    if ((c->op == GGML_OP_RESHAPE || c->op == GGML_OP_VIEW || c->op == GGML_OP_PERMUTE ||
++                         c->op == GGML_OP_TRANSPOSE || c->op == GGML_OP_CONT) && c->src[0]) { c = c->src[0]; continue; }
++                    break;
++                }
++                return false;
++            };
++            // Metadata-only no-ops (match the stock capture loop's skip set). CONT is
++            // NOT here: it materializes a contiguous copy, so a CONT of norm_out must fall
++            // through to the roots_at check below and bail (it would need the f32 norm).
++            auto is_pure_view = [](const ggml_tensor * t) -> bool {
++                return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_PERMUTE ||
++                       t->op == GGML_OP_TRANSPOSE || t->op == GGML_OP_NONE;
++            };
++            // ownable large-M cuBLAS-bf16 projection whose src1 is the FULL norm output
++            auto is_owned_proj = [&](const ggml_tensor * p) -> bool {
++                if (p->op != GGML_OP_MUL_MAT) return false;
++                const ggml_tensor * w  = p->src[0];
++                const ggml_tensor * x1 = p->src[1];
++                if (!w || !x1) return false;
++                if (!(x1 == norm_out || x1->view_src == norm_out ||
++                      (x1->op == GGML_OP_RESHAPE && x1->src[0] == norm_out))) return false;
++                if (ggml_nelements(x1) != ggml_nelements(norm_out)) return false;   // full, offset 0
++                return (w->type == GGML_TYPE_BF16 || w->type == GGML_TYPE_NVFP4) && ggml_is_contiguous(w) &&
++                       p->type == GGML_TYPE_F32 &&
++                       x1->ne[2] == 1 && x1->ne[3] == 1 &&
++                       x1->ne[1] >= 128 &&
++                       !ggml_cuda_fp4_prefill_should_engage(w, x1, const_cast<ggml_tensor *>(p), cc) &&
++                       !ggml_cuda_should_use_mmq(w->type, cc, x1->ne[1], /*n_experts=*/0);
++            };
++
++            // Scan the rest of the graph: collect our projections, enforce that every
++            // consumer of norm_out is one of them, and that the skipped span holds only
++            // pure views / our projections.
++            bool          ok           = true;
++            int           n_proj       = 0;
++            int           max_proj_idx = -1;
++            const char *  miss_reason  = "unknown";
++            int           miss_node    = -1;
++            const char *  miss_op      = "";
++            ggml_tensor * projs[16];
++            for (int j = norm_end; j < cgraph->n_nodes && ok; ++j) {
++                ggml_tensor * nj = cgraph->nodes[j];
++                // Pure view/reshape no-ops are part of the src1 view chain (or unrelated
++                // metadata ops): they carry no kernel and are re-expressed by the inline
++                // bf16 src1, so they never force f32 materialization. A *real* downstream
++                // consumer that reads norm_out through such a view is still caught below,
++                // because roots_at() climbs the view chain to norm_out.
++                if (is_pure_view(nj)) {
++                    continue;
++                }
++                if (is_owned_proj(nj)) {
++                    if (n_proj < 16) { projs[n_proj] = nj; }
++                    n_proj++;
++                    max_proj_idx = j;
++                    continue;
++                }
++                // any (non-view, non-projection) reader of norm_out disqualifies the segment
++                for (int s = 0; s < GGML_MAX_SRC; ++s) {
++                    if (nj->src[s] && roots_at(nj->src[s], norm_out)) {
++                        ok = false; miss_reason = "nonproj_consumer"; miss_node = j; miss_op = ggml_op_name(nj->op); break;
++                    }
++                }
++            }
++            // require projections, room in the fixed buffer, and a bounded span. The
++            // span [norm_end, max_proj_idx] may hold non-projection compute (q36 QK-norm /
++            // scale on the projection outputs); those never read norm_out (enforced above)
++            // so the whole span is executed inline in graph order below - owned projections
++            // through the bf16 buffer, everything else via the stock per-node executor -
++            // and then skipped as one unit.
++            const int span_len = max_proj_idx - norm_end;
++            if (!(n_proj >= 1 && n_proj <= 16 && span_len <= 96)) {
++                if (ok) { miss_reason = (n_proj == 0) ? "no_owned_proj" : (n_proj > 16 ? "too_many_proj" : "span_too_long"); }
++                ok = false;
++            }
++
++            if (ok) {
++                const int64_t ne_tot = ggml_nelements(norm_out);
++                ggml_cuda_pool_alloc<nv_bfloat16> norm_bf16(cuda_ctx->pool(), ne_tot);
++                if (kind == 2) {
++                    ggml_cuda_rms_norm_gate_mul_bf16out(*cuda_ctx, k_rms, k_mul, k_silu, norm_out, norm_bf16.get());
++                } else {
++                    ggml_cuda_rms_norm_mul_bf16out(*cuda_ctx, k_rms, k_mul, norm_bf16.get());
++                }
++
++                // Execute the whole owned span inline, in graph order (mirrors the stock
++                // capture loop's per-node handling for the non-owned nodes).
++                for (int j = norm_end; j <= max_proj_idx; ++j) {
++                    ggml_tensor * nj = cgraph->nodes[j];
++
++                    bool mine = false;
++                    for (int p = 0; p < n_proj; ++p) { if (projs[p] == nj) { mine = true; break; } }
++
++                    if (mine) {
++                        ggml_tensor * proj_src1 = nj->src[1];
++                        ggml_tensor   src1_bf16 = *proj_src1;
++                        src1_bf16.type      = GGML_TYPE_BF16;
++                        src1_bf16.data      = norm_bf16.get();
++                        src1_bf16.view_src  = nullptr;
++                        src1_bf16.view_offs = 0;
++                        src1_bf16.nb[0]     = sizeof(nv_bfloat16);
++                        src1_bf16.nb[1]     = src1_bf16.nb[0] * src1_bf16.ne[0];
++                        src1_bf16.nb[2]     = src1_bf16.nb[1] * src1_bf16.ne[1];
++                        src1_bf16.nb[3]     = src1_bf16.nb[2] * src1_bf16.ne[2];
++
++                        ggml_tensor * saved_src1 = nj->src[1];
++                        nj->src[1] = &src1_bf16;
++                        g_bf16_stream_f32_out = bf16_stream_f32_out_default;
++                        const bool okc = ggml_cuda_compute_forward(*cuda_ctx, nj);
++                        g_bf16_stream_f32_out = false;
++                        nj->src[1] = saved_src1;
++                        GGML_ASSERT(okc);
++                        continue;
++                    }
++
++                    // non-owned span node: mirror the stock loop (skip metadata no-ops,
++                    // run the rest through the per-node executor)
++                    if (ggml_is_empty(nj) || nj->op == GGML_OP_RESHAPE || nj->op == GGML_OP_TRANSPOSE ||
++                        nj->op == GGML_OP_VIEW || nj->op == GGML_OP_PERMUTE || nj->op == GGML_OP_NONE) {
++                        continue;
++                    }
++                    if ((nj->flags & GGML_TENSOR_FLAG_COMPUTE) == 0) {
++                        continue;
++                    }
++                    const bool okn = ggml_cuda_compute_forward(*cuda_ctx, nj);
++                    GGML_ASSERT(okn);
++                }
++
++                static std::atomic<int> bf16_stream_engage_count{0};
++                const int ec = bf16_stream_engage_count.fetch_add(1, std::memory_order_relaxed);
++                if (bf16_stream_trace > 0 && ec < bf16_stream_trace) {
++                    const ggml_tensor * w0 = projs[0]->src[0];
++                    fprintf(stderr,
++                        "[LLAMA_BF16_STREAM] engaged seg=%s node=%d n_proj=%d last_proj=%d "
++                        "M=%" PRId64 " N=%" PRId64 " K=%" PRId64 " f32out=%d skip=%d\n",
++                        seg_kind, i, n_proj, max_proj_idx, projs[0]->src[1]->ne[1], w0->ne[1], w0->ne[0],
++                        bf16_stream_f32_out_default ? 1 : 0, max_proj_idx - i);
++                }
++                return max_proj_idx - i;   // skip norm nodes + intervening views + all owned projections
++            }
++
++            if (bf16_stream_trace > 0) {
++                static std::atomic<int> bf16_stream_miss_count{0};
++                const int mc = bf16_stream_miss_count.fetch_add(1, std::memory_order_relaxed);
++                if (mc < bf16_stream_trace) {
++                    fprintf(stderr,
++                        "[LLAMA_BF16_STREAM] miss seg=%s node=%d n_proj=%d reason=%s miss_node=%d miss_op=%s\n",
++                        seg_kind, i, n_proj, miss_reason, miss_node, miss_op);
++                }
++            }
++        }
++    }
++
+     // Fused gated RMS norm: RMS norm + weight multiply + SiLU-gated multiply
+     // (bit-exact). The Qwen3.6 gated-DeltaNet output norm. Default ON; set
+     // LLAMA_FUSE_GATE_RMSNORM=0 for a clean A/B against the unfused path.
+diff --git a/ggml/src/ggml-cuda/norm-bf16.cu b/ggml/src/ggml-cuda/norm-bf16.cu
+new file mode 100644
+index 000000000..77ccbf80e
+--- /dev/null
++++ b/ggml/src/ggml-cuda/norm-bf16.cu
+@@ -0,0 +1,483 @@
++#include "norm-bf16.cuh"
++
++#include <cstring>
++
++// [P1 bf16-stream] bf16-output variants of the residual-stream norms. Same reduction
++// and FP order as the f32 kernels in norm.cu; only the final store is narrowed. Kept
++// bit-faithful to the f32 norms up to the __float2bfloat16 store so the opt-in stream
++// stays a pure dtype (KL-benign) change, not an algorithmic one.
++
++// Output-store policy: identity for float, round-to-nearest bf16 for nv_bfloat16.
++template <typename Tdst> struct bf16stream_store;
++template <> struct bf16stream_store<float> {
++    static __device__ __forceinline__ float store(float v) { return v; }
++};
++template <> struct bf16stream_store<nv_bfloat16> {
++    static __device__ __forceinline__ nv_bfloat16 store(float v) { return __float2bfloat16(v); }
++};
++
++// ---------------------------------------------------------------------------
++// plain rms_norm + weight multiply -> Tdst    (mirrors rms_norm_f32<do_multiply=true>)
++// ---------------------------------------------------------------------------
++template <int block_size, typename Tdst>
++static __global__ void rms_norm_mul_out(const float * x,
++                                        Tdst *        dst,
++                                        const int     ncols,
++                                        const int64_t stride_row,
++                                        const int64_t stride_channel,
++                                        const int64_t stride_sample,
++                                        const float   eps,
++                                        const float * mul,
++                                        const int64_t mul_stride_row,
++                                        const int64_t mul_stride_channel,
++                                        const int64_t mul_stride_sample,
++                                        const uint3   mul_ncols_packed,
++                                        const uint3   mul_nrows_packed,
++                                        const uint3   mul_nchannels_packed,
++                                        const uint3   mul_nsamples_packed) {
++    ggml_cuda_pdl_lc();
++    const int nrows     = gridDim.x;
++    const int nchannels = gridDim.y;
++
++    const int row     = blockIdx.x;
++    const int channel = blockIdx.y;
++    const int sample  = blockIdx.z;
++    const int tid     = threadIdx.x;
++
++    x   += sample*stride_sample + channel*stride_channel + row*stride_row;
++    dst += ((sample*nchannels + channel)*nrows + row)*ncols;
++
++    {
++        const uint32_t mul_row     = fastmodulo(row, mul_nrows_packed);
++        const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed);
++        const uint32_t mul_sample  = fastmodulo(sample, mul_nsamples_packed);
++        mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row;
++    }
++
++    float tmp = 0.0f;
++
++    ggml_cuda_pdl_sync();
++    for (int col = tid; col < ncols; col += block_size) {
++        const float xi = x[col];
++        tmp += xi * xi;
++    }
++
++    extern __shared__ float s_sum[];
++    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
++
++    const float mean  = tmp / ncols;
++    const float scale = rsqrtf(mean + eps);
++
++    for (int col = tid; col < ncols; col += block_size) {
++        const int mul_col = fastmodulo(col, mul_ncols_packed);
++        dst[col]          = bf16stream_store<Tdst>::store(scale * x[col] * mul[mul_col]);
++    }
++}
++
++// ---------------------------------------------------------------------------
++// 0042 residual-add + rms_norm + weight multiply -> f32 h_out + Tdst dst
++// (mirrors rms_norm_pre_add_mul_f32<do_multiply=true>; h_out stays f32 so the next
++//  residual add reads the same f32 residual stream)
++// ---------------------------------------------------------------------------
++template <int block_size, typename Tdst>
++static __global__ void rms_norm_pre_add_mul_out(const float * a,
++                                                const float * b,
++                                                float *       h_out,
++                                                Tdst *        dst,
++                                                const int     ncols,
++                                                const int64_t stride_row,
++                                                const int64_t stride_channel,
++                                                const int64_t stride_sample,
++                                                const float   eps,
++                                                const float * mul,
++                                                const int64_t mul_stride_row,
++                                                const int64_t mul_stride_channel,
++                                                const int64_t mul_stride_sample,
++                                                const uint3   mul_ncols_packed,
++                                                const uint3   mul_nrows_packed,
++                                                const uint3   mul_nchannels_packed,
++                                                const uint3   mul_nsamples_packed) {
++    ggml_cuda_pdl_lc();
++    const int nrows     = gridDim.x;
++    const int nchannels = gridDim.y;
++
++    const int row     = blockIdx.x;
++    const int channel = blockIdx.y;
++    const int sample  = blockIdx.z;
++    const int tid     = threadIdx.x;
++
++    const int64_t row_offset = sample*stride_sample + channel*stride_channel + row*stride_row;
++    a     += row_offset;
++    b     += row_offset;
++    h_out += row_offset;
++    dst   += ((sample*nchannels + channel)*nrows + row)*ncols;
++
++    {
++        const uint32_t mul_row     = fastmodulo(row, mul_nrows_packed);
++        const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed);
++        const uint32_t mul_sample  = fastmodulo(sample, mul_nsamples_packed);
++        mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row;
++    }
++
++    float tmp = 0.0f;
++
++    ggml_cuda_pdl_sync();
++    for (int col = tid; col < ncols; col += block_size) {
++        const float hi = a[col] + b[col];
++        h_out[col] = hi;   // publish the f32 residual stream for the next add
++        tmp += hi * hi;
++    }
++
++    extern __shared__ float s_sum[];
++    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
++
++    const float mean  = tmp / ncols;
++    const float scale = rsqrtf(mean + eps);
++
++    for (int col = tid; col < ncols; col += block_size) {
++        const float hi      = h_out[col];
++        const int   mul_col = fastmodulo(col, mul_ncols_packed);
++        dst[col]            = bf16stream_store<Tdst>::store(scale * hi * mul[mul_col]);
++    }
++}
++
++// ---------------------------------------------------------------------------
++// 0044 gated-DeltaNet output norm  scale*x*w*silu(z) -> Tdst  (the P0 segment)
++// ---------------------------------------------------------------------------
++template <int block_size, typename Tdst>
++static __global__ void rms_norm_gate_mul_out(const float * x,
++                                             Tdst *        dst,
++                                             const int     ncols,
++                                             const int64_t stride_row,
++                                             const int64_t stride_channel,
++                                             const int64_t stride_sample,
++                                             const float   eps,
++                                             const float * mul,
++                                             const int64_t mul_stride_row,
++                                             const int64_t mul_stride_channel,
++                                             const int64_t mul_stride_sample,
++                                             const uint3   mul_ncols_packed,
++                                             const uint3   mul_nrows_packed,
++                                             const uint3   mul_nchannels_packed,
++                                             const uint3   mul_nsamples_packed,
++                                             const float * gate,
++                                             const int64_t gate_stride_row,
++                                             const int64_t gate_stride_channel,
++                                             const int64_t gate_stride_sample,
++                                             const uint3   gate_ncols_packed,
++                                             const uint3   gate_nrows_packed,
++                                             const uint3   gate_nchannels_packed,
++                                             const uint3   gate_nsamples_packed) {
++    ggml_cuda_pdl_lc();
++    const int nrows     = gridDim.x;
++    const int nchannels = gridDim.y;
++
++    const int row     = blockIdx.x;
++    const int channel = blockIdx.y;
++    const int sample  = blockIdx.z;
++    const int tid     = threadIdx.x;
++
++    x   += sample*stride_sample + channel*stride_channel + row*stride_row;
++    dst += ((sample*nchannels + channel)*nrows + row)*ncols;
++
++    {
++        const uint32_t mul_row     = fastmodulo(row, mul_nrows_packed);
++        const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed);
++        const uint32_t mul_sample  = fastmodulo(sample, mul_nsamples_packed);
++        mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row;
++    }
++    {
++        const uint32_t gate_row     = fastmodulo(row, gate_nrows_packed);
++        const uint32_t gate_channel = fastmodulo(channel, gate_nchannels_packed);
++        const uint32_t gate_sample  = fastmodulo(sample, gate_nsamples_packed);
++        gate += gate_sample * gate_stride_sample + gate_channel * gate_stride_channel + gate_row * gate_stride_row;
++    }
++
++    float tmp = 0.0f;
++
++    ggml_cuda_pdl_sync();
++    for (int col = tid; col < ncols; col += block_size) {
++        const float xi = x[col];
++        tmp += xi * xi;
++    }
++
++    extern __shared__ float s_sum[];
++    tmp = block_reduce<block_reduce_method::SUM, block_size>(tmp, s_sum);
++
++    const float mean  = tmp / ncols;
++    const float scale = rsqrtf(mean + eps);
++
++    for (int col = tid; col < ncols; col += block_size) {
++        const int   mul_col  = fastmodulo(col, mul_ncols_packed);
++        const int   gate_col = fastmodulo(col, gate_ncols_packed);
++        const float zi       = gate[gate_col];
++        const float silu_z   = zi / (1.0f + expf(-zi));
++        dst[col]             = bf16stream_store<Tdst>::store(scale * x[col] * mul[mul_col] * silu_z);
++    }
++}
++
++// ===========================================================================
++// launchers
++// ===========================================================================
++template <typename Tdst>
++static void rms_norm_mul_out_cuda(const float * x, Tdst * dst,
++                                  const int ncols, const int nrows, const int nchannels, const int nsamples,
++                                  const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
++                                  const float * mul,
++                                  const int64_t mul_stride_row, const int64_t mul_stride_channel, const int64_t mul_stride_sample,
++                                  const uint32_t mul_ncols, const uint32_t mul_nrows, const uint32_t mul_nchannels, const uint32_t mul_nsamples,
++                                  const float eps, cudaStream_t stream) {
++    const dim3 blocks_num(nrows, nchannels, nsamples);
++    GGML_ASSERT(mul != nullptr);
++    const uint3 mc = init_fastdiv_values(mul_ncols);
++    const uint3 mr = init_fastdiv_values(mul_nrows);
++    const uint3 mch = init_fastdiv_values(mul_nchannels);
++    const uint3 ms = init_fastdiv_values(mul_nsamples);
++    if (ncols < 1024) {
++        const dim3 block_dims(256, 1, 1);
++        const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
++        ggml_cuda_kernel_launch(rms_norm_mul_out<256, Tdst>, lp,
++            x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
++            mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms);
++    } else {
++        const dim3 block_dims(1024, 1, 1);
++        const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
++        ggml_cuda_kernel_launch(rms_norm_mul_out<1024, Tdst>, lp,
++            x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
++            mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms);
++    }
++}
++
++template <typename Tdst>
++static void rms_norm_pre_add_mul_out_cuda(const float * a, const float * b, float * h_out, Tdst * dst,
++                                          const int ncols, const int nrows, const int nchannels, const int nsamples,
++                                          const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
++                                          const float * mul,
++                                          const int64_t mul_stride_row, const int64_t mul_stride_channel, const int64_t mul_stride_sample,
++                                          const uint32_t mul_ncols, const uint32_t mul_nrows, const uint32_t mul_nchannels, const uint32_t mul_nsamples,
++                                          const float eps, cudaStream_t stream) {
++    const dim3 blocks_num(nrows, nchannels, nsamples);
++    GGML_ASSERT(mul != nullptr);
++    const uint3 mc = init_fastdiv_values(mul_ncols);
++    const uint3 mr = init_fastdiv_values(mul_nrows);
++    const uint3 mch = init_fastdiv_values(mul_nchannels);
++    const uint3 ms = init_fastdiv_values(mul_nsamples);
++    if (ncols < 1024) {
++        const dim3 block_dims(256, 1, 1);
++        const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
++        ggml_cuda_kernel_launch(rms_norm_pre_add_mul_out<256, Tdst>, lp,
++            a, b, h_out, dst, ncols, stride_row, stride_channel, stride_sample, eps,
++            mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms);
++    } else {
++        const dim3 block_dims(1024, 1, 1);
++        const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
++        ggml_cuda_kernel_launch(rms_norm_pre_add_mul_out<1024, Tdst>, lp,
++            a, b, h_out, dst, ncols, stride_row, stride_channel, stride_sample, eps,
++            mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms);
++    }
++}
++
++template <typename Tdst>
++static void rms_norm_gate_mul_out_cuda(const float * x, Tdst * dst,
++                                       const int ncols, const int nrows, const int nchannels, const int nsamples,
++                                       const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample,
++                                       const float * mul,
++                                       const int64_t mul_stride_row, const int64_t mul_stride_channel, const int64_t mul_stride_sample,
++                                       const uint32_t mul_ncols, const uint32_t mul_nrows, const uint32_t mul_nchannels, const uint32_t mul_nsamples,
++                                       const float * gate,
++                                       const int64_t gate_stride_row, const int64_t gate_stride_channel, const int64_t gate_stride_sample,
++                                       const uint32_t gate_ncols, const uint32_t gate_nrows, const uint32_t gate_nchannels, const uint32_t gate_nsamples,
++                                       const float eps, cudaStream_t stream) {
++    const dim3 blocks_num(nrows, nchannels, nsamples);
++    GGML_ASSERT(mul  != nullptr);
++    GGML_ASSERT(gate != nullptr);
++    const uint3 mc = init_fastdiv_values(mul_ncols);
++    const uint3 mr = init_fastdiv_values(mul_nrows);
++    const uint3 mch = init_fastdiv_values(mul_nchannels);
++    const uint3 ms = init_fastdiv_values(mul_nsamples);
++    const uint3 gc = init_fastdiv_values(gate_ncols);
++    const uint3 gr = init_fastdiv_values(gate_nrows);
++    const uint3 gch = init_fastdiv_values(gate_nchannels);
++    const uint3 gs = init_fastdiv_values(gate_nsamples);
++    if (ncols < 1024) {
++        const dim3 block_dims(256, 1, 1);
++        const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
++        ggml_cuda_kernel_launch(rms_norm_gate_mul_out<256, Tdst>, lp,
++            x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
++            mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms,
++            gate, gate_stride_row, gate_stride_channel, gate_stride_sample, gc, gr, gch, gs);
++    } else {
++        const dim3 block_dims(1024, 1, 1);
++        const ggml_cuda_kernel_launch_params lp{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream};
++        ggml_cuda_kernel_launch(rms_norm_gate_mul_out<1024, Tdst>, lp,
++            x, dst, ncols, stride_row, stride_channel, stride_sample, eps,
++            mul, mul_stride_row, mul_stride_channel, mul_stride_sample, mc, mr, mch, ms,
++            gate, gate_stride_row, gate_stride_channel, gate_stride_sample, gc, gr, gch, gs);
++    }
++}
++
++// ===========================================================================
++// host entries
++// ===========================================================================
++void ggml_cuda_rms_norm_mul_bf16out(ggml_backend_cuda_context & ctx,
++                                    const ggml_tensor *         rms_norm_tensor,
++                                    const ggml_tensor *         mul_tensor,
++                                    void *                      dst_bf16) {
++    const ggml_tensor * x_src   = rms_norm_tensor->src[0];
++    const ggml_tensor * mul_src = (mul_tensor->src[0] == rms_norm_tensor) ? mul_tensor->src[1] : mul_tensor->src[0];
++    GGML_ASSERT(mul_tensor->src[0] == rms_norm_tensor || mul_tensor->src[1] == rms_norm_tensor);
++
++    float eps = 0.0f;
++    memcpy(&eps, rms_norm_tensor->op_params, sizeof(float));
++    GGML_ASSERT(eps >= 0.0f);
++
++    GGML_ASSERT(x_src->type           == GGML_TYPE_F32);
++    GGML_ASSERT(mul_src->type         == GGML_TYPE_F32);
++    GGML_ASSERT(rms_norm_tensor->type == GGML_TYPE_F32);
++    GGML_ASSERT(mul_tensor->type      == GGML_TYPE_F32);
++
++    const float * x_d   = (const float *) x_src->data;
++    const float * mul_d = (const float *) mul_src->data;
++    nv_bfloat16 * dst_d = (nv_bfloat16 *)  dst_bf16;
++    cudaStream_t  stream = ctx.stream();
++
++    const int64_t ne00 = rms_norm_tensor->ne[0];
++    const int64_t ne01 = rms_norm_tensor->ne[1];
++    const int64_t ne02 = rms_norm_tensor->ne[2];
++    const int64_t ne03 = rms_norm_tensor->ne[3];
++
++    const size_t ts0 = ggml_type_size(x_src->type);
++    GGML_ASSERT(x_src->nb[0] == ts0);
++    const int64_t s01 = x_src->nb[1] / ts0;
++    const int64_t s02 = x_src->nb[2] / ts0;
++    const int64_t s03 = x_src->nb[3] / ts0;
++
++    const size_t ts_mul = ggml_type_size(mul_src->type);
++    GGML_ASSERT(mul_src->nb[0] == ts_mul);
++    const int64_t mul_s01 = mul_src->nb[1] / ts_mul;
++    const int64_t mul_s02 = mul_src->nb[2] / ts_mul;
++    const int64_t mul_s03 = mul_src->nb[3] / ts_mul;
++
++    rms_norm_mul_out_cuda<nv_bfloat16>(x_d, dst_d,
++        ne00, ne01, ne02, ne03, s01, s02, s03,
++        mul_d, mul_s01, mul_s02, mul_s03,
++        mul_src->ne[0], mul_src->ne[1], mul_src->ne[2], mul_src->ne[3],
++        eps, stream);
++}
++
++void ggml_cuda_rms_norm_pre_add_mul_bf16out(ggml_backend_cuda_context & ctx,
++                                            const ggml_tensor *         add_tensor,
++                                            const ggml_tensor *         rms_norm_tensor,
++                                            const ggml_tensor *         mul_tensor,
++                                            void *                      dst_bf16) {
++    GGML_ASSERT(rms_norm_tensor->src[0] == add_tensor);
++
++    const ggml_tensor * a_src = add_tensor->src[0];
++    const ggml_tensor * b_src = add_tensor->src[1];
++
++    float eps = 0.0f;
++    memcpy(&eps, rms_norm_tensor->op_params, sizeof(float));
++    GGML_ASSERT(eps >= 0.0f);
++
++    const ggml_tensor * mul_src = (mul_tensor->src[0] == rms_norm_tensor) ? mul_tensor->src[1] : mul_tensor->src[0];
++    GGML_ASSERT(mul_tensor->src[0] == rms_norm_tensor || mul_tensor->src[1] == rms_norm_tensor);
++
++    GGML_ASSERT(a_src->type           == GGML_TYPE_F32);
++    GGML_ASSERT(b_src->type           == GGML_TYPE_F32);
++    GGML_ASSERT(add_tensor->type      == GGML_TYPE_F32);
++    GGML_ASSERT(rms_norm_tensor->type == GGML_TYPE_F32);
++    GGML_ASSERT(mul_tensor->type      == GGML_TYPE_F32);
++    GGML_ASSERT(ggml_are_same_shape(a_src, b_src));
++
++    const float * a_d   = (const float *) a_src->data;
++    const float * b_d   = (const float *) b_src->data;
++    float *       h_d   = (float *)       add_tensor->data;   // f32 residual stream
++    const float * mul_d = (const float *) mul_src->data;
++    nv_bfloat16 * dst_d = (nv_bfloat16 *)  dst_bf16;
++    cudaStream_t  stream = ctx.stream();
++
++    const int64_t ne00 = add_tensor->ne[0];
++    const int64_t ne01 = add_tensor->ne[1];
++    const int64_t ne02 = add_tensor->ne[2];
++    const int64_t ne03 = add_tensor->ne[3];
++
++    const size_t ts0 = ggml_type_size(a_src->type);
++    GGML_ASSERT(a_src->nb[0] == ts0 && b_src->nb[0] == ts0);
++    const int64_t s01 = a_src->nb[1] / ts0;
++    const int64_t s02 = a_src->nb[2] / ts0;
++    const int64_t s03 = a_src->nb[3] / ts0;
++
++    const size_t ts_mul = ggml_type_size(mul_src->type);
++    GGML_ASSERT(mul_src->nb[0] == ts_mul);
++    const int64_t mul_s01 = mul_src->nb[1] / ts_mul;
++    const int64_t mul_s02 = mul_src->nb[2] / ts_mul;
++    const int64_t mul_s03 = mul_src->nb[3] / ts_mul;
++
++    rms_norm_pre_add_mul_out_cuda<nv_bfloat16>(a_d, b_d, h_d, dst_d,
++        ne00, ne01, ne02, ne03, s01, s02, s03,
++        mul_d, mul_s01, mul_s02, mul_s03,
++        mul_src->ne[0], mul_src->ne[1], mul_src->ne[2], mul_src->ne[3],
++        eps, stream);
++}
++
++void ggml_cuda_rms_norm_gate_mul_bf16out(ggml_backend_cuda_context & ctx,
++                                         const ggml_tensor *         rms_norm_tensor,
++                                         const ggml_tensor *         mul_tensor,
++                                         const ggml_tensor *         silu_tensor,
++                                         const ggml_tensor *         gate_mul_tensor,
++                                         void *                      dst_bf16) {
++    GGML_ASSERT(mul_tensor->src[0] == rms_norm_tensor || mul_tensor->src[1] == rms_norm_tensor);
++    GGML_ASSERT(gate_mul_tensor->src[0] == silu_tensor || gate_mul_tensor->src[1] == silu_tensor);
++
++    const ggml_tensor * x_src    = rms_norm_tensor->src[0];
++    const ggml_tensor * w_src    = (mul_tensor->src[0] == rms_norm_tensor) ? mul_tensor->src[1] : mul_tensor->src[0];
++    const ggml_tensor * gate_src = silu_tensor->src[0];
++
++    float eps = 0.0f;
++    memcpy(&eps, rms_norm_tensor->op_params, sizeof(float));
++    GGML_ASSERT(eps >= 0.0f);
++
++    const float * x_d    = (const float *) x_src->data;
++    const float * w_d    = (const float *) w_src->data;
++    const float * gate_d = (const float *) gate_src->data;
++    nv_bfloat16 * dst_d  = (nv_bfloat16 *)  dst_bf16;
++    cudaStream_t  stream = ctx.stream();
++
++    GGML_ASSERT(x_src->type    == GGML_TYPE_F32);
++    GGML_ASSERT(w_src->type    == GGML_TYPE_F32);
++    GGML_ASSERT(gate_src->type == GGML_TYPE_F32);
++    GGML_ASSERT(rms_norm_tensor->type == GGML_TYPE_F32);
++    GGML_ASSERT(mul_tensor->type      == GGML_TYPE_F32);
++    GGML_ASSERT(silu_tensor->type     == GGML_TYPE_F32);
++
++    const int64_t ne00 = rms_norm_tensor->ne[0];
++    const int64_t ne01 = rms_norm_tensor->ne[1];
++    const int64_t ne02 = rms_norm_tensor->ne[2];
++    const int64_t ne03 = rms_norm_tensor->ne[3];
++
++    const size_t ts0 = ggml_type_size(x_src->type);
++    GGML_ASSERT(x_src->nb[0] == ts0);
++    const int64_t s01 = x_src->nb[1] / ts0;
++    const int64_t s02 = x_src->nb[2] / ts0;
++    const int64_t s03 = x_src->nb[3] / ts0;
++
++    const size_t ts_mul = ggml_type_size(w_src->type);
++    GGML_ASSERT(w_src->nb[0] == ts_mul);
++    const int64_t mul_s01 = w_src->nb[1] / ts_mul;
++    const int64_t mul_s02 = w_src->nb[2] / ts_mul;
++    const int64_t mul_s03 = w_src->nb[3] / ts_mul;
++
++    const size_t ts_gate = ggml_type_size(gate_src->type);
++    GGML_ASSERT(gate_src->nb[0] == ts_gate);
++    const int64_t gate_s01 = gate_src->nb[1] / ts_gate;
++    const int64_t gate_s02 = gate_src->nb[2] / ts_gate;
++    const int64_t gate_s03 = gate_src->nb[3] / ts_gate;
++
++    rms_norm_gate_mul_out_cuda<nv_bfloat16>(x_d, dst_d,
++        ne00, ne01, ne02, ne03, s01, s02, s03,
++        w_d, mul_s01, mul_s02, mul_s03,
++        w_src->ne[0], w_src->ne[1], w_src->ne[2], w_src->ne[3],
++        gate_d, gate_s01, gate_s02, gate_s03,
++        gate_src->ne[0], gate_src->ne[1], gate_src->ne[2], gate_src->ne[3],
++        eps, stream);
++}
+diff --git a/ggml/src/ggml-cuda/norm-bf16.cuh b/ggml/src/ggml-cuda/norm-bf16.cuh
+new file mode 100644
+index 000000000..3b52757b8
+--- /dev/null
++++ b/ggml/src/ggml-cuda/norm-bf16.cuh
+@@ -0,0 +1,37 @@
++#include "common.cuh"
++
++// [P1 bf16-stream] bf16-resident execution-pass helpers (default-off, gated by
++// LLAMA_BF16_STREAM at the ggml_cuda_try_fuse call site). Siblings of the plain
++// rms_norm and the 0042/0044 fused norms in norm.cu; these variants write a bf16
++// output so the consuming projection GEMM reads the activation directly (no f32->bf16
++// convert_dtype glue). Each kernel is templated on the output dtype (float or
++// nv_bfloat16) so the file carries the full op-variant set the P1 contract names;
++// the live q36 engage path instantiates the bf16 output only.
++//
++// All three keep the same reduction and FP order as their f32 originals; only the
++// final store is narrowed to bf16 via __float2bfloat16, so the opt-in path stays a
++// pure dtype (KL-benign) change, not an algorithmic one.
++
++// plain rms_norm + weight multiply -> bf16  (attention input norm, GDN input norm)
++void ggml_cuda_rms_norm_mul_bf16out(ggml_backend_cuda_context & ctx,
++                                    const ggml_tensor *         rms_norm_tensor,
++                                    const ggml_tensor *         mul_tensor,
++                                    void *                      dst_bf16);
++
++// 0042 residual-add + rms_norm + weight multiply -> f32 residual (h_out) + bf16 normed
++// (op-set completeness; on q36 the ffn/moe-input norm feeds MMQ experts so the engage
++// path bails, but the kernel + entry keep the op-variant set whole and the sentinel
++// exercises it).
++void ggml_cuda_rms_norm_pre_add_mul_bf16out(ggml_backend_cuda_context & ctx,
++                                            const ggml_tensor *         add_tensor,
++                                            const ggml_tensor *         rms_norm_tensor,
++                                            const ggml_tensor *         mul_tensor,
++                                            void *                      dst_bf16);
++
++// 0044 gated-DeltaNet output norm  scale*x*w*silu(z)  -> bf16  (ssm_out; the P0 segment)
++void ggml_cuda_rms_norm_gate_mul_bf16out(ggml_backend_cuda_context & ctx,
++                                         const ggml_tensor *         rms_norm_tensor,
++                                         const ggml_tensor *         mul_tensor,
++                                         const ggml_tensor *         silu_tensor,
++                                         const ggml_tensor *         gate_mul_tensor,
++                                         void *                      dst_bf16);
+-- 
+2.43.0
+
diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0054-feat-paged-P1-bf16-stream-bf16-residual-add-rope-op-.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0054-feat-paged-P1-bf16-stream-bf16-residual-add-rope-op-.patch
new file mode 100644
index 000000000..c60f1c8ec
--- /dev/null
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0054-feat-paged-P1-bf16-stream-bf16-residual-add-rope-op-.patch
@@ -0,0 +1,107 @@
+From 91373e1b9ab290eb9df63ce26e7cd17da81970fe Mon Sep 17 00:00:00 2001
+From: Ettore Di Giacinto <mudler@localai.io>
+Date: Thu, 2 Jul 2026 16:29:20 +0200
+Subject: [PATCH 54/55] feat(paged): P1 bf16-stream bf16 residual-add + rope
+ op-variants
+
+Round out the op-variant set for the bf16-resident stream: bf16 branches in
+binbcast.cu (residual add) and bf16 rope instantiations (asserts widened
+only). Standing infra; Option-A keeps f32 at segment boundaries so these are
+not on the current measured path. Existing f32 paths untouched.
+
+Assisted-by: Claude:opus-4.8 [Claude Code]
+Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
+---
+ ggml/src/ggml-cuda/binbcast.cu | 10 +++++++++-
+ ggml/src/ggml-cuda/rope.cu     | 31 ++++++++++++++++++++++++++++---
+ 2 files changed, 37 insertions(+), 4 deletions(-)
+
+diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu
+index 2e38077bf..135becedb 100644
+--- a/ggml/src/ggml-cuda/binbcast.cu
++++ b/ggml/src/ggml-cuda/binbcast.cu
+@@ -413,7 +413,7 @@ static void ggml_cuda_op_bin_bcast(
+     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+     const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) {
+ 
+-    GGML_ASSERT(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
++    GGML_ASSERT(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_BF16);
+ 
+     if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+         op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
+@@ -423,6 +423,14 @@ static void ggml_cuda_op_bin_bcast(
+         op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream);
+     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+         op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream);
++    // [P1 bf16-stream] bf16 residual-add variants, so a bf16-resident segment can keep
++    // its residual add in bf16 (half the memory traffic) rather than widening to f32.
++    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) {
++        op()(src0, src1, dst, (const nv_bfloat16 *) src0_dd, (const nv_bfloat16 *) src1_dd, (nv_bfloat16 *) dst_dd, stream);
++    } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16) {
++        op()(src0, src1, dst, (const nv_bfloat16 *) src0_dd, (const float *) src1_dd, (nv_bfloat16 *) dst_dd, stream);
++    } else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
++        op()(src0, src1, dst, (const nv_bfloat16 *) src0_dd, (const float *) src1_dd, (float *) dst_dd, stream);
+     } else {
+         fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+             ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
+index e20a5cb6b..923032327 100644
+--- a/ggml/src/ggml-cuda/rope.cu
++++ b/ggml/src/ggml-cuda/rope.cu
+@@ -528,11 +528,16 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
+     }
+     cudaStream_t stream = ctx.stream();
+ 
+-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+-    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
++    // [P1 bf16-stream] bf16 is accepted so a bf16-resident attention segment can rope
++    // its Q/K in bf16 (the norm/neox kernels are float-internal, so the bf16 arms just
++    // add T/D = nv_bfloat16 instantiations below).
++    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16);
++    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16 ||  dst->type == GGML_TYPE_BF16);
+     // When not fused, src0 and dst types must match
+     // When fused (ROPE+VIEW+SET_ROWS), src0 may be F32 and dst may be F16
+-    GGML_ASSERT(src0->type == dst->type || (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16));
++    GGML_ASSERT(src0->type == dst->type ||
++                (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) ||
++                (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16));
+ 
+     const int64_t ne00 = src0->ne[0]; // head dims
+     const int64_t ne01 = src0->ne[1]; // num heads
+@@ -610,6 +615,16 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
+                                                 s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                 ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                 set_rows_stride, stream);
++        } else if (src0->type == GGML_TYPE_BF16 && dst_type == GGML_TYPE_BF16) {
++            rope_neox_cuda<forward, nv_bfloat16, nv_bfloat16>((const nv_bfloat16 *) src0_d, (nv_bfloat16 *) dst_d, ne00, ne01, ne02, s01, s02,
++                                                s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
++                                                ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
++                                                set_rows_stride, stream);
++        } else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_BF16) {
++            rope_neox_cuda<forward, float, nv_bfloat16>((const float *) src0_d, (nv_bfloat16 *) dst_d, ne00, ne01, ne02, s01, s02,
++                                                s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
++                                                ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
++                                                set_rows_stride, stream);
+         } else {
+             GGML_ABORT("fatal error");
+         }
+@@ -653,6 +668,16 @@ void ggml_cuda_op_rope_impl(ggml_backend_cuda_context & ctx,
+                                                 s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
+                                                 ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
+                                                 set_rows_stride, stream);
++        } else if (src0->type == GGML_TYPE_BF16 && dst_type == GGML_TYPE_BF16) {
++            rope_norm_cuda<forward, nv_bfloat16, nv_bfloat16>((const nv_bfloat16 *) src0_d, (nv_bfloat16 *) dst_d, ne00, ne01, ne02, s01, s02,
++                                                s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
++                                                ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
++                                                set_rows_stride, stream);
++        } else if (src0->type == GGML_TYPE_F32 && dst_type == GGML_TYPE_BF16) {
++            rope_norm_cuda<forward, float, nv_bfloat16>((const float *) src0_d, (nv_bfloat16 *) dst_d, ne00, ne01, ne02, s01, s02,
++                                                s03, s1, s2, s3, n_dims, nr, pos, freq_scale, freq_base,
++                                                ext_factor, attn_factor, corr_dims, freq_factors, row_indices,
++                                                set_rows_stride, stream);
+         } else {
+             GGML_ABORT("fatal error");
+         }
+-- 
+2.43.0
+
diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0055-test-paged-P1-bf16-stream-BF16_STREAM_SEGMENT-sentin.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0055-test-paged-P1-bf16-stream-BF16_STREAM_SEGMENT-sentin.patch
new file mode 100644
index 000000000..133365eb1
--- /dev/null
+++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0055-test-paged-P1-bf16-stream-BF16_STREAM_SEGMENT-sentin.patch
@@ -0,0 +1,116 @@
+From 653bb2f3d5914872010eac29287863bff67de943 Mon Sep 17 00:00:00 2001
+From: Ettore Di Giacinto <mudler@localai.io>
+Date: Thu, 2 Jul 2026 16:29:20 +0200
+Subject: [PATCH 55/55] test(paged): P1 bf16-stream BF16_STREAM_SEGMENT
+ sentinel
+
+Whole-graph test-backend-ops case (MOE_SWIGLU_DOWN style) that engages both
+segment kinds (plain rms_norm multi-consumer and the 0044 gate_mul ssm_out)
+under LLAMA_BF16_STREAM. Green default and opt-in (4/4).
+
+Assisted-by: Claude:opus-4.8 [Claude Code]
+Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
+---
+ tests/test-backend-ops.cpp | 79 ++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 79 insertions(+)
+
+diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
+index 8c41ae56a..d0c56521c 100644
+--- a/tests/test-backend-ops.cpp
++++ b/tests/test-backend-ops.cpp
+@@ -4532,6 +4532,78 @@ struct test_moe_swiglu_down : public test_case {
+     }
+ };
+ 
++// [P1 bf16-stream] Standing coverage for the bf16-native residual-stream segment
++// executor (LLAMA_BF16_STREAM). Builds a residual-stream norm feeding a large-M
++// cuBLAS-bf16 projection (the exact shape ggml_cuda_try_fuse owns): kind 1 = plain
++// rms_norm+mul -> proj (attention / GDN input norm), kind 2 = 0044 gated-DeltaNet
++// output norm silu(z)*rms(x)*w -> proj (ssm_out). Whole-graph so the fusion pass runs;
++// with the env off it validates the f32 path, and with LLAMA_BF16_STREAM=1 it validates
++// the bf16-activation path (the projection reads the bf16 norm output directly). The
++// weight is BF16 so the projection deterministically routes to the cuBLAS-bf16 branch.
++struct test_bf16_stream_segment : public test_case {
++    const int     kind;      // 1 = plain rms+mul, 2 = gated-DeltaNet output norm
++    const int64_t n_embd;    // K (contraction)
++    const int64_t n_out;     // N (projection rows)
++    const int64_t n_tokens;  // M (>= 128 so the executor engages)
++
++    std::string vars() override {
++        return VARS_TO_STR4(kind, n_embd, n_out, n_tokens);
++    }
++
++    double max_nmse_err() override {
++        // bf16 activation rounding on both the default and opt-in paths (the BF16 weight
++        // GEMM already rounds the activation); generous but tight enough to catch a bug.
++        return 1e-2;
++    }
++
++    uint64_t op_flops(ggml_tensor * t) override {
++        GGML_UNUSED(t);
++        return 2 * n_embd * n_out * n_tokens;
++    }
++
++    test_bf16_stream_segment(int kind = 1, int64_t n_embd = 4096, int64_t n_out = 2048, int64_t n_tokens = 256)
++        : kind(kind), n_embd(n_embd), n_out(n_out), n_tokens(n_tokens) {}
++
++    ggml_tensor * build_graph(ggml_context * ctx) override {
++        const float eps = 1e-5f;
++
++        ggml_tensor * x = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
++        ggml_set_name(x, "x");
++        ggml_tensor * w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
++        ggml_set_name(w, "rms_w");
++        ggml_tensor * proj_w = ggml_new_tensor_2d(ctx, GGML_TYPE_BF16, n_embd, n_out);
++        ggml_set_name(proj_w, "proj_w");
++
++        ggml_tensor * rms = ggml_rms_norm(ctx, x, eps);
++        ggml_set_name(rms, "rms");
++        ggml_tensor * mul = ggml_mul(ctx, rms, w);
++        ggml_set_name(mul, "rms_mul");
++
++        ggml_tensor * norm_out = mul;
++        if (kind == 2) {
++            ggml_tensor * z = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
++            ggml_set_name(z, "gate_z");
++            ggml_tensor * silu = ggml_silu(ctx, z);
++            ggml_set_name(silu, "silu");
++            // silu as src0 so the topological order is {SILU, RMS_NORM, MUL, MUL},
++            // matching the fusion-pass pattern (and the real q36 gated-output norm).
++            norm_out = ggml_mul(ctx, silu, mul);
++            ggml_set_name(norm_out, "gate_mul");
++        }
++
++        ggml_tensor * out = ggml_mul_mat(ctx, proj_w, norm_out);
++        ggml_set_name(out, "proj");
++        return out;
++    }
++
++    bool run_whole_graph() override { return true; }
++
++    std::string op_desc(ggml_tensor * t) override {
++        GGML_UNUSED(t);
++        return "BF16_STREAM_SEGMENT";
++    }
++};
++
+ // MoE down projection -> router-weight multiply -> rank-ordered expert add.
+ struct test_moe_weighted_combine : public test_case {
+     const ggml_type type_a;
+@@ -9043,6 +9115,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
+         test_cases.emplace_back(new test_moe_swiglu_down(GGML_TYPE_NVFP4, 128, 8, 768, n, 2048));
+     }
+ 
++    // [P1 bf16-stream] bf16-native residual-stream segment executor coverage. Small-M
++    // (bails, f32 path) + large-M (engages under LLAMA_BF16_STREAM=1) for both norm kinds.
++    for (int kind : {1, 2}) {
++        test_cases.emplace_back(new test_bf16_stream_segment(kind, 4096, 2048, 64));
++        test_cases.emplace_back(new test_bf16_stream_segment(kind, 4096, 2048, 256));
++    }
++
+     // [paged Phase 7] MoE down projection -> router-weight multiply -> rank-ordered
+     // expert add gate for the weighted-combine fusion candidate.
+     test_cases.emplace_back(new test_moe_weighted_combine(GGML_TYPE_F32, 8, 2, 32, 8, 64));
+-- 
+2.43.0
+