From 2033086f609a3daf1deb68a7a76281d58b53a595 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 30 Jun 2026 15:10:13 +0000 Subject: [PATCH] patches(paged): track 0044 GatedRMSNorm patch, sync LocalAI series to fork 51168c5 The fork mudler/llama.cpp branch localai-paged is the canonical source of truth for the paged-backend patch series. This file is the git format-patch of fork commit 51168c5ee ("feat(paged): fused gated RMSNorm + SiLU gate-mul CUDA op (patch 0044)"), verified byte-identical to that commit's format-patch output. The full on-disk series applies clean in numeric order on the pinned base and the resulting tree is byte-identical to the fork commit tree (tree hash a73d759350277532a14e853e1fe78f08bbb74ce8), so the LocalAI series is a drift-free 1:1 mirror of the fork branch. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto --- ...ed-fused-gated-RMSNorm-SiLU-gate-mul.patch | 470 ++++++++++++++++++ 1 file changed, 470 insertions(+) create mode 100644 backend/cpp/llama-cpp-localai-paged/patches/paged/0044-feat-paged-fused-gated-RMSNorm-SiLU-gate-mul.patch diff --git a/backend/cpp/llama-cpp-localai-paged/patches/paged/0044-feat-paged-fused-gated-RMSNorm-SiLU-gate-mul.patch b/backend/cpp/llama-cpp-localai-paged/patches/paged/0044-feat-paged-fused-gated-RMSNorm-SiLU-gate-mul.patch new file mode 100644 index 000000000..3b2aa9b98 --- /dev/null +++ b/backend/cpp/llama-cpp-localai-paged/patches/paged/0044-feat-paged-fused-gated-RMSNorm-SiLU-gate-mul.patch @@ -0,0 +1,470 @@ +From 51168c5eee2e35348d9006f0b2fab3dc6e7c01cc Mon Sep 17 00:00:00 2001 +From: Ettore Di Giacinto +Date: Tue, 30 Jun 2026 10:57:05 +0200 +Subject: [PATCH] feat(paged): fused gated RMSNorm + SiLU gate-mul CUDA op + (patch 0044) + +The Qwen3.6 gated-DeltaNet output norm self.norm(core_attn_out, z) +(qwen35 / qwen35moe build_norm_gated) runs as (rms_norm(x) * w) * silu(z): +on CUDA that was rms_norm_mul + silu_mul, two fused launches with the +normalized intermediate round-tripping through HBM. Fuse the whole chain +into one kernel so it stays in registers. This is the gated-RMSNorm fusion +the vLLM decode-gap analysis ranked #1 (the easy, bit-exact prefill win), +a direct sibling of patch 0042 (add-RMSNorm). + +The chain is NOT naturally consecutive in the graph: the gate z-projection +(a MUL_MAT) is scheduled between the weight MUL and the SILU, so the default +mul(normalized, silu(z)) order leaves a GEMM between them and cannot be +fused. build_norm_gated now emits the gate multiply as mul(silu(z), +normalized) (commutative, so bit-exact), which lays the chain out as the +consecutive subgraph { SILU, RMS_NORM, MUL, MUL } that ggml-cuda can fuse. + +- New kernel rms_norm_gate_mul_f32 (ggml/src/ggml-cuda/norm.cu): same + block_reduce over x^2, same 256/1024 block-size thresholds and + rsqrtf(mean+eps) as rms_norm / patch 0042; the final write computes + dst = scale * x * w * silu(z) with silu(z) = z/(1+expf(-z)) (the exact + ggml_cuda_op_silu_single form). w (the RMS weight) and z (the gate) both + broadcast via the packed-modulo helper. +- ggml_cuda_can_fuse recognizes { GGML_OP_UNARY(SILU), RMS_NORM, MUL, MUL } + via ggml_can_fuse_subgraph with the final MUL as the only output (the SILU + reads an external gate; RMS_NORM and the weight MUL are single-use within). +- Gated by LLAMA_FUSE_GATE_RMSNORM (default ON) for a clean single-build A/B; + OFF keeps the original operand order AND the unfused kernels, so OFF is + byte- and kernel-identical to the pre-patch path. + +BIT-EXACT (per-path canonical greedy md5, n=48 --temp 0 --seed 1): + dense q36-27b-nvfp4 : 5951a5b4d624ce891e22ab5fca9bc439 (ON == OFF == canonical, paged and non-paged) + MoE q36-35b-a3b : 8cb0ce23777bf55f92f63d0292c756b0 (ON == OFF == canonical, paged) +Multiply is commutative, so ((scale*x)*w)*silu(z) is byte-identical to the +unfused silu(z)*((scale*x)*w); the sum(x^2) reduction and rsqrt scale are +unchanged. test-backend-ops 12979/12979 (CUDA0 vs CPU). + +PROFILE (dense prefill, nsys --cuda-graph-trace=node, npp512 ntg4 npl8): + rms_norm_f32<256,1,0> 560 -> 224 launches + unary_gated_op_kernel 784 -> 448 launches + rms_norm_gate_mul_f32 (new) 336 launches / 69.7M ns + -> the 336 gated-norm rms_norm_mul + 336 silu_mul launches (672) fold into + 336 fused launches, removing the normalized HBM round-trip. +S_PP (npp512 ntg4 npl32, 3x interleaved A/B, every ON beats every OFF): + dense q36-27b : 1002.5 -> 1013.4 t/s (+1.1%, ~+10 us/tok) + MoE q36-35b : 2626.9 -> 2651.8 t/s (+0.9%) + +Assisted-by: Claude:opus-4.8 [Claude Code] +Signed-off-by: Ettore Di Giacinto +--- + ggml/src/ggml-cuda/ggml-cuda.cu | 67 ++++++++++ + ggml/src/ggml-cuda/norm.cu | 215 ++++++++++++++++++++++++++++++++ + ggml/src/ggml-cuda/norm.cuh | 6 + + src/models/qwen35.cpp | 16 +++ + src/models/qwen35moe.cpp | 16 +++ + 5 files changed, 320 insertions(+) + +diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu +index 42bcd4a77..374949f25 100644 +--- a/ggml/src/ggml-cuda/ggml-cuda.cu ++++ b/ggml/src/ggml-cuda/ggml-cuda.cu +@@ -3816,6 +3816,60 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, + return true; + } + ++ // Fused gated RMS norm: SiLU gate multiply over (RMS norm * weight), the ++ // gated-DeltaNet output norm `out = (rms_norm(x) * w) * silu(z)` of the Qwen3.6 ++ // hybrid models (qwen35 / qwen35moe build_norm_gated). The model emits the gate ++ // multiply as mul(silu(z), normalized) (default; see LLAMA_FUSE_GATE_RMSNORM in ++ // build_norm_gated) so the chain forms the consecutive subgraph ++ // { SILU, RMS_NORM, MUL, MUL } - the gate z-projection is scheduled before the ++ // SILU, so the natural mul(normalized, silu) order leaves a GEMM between the ++ // weight MUL and the SILU and cannot be fused. The SILU (node_idx) reads an ++ // external gate and the final gate MUL (node_idx + 3) feeds the o_proj, so mark ++ // node_idx + 3 as the only output; the RMS_NORM (node_idx + 1) and weight MUL ++ // (node_idx + 2) are single-use within the subgraph. ++ std::initializer_list rms_norm_gate_mul_ops = { GGML_OP_UNARY, GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_MUL }; ++ if (is_equal(rms_norm_gate_mul_ops, ops) && unary_ops.size() == 1 && unary_ops.begin()[0] == GGML_UNARY_OP_SILU && ++ ggml_can_fuse_subgraph(cgraph, node_idx, ops, { node_idx + 3 })) { ++ const ggml_tensor * silu = cgraph->nodes[node_idx]; ++ const ggml_tensor * rms_norm = cgraph->nodes[node_idx + 1]; ++ const ggml_tensor * mul = cgraph->nodes[node_idx + 2]; ++ const ggml_tensor * gate_mul = cgraph->nodes[node_idx + 3]; ++ ++ if (ggml_get_unary_op(silu) != GGML_UNARY_OP_SILU) { ++ return false; ++ } ++ // The weight MUL must consume the RMS norm output; the gate MUL must ++ // consume both the weight MUL and the SILU output. ++ if (mul->src[0] != rms_norm && mul->src[1] != rms_norm) { ++ return false; ++ } ++ if ((gate_mul->src[0] != mul && gate_mul->src[1] != mul) || ++ (gate_mul->src[0] != silu && gate_mul->src[1] != silu)) { ++ return false; ++ } ++ // All operands F32 (rms norm / fused mul / silu kernel only support F32). ++ if (rms_norm->src[0]->type != GGML_TYPE_F32 || rms_norm->type != GGML_TYPE_F32 || ++ mul->src[0]->type != GGML_TYPE_F32 || mul->src[1]->type != GGML_TYPE_F32 || mul->type != GGML_TYPE_F32 || ++ silu->src[0]->type != GGML_TYPE_F32 || silu->type != GGML_TYPE_F32 || ++ gate_mul->src[0]->type != GGML_TYPE_F32 || gate_mul->src[1]->type != GGML_TYPE_F32 || ++ gate_mul->type != GGML_TYPE_F32) { ++ return false; ++ } ++ // If rms_norm is the B operand of the weight mul, broadcast of A is unsupported. ++ if (rms_norm == mul->src[1] && !ggml_are_same_shape(mul->src[0], rms_norm)) { ++ return false; ++ } ++ // The fused kernel reads contiguous rows for the norm input, the weight, ++ // and the gate, and writes a contiguous output. ++ if (!ggml_is_contiguous_rows(rms_norm->src[0]) || ++ !ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1]) || ++ !ggml_is_contiguous_rows(silu->src[0]) || ++ !ggml_is_contiguous_rows(gate_mul->src[0]) || !ggml_is_contiguous_rows(gate_mul->src[1])) { ++ return false; ++ } ++ return true; ++ } ++ + if (!ggml_can_fuse(cgraph, node_idx, ops)) { + return false; + } +@@ -4350,6 +4404,19 @@ static int ggml_cuda_try_fuse(ggml_backend_cuda_context * cuda_ctx, ggml_cgraph + return 2; + } + ++ // Fused gated RMS norm: RMS norm + weight multiply + SiLU-gated multiply ++ // (bit-exact). The Qwen3.6 gated-DeltaNet output norm. Default ON; set ++ // LLAMA_FUSE_GATE_RMSNORM=0 for a clean A/B against the unfused path. ++ static const bool fuse_gate_rmsnorm = [] { ++ const char * e = getenv("LLAMA_FUSE_GATE_RMSNORM"); ++ return e == nullptr || atoi(e) != 0; ++ }(); ++ if (fuse_gate_rmsnorm && ++ ggml_cuda_can_fuse(cgraph, i, { GGML_OP_UNARY, GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_MUL }, { GGML_UNARY_OP_SILU })) { ++ ggml_cuda_op_rms_norm_gate_mul(*cuda_ctx, cgraph->nodes[i + 1], cgraph->nodes[i + 2], node, cgraph->nodes[i + 3]); ++ return 3; ++ } ++ + if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD }, {})) { + ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i + 1], cgraph->nodes[i + 2]); + return 2; +diff --git a/ggml/src/ggml-cuda/norm.cu b/ggml/src/ggml-cuda/norm.cu +index a07d02276..e776c67d2 100644 +--- a/ggml/src/ggml-cuda/norm.cu ++++ b/ggml/src/ggml-cuda/norm.cu +@@ -235,6 +235,95 @@ static __global__ void rms_norm_pre_add_mul_f32(const float * a, + } + } + ++// Fused gated RMS norm: RMS norm + weight multiply + SiLU gate multiply. ++// dst = (rsqrt(mean(x^2)+eps) * x * w) * silu(z) with silu(z) = z/(1+expf(-z)) ++// This is the gated-DeltaNet output norm `self.norm(core_attn_out, z)` of the ++// Qwen3.6 hybrid models (build_norm_gated): rms_norm(x) scaled by the per-head ++// ssm_norm weight `w`, then gated by silu of the gate activation `z`. Unfused it ++// runs as rms_norm_mul (scale*x*w) -> silu(z) -> mul; fusing it keeps the ++// normalized intermediate in registers so it never round-trips to HBM. ++// ++// Bit-exactness: the sum(x^2) reduction uses the same block_reduce with the ++// same 256/1024 block-size thresholds and the same rsqrtf(mean+eps) as rms_norm, ++// the weight multiply reproduces rms_norm_mul's `scale*x[col]*w[col]` order, and ++// silu reuses the exact `z/(1+expf(-z))` of ggml_cuda_op_silu_single. Float ++// multiply is commutative, so `(scale*x*w) * silu(z)` is byte-identical to the ++// unfused `mul(rms_norm_mul, silu(z))` (whether or not silu+mul was itself fused). ++// `w` (the RMS weight) and `z` (the gate) both broadcast via the packed-modulo path. ++template ++static __global__ void rms_norm_gate_mul_f32(const float * x, ++ float * dst, ++ const int ncols, ++ const int64_t stride_row, ++ const int64_t stride_channel, ++ const int64_t stride_sample, ++ const float eps, ++ const float * mul, ++ const int64_t mul_stride_row, ++ const int64_t mul_stride_channel, ++ const int64_t mul_stride_sample, ++ const uint3 mul_ncols_packed, ++ const uint3 mul_nrows_packed, ++ const uint3 mul_nchannels_packed, ++ const uint3 mul_nsamples_packed, ++ const float * gate, ++ const int64_t gate_stride_row, ++ const int64_t gate_stride_channel, ++ const int64_t gate_stride_sample, ++ const uint3 gate_ncols_packed, ++ const uint3 gate_nrows_packed, ++ const uint3 gate_nchannels_packed, ++ const uint3 gate_nsamples_packed) { ++ ggml_cuda_pdl_lc(); ++ const int nrows = gridDim.x; ++ const int nchannels = gridDim.y; ++ ++ const int row = blockIdx.x; ++ const int channel = blockIdx.y; ++ const int sample = blockIdx.z; ++ const int tid = threadIdx.x; ++ ++ x += sample*stride_sample + channel*stride_channel + row*stride_row; ++ // dst is laid out contiguously by the scheduler for the (final) MUL output ++ dst += ((sample*nchannels + channel)*nrows + row)*ncols; ++ ++ { ++ const uint32_t mul_row = fastmodulo(row, mul_nrows_packed); ++ const uint32_t mul_channel = fastmodulo(channel, mul_nchannels_packed); ++ const uint32_t mul_sample = fastmodulo(sample, mul_nsamples_packed); ++ mul += mul_sample * mul_stride_sample + mul_channel * mul_stride_channel + mul_row * mul_stride_row; ++ } ++ { ++ const uint32_t gate_row = fastmodulo(row, gate_nrows_packed); ++ const uint32_t gate_channel = fastmodulo(channel, gate_nchannels_packed); ++ const uint32_t gate_sample = fastmodulo(sample, gate_nsamples_packed); ++ gate += gate_sample * gate_stride_sample + gate_channel * gate_stride_channel + gate_row * gate_stride_row; ++ } ++ ++ float tmp = 0.0f; // partial sum for thread in warp ++ ++ ggml_cuda_pdl_sync(); ++ for (int col = tid; col < ncols; col += block_size) { ++ const float xi = x[col]; ++ tmp += xi * xi; ++ } ++ ++ // sum up partial sums ++ extern __shared__ float s_sum[]; ++ tmp = block_reduce(tmp, s_sum); ++ ++ const float mean = tmp / ncols; ++ const float scale = rsqrtf(mean + eps); ++ ++ for (int col = tid; col < ncols; col += block_size) { ++ const int mul_col = fastmodulo(col, mul_ncols_packed); ++ const int gate_col = fastmodulo(col, gate_ncols_packed); ++ const float zi = gate[gate_col]; ++ const float silu_z = zi / (1.0f + expf(-zi)); ++ dst[col] = scale * x[col] * mul[mul_col] * silu_z; ++ } ++} ++ + template + static __global__ void rms_norm_back_f32( + const float * grad, const float * xf, float * dst, const int ncols, const float eps) { +@@ -532,6 +621,65 @@ static void rms_norm_pre_add_mul_f32_cuda(const float * a, + } + } + ++static void rms_norm_gate_mul_f32_cuda(const float * x, ++ float * dst, ++ const int ncols, ++ const int nrows, ++ const int nchannels, ++ const int nsamples, ++ const int64_t stride_row, ++ const int64_t stride_channel, ++ const int64_t stride_sample, ++ const float * mul, ++ const int64_t mul_stride_row, ++ const int64_t mul_stride_channel, ++ const int64_t mul_stride_sample, ++ const uint32_t mul_ncols, ++ const uint32_t mul_nrows, ++ const uint32_t mul_nchannels, ++ const uint32_t mul_nsamples, ++ const float * gate, ++ const int64_t gate_stride_row, ++ const int64_t gate_stride_channel, ++ const int64_t gate_stride_sample, ++ const uint32_t gate_ncols, ++ const uint32_t gate_nrows, ++ const uint32_t gate_nchannels, ++ const uint32_t gate_nsamples, ++ const float eps, ++ cudaStream_t stream) { ++ const dim3 blocks_num(nrows, nchannels, nsamples); ++ GGML_ASSERT(mul != nullptr); ++ GGML_ASSERT(gate != nullptr); ++ const uint3 mul_ncols_packed = init_fastdiv_values(mul_ncols); ++ const uint3 mul_nrows_packed = init_fastdiv_values(mul_nrows); ++ const uint3 mul_nchannels_packed = init_fastdiv_values(mul_nchannels); ++ const uint3 mul_nsamples_packed = init_fastdiv_values(mul_nsamples); ++ const uint3 gate_ncols_packed = init_fastdiv_values(gate_ncols); ++ const uint3 gate_nrows_packed = init_fastdiv_values(gate_nrows); ++ const uint3 gate_nchannels_packed = init_fastdiv_values(gate_nchannels); ++ const uint3 gate_nsamples_packed = init_fastdiv_values(gate_nsamples); ++ if (ncols < 1024) { ++ const dim3 block_dims(256, 1, 1); ++ const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream}; ++ ggml_cuda_kernel_launch(rms_norm_gate_mul_f32<256>, launch_params, ++ x, dst, ncols, stride_row, stride_channel, stride_sample, eps, ++ mul, mul_stride_row, mul_stride_channel, mul_stride_sample, ++ mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, ++ gate, gate_stride_row, gate_stride_channel, gate_stride_sample, ++ gate_ncols_packed, gate_nrows_packed, gate_nchannels_packed, gate_nsamples_packed); ++ } else { ++ const dim3 block_dims(1024, 1, 1); ++ const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params{blocks_num, block_dims, block_dims.x > WARP_SIZE ? 32 * sizeof(float) : 0, stream}; ++ ggml_cuda_kernel_launch(rms_norm_gate_mul_f32<1024>, launch_params, ++ x, dst, ncols, stride_row, stride_channel, stride_sample, eps, ++ mul, mul_stride_row, mul_stride_channel, mul_stride_sample, ++ mul_ncols_packed, mul_nrows_packed, mul_nchannels_packed, mul_nsamples_packed, ++ gate, gate_stride_row, gate_stride_channel, gate_stride_sample, ++ gate_ncols_packed, gate_nrows_packed, gate_nchannels_packed, gate_nsamples_packed); ++ } ++} ++ + static void rms_norm_back_f32_cuda(const float * grad, const float * xf, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { + if (ncols < 1024) { + const dim3 block_dims(WARP_SIZE, 1, 1); +@@ -843,6 +991,73 @@ void ggml_cuda_op_rms_norm_pre_add_mul(ggml_backend_cuda_context & ctx, + eps, stream); + } + ++void ggml_cuda_op_rms_norm_gate_mul(ggml_backend_cuda_context & ctx, ++ ggml_tensor * rms_norm_tensor, ++ ggml_tensor * mul_tensor, ++ ggml_tensor * silu_tensor, ++ ggml_tensor * gate_mul_tensor) { ++ // mul = rms_norm(x) * w ; silu = silu(z) ; gate_mul = mul * silu ++ GGML_ASSERT(mul_tensor->src[0] == rms_norm_tensor || mul_tensor->src[1] == rms_norm_tensor); ++ GGML_ASSERT(gate_mul_tensor->src[0] == silu_tensor || gate_mul_tensor->src[1] == silu_tensor); ++ ++ const ggml_tensor * x_src = rms_norm_tensor->src[0]; ++ const ggml_tensor * w_src = (mul_tensor->src[0] == rms_norm_tensor) ? mul_tensor->src[1] : mul_tensor->src[0]; ++ const ggml_tensor * gate_src = silu_tensor->src[0]; ++ ++ float eps = 0.0f; ++ memcpy(&eps, rms_norm_tensor->op_params, sizeof(float)); ++ GGML_ASSERT(eps >= 0.0f); ++ ++ const float * x_d = (const float *) x_src->data; ++ const float * w_d = (const float *) w_src->data; ++ const float * gate_d = (const float *) gate_src->data; ++ float * dst_d = (float *) gate_mul_tensor->data; ++ cudaStream_t stream = ctx.stream(); ++ ++ GGML_ASSERT(x_src->type == GGML_TYPE_F32); ++ GGML_ASSERT(w_src->type == GGML_TYPE_F32); ++ GGML_ASSERT(gate_src->type == GGML_TYPE_F32); ++ GGML_ASSERT(rms_norm_tensor->type == GGML_TYPE_F32); ++ GGML_ASSERT(mul_tensor->type == GGML_TYPE_F32); ++ GGML_ASSERT(silu_tensor->type == GGML_TYPE_F32); ++ GGML_ASSERT(gate_mul_tensor->type == GGML_TYPE_F32); ++ ++ const int64_t ne00 = rms_norm_tensor->ne[0]; ++ const int64_t ne01 = rms_norm_tensor->ne[1]; ++ const int64_t ne02 = rms_norm_tensor->ne[2]; ++ const int64_t ne03 = rms_norm_tensor->ne[3]; ++ ++ // x (the rms-norm input) strides; cols must be contiguous ++ const size_t ts0 = ggml_type_size(x_src->type); ++ GGML_ASSERT(x_src->nb[0] == ts0); ++ const int64_t s01 = x_src->nb[1] / ts0; ++ const int64_t s02 = x_src->nb[2] / ts0; ++ const int64_t s03 = x_src->nb[3] / ts0; ++ ++ // weight (the RMS scale) strides + broadcast extents ++ const size_t ts_mul = ggml_type_size(w_src->type); ++ GGML_ASSERT(w_src->nb[0] == ts_mul); ++ const int64_t mul_s01 = w_src->nb[1] / ts_mul; ++ const int64_t mul_s02 = w_src->nb[2] / ts_mul; ++ const int64_t mul_s03 = w_src->nb[3] / ts_mul; ++ ++ // gate (the silu activation) strides + broadcast extents ++ const size_t ts_gate = ggml_type_size(gate_src->type); ++ GGML_ASSERT(gate_src->nb[0] == ts_gate); ++ const int64_t gate_s01 = gate_src->nb[1] / ts_gate; ++ const int64_t gate_s02 = gate_src->nb[2] / ts_gate; ++ const int64_t gate_s03 = gate_src->nb[3] / ts_gate; ++ ++ rms_norm_gate_mul_f32_cuda(x_d, dst_d, ++ ne00, ne01, ne02, ne03, ++ /*s00*/ s01, s02, s03, ++ w_d, /*mul_s00*/ mul_s01, mul_s02, mul_s03, ++ w_src->ne[0], w_src->ne[1], w_src->ne[2], w_src->ne[3], ++ gate_d, /*gate_s00*/ gate_s01, gate_s02, gate_s03, ++ gate_src->ne[0], gate_src->ne[1], gate_src->ne[2], gate_src->ne[3], ++ eps, stream); ++} ++ + void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * grad = dst->src[0]; // gradients + const ggml_tensor * src0f = dst->src[1]; // src0 from forward pass +diff --git a/ggml/src/ggml-cuda/norm.cuh b/ggml/src/ggml-cuda/norm.cuh +index 05396cdf0..4d6dba6fa 100644 +--- a/ggml/src/ggml-cuda/norm.cuh ++++ b/ggml/src/ggml-cuda/norm.cuh +@@ -17,6 +17,12 @@ void ggml_cuda_op_rms_norm_pre_add_mul(ggml_backend_cuda_context & ctx, + ggml_tensor * add_tensor, + ggml_tensor * rms_norm_tensor, + ggml_tensor * mul_tensor); ++void ggml_cuda_op_rms_norm_gate_mul(ggml_backend_cuda_context & ctx, ++ ggml_tensor * rms_norm_tensor, ++ ggml_tensor * mul_tensor, ++ ggml_tensor * silu_tensor, ++ ggml_tensor * gate_mul_tensor); ++ + + void ggml_cuda_op_rms_norm_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp +index 66064869e..98751f7cc 100644 +--- a/src/models/qwen35.cpp ++++ b/src/models/qwen35.cpp +@@ -1,4 +1,5 @@ + #include "models.h" ++#include + #include "llama-memory-recurrent.h" + + void llama_model_qwen35::load_arch_hparams(llama_model_loader & ml) { +@@ -251,6 +252,21 @@ ggml_tensor * llama_model_qwen35::graph::build_norm_gated( + ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer); + ggml_tensor * gated_silu = ggml_silu(ctx0, gate); + ++ // Emit the gate multiply as mul(silu(z), normalized) so the gated-DeltaNet ++ // output-norm chain forms the consecutive subgraph { SILU, RMS_NORM, MUL, MUL } ++ // that the CUDA backend fuses into one rms_norm_gate_mul kernel (the normalized ++ // intermediate then never round-trips to HBM). The gate z-projection is scheduled ++ // before the SILU, so the natural mul(normalized, silu) order leaves a GEMM ++ // between the weight MUL and the SILU and is not fusable. Multiplication is ++ // commutative, so this is bit-exact vs mul(normalized, silu). ++ // LLAMA_FUSE_GATE_RMSNORM=0 keeps the original operand order (kernel fusion off). ++ static const bool fuse_gate_rmsnorm = [] { ++ const char * e = getenv("LLAMA_FUSE_GATE_RMSNORM"); ++ return e == nullptr || atoi(e) != 0; ++ }(); ++ if (fuse_gate_rmsnorm) { ++ return ggml_mul(ctx0, gated_silu, normalized); ++ } + return ggml_mul(ctx0, normalized, gated_silu); + } + +diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp +index a79917628..071b88daa 100644 +--- a/src/models/qwen35moe.cpp ++++ b/src/models/qwen35moe.cpp +@@ -1,4 +1,5 @@ + #include "models.h" ++#include + #include "llama-memory-recurrent.h" + + void llama_model_qwen35moe::load_arch_hparams(llama_model_loader & ml) { +@@ -275,6 +276,21 @@ ggml_tensor * llama_model_qwen35moe::graph::build_norm_gated( + ggml_tensor * normalized = build_norm(input, weights, nullptr, LLM_NORM_RMS, layer); + ggml_tensor * gated_silu = ggml_silu(ctx0, gate); + ++ // Emit the gate multiply as mul(silu(z), normalized) so the gated-DeltaNet ++ // output-norm chain forms the consecutive subgraph { SILU, RMS_NORM, MUL, MUL } ++ // that the CUDA backend fuses into one rms_norm_gate_mul kernel (the normalized ++ // intermediate then never round-trips to HBM). The gate z-projection is scheduled ++ // before the SILU, so the natural mul(normalized, silu) order leaves a GEMM ++ // between the weight MUL and the SILU and is not fusable. Multiplication is ++ // commutative, so this is bit-exact vs mul(normalized, silu). ++ // LLAMA_FUSE_GATE_RMSNORM=0 keeps the original operand order (kernel fusion off). ++ static const bool fuse_gate_rmsnorm = [] { ++ const char * e = getenv("LLAMA_FUSE_GATE_RMSNORM"); ++ return e == nullptr || atoi(e) != 0; ++ }(); ++ if (fuse_gate_rmsnorm) { ++ return ggml_mul(ctx0, gated_silu, normalized); ++ } + return ggml_mul(ctx0, normalized, gated_silu); + } + +-- +2.43.0 +