diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index e2b1f7940..3e0eeb503 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -840,6 +840,27 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
                     // If conversion fails, leave the per-slot cap unset (engine default)
                 }
             }
+        // --- hybrid per-head bf16 SSM-state precision (patch 0026, qwen3.5 gated-DeltaNet decode) ---
+        // Opt-in reduced-precision fast mode for the recurrent SSM state: a gated-DeltaNet head whose
+        // memory length tau_h = 1/(|ssm_a|*softplus(ssm_dt)) tokens exceeds this threshold stays f32;
+        // faster-decaying heads persist their state as bf16, halving that head's dominant recurrence
+        // byte stream on decode. The value is the tau threshold in tokens (e.g. 32 / 64); 0 keeps every
+        // head f32 (the bit-exact default). Set BEFORE context init via LLAMA_SSM_BF16_TAU, consumed in
+        // common_context_params_to_llama (patch 0026) only when the --ssm-bf16-tau CLI flag is unset.
+        // Unset / non-positive => env untouched, so stock stays byte-identical and bit-exact (an
+        // externally exported LLAMA_SSM_BF16_TAU still works as an escape hatch). NOTE: this mode is
+        // NOT bit-exact (~91% same-top-p ceiling); see patches/paged/A_HYBRID_SSM_RESULTS.md.
+        } else if (!strcmp(optname, "ssm_bf16_tau") || !strcmp(optname, "ssm_hybrid_tau")) {
+            if (optval != NULL) {
+                try {
+                    float tau = std::stof(optval_str);
+                    if (tau > 0.0f) {
+                        setenv("LLAMA_SSM_BF16_TAU", std::to_string(tau).c_str(), 1);
+                    }
+                } catch (const std::exception& e) {
+                    // If conversion fails, leave the threshold unset (bit-exact f32 default)
+                }
+            }
         } else if (!strcmp(optname, "n_ctx_checkpoints") || !strcmp(optname, "ctx_checkpoints")) {
             if (optval != NULL) {
                 try {
diff --git a/backend/cpp/llama-cpp/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch b/backend/cpp/llama-cpp/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch
index 1528123e2..8401fa79a 100644
--- a/backend/cpp/llama-cpp/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch
+++ b/backend/cpp/llama-cpp/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch
@@ -54,13 +54,23 @@ diff --git a/common/common.cpp b/common/common.cpp
 index a14e7bb..c4ab884 100644
 --- a/common/common.cpp
 +++ b/common/common.cpp
-@@ -1600,6 +1600,9 @@ struct llama_context_params common_context_params_to_llama(const common_params &
+@@ -1600,6 +1600,19 @@ struct llama_context_params common_context_params_to_llama(const common_params &
  
      cparams.type_k = params.cache_type_k;
      cparams.type_v = params.cache_type_v;
 +    cparams.type_r = params.cache_type_conv;
 +    cparams.type_s = params.cache_type_ssm;
 +    cparams.ssm_hybrid_tau_thresh = params.ssm_hybrid_tau_thresh;
++    // LocalAI per-model option hook: when the --ssm-bf16-tau CLI flag is at its bit-exact
++    // default (0), honor LLAMA_SSM_BF16_TAU (set by the grpc-server from the model YAML
++    // `options: [ssm_bf16_tau:N]`) so the reduced-precision hybrid fast mode is selectable
++    // per model without a process-wide CLI flag. Absent/non-positive env => untouched, so
++    // stock stays bit-exact; the CLI flag, when set, takes precedence.
++    if (cparams.ssm_hybrid_tau_thresh == 0.0f) {
++        if (const char * tau_env = std::getenv("LLAMA_SSM_BF16_TAU")) {
++            try { cparams.ssm_hybrid_tau_thresh = std::stof(tau_env); } catch (...) {}
++        }
++    }
  
      return cparams;
  }
diff --git a/backend/index.yaml b/backend/index.yaml
index 09eb32116..2df4a5920 100644
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -83,7 +83,10 @@
     stock llama-cpp backend, with the LocalAI paged patch series applied
     (LLAMA_PAGED=on). Tuned for NVFP4 dense / MoE on Blackwell / GB10. Reuses the
     llama-cpp gRPC server sources; the paged engine is gated at runtime by the
-    paged_kv / max_batch_tokens model options.
+    paged_kv / max_batch_tokens model options. Qwen3.5 gated-DeltaNet models can
+    additionally opt into the reduced-precision hybrid SSM-state fast mode with
+    the ssm_bf16_tau:<tokens> option (default off = bit-exact f32; non-bit-exact
+    when enabled).
   urls:
     - https://github.com/ggerganov/llama.cpp
   tags:
diff --git a/docs/content/features/backends.md b/docs/content/features/backends.md
index 84a6650db..9efd30b97 100644
--- a/docs/content/features/backends.md
+++ b/docs/content/features/backends.md
@@ -125,7 +125,7 @@ For getting started, see the available backends in LocalAI here: https://github.
 LocalAI supports various types of backends:
 
 - **LLM Backends**: For running language models (e.g., llama.cpp, vLLM, SGLang, transformers, MLX)
-  - **`llama-cpp-localai-paged`**: LocalAI's paged-attention llama.cpp variant - on-demand paged KV cache plus a decode-first prefill budget, tuned for NVFP4 dense/MoE on Blackwell/GB10. Same upstream llama.cpp pin as the stock `llama-cpp` backend, reusing its gRPC server; the paged engine is enabled per-model via the `paged_kv` / `max_batch_tokens` options.
+  - **`llama-cpp-localai-paged`**: LocalAI's paged-attention llama.cpp variant - on-demand paged KV cache plus a decode-first prefill budget, tuned for NVFP4 dense/MoE on Blackwell/GB10. Same upstream llama.cpp pin as the stock `llama-cpp` backend, reusing its gRPC server; the paged engine is enabled per-model via the `paged_kv` / `max_batch_tokens` options. For Qwen3.5 gated-DeltaNet (hybrid SSM) models you can additionally set `options: [ssm_bf16_tau:<tokens>]` to enable the reduced-precision hybrid SSM-state fast mode: fast-decaying recurrent heads (memory length tau below the threshold, e.g. `32` / `64`) persist their state as bf16, halving that head's decode byte stream. Default off (`0`) keeps every head f32 and is bit-exact; when enabled the mode is **not** bit-exact (~91% same-top-p ceiling - see `backend/cpp/llama-cpp/patches/paged/A_HYBRID_SSM_RESULTS.md` for the quality/throughput profile).
 - **Speech-to-Text Backends**: For transcription (e.g., whisper.cpp, parakeet.cpp, faster-whisper, NeMo)
 - **Text-to-Speech Backends**: For speech synthesis (e.g., piper, Kokoro, VibeVoice, Qwen3-TTS)
 - **Sound Generation Backends**: For music and audio generation (e.g., ACE-Step)
diff --git a/gallery/index.yaml b/gallery/index.yaml
index d85f2ba31..f6c40c220 100644
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -12,6 +12,15 @@
 # TODO(NVFP4 read gating): NVFP4 GGUF tensor types require a llama.cpp new enough
 # to read them. Confirm the paged backend's pinned LLAMA_VERSION supports NVFP4
 # on a GPU box before relying on these (plan section 3.4 / 4 blocker #1).
+#
+# NOTE(ssm_bf16_tau): Qwen3.5 gated-DeltaNet (hybrid SSM) models can opt into the
+# reduced-precision hybrid SSM-state fast mode by adding `ssm_bf16_tau:<tokens>`
+# (e.g. 32 / 64) to a model's `options:` list - fast-decaying recurrent heads then
+# persist their state as bf16 (LLAMA_SSM_BF16_TAU), halving that head's decode byte
+# stream. Default off (0) = every head f32 = bit-exact; when enabled the mode is NOT
+# bit-exact (~91% same-top-p, beats vLLM dense) - see
+# backend/cpp/llama-cpp/patches/paged/A_HYBRID_SSM_RESULTS.md for the quality profile.
+# The two NVFP4 entries below intentionally stay bit-exact (no ssm_bf16_tau).
 # =============================================================================
 - name: "qwen3.6-27b-nvfp4"
   url: "github:mudler/LocalAI/gallery/virtual.yaml@master"