diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index e2b1f7940..3e0eeb503 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -840,6 +840,27 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // If conversion fails, leave the per-slot cap unset (engine default) } } + // --- hybrid per-head bf16 SSM-state precision (patch 0026, qwen3.5 gated-DeltaNet decode) --- + // Opt-in reduced-precision fast mode for the recurrent SSM state: a gated-DeltaNet head whose + // memory length tau_h = 1/(|ssm_a|*softplus(ssm_dt)) tokens exceeds this threshold stays f32; + // faster-decaying heads persist their state as bf16, halving that head's dominant recurrence + // byte stream on decode. The value is the tau threshold in tokens (e.g. 32 / 64); 0 keeps every + // head f32 (the bit-exact default). Set BEFORE context init via LLAMA_SSM_BF16_TAU, consumed in + // common_context_params_to_llama (patch 0026) only when the --ssm-bf16-tau CLI flag is unset. + // Unset / non-positive => env untouched, so stock stays byte-identical and bit-exact (an + // externally exported LLAMA_SSM_BF16_TAU still works as an escape hatch). NOTE: this mode is + // NOT bit-exact (~91% same-top-p ceiling); see patches/paged/A_HYBRID_SSM_RESULTS.md. + } else if (!strcmp(optname, "ssm_bf16_tau") || !strcmp(optname, "ssm_hybrid_tau")) { + if (optval != NULL) { + try { + float tau = std::stof(optval_str); + if (tau > 0.0f) { + setenv("LLAMA_SSM_BF16_TAU", std::to_string(tau).c_str(), 1); + } + } catch (const std::exception& e) { + // If conversion fails, leave the threshold unset (bit-exact f32 default) + } + } } else if (!strcmp(optname, "n_ctx_checkpoints") || !strcmp(optname, "ctx_checkpoints")) { if (optval != NULL) { try { diff --git a/backend/cpp/llama-cpp/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch b/backend/cpp/llama-cpp/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch index 1528123e2..8401fa79a 100644 --- a/backend/cpp/llama-cpp/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch +++ b/backend/cpp/llama-cpp/patches/paged/0026-qwen35-hybrid-perhead-ssm-state.patch @@ -54,13 +54,23 @@ diff --git a/common/common.cpp b/common/common.cpp index a14e7bb..c4ab884 100644 --- a/common/common.cpp +++ b/common/common.cpp -@@ -1600,6 +1600,9 @@ struct llama_context_params common_context_params_to_llama(const common_params & +@@ -1600,6 +1600,19 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.type_k = params.cache_type_k; cparams.type_v = params.cache_type_v; + cparams.type_r = params.cache_type_conv; + cparams.type_s = params.cache_type_ssm; + cparams.ssm_hybrid_tau_thresh = params.ssm_hybrid_tau_thresh; ++ // LocalAI per-model option hook: when the --ssm-bf16-tau CLI flag is at its bit-exact ++ // default (0), honor LLAMA_SSM_BF16_TAU (set by the grpc-server from the model YAML ++ // `options: [ssm_bf16_tau:N]`) so the reduced-precision hybrid fast mode is selectable ++ // per model without a process-wide CLI flag. Absent/non-positive env => untouched, so ++ // stock stays bit-exact; the CLI flag, when set, takes precedence. ++ if (cparams.ssm_hybrid_tau_thresh == 0.0f) { ++ if (const char * tau_env = std::getenv("LLAMA_SSM_BF16_TAU")) { ++ try { cparams.ssm_hybrid_tau_thresh = std::stof(tau_env); } catch (...) {} ++ } ++ } return cparams; } diff --git a/backend/index.yaml b/backend/index.yaml index 09eb32116..2df4a5920 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -83,7 +83,10 @@ stock llama-cpp backend, with the LocalAI paged patch series applied (LLAMA_PAGED=on). Tuned for NVFP4 dense / MoE on Blackwell / GB10. Reuses the llama-cpp gRPC server sources; the paged engine is gated at runtime by the - paged_kv / max_batch_tokens model options. + paged_kv / max_batch_tokens model options. Qwen3.5 gated-DeltaNet models can + additionally opt into the reduced-precision hybrid SSM-state fast mode with + the ssm_bf16_tau: option (default off = bit-exact f32; non-bit-exact + when enabled). urls: - https://github.com/ggerganov/llama.cpp tags: diff --git a/docs/content/features/backends.md b/docs/content/features/backends.md index 84a6650db..9efd30b97 100644 --- a/docs/content/features/backends.md +++ b/docs/content/features/backends.md @@ -125,7 +125,7 @@ For getting started, see the available backends in LocalAI here: https://github. LocalAI supports various types of backends: - **LLM Backends**: For running language models (e.g., llama.cpp, vLLM, SGLang, transformers, MLX) - - **`llama-cpp-localai-paged`**: LocalAI's paged-attention llama.cpp variant - on-demand paged KV cache plus a decode-first prefill budget, tuned for NVFP4 dense/MoE on Blackwell/GB10. Same upstream llama.cpp pin as the stock `llama-cpp` backend, reusing its gRPC server; the paged engine is enabled per-model via the `paged_kv` / `max_batch_tokens` options. + - **`llama-cpp-localai-paged`**: LocalAI's paged-attention llama.cpp variant - on-demand paged KV cache plus a decode-first prefill budget, tuned for NVFP4 dense/MoE on Blackwell/GB10. Same upstream llama.cpp pin as the stock `llama-cpp` backend, reusing its gRPC server; the paged engine is enabled per-model via the `paged_kv` / `max_batch_tokens` options. For Qwen3.5 gated-DeltaNet (hybrid SSM) models you can additionally set `options: [ssm_bf16_tau:]` to enable the reduced-precision hybrid SSM-state fast mode: fast-decaying recurrent heads (memory length tau below the threshold, e.g. `32` / `64`) persist their state as bf16, halving that head's decode byte stream. Default off (`0`) keeps every head f32 and is bit-exact; when enabled the mode is **not** bit-exact (~91% same-top-p ceiling - see `backend/cpp/llama-cpp/patches/paged/A_HYBRID_SSM_RESULTS.md` for the quality/throughput profile). - **Speech-to-Text Backends**: For transcription (e.g., whisper.cpp, parakeet.cpp, faster-whisper, NeMo) - **Text-to-Speech Backends**: For speech synthesis (e.g., piper, Kokoro, VibeVoice, Qwen3-TTS) - **Sound Generation Backends**: For music and audio generation (e.g., ACE-Step) diff --git a/gallery/index.yaml b/gallery/index.yaml index d85f2ba31..f6c40c220 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -12,6 +12,15 @@ # TODO(NVFP4 read gating): NVFP4 GGUF tensor types require a llama.cpp new enough # to read them. Confirm the paged backend's pinned LLAMA_VERSION supports NVFP4 # on a GPU box before relying on these (plan section 3.4 / 4 blocker #1). +# +# NOTE(ssm_bf16_tau): Qwen3.5 gated-DeltaNet (hybrid SSM) models can opt into the +# reduced-precision hybrid SSM-state fast mode by adding `ssm_bf16_tau:` +# (e.g. 32 / 64) to a model's `options:` list - fast-decaying recurrent heads then +# persist their state as bf16 (LLAMA_SSM_BF16_TAU), halving that head's decode byte +# stream. Default off (0) = every head f32 = bit-exact; when enabled the mode is NOT +# bit-exact (~91% same-top-p, beats vLLM dense) - see +# backend/cpp/llama-cpp/patches/paged/A_HYBRID_SSM_RESULTS.md for the quality profile. +# The two NVFP4 entries below intentionally stay bit-exact (no ssm_bf16_tau). # ============================================================================= - name: "qwen3.6-27b-nvfp4" url: "github:mudler/LocalAI/gallery/virtual.yaml@master"