diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp
index 2ae599ded..5cfc81e70 100644
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -284,6 +284,12 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const
     data["ignore_eos"] = predict->ignoreeos();
     data["embeddings"] = predict->embeddings();
 
+    // Speculative decoding per-request overrides
+    // NDraft maps to speculative.n_max (maximum draft tokens per speculation step)
+    if (predict->ndraft() > 0) {
+        data["speculative.n_max"] = predict->ndraft();
+    }
+
     // Add the correlationid to json data
     data["correlation_id"] = predict->correlationid();
 
@@ -402,6 +408,16 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
     if (!request->mmproj().empty()) {
       params.mmproj.path = request->mmproj();
     }
+
+    // Draft model for speculative decoding
+    if (!request->draftmodel().empty()) {
+        params.speculative.mparams_dft.path = request->draftmodel();
+        // Default to draft type if a draft model is set but no explicit type
+        if (params.speculative.type == COMMON_SPECULATIVE_TYPE_NONE) {
+            params.speculative.type = COMMON_SPECULATIVE_TYPE_DRAFT;
+        }
+    }
+
     //  params.model_alias ??
     params.model_alias.insert(request->modelfile());
     if (!request->cachetypekey().empty()) {
@@ -609,6 +625,48 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
                     // If conversion fails, keep default value (8)
                 }
             }
+        // Speculative decoding options
+        } else if (!strcmp(optname, "spec_type") || !strcmp(optname, "speculative_type")) {
+            auto type = common_speculative_type_from_name(optval_str);
+            if (type != COMMON_SPECULATIVE_TYPE_COUNT) {
+                params.speculative.type = type;
+            }
+        } else if (!strcmp(optname, "spec_n_max") || !strcmp(optname, "draft_max")) {
+            if (optval != NULL) {
+                try { params.speculative.n_max = std::stoi(optval_str); } catch (...) {}
+            }
+        } else if (!strcmp(optname, "spec_n_min") || !strcmp(optname, "draft_min")) {
+            if (optval != NULL) {
+                try { params.speculative.n_min = std::stoi(optval_str); } catch (...) {}
+            }
+        } else if (!strcmp(optname, "spec_p_min") || !strcmp(optname, "draft_p_min")) {
+            if (optval != NULL) {
+                try { params.speculative.p_min = std::stof(optval_str); } catch (...) {}
+            }
+        } else if (!strcmp(optname, "spec_p_split")) {
+            if (optval != NULL) {
+                try { params.speculative.p_split = std::stof(optval_str); } catch (...) {}
+            }
+        } else if (!strcmp(optname, "spec_ngram_size_n") || !strcmp(optname, "ngram_size_n")) {
+            if (optval != NULL) {
+                try { params.speculative.ngram_size_n = (uint16_t)std::stoi(optval_str); } catch (...) {}
+            }
+        } else if (!strcmp(optname, "spec_ngram_size_m") || !strcmp(optname, "ngram_size_m")) {
+            if (optval != NULL) {
+                try { params.speculative.ngram_size_m = (uint16_t)std::stoi(optval_str); } catch (...) {}
+            }
+        } else if (!strcmp(optname, "spec_ngram_min_hits") || !strcmp(optname, "ngram_min_hits")) {
+            if (optval != NULL) {
+                try { params.speculative.ngram_min_hits = (uint16_t)std::stoi(optval_str); } catch (...) {}
+            }
+        } else if (!strcmp(optname, "draft_gpu_layers")) {
+            if (optval != NULL) {
+                try { params.speculative.n_gpu_layers = std::stoi(optval_str); } catch (...) {}
+            }
+        } else if (!strcmp(optname, "draft_ctx_size")) {
+            if (optval != NULL) {
+                try { params.speculative.n_ctx = std::stoi(optval_str); } catch (...) {}
+            }
         }
     }
 
diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md
index dafd9f0da..57e36322a 100644
--- a/docs/content/advanced/model-configuration.md
+++ b/docs/content/advanced/model-configuration.md
@@ -189,8 +189,8 @@ These settings apply to most LLM backends (llama.cpp, vLLM, etc.):
 | Field | Type | Description |
 |-------|------|-------------|
 | `no_mulmatq` | bool | Disable matrix multiplication queuing |
-| `draft_model` | string | Draft model for speculative decoding |
-| `n_draft` | int32 | Number of draft tokens |
+| `draft_model` | string | Draft model GGUF file for speculative decoding (see [Speculative Decoding](#speculative-decoding)) |
+| `n_draft` | int32 | Maximum number of draft tokens per speculative step (default: 16) |
 | `quantization` | string | Quantization format |
 | `load_format` | string | Model load format |
 | `numa` | bool | Enable NUMA (Non-Uniform Memory Access) |
@@ -211,6 +211,76 @@ YARN (Yet Another RoPE extensioN) settings for context extension:
 | `yarn_beta_fast` | float32 | YARN beta fast parameter |
 | `yarn_beta_slow` | float32 | YARN beta slow parameter |
 
+### Speculative Decoding
+
+Speculative decoding speeds up text generation by predicting multiple tokens ahead and verifying them in a single forward pass. The output is identical to normal decoding — only faster. This feature is only available with the `llama-cpp` backend.
+
+There are two approaches:
+
+#### Draft Model Speculative Decoding
+
+Uses a smaller, faster model from the same model family to draft candidate tokens, which the main model then verifies. Requires a separate GGUF file for the draft model.
+
+```yaml
+name: my-model
+backend: llama-cpp
+parameters:
+  model: large-model.gguf
+draft_model: small-draft-model.gguf
+n_draft: 8
+options:
+  - spec_p_min:0.8
+  - draft_gpu_layers:99
+```
+
+#### N-gram Self-Speculative Decoding
+
+Uses patterns from the token history to predict future tokens — no extra model required. Works well for repetitive or structured output (code, JSON, lists).
+
+```yaml
+name: my-model
+backend: llama-cpp
+parameters:
+  model: my-model.gguf
+options:
+  - spec_type:ngram_simple
+  - spec_n_max:16
+```
+
+#### Speculative Decoding Options
+
+These are set via the `options:` array in the model configuration (format: `key:value`):
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `spec_type` | string | `none` | Speculative decoding type (see table below) |
+| `spec_n_max` / `draft_max` | int | 16 | Maximum number of tokens to draft per step |
+| `spec_n_min` / `draft_min` | int | 0 | Minimum draft tokens required to use speculation |
+| `spec_p_min` / `draft_p_min` | float | 0.75 | Minimum probability threshold for greedy acceptance |
+| `spec_p_split` | float | 0.1 | Split probability for tree-based branching |
+| `spec_ngram_size_n` / `ngram_size_n` | int | 12 | N-gram lookup size |
+| `spec_ngram_size_m` / `ngram_size_m` | int | 48 | M-gram proposal size |
+| `spec_ngram_min_hits` / `ngram_min_hits` | int | 1 | Minimum hits for accepting n-gram proposals |
+| `draft_gpu_layers` | int | -1 | GPU layers for the draft model (-1 = use default) |
+| `draft_ctx_size` | int | 0 | Context size for the draft model (0 = auto) |
+
+#### Speculative Type Values
+
+| Type | Description |
+|------|-------------|
+| `none` | No speculative decoding (default) |
+| `draft` | Draft model-based speculation (auto-set when `draft_model` is configured) |
+| `eagle3` | EAGLE3 draft model architecture |
+| `ngram_simple` | Simple self-speculative using token history |
+| `ngram_map_k` | N-gram with key-only map |
+| `ngram_map_k4v` | N-gram with keys and 4 m-gram values |
+| `ngram_mod` | Modified n-gram speculation |
+| `ngram_cache` | 3-level n-gram cache |
+
+{{% notice note %}}
+Speculative decoding is automatically disabled when multimodal models (with `mmproj`) are active. The `n_draft` parameter can also be overridden per-request.
+{{% /notice %}}
+
 ### Prompt Caching
 
 | Field | Type | Description |