diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 2ae599ded..5cfc81e70 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -284,6 +284,12 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const data["ignore_eos"] = predict->ignoreeos(); data["embeddings"] = predict->embeddings(); + // Speculative decoding per-request overrides + // NDraft maps to speculative.n_max (maximum draft tokens per speculation step) + if (predict->ndraft() > 0) { + data["speculative.n_max"] = predict->ndraft(); + } + // Add the correlationid to json data data["correlation_id"] = predict->correlationid(); @@ -402,6 +408,16 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt if (!request->mmproj().empty()) { params.mmproj.path = request->mmproj(); } + + // Draft model for speculative decoding + if (!request->draftmodel().empty()) { + params.speculative.mparams_dft.path = request->draftmodel(); + // Default to draft type if a draft model is set but no explicit type + if (params.speculative.type == COMMON_SPECULATIVE_TYPE_NONE) { + params.speculative.type = COMMON_SPECULATIVE_TYPE_DRAFT; + } + } + // params.model_alias ?? params.model_alias.insert(request->modelfile()); if (!request->cachetypekey().empty()) { @@ -609,6 +625,48 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // If conversion fails, keep default value (8) } } + // Speculative decoding options + } else if (!strcmp(optname, "spec_type") || !strcmp(optname, "speculative_type")) { + auto type = common_speculative_type_from_name(optval_str); + if (type != COMMON_SPECULATIVE_TYPE_COUNT) { + params.speculative.type = type; + } + } else if (!strcmp(optname, "spec_n_max") || !strcmp(optname, "draft_max")) { + if (optval != NULL) { + try { params.speculative.n_max = std::stoi(optval_str); } catch (...) {} + } + } else if (!strcmp(optname, "spec_n_min") || !strcmp(optname, "draft_min")) { + if (optval != NULL) { + try { params.speculative.n_min = std::stoi(optval_str); } catch (...) {} + } + } else if (!strcmp(optname, "spec_p_min") || !strcmp(optname, "draft_p_min")) { + if (optval != NULL) { + try { params.speculative.p_min = std::stof(optval_str); } catch (...) {} + } + } else if (!strcmp(optname, "spec_p_split")) { + if (optval != NULL) { + try { params.speculative.p_split = std::stof(optval_str); } catch (...) {} + } + } else if (!strcmp(optname, "spec_ngram_size_n") || !strcmp(optname, "ngram_size_n")) { + if (optval != NULL) { + try { params.speculative.ngram_size_n = (uint16_t)std::stoi(optval_str); } catch (...) {} + } + } else if (!strcmp(optname, "spec_ngram_size_m") || !strcmp(optname, "ngram_size_m")) { + if (optval != NULL) { + try { params.speculative.ngram_size_m = (uint16_t)std::stoi(optval_str); } catch (...) {} + } + } else if (!strcmp(optname, "spec_ngram_min_hits") || !strcmp(optname, "ngram_min_hits")) { + if (optval != NULL) { + try { params.speculative.ngram_min_hits = (uint16_t)std::stoi(optval_str); } catch (...) {} + } + } else if (!strcmp(optname, "draft_gpu_layers")) { + if (optval != NULL) { + try { params.speculative.n_gpu_layers = std::stoi(optval_str); } catch (...) {} + } + } else if (!strcmp(optname, "draft_ctx_size")) { + if (optval != NULL) { + try { params.speculative.n_ctx = std::stoi(optval_str); } catch (...) {} + } } } diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md index dafd9f0da..57e36322a 100644 --- a/docs/content/advanced/model-configuration.md +++ b/docs/content/advanced/model-configuration.md @@ -189,8 +189,8 @@ These settings apply to most LLM backends (llama.cpp, vLLM, etc.): | Field | Type | Description | |-------|------|-------------| | `no_mulmatq` | bool | Disable matrix multiplication queuing | -| `draft_model` | string | Draft model for speculative decoding | -| `n_draft` | int32 | Number of draft tokens | +| `draft_model` | string | Draft model GGUF file for speculative decoding (see [Speculative Decoding](#speculative-decoding)) | +| `n_draft` | int32 | Maximum number of draft tokens per speculative step (default: 16) | | `quantization` | string | Quantization format | | `load_format` | string | Model load format | | `numa` | bool | Enable NUMA (Non-Uniform Memory Access) | @@ -211,6 +211,76 @@ YARN (Yet Another RoPE extensioN) settings for context extension: | `yarn_beta_fast` | float32 | YARN beta fast parameter | | `yarn_beta_slow` | float32 | YARN beta slow parameter | +### Speculative Decoding + +Speculative decoding speeds up text generation by predicting multiple tokens ahead and verifying them in a single forward pass. The output is identical to normal decoding — only faster. This feature is only available with the `llama-cpp` backend. + +There are two approaches: + +#### Draft Model Speculative Decoding + +Uses a smaller, faster model from the same model family to draft candidate tokens, which the main model then verifies. Requires a separate GGUF file for the draft model. + +```yaml +name: my-model +backend: llama-cpp +parameters: + model: large-model.gguf +draft_model: small-draft-model.gguf +n_draft: 8 +options: + - spec_p_min:0.8 + - draft_gpu_layers:99 +``` + +#### N-gram Self-Speculative Decoding + +Uses patterns from the token history to predict future tokens — no extra model required. Works well for repetitive or structured output (code, JSON, lists). + +```yaml +name: my-model +backend: llama-cpp +parameters: + model: my-model.gguf +options: + - spec_type:ngram_simple + - spec_n_max:16 +``` + +#### Speculative Decoding Options + +These are set via the `options:` array in the model configuration (format: `key:value`): + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `spec_type` | string | `none` | Speculative decoding type (see table below) | +| `spec_n_max` / `draft_max` | int | 16 | Maximum number of tokens to draft per step | +| `spec_n_min` / `draft_min` | int | 0 | Minimum draft tokens required to use speculation | +| `spec_p_min` / `draft_p_min` | float | 0.75 | Minimum probability threshold for greedy acceptance | +| `spec_p_split` | float | 0.1 | Split probability for tree-based branching | +| `spec_ngram_size_n` / `ngram_size_n` | int | 12 | N-gram lookup size | +| `spec_ngram_size_m` / `ngram_size_m` | int | 48 | M-gram proposal size | +| `spec_ngram_min_hits` / `ngram_min_hits` | int | 1 | Minimum hits for accepting n-gram proposals | +| `draft_gpu_layers` | int | -1 | GPU layers for the draft model (-1 = use default) | +| `draft_ctx_size` | int | 0 | Context size for the draft model (0 = auto) | + +#### Speculative Type Values + +| Type | Description | +|------|-------------| +| `none` | No speculative decoding (default) | +| `draft` | Draft model-based speculation (auto-set when `draft_model` is configured) | +| `eagle3` | EAGLE3 draft model architecture | +| `ngram_simple` | Simple self-speculative using token history | +| `ngram_map_k` | N-gram with key-only map | +| `ngram_map_k4v` | N-gram with keys and 4 m-gram values | +| `ngram_mod` | Modified n-gram speculation | +| `ngram_cache` | 3-level n-gram cache | + +{{% notice note %}} +Speculative decoding is automatically disabled when multimodal models (with `mmproj`) are active. The `n_draft` parameter can also be overridden per-request. +{{% /notice %}} + ### Prompt Caching | Field | Type | Description |