diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 6c4fa6946..ce6b47740 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -688,6 +688,136 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt // If conversion fails, keep default value (8) } } + + // --- physical batch size (upstream -ub / --ubatch-size) --- + // Note: line ~482 already aliases n_ubatch to n_batch as a default; this + // option lets users decouple the two (useful for embeddings/rerank). + } else if (!strcmp(optname, "n_ubatch") || !strcmp(optname, "ubatch")) { + if (optval != NULL) { + try { params.n_ubatch = std::stoi(optval_str); } catch (...) {} + } + + // --- main-model batch threads (upstream -tb / --threads-batch) --- + } else if (!strcmp(optname, "threads_batch") || !strcmp(optname, "n_threads_batch")) { + if (optval != NULL) { + try { + int n = std::stoi(optval_str); + if (n <= 0) n = (int)std::thread::hardware_concurrency(); + params.cpuparams_batch.n_threads = n; + } catch (...) {} + } + + // --- pooling type for embeddings (upstream --pooling) --- + } else if (!strcmp(optname, "pooling_type") || !strcmp(optname, "pooling")) { + if (optval != NULL) { + if (optval_str == "none") params.pooling_type = LLAMA_POOLING_TYPE_NONE; + else if (optval_str == "mean") params.pooling_type = LLAMA_POOLING_TYPE_MEAN; + else if (optval_str == "cls") params.pooling_type = LLAMA_POOLING_TYPE_CLS; + else if (optval_str == "last") params.pooling_type = LLAMA_POOLING_TYPE_LAST; + else if (optval_str == "rank") params.pooling_type = LLAMA_POOLING_TYPE_RANK; + // unknown values silently leave UNSPECIFIED (auto-detect) + } + + // --- llama log verbosity threshold (upstream -lv / --verbosity) --- + } else if (!strcmp(optname, "verbosity")) { + if (optval != NULL) { + try { params.verbosity = std::stoi(optval_str); } catch (...) {} + } + + // --- O_DIRECT model loading (upstream --direct-io) --- + } else if (!strcmp(optname, "direct_io") || !strcmp(optname, "use_direct_io")) { + if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") { + params.use_direct_io = true; + } else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") { + params.use_direct_io = false; + } + + // --- embedding normalization (upstream --embd-normalize) --- + // -1 none, 0 max-abs, 1 taxicab, 2 L2 (default), >2 p-norm + } else if (!strcmp(optname, "embd_normalize") || !strcmp(optname, "embedding_normalize")) { + if (optval != NULL) { + try { params.embd_normalize = std::stoi(optval_str); } catch (...) {} + } + + // --- reasoning parser (upstream --reasoning-format) --- + // Picks the parser for blocks emitted by reasoning models. + // none / auto / deepseek / deepseek-legacy + } else if (!strcmp(optname, "reasoning_format")) { + if (optval != NULL) { + if (optval_str == "none") params.reasoning_format = COMMON_REASONING_FORMAT_NONE; + else if (optval_str == "auto") params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; + else if (optval_str == "deepseek") params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + else if (optval_str == "deepseek-legacy" || optval_str == "deepseek_legacy") + params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; + // unknown values silently keep the upstream default (DEEPSEEK) + } + + // --- reasoning budget (upstream --reasoning-budget) --- + // -1 unlimited, 0 disabled, >0 token budget for thinking blocks. + // Distinct from per-request `enable_thinking` (chat_template_kwargs). + } else if (!strcmp(optname, "enable_reasoning") || !strcmp(optname, "reasoning_budget")) { + if (optval != NULL) { + try { params.enable_reasoning = std::stoi(optval_str); } catch (...) {} + } + + // --- prefill assistant turn (upstream --no-prefill-assistant) --- + } else if (!strcmp(optname, "prefill_assistant")) { + if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") { + params.prefill_assistant = true; + } else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") { + params.prefill_assistant = false; + } + + // --- mmproj GPU offload (upstream --no-mmproj-offload, inverted) --- + } else if (!strcmp(optname, "mmproj_use_gpu") || !strcmp(optname, "mmproj_offload")) { + if (optval_str == "true" || optval_str == "1" || optval_str == "yes" || optval_str == "on" || optval_str == "enabled") { + params.mmproj_use_gpu = true; + } else if (optval_str == "false" || optval_str == "0" || optval_str == "no" || optval_str == "off" || optval_str == "disabled") { + params.mmproj_use_gpu = false; + } + + // --- per-image vision token budget (upstream --image-min/max-tokens) --- + } else if (!strcmp(optname, "image_min_tokens")) { + if (optval != NULL) { + try { params.image_min_tokens = std::stoi(optval_str); } catch (...) {} + } + } else if (!strcmp(optname, "image_max_tokens")) { + if (optval != NULL) { + try { params.image_max_tokens = std::stoi(optval_str); } catch (...) {} + } + + // --- main-model tensor buffer overrides (upstream --override-tensor) --- + // Format: =,=,... + // Mirrors the existing `draft_override_tensor` parser below. + } else if (!strcmp(optname, "override_tensor") || !strcmp(optname, "tensor_buft_overrides")) { + ggml_backend_load_all(); + std::map buft_list; + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + auto * buft = ggml_backend_dev_buffer_type(dev); + if (buft) { + buft_list[ggml_backend_buft_name(buft)] = buft; + } + } + static std::list override_names; + std::string cur; + auto flush = [&](const std::string & spec) { + auto pos = spec.find('='); + if (pos == std::string::npos) return; + const std::string name = spec.substr(0, pos); + const std::string type = spec.substr(pos + 1); + auto it = buft_list.find(type); + if (it == buft_list.end()) return; // unknown buffer type: ignore + override_names.push_back(name); + params.tensor_buft_overrides.push_back( + {override_names.back().c_str(), it->second}); + }; + for (char c : optval_str) { + if (c == ',') { if (!cur.empty()) { flush(cur); cur.clear(); } } + else { cur.push_back(c); } + } + if (!cur.empty()) flush(cur); + // Speculative decoding options } else if (!strcmp(optname, "spec_type") || !strcmp(optname, "speculative_type")) { #ifdef LOCALAI_LEGACY_LLAMA_CPP_SPEC @@ -2808,7 +2938,9 @@ public: } } - int embd_normalize = 2; // default to Euclidean/L2 norm + // Honor the load-time embd_normalize set via options:embd_normalize. + // -1 none, 0 max-abs, 1 taxicab, 2 L2 (default), >2 p-norm. + int embd_normalize = params_base.embd_normalize; // create and queue the task auto rd = ctx_server.get_response_reader(); { diff --git a/docs/content/advanced/model-configuration.md b/docs/content/advanced/model-configuration.md index 02aa555ce..876c9e062 100644 --- a/docs/content/advanced/model-configuration.md +++ b/docs/content/advanced/model-configuration.md @@ -316,23 +316,66 @@ These are set via the `options:` array in the model configuration (format: `key: #### Speculative Type Values -| Type | Description | -|------|-------------| -| `none` | No speculative decoding (default) | -| `draft` | Draft model-based speculation (auto-set when `draft_model` is configured) | -| `eagle3` | EAGLE3 draft model architecture | -| `ngram_simple` | Simple self-speculative using token history | -| `ngram_map_k` | N-gram with key-only map | -| `ngram_map_k4v` | N-gram with keys and 4 m-gram values | -| `ngram_mod` | Modified n-gram speculation | -| `ngram_cache` | 3-level n-gram cache | +The canonical names match upstream llama.cpp (dash-separated). For backward compatibility LocalAI also accepts the underscore-separated forms and the bare `draft` / `eagle3` aliases. -Multiple types can be chained by passing a comma-separated list to `spec_type` (e.g. `spec_type:ngram_simple,ngram_mod`). The runtime tries them in order and accepts the first proposal that meets the acceptance criteria. +| Type | Aliases accepted | Description | +|------|------------------|-------------| +| `none` | | No speculative decoding (default) | +| `draft-simple` | `draft`, `draft_simple` | Draft model-based speculation (auto-set when `draft_model` is configured) | +| `draft-eagle3` | `eagle3`, `draft_eagle3` | EAGLE3 draft model architecture | +| `ngram-simple` | `ngram_simple` | Simple self-speculative using token history | +| `ngram-map-k` | `ngram_map_k` | N-gram with key-only map | +| `ngram-map-k4v` | `ngram_map_k4v` | N-gram with keys and 4 m-gram values | +| `ngram-mod` | `ngram_mod` | Modified n-gram speculation | +| `ngram-cache` | `ngram_cache` | 3-level n-gram cache | + +Multiple types can be chained by passing a comma-separated list to `spec_type` (e.g. `spec_type:ngram-simple,ngram-mod`). The runtime tries them in order and accepts the first proposal that meets the acceptance criteria. {{% notice note %}} Speculative decoding is automatically disabled when multimodal models (with `mmproj`) are active. The `n_draft` parameter can also be overridden per-request. {{% /notice %}} +### Reasoning Models (DeepSeek-R1, Qwen3, etc.) + +These load-time options control how the backend parses `` reasoning blocks and how much budget the model is allowed for thinking. They are set per model via the `options:` array. + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `reasoning_format` | string | `deepseek` | Parser for reasoning/thinking blocks. One of `none`, `auto`, `deepseek`, `deepseek-legacy` (alias `deepseek_legacy`). | +| `enable_reasoning` / `reasoning_budget` | int | `-1` | Reasoning budget in tokens: `-1` unlimited, `0` disabled, `>0` token cap for the thinking section. | +| `prefill_assistant` | bool | `true` | When `false`, the trailing assistant message is not pre-filled by the chat template. | + +{{% notice note %}} +This is the load-time reasoning configuration. The orthogonal per-request `enable_thinking` chat-template kwarg (set via the YAML `reasoning.disable` field) toggles thinking on/off per call without restarting the model. +{{% /notice %}} + +### Multimodal Backend Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `mmproj_use_gpu` / `mmproj_offload` | bool | `true` | Set `false` to keep the multimodal projector on CPU (saves VRAM at cost of speed). | +| `image_min_tokens` | int | `-1` | Minimum vision tokens per image. `-1` keeps the model default. | +| `image_max_tokens` | int | `-1` | Maximum vision tokens per image. `-1` keeps the model default. | + +### Embedding & Reranking Backend Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `pooling_type` / `pooling` | string | auto | Pooling strategy for embeddings: `none`, `mean`, `cls`, `last`, `rank`. Reranking automatically uses `rank`. | +| `embd_normalize` / `embedding_normalize` | int | `2` | Normalization: `-1` none, `0` max-abs, `1` taxicab, `2` Euclidean (L2), `>2` p-norm. | + +### Other Backend Tuning Options + +These llama.cpp options are passed through the `options:` array. + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `n_ubatch` / `ubatch` | int | same as `batch` | Physical batch size. Decouple from `n_batch` when an embedding/rerank workload needs a different value. | +| `threads_batch` / `n_threads_batch` | int | same as `threads` | Threads used during prompt processing. `<= 0` means `hardware_concurrency()`. | +| `direct_io` / `use_direct_io` | bool | `false` | Open the model with `O_DIRECT` (faster cold loads on NVMe; ignored if not supported). | +| `verbosity` | int | `3` | llama.cpp internal log verbosity threshold. Higher = more verbose. | +| `override_tensor` / `tensor_buft_overrides` | string | "" | Per-tensor buffer-type overrides for the main model. Format: `=,=,...`. Mirrors the existing `draft_override_tensor` syntax for the draft model. | + ### Prompt Caching | Field | Type | Description |