Compare commits

...

7 Commits

Author SHA1 Message Date
LocalAI [bot]
ad232fdb1a docs: ⬆️ update docs version mudler/LocalAI (#9241)
⬆️ Update docs version mudler/LocalAI

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-04-06 10:53:07 +02:00
LocalAI [bot]
11637b5a1b chore: ⬆️ Update leejet/stable-diffusion.cpp to 7397ddaa86f4e8837d5261724678cde0f36d4d89 (#9242)
⬆️ Update leejet/stable-diffusion.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-04-06 10:52:51 +02:00
LocalAI [bot]
0dda4fe6f0 chore: ⬆️ Update ggml-org/llama.cpp to 761797ffdf2ce3f118e82c663b1ad7d935fbd656 (#9243)
⬆️ Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-04-06 10:52:38 +02:00
Ettore Di Giacinto
773489eeb1 fix(chat): do not retry if we had chatdeltas or tooldeltas from backend (#9244)
* fix(chat): do not retry if we had chatdeltas or tooldeltas from backend

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix: use oai compat for llama.cpp

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* fix: apply to non-streaming path too

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* map also other fields

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-04-06 10:52:23 +02:00
Ettore Di Giacinto
06fbe48b3f feat(llama.cpp): wire speculative decoding settings (#9238)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-04-05 14:56:30 +02:00
Ettore Di Giacinto
232e324a68 fix(autoparser): correctly pass by logprobs (#9239)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-04-05 09:39:22 +02:00
ER-EPR
39c954764c Update index.yaml and add Qwen3.5 model files (#9237)
* Update index.yaml

Signed-off-by: ER-EPR <38782737+ER-EPR@users.noreply.github.com>

* Add mmproj files for Qwen3.5 models

Signed-off-by: ER-EPR <38782737+ER-EPR@users.noreply.github.com>

* Update file paths for Qwen models in index.yaml

Signed-off-by: ER-EPR <38782737+ER-EPR@users.noreply.github.com>

* Update index.yaml

Signed-off-by: ER-EPR <38782737+ER-EPR@users.noreply.github.com>

* Refactor Qwen3-Reranker-0.6B entry in index.yaml

Signed-off-by: ER-EPR <38782737+ER-EPR@users.noreply.github.com>

* Update qwen3.yaml configuration parameters

Signed-off-by: ER-EPR <38782737+ER-EPR@users.noreply.github.com>

---------

Signed-off-by: ER-EPR <38782737+ER-EPR@users.noreply.github.com>
2026-04-05 09:21:21 +02:00
12 changed files with 704 additions and 50 deletions

View File

@@ -1,5 +1,5 @@
LLAMA_VERSION?=b8635075ffe27b135c49afb9a8b5c434bd42c502
LLAMA_VERSION?=761797ffdf2ce3f118e82c663b1ad7d935fbd656
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?=

View File

@@ -284,6 +284,12 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const
data["ignore_eos"] = predict->ignoreeos();
data["embeddings"] = predict->embeddings();
// Speculative decoding per-request overrides
// NDraft maps to speculative.n_max (maximum draft tokens per speculation step)
if (predict->ndraft() > 0) {
data["speculative.n_max"] = predict->ndraft();
}
// Add the correlationid to json data
data["correlation_id"] = predict->correlationid();
@@ -402,6 +408,16 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
if (!request->mmproj().empty()) {
params.mmproj.path = request->mmproj();
}
// Draft model for speculative decoding
if (!request->draftmodel().empty()) {
params.speculative.mparams_dft.path = request->draftmodel();
// Default to draft type if a draft model is set but no explicit type
if (params.speculative.type == COMMON_SPECULATIVE_TYPE_NONE) {
params.speculative.type = COMMON_SPECULATIVE_TYPE_DRAFT;
}
}
// params.model_alias ??
params.model_alias.insert(request->modelfile());
if (!request->cachetypekey().empty()) {
@@ -609,6 +625,48 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
// If conversion fails, keep default value (8)
}
}
// Speculative decoding options
} else if (!strcmp(optname, "spec_type") || !strcmp(optname, "speculative_type")) {
auto type = common_speculative_type_from_name(optval_str);
if (type != COMMON_SPECULATIVE_TYPE_COUNT) {
params.speculative.type = type;
}
} else if (!strcmp(optname, "spec_n_max") || !strcmp(optname, "draft_max")) {
if (optval != NULL) {
try { params.speculative.n_max = std::stoi(optval_str); } catch (...) {}
}
} else if (!strcmp(optname, "spec_n_min") || !strcmp(optname, "draft_min")) {
if (optval != NULL) {
try { params.speculative.n_min = std::stoi(optval_str); } catch (...) {}
}
} else if (!strcmp(optname, "spec_p_min") || !strcmp(optname, "draft_p_min")) {
if (optval != NULL) {
try { params.speculative.p_min = std::stof(optval_str); } catch (...) {}
}
} else if (!strcmp(optname, "spec_p_split")) {
if (optval != NULL) {
try { params.speculative.p_split = std::stof(optval_str); } catch (...) {}
}
} else if (!strcmp(optname, "spec_ngram_size_n") || !strcmp(optname, "ngram_size_n")) {
if (optval != NULL) {
try { params.speculative.ngram_size_n = (uint16_t)std::stoi(optval_str); } catch (...) {}
}
} else if (!strcmp(optname, "spec_ngram_size_m") || !strcmp(optname, "ngram_size_m")) {
if (optval != NULL) {
try { params.speculative.ngram_size_m = (uint16_t)std::stoi(optval_str); } catch (...) {}
}
} else if (!strcmp(optname, "spec_ngram_min_hits") || !strcmp(optname, "ngram_min_hits")) {
if (optval != NULL) {
try { params.speculative.ngram_min_hits = (uint16_t)std::stoi(optval_str); } catch (...) {}
}
} else if (!strcmp(optname, "draft_gpu_layers")) {
if (optval != NULL) {
try { params.speculative.n_gpu_layers = std::stoi(optval_str); } catch (...) {}
}
} else if (!strcmp(optname, "draft_ctx_size")) {
if (optval != NULL) {
try { params.speculative.n_ctx = std::stoi(optval_str); } catch (...) {}
}
}
}
@@ -1251,6 +1309,7 @@ public:
body_json["messages"] = messages_json;
body_json["stream"] = true; // PredictStream is always streaming
body_json["stream_options"] = {{"include_usage", true}}; // Ensure token counts in final chunk
// Check if grammar is provided from Go layer (NoGrammar=false)
// If grammar is provided, we must use it and NOT let template generate grammar from tools
@@ -1558,8 +1617,11 @@ public:
data);
task.id_slot = json_value(data, "id_slot", -1);
// OAI-compat
task.params.res_type = TASK_RESPONSE_TYPE_NONE;
// OAI-compat: enable autoparser (PEG-based chat parsing) so that
// reasoning, tool calls, and content are classified into ChatDeltas.
// Without this, the PEG parser never produces diffs and the Go side
// cannot detect tool calls or separate reasoning from content.
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
task.params.oaicompat_cmpl_id = completion_id;
// oaicompat_model is already populated by params_from_json_cmpl
@@ -1584,19 +1646,47 @@ public:
return grpc::Status(grpc::StatusCode::INTERNAL, error_json.value("message", "Error occurred"));
}
// Lambda to build a Reply from JSON + attach chat deltas from a result
// Lambda to build a Reply from JSON + attach chat deltas from a result.
// Handles both native format ({"content": "..."}) and OAI chat format
// ({"choices": [{"delta": {"content": "...", "reasoning": "..."}}]}).
auto build_reply_from_json = [](const json & res_json, server_task_result * raw_result) -> backend::Reply {
backend::Reply reply;
std::string completion_text = res_json.value("content", "");
reply.set_message(completion_text);
reply.set_tokens(res_json.value("tokens_predicted", 0));
reply.set_prompt_tokens(res_json.value("tokens_evaluated", 0));
std::string completion_text;
if (res_json.contains("choices")) {
// OAI chat format — extract content from choices[0].delta
const auto & choices = res_json.at("choices");
if (!choices.empty()) {
const auto & delta = choices[0].value("delta", json::object());
if (delta.contains("content") && !delta.at("content").is_null()) {
completion_text = delta.at("content").get<std::string>();
}
}
} else {
// Native llama.cpp format
completion_text = res_json.value("content", "");
}
reply.set_message(completion_text);
// Token counts: native format has top-level fields,
// OAI format has them in "usage" (final chunk only)
if (res_json.contains("usage")) {
const auto & usage = res_json.at("usage");
reply.set_tokens(usage.value("completion_tokens", 0));
reply.set_prompt_tokens(usage.value("prompt_tokens", 0));
} else {
reply.set_tokens(res_json.value("tokens_predicted", 0));
reply.set_prompt_tokens(res_json.value("tokens_evaluated", 0));
}
// Timings: present as top-level "timings" in both formats
if (res_json.contains("timings")) {
reply.set_timing_prompt_processing(res_json.at("timings").value("prompt_ms", 0.0));
reply.set_timing_token_generation(res_json.at("timings").value("predicted_ms", 0.0));
}
// Logprobs: extract_logprobs_from_json handles both formats
json logprobs_json = extract_logprobs_from_json(res_json);
if (!logprobs_json.empty() && !logprobs_json.is_null()) {
reply.set_logprobs(logprobs_json.dump());
@@ -1605,21 +1695,17 @@ public:
return reply;
};
// Attach chat deltas from the autoparser to a Reply.
// When diffs are available, populate ChatDeltas on the reply.
// The raw message is always preserved so the Go side can use it
// for reasoning extraction and tool call parsing as a fallback
// (important in distributed mode where ChatDeltas may not be
// the primary parsing path).
auto attach_chat_deltas = [](backend::Reply & reply, server_task_result * raw_result) {
// Try streaming partial result first
auto* partial = dynamic_cast<server_task_result_cmpl_partial*>(raw_result);
if (partial) {
if (!partial->oaicompat_msg_diffs.empty()) {
populate_chat_deltas_from_diffs(reply, partial->oaicompat_msg_diffs);
} else if (partial->is_updated) {
// Autoparser is active but hasn't classified this chunk yet
// (PEG parser warming up). Clear the raw message so the Go
// side doesn't try to parse partial tag tokens (e.g. "<|channel>"
// before the full "<|channel>thought\n" is received).
// This matches llama.cpp server behavior which only emits SSE
// chunks when the parser produces diffs.
reply.set_message("");
}
if (partial && !partial->oaicompat_msg_diffs.empty()) {
populate_chat_deltas_from_diffs(reply, partial->oaicompat_msg_diffs);
return;
}
// Try final result
@@ -2299,8 +2385,9 @@ public:
data);
task.id_slot = json_value(data, "id_slot", -1);
// OAI-compat
task.params.res_type = TASK_RESPONSE_TYPE_NONE;
// OAI-compat: enable autoparser (PEG-based chat parsing) so that
// reasoning, tool calls, and content are classified into ChatDeltas.
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
task.params.oaicompat_cmpl_id = completion_id;
// oaicompat_model is already populated by params_from_json_cmpl
@@ -2331,25 +2418,48 @@ public:
auto* final_res = dynamic_cast<server_task_result_cmpl_final*>(all_results.results[0].get());
GGML_ASSERT(final_res != nullptr);
json result_json = all_results.results[0]->to_json();
reply->set_message(result_json.value("content", ""));
int32_t tokens_predicted = result_json.value("tokens_predicted", 0);
// Handle both native format ({"content": "...", "tokens_predicted": N})
// and OAI chat format ({"choices": [{"message": {"content": "..."}}],
// "usage": {"completion_tokens": N, "prompt_tokens": N}}).
std::string completion_text;
int32_t tokens_predicted = 0;
int32_t tokens_evaluated = 0;
if (result_json.contains("choices")) {
// OAI chat format
const auto & choices = result_json.at("choices");
if (!choices.empty()) {
const auto & msg = choices[0].value("message", json::object());
if (msg.contains("content") && !msg.at("content").is_null()) {
completion_text = msg.at("content").get<std::string>();
}
}
if (result_json.contains("usage")) {
const auto & usage = result_json.at("usage");
tokens_predicted = usage.value("completion_tokens", 0);
tokens_evaluated = usage.value("prompt_tokens", 0);
}
} else {
// Native llama.cpp format
completion_text = result_json.value("content", "");
tokens_predicted = result_json.value("tokens_predicted", 0);
tokens_evaluated = result_json.value("tokens_evaluated", 0);
}
reply->set_message(completion_text);
reply->set_tokens(tokens_predicted);
int32_t tokens_evaluated = result_json.value("tokens_evaluated", 0);
reply->set_prompt_tokens(tokens_evaluated);
// Timings: present in both formats as a top-level "timings" object
if (result_json.contains("timings")) {
double timing_prompt_processing = result_json.at("timings").value("prompt_ms", 0.0);
reply->set_timing_prompt_processing(timing_prompt_processing);
double timing_token_generation = result_json.at("timings").value("predicted_ms", 0.0);
reply->set_timing_token_generation(timing_token_generation);
reply->set_timing_prompt_processing(result_json.at("timings").value("prompt_ms", 0.0));
reply->set_timing_token_generation(result_json.at("timings").value("predicted_ms", 0.0));
}
// Extract and set logprobs if present
// Logprobs: extract_logprobs_from_json handles both formats
json logprobs_json = extract_logprobs_from_json(result_json);
if (!logprobs_json.empty() && !logprobs_json.is_null()) {
std::string logprobs_str = logprobs_json.dump();
reply->set_logprobs(logprobs_str);
reply->set_logprobs(logprobs_json.dump());
}
// Populate chat deltas from the autoparser's final parsed message
@@ -2365,7 +2475,20 @@ public:
for (auto & res : all_results.results) {
GGML_ASSERT(dynamic_cast<server_task_result_cmpl_final*>(res.get()) != nullptr);
json res_json = res->to_json();
arr.push_back(res_json.value("content", ""));
// Handle both native and OAI chat formats
std::string result_content;
if (res_json.contains("choices")) {
const auto & choices = res_json.at("choices");
if (!choices.empty()) {
const auto & msg = choices[0].value("message", json::object());
if (msg.contains("content") && !msg.at("content").is_null()) {
result_content = msg.at("content").get<std::string>();
}
}
} else {
result_content = res_json.value("content", "");
}
arr.push_back(result_content);
// Extract logprobs for each result
json logprobs_json = extract_logprobs_from_json(res_json);

View File

@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=87ecb95cbc65dc8e58e3d88f4f4a59a0939796f5
STABLEDIFFUSION_GGML_VERSION?=7397ddaa86f4e8837d5261724678cde0f36d4d89
CMAKE_ARGS+=-DGGML_MAX_NAME=128

View File

@@ -147,10 +147,23 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
result := ""
lastEmittedCount := 0
sentInitialRole := false
hasChatDeltaToolCalls := false
hasChatDeltaContent := false
_, tokenUsage, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
result += s
// Track whether ChatDeltas from the C++ autoparser contain
// tool calls or content, so the retry decision can account for them.
for _, d := range usage.ChatDeltas {
if len(d.ToolCalls) > 0 {
hasChatDeltaToolCalls = true
}
if d.Content != "" {
hasChatDeltaContent = true
}
}
var reasoningDelta, contentDelta string
goReasoning, goContent := extractor.ProcessToken(s)
@@ -309,15 +322,22 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
// After streaming completes: check if we got actionable content
cleaned := extractor.CleanedContent()
// Check for tool calls from chat deltas (will be re-checked after ComputeChoices,
// but we need to know here whether to retry)
hasToolCalls := lastEmittedCount > 0
if cleaned == "" && !hasToolCalls {
// but we need to know here whether to retry).
// Also check ChatDelta flags — when the C++ autoparser is active,
// tool calls and content are delivered via ChatDeltas while the
// raw message is cleared. Without this check, we'd retry
// unnecessarily, losing valid results and concatenating output.
hasToolCalls := lastEmittedCount > 0 || hasChatDeltaToolCalls
hasContent := cleaned != "" || hasChatDeltaContent
if !hasContent && !hasToolCalls {
xlog.Warn("Streaming: backend produced only reasoning, retrying",
"reasoning_len", len(extractor.Reasoning()), "attempt", attempt+1)
extractor.ResetAndSuppressReasoning()
result = ""
lastEmittedCount = 0
sentInitialRole = false
hasChatDeltaToolCalls = false
hasChatDeltaContent = false
return true
}
return false
@@ -1006,7 +1026,12 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
if deltaReasoning != "" {
message.Reasoning = &deltaReasoning
}
result = []schema.Choice{{FinishReason: &stopReason, Index: 0, Message: message}}
newChoice := schema.Choice{FinishReason: &stopReason, Index: 0, Message: message}
// Preserve logprobs from the original result
if len(result) > 0 && result[0].Logprobs != nil {
newChoice.Logprobs = result[0].Logprobs
}
result = []schema.Choice{newChoice}
}
}

View File

@@ -113,11 +113,23 @@ func ComputeChoices(
}
prediction = p
// Built-in: retry on truly empty response (no tokens at all)
// Built-in: retry on truly empty response (no tokens at all).
// However, when the C++ autoparser is active, it clears the raw
// message and delivers content via ChatDeltas instead. Do NOT
// retry if ChatDeltas contain tool calls or content.
if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries {
xlog.Warn("Backend returned empty response, retrying",
"attempt", attempt+1, "maxRetries", maxRetries)
continue
hasChatDeltaData := false
for _, d := range prediction.ChatDeltas {
if d.Content != "" || len(d.ToolCalls) > 0 {
hasChatDeltaData = true
break
}
}
if !hasChatDeltaData {
xlog.Warn("Backend returned empty response, retrying",
"attempt", attempt+1, "maxRetries", maxRetries)
continue
}
}
tokenUsage.Prompt = prediction.Usage.Prompt
@@ -130,8 +142,21 @@ func ComputeChoices(
finetunedResponse := backend.Finetune(*config, predInput, prediction.Response)
cb(finetunedResponse, &result)
// Caller-driven retry (tool parsing, reasoning-only, etc.)
if shouldRetryFn != nil && shouldRetryFn(attempt) && attempt < maxRetries {
// Caller-driven retry (tool parsing, reasoning-only, etc.).
// When the C++ autoparser is active, it clears the raw response
// and delivers data via ChatDeltas. If the response is empty but
// ChatDeltas contain actionable data, skip the caller retry —
// the autoparser already parsed the response successfully.
skipCallerRetry := false
if strings.TrimSpace(prediction.Response) == "" && len(prediction.ChatDeltas) > 0 {
for _, d := range prediction.ChatDeltas {
if d.Content != "" || len(d.ToolCalls) > 0 {
skipCallerRetry = true
break
}
}
}
if shouldRetryFn != nil && !skipCallerRetry && shouldRetryFn(attempt) && attempt < maxRetries {
// Caller has already reset its state inside shouldRetry
result = result[:0]
allChatDeltas = nil

View File

@@ -189,8 +189,8 @@ These settings apply to most LLM backends (llama.cpp, vLLM, etc.):
| Field | Type | Description |
|-------|------|-------------|
| `no_mulmatq` | bool | Disable matrix multiplication queuing |
| `draft_model` | string | Draft model for speculative decoding |
| `n_draft` | int32 | Number of draft tokens |
| `draft_model` | string | Draft model GGUF file for speculative decoding (see [Speculative Decoding](#speculative-decoding)) |
| `n_draft` | int32 | Maximum number of draft tokens per speculative step (default: 16) |
| `quantization` | string | Quantization format |
| `load_format` | string | Model load format |
| `numa` | bool | Enable NUMA (Non-Uniform Memory Access) |
@@ -211,6 +211,76 @@ YARN (Yet Another RoPE extensioN) settings for context extension:
| `yarn_beta_fast` | float32 | YARN beta fast parameter |
| `yarn_beta_slow` | float32 | YARN beta slow parameter |
### Speculative Decoding
Speculative decoding speeds up text generation by predicting multiple tokens ahead and verifying them in a single forward pass. The output is identical to normal decoding — only faster. This feature is only available with the `llama-cpp` backend.
There are two approaches:
#### Draft Model Speculative Decoding
Uses a smaller, faster model from the same model family to draft candidate tokens, which the main model then verifies. Requires a separate GGUF file for the draft model.
```yaml
name: my-model
backend: llama-cpp
parameters:
model: large-model.gguf
draft_model: small-draft-model.gguf
n_draft: 8
options:
- spec_p_min:0.8
- draft_gpu_layers:99
```
#### N-gram Self-Speculative Decoding
Uses patterns from the token history to predict future tokens — no extra model required. Works well for repetitive or structured output (code, JSON, lists).
```yaml
name: my-model
backend: llama-cpp
parameters:
model: my-model.gguf
options:
- spec_type:ngram_simple
- spec_n_max:16
```
#### Speculative Decoding Options
These are set via the `options:` array in the model configuration (format: `key:value`):
| Option | Type | Default | Description |
|--------|------|---------|-------------|
| `spec_type` | string | `none` | Speculative decoding type (see table below) |
| `spec_n_max` / `draft_max` | int | 16 | Maximum number of tokens to draft per step |
| `spec_n_min` / `draft_min` | int | 0 | Minimum draft tokens required to use speculation |
| `spec_p_min` / `draft_p_min` | float | 0.75 | Minimum probability threshold for greedy acceptance |
| `spec_p_split` | float | 0.1 | Split probability for tree-based branching |
| `spec_ngram_size_n` / `ngram_size_n` | int | 12 | N-gram lookup size |
| `spec_ngram_size_m` / `ngram_size_m` | int | 48 | M-gram proposal size |
| `spec_ngram_min_hits` / `ngram_min_hits` | int | 1 | Minimum hits for accepting n-gram proposals |
| `draft_gpu_layers` | int | -1 | GPU layers for the draft model (-1 = use default) |
| `draft_ctx_size` | int | 0 | Context size for the draft model (0 = auto) |
#### Speculative Type Values
| Type | Description |
|------|-------------|
| `none` | No speculative decoding (default) |
| `draft` | Draft model-based speculation (auto-set when `draft_model` is configured) |
| `eagle3` | EAGLE3 draft model architecture |
| `ngram_simple` | Simple self-speculative using token history |
| `ngram_map_k` | N-gram with key-only map |
| `ngram_map_k4v` | N-gram with keys and 4 m-gram values |
| `ngram_mod` | Modified n-gram speculation |
| `ngram_cache` | 3-level n-gram cache |
{{% notice note %}}
Speculative decoding is automatically disabled when multimodal models (with `mmproj`) are active. The `n_draft` parameter can also be overridden per-request.
{{% /notice %}}
### Prompt Caching
| Field | Type | Description |

View File

@@ -1,3 +1,3 @@
{
"version": "v4.1.0"
"version": "v4.1.1"
}

View File

@@ -1288,6 +1288,59 @@
- filename: llama-cpp/mmproj/Qwen3-VL-Reranker-8B.mmproj-f16.gguf
sha256: 15cd9bd4882dae771344f0ac204fce07de91b47c1438ada3861dfc817403c31e
uri: https://huggingface.co/mradermacher/Qwen3-VL-Reranker-8B-GGUF/resolve/main/Qwen3-VL-Reranker-8B.mmproj-f16.gguf
- name: "qwen3-vl-reranker-2b-i1"
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
urls:
- https://huggingface.co/mradermacher/Qwen3-VL-Reranker-2B-i1-GGUF
description: |
**Model Name:** Qwen3-VL-Reranker-2B-i1
**Base Model:** Qwen/Qwen3-VL-Reranker-2B
**Description:**
A high-performance multimodal reranking model for state-of-the-art cross-modal search. It supports 30+ languages and handles text, images, screenshots, videos, and mixed modalities. With 8B parameters and a 32K context length, it refines retrieval results by combining embedding vectors with precise relevance scores. Optimized for efficiency, it supports quantized versions (e.g., Q8_0, Q4_K_M) and is ideal for applications requiring accurate multimodal content matching.
**Key Features:**
- **Multimodal**: Text, images, videos, and mixed content.
- **Language Support**: 30+ languages.
- **Quantization**: Available in Q8_0 (best quality), Q4_K_M (fast, recommended), and lower-precision options.
- **Performance**: Outperforms base models in retrieval tasks (e.g., JinaVDR, ViDoRe v3).
- **Use Case**: Enhances search pipelines by refining embeddings with precise relevance scores.
**Downloads:**
- [GGUF Files](https://huggingface.co/mradermacher/Qwen3-VL-Reranker-2B-i1-GGUF) (e.g., `Qwen3-VL-Reranker-2B.i1-Q4_K_M.gguf`).
**Usage:**
- Requires `transformers`, `qwen-vl-utils`, and `torch`.
- Example: `from scripts.qwen3_vl_reranker import Qwen3VLReranker; model = Qwen3VLReranker(...)`
**Citation:**
@article{qwen3vlembedding, ...}
This description emphasizes its capabilities, efficiency, and versatility for multimodal search tasks.
overrides:
reranking: true
parameters:
model: llama-cpp/models/Qwen3-VL-Reranker-2B.i1-Q4_K_M.gguf
name: Qwen3-VL-Reranker-2B-i1-GGUF
backend: llama-cpp
template:
use_tokenizer_template: true
known_usecases:
- chat
function:
grammar:
disable: true
mmproj: llama-cpp/mmproj/Qwen3-VL-Reranker-2B.mmproj-f16.gguf
description: Imported from https://huggingface.co/mradermacher/Qwen3-VL-Reranker-2B-GGUF/
options:
- use_jinja:true
files:
- filename: llama-cpp/models/Qwen3-VL-Reranker-2B.i1-Q4_K_M.gguf
sha256: f19dfbceeef9f6ee1f7d0ff536d66e9b1b90424a4b8aa1d1777db43d20afdbc5
uri: https://huggingface.co/mradermacher/Qwen3-VL-Reranker-2B-i1-GGUF/resolve/main/Qwen3-VL-Reranker-2B.i1-Q4_K_M.gguf
- filename: llama-cpp/mmproj/Qwen3-VL-Reranker-8B.mmproj-f16.gguf
sha256: d38b7ae347fc3e51726bfb9cba1b04885f1f005a4087d8070933e46509db5a6e
uri: https://huggingface.co/mradermacher/Qwen3-VL-Reranker-2B-GGUF/resolve/main/Qwen3-VL-Reranker-2B.mmproj-f16.gguf
- name: "liquidai.lfm2-2.6b-transcript"
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
urls:
@@ -3095,6 +3148,35 @@
- filename: Qwen_Qwen3-30B-A3B-Q4_K_M.gguf
sha256: a015794bfb1d69cb03dbb86b185fb2b9b339f757df5f8f9dd9ebdab8f6ed5d32
uri: huggingface://bartowski/Qwen_Qwen3-30B-A3B-GGUF/Qwen_Qwen3-30B-A3B-Q4_K_M.gguf
- !!merge <<: *qwen3
name: "qwen3-reranker-0.6b"
tags:
- qwen3
- reranker
- gguf
- gpu
- cpu
urls:
- https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
description: |
The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B). This series inherits the exceptional multilingual capabilities, long-text understanding, and reasoning skills of its foundational model. The Qwen3 Embedding series represents significant advancements in multiple text embedding and ranking tasks, including text retrieval, code retrieval, text classification, text clustering, and bitext mining.
**Exceptional Versatility**: The embedding model has achieved state-of-the-art performance across a wide range of downstream application evaluations. The 8B size embedding model ranks No.1 in the MTEB multilingual leaderboard (as of June 5, 2025, score 70.58), while the reranking model excels in various text retrieval scenarios.
**Comprehensive Flexibility**: The Qwen3 Embedding series offers a full spectrum of sizes (from 0.6B to 8B) for both embedding and reranking models, catering to diverse use cases that prioritize efficiency and effectiveness. Developers can seamlessly combine these two modules. Additionally, the embedding model allows for flexible vector definitions across all dimensions, and both embedding and reranking models support user-defined instructions to enhance performance for specific tasks, languages, or scenarios.
**Multilingual Capability**: The Qwen3 Embedding series offer support for over 100 languages, thanks to the multilingual capabilites of Qwen3 models. This includes various programming languages, and provides robust multilingual, cross-lingual, and code retrieval capabilities.
**Qwen3-Reranker-0.6B** has the following features:
- Model Type: Text Reranking
- Supported Languages: 100+ Languages
- Number of Paramaters: 0.6B
- Context Length: 32k
- Quantization: q4_K_M, q5_0, q5_K_M, q6_K, q8_0, f16
overrides:
reranking: true
parameters:
model: Qwen3-Reranker-0.6B.Q8_0.gguf
files:
- filename: Qwen3-Reranker-0.6B.Q8_0.gguf
uri: huggingface://mradermacher/Qwen3-Reranker-0.6B-GGUF/Qwen3-Reranker-0.6B.Q8_0.gguf
sha256: c525a7449243f690a7062e6377d6cf5adbb289354bd4316312367cd20e187ab7
- !!merge <<: *qwen3
name: "qwen3-235b-a22b-instruct-2507"
icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png

View File

@@ -2,7 +2,10 @@
name: "qwen3"
config_file: |
mmap: true
parameters:
context_size: 8192
f16: true
mmap: true
backend: "llama-cpp"
template:
chat_message: |
@@ -36,8 +39,6 @@ config_file: |
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 8192
f16: true
stopwords:
- '<|im_end|>'
- '<dummy32000>'

View File

@@ -101,6 +101,25 @@ var _ = BeforeSuite(func() {
Expect(err).ToNot(HaveOccurred())
Expect(os.WriteFile(configPath, configYAML, 0644)).To(Succeed())
// Create model config for autoparser tests (NoGrammar so tool calls
// are driven entirely by the backend's ChatDeltas, not grammar enforcement)
autoparserConfig := map[string]any{
"name": "mock-model-autoparser",
"backend": "mock-backend",
"parameters": map[string]any{
"model": "mock-model.bin",
},
"function": map[string]any{
"grammar": map[string]any{
"disable": true,
},
},
}
autoparserPath := filepath.Join(modelsPath, "mock-model-autoparser.yaml")
autoparserYAML, err := yaml.Marshal(autoparserConfig)
Expect(err).ToNot(HaveOccurred())
Expect(os.WriteFile(autoparserPath, autoparserYAML, 0644)).To(Succeed())
// Start mock MCP server and create MCP-enabled model config
mcpServerURL, mcpServerShutdown = startMockMCPServer()
mcpConfig := mcpModelConfig(mcpServerURL)

View File

@@ -55,6 +55,46 @@ func (m *MockBackend) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.R
if strings.Contains(in.Prompt, "MOCK_ERROR") {
return nil, fmt.Errorf("mock backend predict error: simulated failure")
}
// Simulate C++ autoparser: tool call via ChatDeltas, empty message
if strings.Contains(in.Prompt, "AUTOPARSER_TOOL_CALL") {
toolName := mockToolNameFromRequest(in)
if toolName == "" {
toolName = "search_collections"
}
return &pb.Reply{
Message: []byte{},
Tokens: 10,
PromptTokens: 5,
ChatDeltas: []*pb.ChatDelta{
{ReasoningContent: "I need to search for information."},
{
ToolCalls: []*pb.ToolCallDelta{
{
Index: 0,
Id: "call_mock_123",
Name: toolName,
Arguments: `{"query":"localai"}`,
},
},
},
},
}, nil
}
// Simulate C++ autoparser: content via ChatDeltas, empty message
if strings.Contains(in.Prompt, "AUTOPARSER_CONTENT") {
return &pb.Reply{
Message: []byte{},
Tokens: 10,
PromptTokens: 5,
ChatDeltas: []*pb.ChatDelta{
{ReasoningContent: "Let me compose a response."},
{Content: "LocalAI is an open-source AI platform."},
},
}, nil
}
var response string
toolName := mockToolNameFromRequest(in)
if toolName != "" && !promptHasToolResults(in.Prompt) {
@@ -88,6 +128,77 @@ func (m *MockBackend) PredictStream(in *pb.PredictOptions, stream pb.Backend_Pre
}
return fmt.Errorf("mock backend stream error: simulated mid-stream failure")
}
// Simulate C++ autoparser behavior: tool calls delivered via ChatDeltas
// with empty message (autoparser clears raw message during parsing).
if strings.Contains(in.Prompt, "AUTOPARSER_TOOL_CALL") {
toolName := mockToolNameFromRequest(in)
if toolName == "" {
toolName = "search_collections"
}
// Phase 1: Stream reasoning tokens with empty message (autoparser active)
reasoning := "I need to search for information."
for _, r := range reasoning {
if err := stream.Send(&pb.Reply{
Message: []byte{}, // autoparser clears raw message
ChatDeltas: []*pb.ChatDelta{
{ReasoningContent: string(r)},
},
}); err != nil {
return err
}
}
// Phase 2: Emit tool call via ChatDeltas (no raw message)
if err := stream.Send(&pb.Reply{
Message: []byte{}, // autoparser clears raw message
ChatDeltas: []*pb.ChatDelta{
{
ToolCalls: []*pb.ToolCallDelta{
{
Index: 0,
Id: "call_mock_123",
Name: toolName,
Arguments: `{"query":"localai"}`,
},
},
},
},
}); err != nil {
return err
}
return nil
}
// Simulate C++ autoparser behavior: content delivered via ChatDeltas
// with empty message (autoparser clears raw message during parsing).
if strings.Contains(in.Prompt, "AUTOPARSER_CONTENT") {
// Phase 1: Stream reasoning via ChatDeltas
reasoning := "Let me compose a response."
for _, r := range reasoning {
if err := stream.Send(&pb.Reply{
Message: []byte{},
ChatDeltas: []*pb.ChatDelta{
{ReasoningContent: string(r)},
},
}); err != nil {
return err
}
}
// Phase 2: Stream content via ChatDeltas (no raw message)
content := "LocalAI is an open-source AI platform."
for _, r := range content {
if err := stream.Send(&pb.Reply{
Message: []byte{},
ChatDeltas: []*pb.ChatDelta{
{Content: string(r)},
},
}); err != nil {
return err
}
}
return nil
}
var toStream string
toolName := mockToolNameFromRequest(in)
if toolName != "" && !promptHasToolResults(in.Prompt) {

View File

@@ -2,6 +2,7 @@ package e2e_test
import (
"context"
"encoding/json"
"io"
"net/http"
"strings"
@@ -265,4 +266,201 @@ var _ = Describe("Mock Backend E2E Tests", Label("MockBackend"), func() {
}
})
})
Describe("Autoparser ChatDelta Streaming", Label("Autoparser"), func() {
// These tests verify that when the C++ autoparser handles tool calls
// and content via ChatDeltas (with empty raw message), the streaming
// endpoint does NOT unnecessarily retry. This is a regression test for
// the bug where the retry logic only checked Go-side parsing, ignoring
// ChatDelta results, causing up to 6 retries and concatenated output.
Context("Streaming with tools and ChatDelta tool calls", func() {
It("should return tool calls without unnecessary retries", func() {
body := `{
"model": "mock-model-autoparser",
"messages": [{"role": "user", "content": "AUTOPARSER_TOOL_CALL"}],
"tools": [{"type": "function", "function": {"name": "search_collections", "description": "Search documents", "parameters": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}}}],
"stream": true
}`
req, err := http.NewRequest("POST", apiURL+"/chat/completions", strings.NewReader(body))
Expect(err).ToNot(HaveOccurred())
req.Header.Set("Content-Type", "application/json")
httpClient := &http.Client{Timeout: 60 * time.Second}
resp, err := httpClient.Do(req)
Expect(err).ToNot(HaveOccurred())
defer resp.Body.Close()
Expect(resp.StatusCode).To(Equal(200))
data, err := io.ReadAll(resp.Body)
Expect(err).ToNot(HaveOccurred())
bodyStr := string(data)
// Parse all SSE events
lines := strings.Split(bodyStr, "\n")
var toolCallChunks int
var reasoningChunks int
hasFinishReason := false
for _, line := range lines {
line = strings.TrimSpace(line)
if !strings.HasPrefix(line, "data: ") || line == "data: [DONE]" {
continue
}
jsonData := strings.TrimPrefix(line, "data: ")
var chunk map[string]any
if err := json.Unmarshal([]byte(jsonData), &chunk); err != nil {
continue
}
choices, ok := chunk["choices"].([]any)
if !ok || len(choices) == 0 {
continue
}
choice := choices[0].(map[string]any)
delta, _ := choice["delta"].(map[string]any)
if delta == nil {
continue
}
if _, ok := delta["tool_calls"]; ok {
toolCallChunks++
}
if _, ok := delta["reasoning"]; ok {
reasoningChunks++
}
if fr, ok := choice["finish_reason"].(string); ok && fr != "" {
hasFinishReason = true
}
}
// The key assertion: tool calls from ChatDeltas should be present
Expect(toolCallChunks).To(BeNumerically(">", 0),
"Expected tool_calls in streaming response from ChatDeltas, but got none. "+
"This likely means the retry logic discarded ChatDelta tool calls.")
// Should have a finish reason
Expect(hasFinishReason).To(BeTrue(), "Expected a finish_reason in the streaming response")
// Reasoning should be present (from ChatDelta reasoning)
Expect(reasoningChunks).To(BeNumerically(">", 0),
"Expected reasoning deltas from ChatDeltas")
})
})
Context("Streaming with tools and ChatDelta content (no tool calls)", func() {
It("should return content without retrying and without concatenation", func() {
body := `{
"model": "mock-model-autoparser",
"messages": [{"role": "user", "content": "AUTOPARSER_CONTENT"}],
"tools": [{"type": "function", "function": {"name": "search_collections", "description": "Search documents", "parameters": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}}}],
"stream": true
}`
req, err := http.NewRequest("POST", apiURL+"/chat/completions", strings.NewReader(body))
Expect(err).ToNot(HaveOccurred())
req.Header.Set("Content-Type", "application/json")
httpClient := &http.Client{Timeout: 60 * time.Second}
resp, err := httpClient.Do(req)
Expect(err).ToNot(HaveOccurred())
defer resp.Body.Close()
Expect(resp.StatusCode).To(Equal(200))
data, err := io.ReadAll(resp.Body)
Expect(err).ToNot(HaveOccurred())
bodyStr := string(data)
// Parse all SSE events and collect content
lines := strings.Split(bodyStr, "\n")
var contentParts []string
var reasoningParts []string
for _, line := range lines {
line = strings.TrimSpace(line)
if !strings.HasPrefix(line, "data: ") || line == "data: [DONE]" {
continue
}
jsonData := strings.TrimPrefix(line, "data: ")
var chunk map[string]any
if err := json.Unmarshal([]byte(jsonData), &chunk); err != nil {
continue
}
choices, ok := chunk["choices"].([]any)
if !ok || len(choices) == 0 {
continue
}
choice := choices[0].(map[string]any)
delta, _ := choice["delta"].(map[string]any)
if delta == nil {
continue
}
if content, ok := delta["content"].(string); ok && content != "" {
contentParts = append(contentParts, content)
}
if reasoning, ok := delta["reasoning"].(string); ok && reasoning != "" {
reasoningParts = append(reasoningParts, reasoning)
}
}
fullContent := strings.Join(contentParts, "")
fullReasoning := strings.Join(reasoningParts, "")
// Content should be present and match the expected answer
Expect(fullContent).To(ContainSubstring("LocalAI"),
"Expected content from ChatDeltas to contain 'LocalAI'. "+
"The retry logic may have discarded ChatDelta content.")
// Content should NOT be duplicated (no retry concatenation)
occurrences := strings.Count(fullContent, "LocalAI is an open-source AI platform.")
Expect(occurrences).To(Equal(1),
"Expected content to appear exactly once, but found %d occurrences. "+
"This indicates unnecessary retries are concatenating output.", occurrences)
// Reasoning should be present
Expect(fullReasoning).To(ContainSubstring("compose"),
"Expected reasoning content from ChatDeltas")
})
})
Context("Non-streaming with tools and ChatDelta tool calls", func() {
It("should return tool calls from ChatDeltas", func() {
body := `{
"model": "mock-model-autoparser",
"messages": [{"role": "user", "content": "AUTOPARSER_TOOL_CALL"}],
"tools": [{"type": "function", "function": {"name": "search_collections", "description": "Search documents", "parameters": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}}}]
}`
req, err := http.NewRequest("POST", apiURL+"/chat/completions", strings.NewReader(body))
Expect(err).ToNot(HaveOccurred())
req.Header.Set("Content-Type", "application/json")
httpClient := &http.Client{Timeout: 60 * time.Second}
resp, err := httpClient.Do(req)
Expect(err).ToNot(HaveOccurred())
defer resp.Body.Close()
Expect(resp.StatusCode).To(Equal(200))
data, err := io.ReadAll(resp.Body)
Expect(err).ToNot(HaveOccurred())
var result map[string]any
Expect(json.Unmarshal(data, &result)).To(Succeed())
choices, ok := result["choices"].([]any)
Expect(ok).To(BeTrue())
Expect(choices).To(HaveLen(1))
choice := choices[0].(map[string]any)
msg, _ := choice["message"].(map[string]any)
Expect(msg).ToNot(BeNil())
toolCalls, ok := msg["tool_calls"].([]any)
Expect(ok).To(BeTrue(),
"Expected tool_calls in non-streaming response from ChatDeltas, "+
"but got: %s", string(data))
Expect(toolCalls).To(HaveLen(1))
tc := toolCalls[0].(map[string]any)
fn, _ := tc["function"].(map[string]any)
Expect(fn["name"]).To(Equal("search_collections"))
})
})
})
})