mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-19 14:17:21 -04:00
Compare commits
7 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ad232fdb1a | ||
|
|
11637b5a1b | ||
|
|
0dda4fe6f0 | ||
|
|
773489eeb1 | ||
|
|
06fbe48b3f | ||
|
|
232e324a68 | ||
|
|
39c954764c |
@@ -1,5 +1,5 @@
|
||||
|
||||
LLAMA_VERSION?=b8635075ffe27b135c49afb9a8b5c434bd42c502
|
||||
LLAMA_VERSION?=761797ffdf2ce3f118e82c663b1ad7d935fbd656
|
||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
@@ -284,6 +284,12 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const
|
||||
data["ignore_eos"] = predict->ignoreeos();
|
||||
data["embeddings"] = predict->embeddings();
|
||||
|
||||
// Speculative decoding per-request overrides
|
||||
// NDraft maps to speculative.n_max (maximum draft tokens per speculation step)
|
||||
if (predict->ndraft() > 0) {
|
||||
data["speculative.n_max"] = predict->ndraft();
|
||||
}
|
||||
|
||||
// Add the correlationid to json data
|
||||
data["correlation_id"] = predict->correlationid();
|
||||
|
||||
@@ -402,6 +408,16 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
|
||||
if (!request->mmproj().empty()) {
|
||||
params.mmproj.path = request->mmproj();
|
||||
}
|
||||
|
||||
// Draft model for speculative decoding
|
||||
if (!request->draftmodel().empty()) {
|
||||
params.speculative.mparams_dft.path = request->draftmodel();
|
||||
// Default to draft type if a draft model is set but no explicit type
|
||||
if (params.speculative.type == COMMON_SPECULATIVE_TYPE_NONE) {
|
||||
params.speculative.type = COMMON_SPECULATIVE_TYPE_DRAFT;
|
||||
}
|
||||
}
|
||||
|
||||
// params.model_alias ??
|
||||
params.model_alias.insert(request->modelfile());
|
||||
if (!request->cachetypekey().empty()) {
|
||||
@@ -609,6 +625,48 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
|
||||
// If conversion fails, keep default value (8)
|
||||
}
|
||||
}
|
||||
// Speculative decoding options
|
||||
} else if (!strcmp(optname, "spec_type") || !strcmp(optname, "speculative_type")) {
|
||||
auto type = common_speculative_type_from_name(optval_str);
|
||||
if (type != COMMON_SPECULATIVE_TYPE_COUNT) {
|
||||
params.speculative.type = type;
|
||||
}
|
||||
} else if (!strcmp(optname, "spec_n_max") || !strcmp(optname, "draft_max")) {
|
||||
if (optval != NULL) {
|
||||
try { params.speculative.n_max = std::stoi(optval_str); } catch (...) {}
|
||||
}
|
||||
} else if (!strcmp(optname, "spec_n_min") || !strcmp(optname, "draft_min")) {
|
||||
if (optval != NULL) {
|
||||
try { params.speculative.n_min = std::stoi(optval_str); } catch (...) {}
|
||||
}
|
||||
} else if (!strcmp(optname, "spec_p_min") || !strcmp(optname, "draft_p_min")) {
|
||||
if (optval != NULL) {
|
||||
try { params.speculative.p_min = std::stof(optval_str); } catch (...) {}
|
||||
}
|
||||
} else if (!strcmp(optname, "spec_p_split")) {
|
||||
if (optval != NULL) {
|
||||
try { params.speculative.p_split = std::stof(optval_str); } catch (...) {}
|
||||
}
|
||||
} else if (!strcmp(optname, "spec_ngram_size_n") || !strcmp(optname, "ngram_size_n")) {
|
||||
if (optval != NULL) {
|
||||
try { params.speculative.ngram_size_n = (uint16_t)std::stoi(optval_str); } catch (...) {}
|
||||
}
|
||||
} else if (!strcmp(optname, "spec_ngram_size_m") || !strcmp(optname, "ngram_size_m")) {
|
||||
if (optval != NULL) {
|
||||
try { params.speculative.ngram_size_m = (uint16_t)std::stoi(optval_str); } catch (...) {}
|
||||
}
|
||||
} else if (!strcmp(optname, "spec_ngram_min_hits") || !strcmp(optname, "ngram_min_hits")) {
|
||||
if (optval != NULL) {
|
||||
try { params.speculative.ngram_min_hits = (uint16_t)std::stoi(optval_str); } catch (...) {}
|
||||
}
|
||||
} else if (!strcmp(optname, "draft_gpu_layers")) {
|
||||
if (optval != NULL) {
|
||||
try { params.speculative.n_gpu_layers = std::stoi(optval_str); } catch (...) {}
|
||||
}
|
||||
} else if (!strcmp(optname, "draft_ctx_size")) {
|
||||
if (optval != NULL) {
|
||||
try { params.speculative.n_ctx = std::stoi(optval_str); } catch (...) {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1251,6 +1309,7 @@ public:
|
||||
|
||||
body_json["messages"] = messages_json;
|
||||
body_json["stream"] = true; // PredictStream is always streaming
|
||||
body_json["stream_options"] = {{"include_usage", true}}; // Ensure token counts in final chunk
|
||||
|
||||
// Check if grammar is provided from Go layer (NoGrammar=false)
|
||||
// If grammar is provided, we must use it and NOT let template generate grammar from tools
|
||||
@@ -1558,8 +1617,11 @@ public:
|
||||
data);
|
||||
task.id_slot = json_value(data, "id_slot", -1);
|
||||
|
||||
// OAI-compat
|
||||
task.params.res_type = TASK_RESPONSE_TYPE_NONE;
|
||||
// OAI-compat: enable autoparser (PEG-based chat parsing) so that
|
||||
// reasoning, tool calls, and content are classified into ChatDeltas.
|
||||
// Without this, the PEG parser never produces diffs and the Go side
|
||||
// cannot detect tool calls or separate reasoning from content.
|
||||
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
||||
task.params.oaicompat_cmpl_id = completion_id;
|
||||
// oaicompat_model is already populated by params_from_json_cmpl
|
||||
|
||||
@@ -1584,19 +1646,47 @@ public:
|
||||
return grpc::Status(grpc::StatusCode::INTERNAL, error_json.value("message", "Error occurred"));
|
||||
}
|
||||
|
||||
// Lambda to build a Reply from JSON + attach chat deltas from a result
|
||||
// Lambda to build a Reply from JSON + attach chat deltas from a result.
|
||||
// Handles both native format ({"content": "..."}) and OAI chat format
|
||||
// ({"choices": [{"delta": {"content": "...", "reasoning": "..."}}]}).
|
||||
auto build_reply_from_json = [](const json & res_json, server_task_result * raw_result) -> backend::Reply {
|
||||
backend::Reply reply;
|
||||
std::string completion_text = res_json.value("content", "");
|
||||
reply.set_message(completion_text);
|
||||
reply.set_tokens(res_json.value("tokens_predicted", 0));
|
||||
reply.set_prompt_tokens(res_json.value("tokens_evaluated", 0));
|
||||
std::string completion_text;
|
||||
|
||||
if (res_json.contains("choices")) {
|
||||
// OAI chat format — extract content from choices[0].delta
|
||||
const auto & choices = res_json.at("choices");
|
||||
if (!choices.empty()) {
|
||||
const auto & delta = choices[0].value("delta", json::object());
|
||||
if (delta.contains("content") && !delta.at("content").is_null()) {
|
||||
completion_text = delta.at("content").get<std::string>();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Native llama.cpp format
|
||||
completion_text = res_json.value("content", "");
|
||||
}
|
||||
|
||||
reply.set_message(completion_text);
|
||||
|
||||
// Token counts: native format has top-level fields,
|
||||
// OAI format has them in "usage" (final chunk only)
|
||||
if (res_json.contains("usage")) {
|
||||
const auto & usage = res_json.at("usage");
|
||||
reply.set_tokens(usage.value("completion_tokens", 0));
|
||||
reply.set_prompt_tokens(usage.value("prompt_tokens", 0));
|
||||
} else {
|
||||
reply.set_tokens(res_json.value("tokens_predicted", 0));
|
||||
reply.set_prompt_tokens(res_json.value("tokens_evaluated", 0));
|
||||
}
|
||||
|
||||
// Timings: present as top-level "timings" in both formats
|
||||
if (res_json.contains("timings")) {
|
||||
reply.set_timing_prompt_processing(res_json.at("timings").value("prompt_ms", 0.0));
|
||||
reply.set_timing_token_generation(res_json.at("timings").value("predicted_ms", 0.0));
|
||||
}
|
||||
|
||||
// Logprobs: extract_logprobs_from_json handles both formats
|
||||
json logprobs_json = extract_logprobs_from_json(res_json);
|
||||
if (!logprobs_json.empty() && !logprobs_json.is_null()) {
|
||||
reply.set_logprobs(logprobs_json.dump());
|
||||
@@ -1605,21 +1695,17 @@ public:
|
||||
return reply;
|
||||
};
|
||||
|
||||
// Attach chat deltas from the autoparser to a Reply.
|
||||
// When diffs are available, populate ChatDeltas on the reply.
|
||||
// The raw message is always preserved so the Go side can use it
|
||||
// for reasoning extraction and tool call parsing as a fallback
|
||||
// (important in distributed mode where ChatDeltas may not be
|
||||
// the primary parsing path).
|
||||
auto attach_chat_deltas = [](backend::Reply & reply, server_task_result * raw_result) {
|
||||
// Try streaming partial result first
|
||||
auto* partial = dynamic_cast<server_task_result_cmpl_partial*>(raw_result);
|
||||
if (partial) {
|
||||
if (!partial->oaicompat_msg_diffs.empty()) {
|
||||
populate_chat_deltas_from_diffs(reply, partial->oaicompat_msg_diffs);
|
||||
} else if (partial->is_updated) {
|
||||
// Autoparser is active but hasn't classified this chunk yet
|
||||
// (PEG parser warming up). Clear the raw message so the Go
|
||||
// side doesn't try to parse partial tag tokens (e.g. "<|channel>"
|
||||
// before the full "<|channel>thought\n" is received).
|
||||
// This matches llama.cpp server behavior which only emits SSE
|
||||
// chunks when the parser produces diffs.
|
||||
reply.set_message("");
|
||||
}
|
||||
if (partial && !partial->oaicompat_msg_diffs.empty()) {
|
||||
populate_chat_deltas_from_diffs(reply, partial->oaicompat_msg_diffs);
|
||||
return;
|
||||
}
|
||||
// Try final result
|
||||
@@ -2299,8 +2385,9 @@ public:
|
||||
data);
|
||||
task.id_slot = json_value(data, "id_slot", -1);
|
||||
|
||||
// OAI-compat
|
||||
task.params.res_type = TASK_RESPONSE_TYPE_NONE;
|
||||
// OAI-compat: enable autoparser (PEG-based chat parsing) so that
|
||||
// reasoning, tool calls, and content are classified into ChatDeltas.
|
||||
task.params.res_type = TASK_RESPONSE_TYPE_OAI_CHAT;
|
||||
task.params.oaicompat_cmpl_id = completion_id;
|
||||
// oaicompat_model is already populated by params_from_json_cmpl
|
||||
|
||||
@@ -2331,25 +2418,48 @@ public:
|
||||
auto* final_res = dynamic_cast<server_task_result_cmpl_final*>(all_results.results[0].get());
|
||||
GGML_ASSERT(final_res != nullptr);
|
||||
json result_json = all_results.results[0]->to_json();
|
||||
reply->set_message(result_json.value("content", ""));
|
||||
|
||||
int32_t tokens_predicted = result_json.value("tokens_predicted", 0);
|
||||
// Handle both native format ({"content": "...", "tokens_predicted": N})
|
||||
// and OAI chat format ({"choices": [{"message": {"content": "..."}}],
|
||||
// "usage": {"completion_tokens": N, "prompt_tokens": N}}).
|
||||
std::string completion_text;
|
||||
int32_t tokens_predicted = 0;
|
||||
int32_t tokens_evaluated = 0;
|
||||
|
||||
if (result_json.contains("choices")) {
|
||||
// OAI chat format
|
||||
const auto & choices = result_json.at("choices");
|
||||
if (!choices.empty()) {
|
||||
const auto & msg = choices[0].value("message", json::object());
|
||||
if (msg.contains("content") && !msg.at("content").is_null()) {
|
||||
completion_text = msg.at("content").get<std::string>();
|
||||
}
|
||||
}
|
||||
if (result_json.contains("usage")) {
|
||||
const auto & usage = result_json.at("usage");
|
||||
tokens_predicted = usage.value("completion_tokens", 0);
|
||||
tokens_evaluated = usage.value("prompt_tokens", 0);
|
||||
}
|
||||
} else {
|
||||
// Native llama.cpp format
|
||||
completion_text = result_json.value("content", "");
|
||||
tokens_predicted = result_json.value("tokens_predicted", 0);
|
||||
tokens_evaluated = result_json.value("tokens_evaluated", 0);
|
||||
}
|
||||
reply->set_message(completion_text);
|
||||
reply->set_tokens(tokens_predicted);
|
||||
int32_t tokens_evaluated = result_json.value("tokens_evaluated", 0);
|
||||
reply->set_prompt_tokens(tokens_evaluated);
|
||||
|
||||
// Timings: present in both formats as a top-level "timings" object
|
||||
if (result_json.contains("timings")) {
|
||||
double timing_prompt_processing = result_json.at("timings").value("prompt_ms", 0.0);
|
||||
reply->set_timing_prompt_processing(timing_prompt_processing);
|
||||
double timing_token_generation = result_json.at("timings").value("predicted_ms", 0.0);
|
||||
reply->set_timing_token_generation(timing_token_generation);
|
||||
reply->set_timing_prompt_processing(result_json.at("timings").value("prompt_ms", 0.0));
|
||||
reply->set_timing_token_generation(result_json.at("timings").value("predicted_ms", 0.0));
|
||||
}
|
||||
|
||||
// Extract and set logprobs if present
|
||||
// Logprobs: extract_logprobs_from_json handles both formats
|
||||
json logprobs_json = extract_logprobs_from_json(result_json);
|
||||
if (!logprobs_json.empty() && !logprobs_json.is_null()) {
|
||||
std::string logprobs_str = logprobs_json.dump();
|
||||
reply->set_logprobs(logprobs_str);
|
||||
reply->set_logprobs(logprobs_json.dump());
|
||||
}
|
||||
|
||||
// Populate chat deltas from the autoparser's final parsed message
|
||||
@@ -2365,7 +2475,20 @@ public:
|
||||
for (auto & res : all_results.results) {
|
||||
GGML_ASSERT(dynamic_cast<server_task_result_cmpl_final*>(res.get()) != nullptr);
|
||||
json res_json = res->to_json();
|
||||
arr.push_back(res_json.value("content", ""));
|
||||
// Handle both native and OAI chat formats
|
||||
std::string result_content;
|
||||
if (res_json.contains("choices")) {
|
||||
const auto & choices = res_json.at("choices");
|
||||
if (!choices.empty()) {
|
||||
const auto & msg = choices[0].value("message", json::object());
|
||||
if (msg.contains("content") && !msg.at("content").is_null()) {
|
||||
result_content = msg.at("content").get<std::string>();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
result_content = res_json.value("content", "");
|
||||
}
|
||||
arr.push_back(result_content);
|
||||
|
||||
// Extract logprobs for each result
|
||||
json logprobs_json = extract_logprobs_from_json(res_json);
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# stablediffusion.cpp (ggml)
|
||||
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
|
||||
STABLEDIFFUSION_GGML_VERSION?=87ecb95cbc65dc8e58e3d88f4f4a59a0939796f5
|
||||
STABLEDIFFUSION_GGML_VERSION?=7397ddaa86f4e8837d5261724678cde0f36d4d89
|
||||
|
||||
CMAKE_ARGS+=-DGGML_MAX_NAME=128
|
||||
|
||||
|
||||
@@ -147,10 +147,23 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
result := ""
|
||||
lastEmittedCount := 0
|
||||
sentInitialRole := false
|
||||
hasChatDeltaToolCalls := false
|
||||
hasChatDeltaContent := false
|
||||
|
||||
_, tokenUsage, chatDeltas, err := ComputeChoices(req, prompt, config, cl, startupOptions, loader, func(s string, c *[]schema.Choice) {}, func(s string, usage backend.TokenUsage) bool {
|
||||
result += s
|
||||
|
||||
// Track whether ChatDeltas from the C++ autoparser contain
|
||||
// tool calls or content, so the retry decision can account for them.
|
||||
for _, d := range usage.ChatDeltas {
|
||||
if len(d.ToolCalls) > 0 {
|
||||
hasChatDeltaToolCalls = true
|
||||
}
|
||||
if d.Content != "" {
|
||||
hasChatDeltaContent = true
|
||||
}
|
||||
}
|
||||
|
||||
var reasoningDelta, contentDelta string
|
||||
|
||||
goReasoning, goContent := extractor.ProcessToken(s)
|
||||
@@ -309,15 +322,22 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
// After streaming completes: check if we got actionable content
|
||||
cleaned := extractor.CleanedContent()
|
||||
// Check for tool calls from chat deltas (will be re-checked after ComputeChoices,
|
||||
// but we need to know here whether to retry)
|
||||
hasToolCalls := lastEmittedCount > 0
|
||||
if cleaned == "" && !hasToolCalls {
|
||||
// but we need to know here whether to retry).
|
||||
// Also check ChatDelta flags — when the C++ autoparser is active,
|
||||
// tool calls and content are delivered via ChatDeltas while the
|
||||
// raw message is cleared. Without this check, we'd retry
|
||||
// unnecessarily, losing valid results and concatenating output.
|
||||
hasToolCalls := lastEmittedCount > 0 || hasChatDeltaToolCalls
|
||||
hasContent := cleaned != "" || hasChatDeltaContent
|
||||
if !hasContent && !hasToolCalls {
|
||||
xlog.Warn("Streaming: backend produced only reasoning, retrying",
|
||||
"reasoning_len", len(extractor.Reasoning()), "attempt", attempt+1)
|
||||
extractor.ResetAndSuppressReasoning()
|
||||
result = ""
|
||||
lastEmittedCount = 0
|
||||
sentInitialRole = false
|
||||
hasChatDeltaToolCalls = false
|
||||
hasChatDeltaContent = false
|
||||
return true
|
||||
}
|
||||
return false
|
||||
@@ -1006,7 +1026,12 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
if deltaReasoning != "" {
|
||||
message.Reasoning = &deltaReasoning
|
||||
}
|
||||
result = []schema.Choice{{FinishReason: &stopReason, Index: 0, Message: message}}
|
||||
newChoice := schema.Choice{FinishReason: &stopReason, Index: 0, Message: message}
|
||||
// Preserve logprobs from the original result
|
||||
if len(result) > 0 && result[0].Logprobs != nil {
|
||||
newChoice.Logprobs = result[0].Logprobs
|
||||
}
|
||||
result = []schema.Choice{newChoice}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -113,11 +113,23 @@ func ComputeChoices(
|
||||
}
|
||||
prediction = p
|
||||
|
||||
// Built-in: retry on truly empty response (no tokens at all)
|
||||
// Built-in: retry on truly empty response (no tokens at all).
|
||||
// However, when the C++ autoparser is active, it clears the raw
|
||||
// message and delivers content via ChatDeltas instead. Do NOT
|
||||
// retry if ChatDeltas contain tool calls or content.
|
||||
if strings.TrimSpace(prediction.Response) == "" && attempt < maxRetries {
|
||||
xlog.Warn("Backend returned empty response, retrying",
|
||||
"attempt", attempt+1, "maxRetries", maxRetries)
|
||||
continue
|
||||
hasChatDeltaData := false
|
||||
for _, d := range prediction.ChatDeltas {
|
||||
if d.Content != "" || len(d.ToolCalls) > 0 {
|
||||
hasChatDeltaData = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !hasChatDeltaData {
|
||||
xlog.Warn("Backend returned empty response, retrying",
|
||||
"attempt", attempt+1, "maxRetries", maxRetries)
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
tokenUsage.Prompt = prediction.Usage.Prompt
|
||||
@@ -130,8 +142,21 @@ func ComputeChoices(
|
||||
finetunedResponse := backend.Finetune(*config, predInput, prediction.Response)
|
||||
cb(finetunedResponse, &result)
|
||||
|
||||
// Caller-driven retry (tool parsing, reasoning-only, etc.)
|
||||
if shouldRetryFn != nil && shouldRetryFn(attempt) && attempt < maxRetries {
|
||||
// Caller-driven retry (tool parsing, reasoning-only, etc.).
|
||||
// When the C++ autoparser is active, it clears the raw response
|
||||
// and delivers data via ChatDeltas. If the response is empty but
|
||||
// ChatDeltas contain actionable data, skip the caller retry —
|
||||
// the autoparser already parsed the response successfully.
|
||||
skipCallerRetry := false
|
||||
if strings.TrimSpace(prediction.Response) == "" && len(prediction.ChatDeltas) > 0 {
|
||||
for _, d := range prediction.ChatDeltas {
|
||||
if d.Content != "" || len(d.ToolCalls) > 0 {
|
||||
skipCallerRetry = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if shouldRetryFn != nil && !skipCallerRetry && shouldRetryFn(attempt) && attempt < maxRetries {
|
||||
// Caller has already reset its state inside shouldRetry
|
||||
result = result[:0]
|
||||
allChatDeltas = nil
|
||||
|
||||
@@ -189,8 +189,8 @@ These settings apply to most LLM backends (llama.cpp, vLLM, etc.):
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `no_mulmatq` | bool | Disable matrix multiplication queuing |
|
||||
| `draft_model` | string | Draft model for speculative decoding |
|
||||
| `n_draft` | int32 | Number of draft tokens |
|
||||
| `draft_model` | string | Draft model GGUF file for speculative decoding (see [Speculative Decoding](#speculative-decoding)) |
|
||||
| `n_draft` | int32 | Maximum number of draft tokens per speculative step (default: 16) |
|
||||
| `quantization` | string | Quantization format |
|
||||
| `load_format` | string | Model load format |
|
||||
| `numa` | bool | Enable NUMA (Non-Uniform Memory Access) |
|
||||
@@ -211,6 +211,76 @@ YARN (Yet Another RoPE extensioN) settings for context extension:
|
||||
| `yarn_beta_fast` | float32 | YARN beta fast parameter |
|
||||
| `yarn_beta_slow` | float32 | YARN beta slow parameter |
|
||||
|
||||
### Speculative Decoding
|
||||
|
||||
Speculative decoding speeds up text generation by predicting multiple tokens ahead and verifying them in a single forward pass. The output is identical to normal decoding — only faster. This feature is only available with the `llama-cpp` backend.
|
||||
|
||||
There are two approaches:
|
||||
|
||||
#### Draft Model Speculative Decoding
|
||||
|
||||
Uses a smaller, faster model from the same model family to draft candidate tokens, which the main model then verifies. Requires a separate GGUF file for the draft model.
|
||||
|
||||
```yaml
|
||||
name: my-model
|
||||
backend: llama-cpp
|
||||
parameters:
|
||||
model: large-model.gguf
|
||||
draft_model: small-draft-model.gguf
|
||||
n_draft: 8
|
||||
options:
|
||||
- spec_p_min:0.8
|
||||
- draft_gpu_layers:99
|
||||
```
|
||||
|
||||
#### N-gram Self-Speculative Decoding
|
||||
|
||||
Uses patterns from the token history to predict future tokens — no extra model required. Works well for repetitive or structured output (code, JSON, lists).
|
||||
|
||||
```yaml
|
||||
name: my-model
|
||||
backend: llama-cpp
|
||||
parameters:
|
||||
model: my-model.gguf
|
||||
options:
|
||||
- spec_type:ngram_simple
|
||||
- spec_n_max:16
|
||||
```
|
||||
|
||||
#### Speculative Decoding Options
|
||||
|
||||
These are set via the `options:` array in the model configuration (format: `key:value`):
|
||||
|
||||
| Option | Type | Default | Description |
|
||||
|--------|------|---------|-------------|
|
||||
| `spec_type` | string | `none` | Speculative decoding type (see table below) |
|
||||
| `spec_n_max` / `draft_max` | int | 16 | Maximum number of tokens to draft per step |
|
||||
| `spec_n_min` / `draft_min` | int | 0 | Minimum draft tokens required to use speculation |
|
||||
| `spec_p_min` / `draft_p_min` | float | 0.75 | Minimum probability threshold for greedy acceptance |
|
||||
| `spec_p_split` | float | 0.1 | Split probability for tree-based branching |
|
||||
| `spec_ngram_size_n` / `ngram_size_n` | int | 12 | N-gram lookup size |
|
||||
| `spec_ngram_size_m` / `ngram_size_m` | int | 48 | M-gram proposal size |
|
||||
| `spec_ngram_min_hits` / `ngram_min_hits` | int | 1 | Minimum hits for accepting n-gram proposals |
|
||||
| `draft_gpu_layers` | int | -1 | GPU layers for the draft model (-1 = use default) |
|
||||
| `draft_ctx_size` | int | 0 | Context size for the draft model (0 = auto) |
|
||||
|
||||
#### Speculative Type Values
|
||||
|
||||
| Type | Description |
|
||||
|------|-------------|
|
||||
| `none` | No speculative decoding (default) |
|
||||
| `draft` | Draft model-based speculation (auto-set when `draft_model` is configured) |
|
||||
| `eagle3` | EAGLE3 draft model architecture |
|
||||
| `ngram_simple` | Simple self-speculative using token history |
|
||||
| `ngram_map_k` | N-gram with key-only map |
|
||||
| `ngram_map_k4v` | N-gram with keys and 4 m-gram values |
|
||||
| `ngram_mod` | Modified n-gram speculation |
|
||||
| `ngram_cache` | 3-level n-gram cache |
|
||||
|
||||
{{% notice note %}}
|
||||
Speculative decoding is automatically disabled when multimodal models (with `mmproj`) are active. The `n_draft` parameter can also be overridden per-request.
|
||||
{{% /notice %}}
|
||||
|
||||
### Prompt Caching
|
||||
|
||||
| Field | Type | Description |
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
{
|
||||
"version": "v4.1.0"
|
||||
"version": "v4.1.1"
|
||||
}
|
||||
|
||||
@@ -1288,6 +1288,59 @@
|
||||
- filename: llama-cpp/mmproj/Qwen3-VL-Reranker-8B.mmproj-f16.gguf
|
||||
sha256: 15cd9bd4882dae771344f0ac204fce07de91b47c1438ada3861dfc817403c31e
|
||||
uri: https://huggingface.co/mradermacher/Qwen3-VL-Reranker-8B-GGUF/resolve/main/Qwen3-VL-Reranker-8B.mmproj-f16.gguf
|
||||
- name: "qwen3-vl-reranker-2b-i1"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/mradermacher/Qwen3-VL-Reranker-2B-i1-GGUF
|
||||
description: |
|
||||
**Model Name:** Qwen3-VL-Reranker-2B-i1
|
||||
**Base Model:** Qwen/Qwen3-VL-Reranker-2B
|
||||
|
||||
**Description:**
|
||||
A high-performance multimodal reranking model for state-of-the-art cross-modal search. It supports 30+ languages and handles text, images, screenshots, videos, and mixed modalities. With 8B parameters and a 32K context length, it refines retrieval results by combining embedding vectors with precise relevance scores. Optimized for efficiency, it supports quantized versions (e.g., Q8_0, Q4_K_M) and is ideal for applications requiring accurate multimodal content matching.
|
||||
|
||||
**Key Features:**
|
||||
- **Multimodal**: Text, images, videos, and mixed content.
|
||||
- **Language Support**: 30+ languages.
|
||||
- **Quantization**: Available in Q8_0 (best quality), Q4_K_M (fast, recommended), and lower-precision options.
|
||||
- **Performance**: Outperforms base models in retrieval tasks (e.g., JinaVDR, ViDoRe v3).
|
||||
- **Use Case**: Enhances search pipelines by refining embeddings with precise relevance scores.
|
||||
|
||||
**Downloads:**
|
||||
- [GGUF Files](https://huggingface.co/mradermacher/Qwen3-VL-Reranker-2B-i1-GGUF) (e.g., `Qwen3-VL-Reranker-2B.i1-Q4_K_M.gguf`).
|
||||
|
||||
**Usage:**
|
||||
- Requires `transformers`, `qwen-vl-utils`, and `torch`.
|
||||
- Example: `from scripts.qwen3_vl_reranker import Qwen3VLReranker; model = Qwen3VLReranker(...)`
|
||||
|
||||
**Citation:**
|
||||
@article{qwen3vlembedding, ...}
|
||||
|
||||
This description emphasizes its capabilities, efficiency, and versatility for multimodal search tasks.
|
||||
overrides:
|
||||
reranking: true
|
||||
parameters:
|
||||
model: llama-cpp/models/Qwen3-VL-Reranker-2B.i1-Q4_K_M.gguf
|
||||
name: Qwen3-VL-Reranker-2B-i1-GGUF
|
||||
backend: llama-cpp
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
known_usecases:
|
||||
- chat
|
||||
function:
|
||||
grammar:
|
||||
disable: true
|
||||
mmproj: llama-cpp/mmproj/Qwen3-VL-Reranker-2B.mmproj-f16.gguf
|
||||
description: Imported from https://huggingface.co/mradermacher/Qwen3-VL-Reranker-2B-GGUF/
|
||||
options:
|
||||
- use_jinja:true
|
||||
files:
|
||||
- filename: llama-cpp/models/Qwen3-VL-Reranker-2B.i1-Q4_K_M.gguf
|
||||
sha256: f19dfbceeef9f6ee1f7d0ff536d66e9b1b90424a4b8aa1d1777db43d20afdbc5
|
||||
uri: https://huggingface.co/mradermacher/Qwen3-VL-Reranker-2B-i1-GGUF/resolve/main/Qwen3-VL-Reranker-2B.i1-Q4_K_M.gguf
|
||||
- filename: llama-cpp/mmproj/Qwen3-VL-Reranker-8B.mmproj-f16.gguf
|
||||
sha256: d38b7ae347fc3e51726bfb9cba1b04885f1f005a4087d8070933e46509db5a6e
|
||||
uri: https://huggingface.co/mradermacher/Qwen3-VL-Reranker-2B-GGUF/resolve/main/Qwen3-VL-Reranker-2B.mmproj-f16.gguf
|
||||
- name: "liquidai.lfm2-2.6b-transcript"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
@@ -3095,6 +3148,35 @@
|
||||
- filename: Qwen_Qwen3-30B-A3B-Q4_K_M.gguf
|
||||
sha256: a015794bfb1d69cb03dbb86b185fb2b9b339f757df5f8f9dd9ebdab8f6ed5d32
|
||||
uri: huggingface://bartowski/Qwen_Qwen3-30B-A3B-GGUF/Qwen_Qwen3-30B-A3B-Q4_K_M.gguf
|
||||
- !!merge <<: *qwen3
|
||||
name: "qwen3-reranker-0.6b"
|
||||
tags:
|
||||
- qwen3
|
||||
- reranker
|
||||
- gguf
|
||||
- gpu
|
||||
- cpu
|
||||
urls:
|
||||
- https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
|
||||
description: |
|
||||
The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B). This series inherits the exceptional multilingual capabilities, long-text understanding, and reasoning skills of its foundational model. The Qwen3 Embedding series represents significant advancements in multiple text embedding and ranking tasks, including text retrieval, code retrieval, text classification, text clustering, and bitext mining.
|
||||
**Exceptional Versatility**: The embedding model has achieved state-of-the-art performance across a wide range of downstream application evaluations. The 8B size embedding model ranks No.1 in the MTEB multilingual leaderboard (as of June 5, 2025, score 70.58), while the reranking model excels in various text retrieval scenarios.
|
||||
**Comprehensive Flexibility**: The Qwen3 Embedding series offers a full spectrum of sizes (from 0.6B to 8B) for both embedding and reranking models, catering to diverse use cases that prioritize efficiency and effectiveness. Developers can seamlessly combine these two modules. Additionally, the embedding model allows for flexible vector definitions across all dimensions, and both embedding and reranking models support user-defined instructions to enhance performance for specific tasks, languages, or scenarios.
|
||||
**Multilingual Capability**: The Qwen3 Embedding series offer support for over 100 languages, thanks to the multilingual capabilites of Qwen3 models. This includes various programming languages, and provides robust multilingual, cross-lingual, and code retrieval capabilities.
|
||||
**Qwen3-Reranker-0.6B** has the following features:
|
||||
- Model Type: Text Reranking
|
||||
- Supported Languages: 100+ Languages
|
||||
- Number of Paramaters: 0.6B
|
||||
- Context Length: 32k
|
||||
- Quantization: q4_K_M, q5_0, q5_K_M, q6_K, q8_0, f16
|
||||
overrides:
|
||||
reranking: true
|
||||
parameters:
|
||||
model: Qwen3-Reranker-0.6B.Q8_0.gguf
|
||||
files:
|
||||
- filename: Qwen3-Reranker-0.6B.Q8_0.gguf
|
||||
uri: huggingface://mradermacher/Qwen3-Reranker-0.6B-GGUF/Qwen3-Reranker-0.6B.Q8_0.gguf
|
||||
sha256: c525a7449243f690a7062e6377d6cf5adbb289354bd4316312367cd20e187ab7
|
||||
- !!merge <<: *qwen3
|
||||
name: "qwen3-235b-a22b-instruct-2507"
|
||||
icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png
|
||||
|
||||
@@ -2,7 +2,10 @@
|
||||
name: "qwen3"
|
||||
|
||||
config_file: |
|
||||
mmap: true
|
||||
parameters:
|
||||
context_size: 8192
|
||||
f16: true
|
||||
mmap: true
|
||||
backend: "llama-cpp"
|
||||
template:
|
||||
chat_message: |
|
||||
@@ -36,8 +39,6 @@ config_file: |
|
||||
<|im_start|>assistant
|
||||
completion: |
|
||||
{{.Input}}
|
||||
context_size: 8192
|
||||
f16: true
|
||||
stopwords:
|
||||
- '<|im_end|>'
|
||||
- '<dummy32000>'
|
||||
|
||||
@@ -101,6 +101,25 @@ var _ = BeforeSuite(func() {
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(os.WriteFile(configPath, configYAML, 0644)).To(Succeed())
|
||||
|
||||
// Create model config for autoparser tests (NoGrammar so tool calls
|
||||
// are driven entirely by the backend's ChatDeltas, not grammar enforcement)
|
||||
autoparserConfig := map[string]any{
|
||||
"name": "mock-model-autoparser",
|
||||
"backend": "mock-backend",
|
||||
"parameters": map[string]any{
|
||||
"model": "mock-model.bin",
|
||||
},
|
||||
"function": map[string]any{
|
||||
"grammar": map[string]any{
|
||||
"disable": true,
|
||||
},
|
||||
},
|
||||
}
|
||||
autoparserPath := filepath.Join(modelsPath, "mock-model-autoparser.yaml")
|
||||
autoparserYAML, err := yaml.Marshal(autoparserConfig)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(os.WriteFile(autoparserPath, autoparserYAML, 0644)).To(Succeed())
|
||||
|
||||
// Start mock MCP server and create MCP-enabled model config
|
||||
mcpServerURL, mcpServerShutdown = startMockMCPServer()
|
||||
mcpConfig := mcpModelConfig(mcpServerURL)
|
||||
|
||||
@@ -55,6 +55,46 @@ func (m *MockBackend) Predict(ctx context.Context, in *pb.PredictOptions) (*pb.R
|
||||
if strings.Contains(in.Prompt, "MOCK_ERROR") {
|
||||
return nil, fmt.Errorf("mock backend predict error: simulated failure")
|
||||
}
|
||||
|
||||
// Simulate C++ autoparser: tool call via ChatDeltas, empty message
|
||||
if strings.Contains(in.Prompt, "AUTOPARSER_TOOL_CALL") {
|
||||
toolName := mockToolNameFromRequest(in)
|
||||
if toolName == "" {
|
||||
toolName = "search_collections"
|
||||
}
|
||||
return &pb.Reply{
|
||||
Message: []byte{},
|
||||
Tokens: 10,
|
||||
PromptTokens: 5,
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{ReasoningContent: "I need to search for information."},
|
||||
{
|
||||
ToolCalls: []*pb.ToolCallDelta{
|
||||
{
|
||||
Index: 0,
|
||||
Id: "call_mock_123",
|
||||
Name: toolName,
|
||||
Arguments: `{"query":"localai"}`,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Simulate C++ autoparser: content via ChatDeltas, empty message
|
||||
if strings.Contains(in.Prompt, "AUTOPARSER_CONTENT") {
|
||||
return &pb.Reply{
|
||||
Message: []byte{},
|
||||
Tokens: 10,
|
||||
PromptTokens: 5,
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{ReasoningContent: "Let me compose a response."},
|
||||
{Content: "LocalAI is an open-source AI platform."},
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
var response string
|
||||
toolName := mockToolNameFromRequest(in)
|
||||
if toolName != "" && !promptHasToolResults(in.Prompt) {
|
||||
@@ -88,6 +128,77 @@ func (m *MockBackend) PredictStream(in *pb.PredictOptions, stream pb.Backend_Pre
|
||||
}
|
||||
return fmt.Errorf("mock backend stream error: simulated mid-stream failure")
|
||||
}
|
||||
|
||||
// Simulate C++ autoparser behavior: tool calls delivered via ChatDeltas
|
||||
// with empty message (autoparser clears raw message during parsing).
|
||||
if strings.Contains(in.Prompt, "AUTOPARSER_TOOL_CALL") {
|
||||
toolName := mockToolNameFromRequest(in)
|
||||
if toolName == "" {
|
||||
toolName = "search_collections"
|
||||
}
|
||||
// Phase 1: Stream reasoning tokens with empty message (autoparser active)
|
||||
reasoning := "I need to search for information."
|
||||
for _, r := range reasoning {
|
||||
if err := stream.Send(&pb.Reply{
|
||||
Message: []byte{}, // autoparser clears raw message
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{ReasoningContent: string(r)},
|
||||
},
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// Phase 2: Emit tool call via ChatDeltas (no raw message)
|
||||
if err := stream.Send(&pb.Reply{
|
||||
Message: []byte{}, // autoparser clears raw message
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{
|
||||
ToolCalls: []*pb.ToolCallDelta{
|
||||
{
|
||||
Index: 0,
|
||||
Id: "call_mock_123",
|
||||
Name: toolName,
|
||||
Arguments: `{"query":"localai"}`,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Simulate C++ autoparser behavior: content delivered via ChatDeltas
|
||||
// with empty message (autoparser clears raw message during parsing).
|
||||
if strings.Contains(in.Prompt, "AUTOPARSER_CONTENT") {
|
||||
// Phase 1: Stream reasoning via ChatDeltas
|
||||
reasoning := "Let me compose a response."
|
||||
for _, r := range reasoning {
|
||||
if err := stream.Send(&pb.Reply{
|
||||
Message: []byte{},
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{ReasoningContent: string(r)},
|
||||
},
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// Phase 2: Stream content via ChatDeltas (no raw message)
|
||||
content := "LocalAI is an open-source AI platform."
|
||||
for _, r := range content {
|
||||
if err := stream.Send(&pb.Reply{
|
||||
Message: []byte{},
|
||||
ChatDeltas: []*pb.ChatDelta{
|
||||
{Content: string(r)},
|
||||
},
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
var toStream string
|
||||
toolName := mockToolNameFromRequest(in)
|
||||
if toolName != "" && !promptHasToolResults(in.Prompt) {
|
||||
|
||||
@@ -2,6 +2,7 @@ package e2e_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
@@ -265,4 +266,201 @@ var _ = Describe("Mock Backend E2E Tests", Label("MockBackend"), func() {
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
Describe("Autoparser ChatDelta Streaming", Label("Autoparser"), func() {
|
||||
// These tests verify that when the C++ autoparser handles tool calls
|
||||
// and content via ChatDeltas (with empty raw message), the streaming
|
||||
// endpoint does NOT unnecessarily retry. This is a regression test for
|
||||
// the bug where the retry logic only checked Go-side parsing, ignoring
|
||||
// ChatDelta results, causing up to 6 retries and concatenated output.
|
||||
|
||||
Context("Streaming with tools and ChatDelta tool calls", func() {
|
||||
It("should return tool calls without unnecessary retries", func() {
|
||||
body := `{
|
||||
"model": "mock-model-autoparser",
|
||||
"messages": [{"role": "user", "content": "AUTOPARSER_TOOL_CALL"}],
|
||||
"tools": [{"type": "function", "function": {"name": "search_collections", "description": "Search documents", "parameters": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}}}],
|
||||
"stream": true
|
||||
}`
|
||||
req, err := http.NewRequest("POST", apiURL+"/chat/completions", strings.NewReader(body))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
httpClient := &http.Client{Timeout: 60 * time.Second}
|
||||
resp, err := httpClient.Do(req)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
defer resp.Body.Close()
|
||||
Expect(resp.StatusCode).To(Equal(200))
|
||||
|
||||
data, err := io.ReadAll(resp.Body)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
bodyStr := string(data)
|
||||
|
||||
// Parse all SSE events
|
||||
lines := strings.Split(bodyStr, "\n")
|
||||
var toolCallChunks int
|
||||
var reasoningChunks int
|
||||
hasFinishReason := false
|
||||
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(line, "data: ") || line == "data: [DONE]" {
|
||||
continue
|
||||
}
|
||||
jsonData := strings.TrimPrefix(line, "data: ")
|
||||
var chunk map[string]any
|
||||
if err := json.Unmarshal([]byte(jsonData), &chunk); err != nil {
|
||||
continue
|
||||
}
|
||||
choices, ok := chunk["choices"].([]any)
|
||||
if !ok || len(choices) == 0 {
|
||||
continue
|
||||
}
|
||||
choice := choices[0].(map[string]any)
|
||||
delta, _ := choice["delta"].(map[string]any)
|
||||
if delta == nil {
|
||||
continue
|
||||
}
|
||||
if _, ok := delta["tool_calls"]; ok {
|
||||
toolCallChunks++
|
||||
}
|
||||
if _, ok := delta["reasoning"]; ok {
|
||||
reasoningChunks++
|
||||
}
|
||||
if fr, ok := choice["finish_reason"].(string); ok && fr != "" {
|
||||
hasFinishReason = true
|
||||
}
|
||||
}
|
||||
|
||||
// The key assertion: tool calls from ChatDeltas should be present
|
||||
Expect(toolCallChunks).To(BeNumerically(">", 0),
|
||||
"Expected tool_calls in streaming response from ChatDeltas, but got none. "+
|
||||
"This likely means the retry logic discarded ChatDelta tool calls.")
|
||||
|
||||
// Should have a finish reason
|
||||
Expect(hasFinishReason).To(BeTrue(), "Expected a finish_reason in the streaming response")
|
||||
|
||||
// Reasoning should be present (from ChatDelta reasoning)
|
||||
Expect(reasoningChunks).To(BeNumerically(">", 0),
|
||||
"Expected reasoning deltas from ChatDeltas")
|
||||
})
|
||||
})
|
||||
|
||||
Context("Streaming with tools and ChatDelta content (no tool calls)", func() {
|
||||
It("should return content without retrying and without concatenation", func() {
|
||||
body := `{
|
||||
"model": "mock-model-autoparser",
|
||||
"messages": [{"role": "user", "content": "AUTOPARSER_CONTENT"}],
|
||||
"tools": [{"type": "function", "function": {"name": "search_collections", "description": "Search documents", "parameters": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}}}],
|
||||
"stream": true
|
||||
}`
|
||||
req, err := http.NewRequest("POST", apiURL+"/chat/completions", strings.NewReader(body))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
httpClient := &http.Client{Timeout: 60 * time.Second}
|
||||
resp, err := httpClient.Do(req)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
defer resp.Body.Close()
|
||||
Expect(resp.StatusCode).To(Equal(200))
|
||||
|
||||
data, err := io.ReadAll(resp.Body)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
bodyStr := string(data)
|
||||
|
||||
// Parse all SSE events and collect content
|
||||
lines := strings.Split(bodyStr, "\n")
|
||||
var contentParts []string
|
||||
var reasoningParts []string
|
||||
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if !strings.HasPrefix(line, "data: ") || line == "data: [DONE]" {
|
||||
continue
|
||||
}
|
||||
jsonData := strings.TrimPrefix(line, "data: ")
|
||||
var chunk map[string]any
|
||||
if err := json.Unmarshal([]byte(jsonData), &chunk); err != nil {
|
||||
continue
|
||||
}
|
||||
choices, ok := chunk["choices"].([]any)
|
||||
if !ok || len(choices) == 0 {
|
||||
continue
|
||||
}
|
||||
choice := choices[0].(map[string]any)
|
||||
delta, _ := choice["delta"].(map[string]any)
|
||||
if delta == nil {
|
||||
continue
|
||||
}
|
||||
if content, ok := delta["content"].(string); ok && content != "" {
|
||||
contentParts = append(contentParts, content)
|
||||
}
|
||||
if reasoning, ok := delta["reasoning"].(string); ok && reasoning != "" {
|
||||
reasoningParts = append(reasoningParts, reasoning)
|
||||
}
|
||||
}
|
||||
|
||||
fullContent := strings.Join(contentParts, "")
|
||||
fullReasoning := strings.Join(reasoningParts, "")
|
||||
|
||||
// Content should be present and match the expected answer
|
||||
Expect(fullContent).To(ContainSubstring("LocalAI"),
|
||||
"Expected content from ChatDeltas to contain 'LocalAI'. "+
|
||||
"The retry logic may have discarded ChatDelta content.")
|
||||
|
||||
// Content should NOT be duplicated (no retry concatenation)
|
||||
occurrences := strings.Count(fullContent, "LocalAI is an open-source AI platform.")
|
||||
Expect(occurrences).To(Equal(1),
|
||||
"Expected content to appear exactly once, but found %d occurrences. "+
|
||||
"This indicates unnecessary retries are concatenating output.", occurrences)
|
||||
|
||||
// Reasoning should be present
|
||||
Expect(fullReasoning).To(ContainSubstring("compose"),
|
||||
"Expected reasoning content from ChatDeltas")
|
||||
})
|
||||
})
|
||||
|
||||
Context("Non-streaming with tools and ChatDelta tool calls", func() {
|
||||
It("should return tool calls from ChatDeltas", func() {
|
||||
body := `{
|
||||
"model": "mock-model-autoparser",
|
||||
"messages": [{"role": "user", "content": "AUTOPARSER_TOOL_CALL"}],
|
||||
"tools": [{"type": "function", "function": {"name": "search_collections", "description": "Search documents", "parameters": {"type": "object", "properties": {"query": {"type": "string"}}, "required": ["query"]}}}]
|
||||
}`
|
||||
req, err := http.NewRequest("POST", apiURL+"/chat/completions", strings.NewReader(body))
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
httpClient := &http.Client{Timeout: 60 * time.Second}
|
||||
resp, err := httpClient.Do(req)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
defer resp.Body.Close()
|
||||
Expect(resp.StatusCode).To(Equal(200))
|
||||
|
||||
data, err := io.ReadAll(resp.Body)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
|
||||
var result map[string]any
|
||||
Expect(json.Unmarshal(data, &result)).To(Succeed())
|
||||
|
||||
choices, ok := result["choices"].([]any)
|
||||
Expect(ok).To(BeTrue())
|
||||
Expect(choices).To(HaveLen(1))
|
||||
|
||||
choice := choices[0].(map[string]any)
|
||||
msg, _ := choice["message"].(map[string]any)
|
||||
Expect(msg).ToNot(BeNil())
|
||||
|
||||
toolCalls, ok := msg["tool_calls"].([]any)
|
||||
Expect(ok).To(BeTrue(),
|
||||
"Expected tool_calls in non-streaming response from ChatDeltas, "+
|
||||
"but got: %s", string(data))
|
||||
Expect(toolCalls).To(HaveLen(1))
|
||||
|
||||
tc := toolCalls[0].(map[string]any)
|
||||
fn, _ := tc["function"].(map[string]any)
|
||||
Expect(fn["name"]).To(Equal("search_collections"))
|
||||
})
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user