Compare commits

..

8 Commits

Author SHA1 Message Date
Ettore Di Giacinto
820bd7dd01 fix(ci): try to fix deps for l4t13 on qwen-*
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-02-14 10:21:23 +01:00
Austen
42cb7bda19 fix(llama-cpp): populate tensor_buft_override buffer so llama-cpp properly performs fit calculations (#8560)
fix auto-fit for llama-cpp
2026-02-14 10:07:37 +01:00
Ettore Di Giacinto
2fb9940b8a fix(voxcpm): pin setuptools (#8556)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-02-13 23:44:35 +01:00
LocalAI [bot]
2ff0ad4190 chore: ⬆️ Update ggml-org/llama.cpp to 05a6f0e8946914918758db767f6eb04bc1e38507 (#8553)
⬆️ Update ggml-org/llama.cpp

Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-02-13 22:48:01 +01:00
Ettore Di Giacinto
bd12103ed4 chore: compute capabilities once (#8555)
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-02-13 22:23:06 +01:00
LocalAI [bot]
2e17edd72a fix: prevent excessive logging in capability detection (#8552)
Closes #8527.

This PR fixes the excessive logging issue in capability detection by applying the existing capabilityLogged guard to the forced capability run file case.

## Changes
- Apply capabilityLogged flag to forced capability detection logging
- Prevents repeated log messages during backend discovery and gallery operations

Co-authored-by: localai-bot <localai-bot@users.noreply.github.com>
2026-02-13 20:00:29 +00:00
Richard Palethorpe
24aab68b3f feat(gallery): Add nanbeige4.1-3b (#8551)
Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-02-13 18:23:44 +01:00
Richard Palethorpe
5bdbb10593 fix(realtime): Send proper image data to backend (#8547)
* fix(realtime): Allow empty parameters

Signed-off-by: Richard Palethorpe <io@richiejp.com>

* fix(realtime): Just pass base64 string to backend

Signed-off-by: Richard Palethorpe <io@richiejp.com>

---------

Signed-off-by: Richard Palethorpe <io@richiejp.com>
2026-02-13 18:01:07 +01:00
12 changed files with 121 additions and 105 deletions

View File

@@ -1,5 +1,5 @@
LLAMA_VERSION?=338085c69e486b7155e5b03d7b5087e02c0e2528 LLAMA_VERSION?=05a6f0e8946914918758db767f6eb04bc1e38507
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
CMAKE_ARGS?= CMAKE_ARGS?=

View File

@@ -294,76 +294,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const
return data; return data;
} }
static bool template_uses_arguments_items_filter(const std::string & template_src) {
return template_src.find("arguments|items") != std::string::npos ||
template_src.find("arguments | items") != std::string::npos ||
template_src.find("arguments| items") != std::string::npos ||
template_src.find("arguments |items") != std::string::npos;
}
static void normalize_tool_call_arguments_for_template(
json & messages,
const std::string & template_src,
const char * request_name)
{
if (!messages.is_array() || !template_uses_arguments_items_filter(template_src)) {
return;
}
size_t converted = 0;
size_t failed = 0;
for (auto & message : messages) {
if (!message.is_object() || !message.contains("tool_calls") || !message["tool_calls"].is_array()) {
continue;
}
for (auto & tool_call : message["tool_calls"]) {
if (!tool_call.is_object() || !tool_call.contains("function") || !tool_call["function"].is_object()) {
continue;
}
auto & function = tool_call["function"];
if (!function.contains("arguments")) {
continue;
}
auto & arguments = function["arguments"];
if (!arguments.is_string()) {
continue;
}
const std::string args_str = arguments.get<std::string>();
if (args_str.empty()) {
arguments = json::object();
converted++;
continue;
}
try {
json parsed_args = json::parse(args_str);
if (parsed_args.is_object()) {
arguments = parsed_args;
converted++;
}
} catch (const json::parse_error &) {
failed++;
}
}
}
if (converted > 0) {
SRV_INF("[TOOLS DEBUG] %s: Converted %zu tool call argument strings to JSON objects for arguments|items template compatibility\n",
request_name,
converted);
}
if (failed > 0) {
SRV_WRN("[TOOLS DEBUG] %s: Failed to parse %zu tool call argument strings as JSON for arguments|items template compatibility\n",
request_name,
failed);
}
}
const std::vector<ggml_type> kv_cache_types = { const std::vector<ggml_type> kv_cache_types = {
GGML_TYPE_F32, GGML_TYPE_F32,
@@ -487,6 +417,12 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
// n_ctx_checkpoints: max context checkpoints per slot (default: 8) // n_ctx_checkpoints: max context checkpoints per slot (default: 8)
params.n_ctx_checkpoints = 8; params.n_ctx_checkpoints = 8;
// llama memory fit fails if we don't provide a buffer for tensor overrides
const size_t ntbo = llama_max_tensor_buft_overrides();
while (params.tensor_buft_overrides.size() < ntbo) {
params.tensor_buft_overrides.push_back({nullptr, nullptr});
}
// decode options. Options are in form optname:optvale, or if booleans only optname. // decode options. Options are in form optname:optvale, or if booleans only optname.
for (int i = 0; i < request->options_size(); i++) { for (int i = 0; i < request->options_size(); i++) {
std::string opt = request->options(i); std::string opt = request->options(i);
@@ -1325,11 +1261,6 @@ public:
body_json["add_generation_prompt"] = data["add_generation_prompt"]; body_json["add_generation_prompt"] = data["add_generation_prompt"];
} }
if (body_json.contains("messages") && ctx_server.impl->chat_params.tmpls) {
const auto template_src = common_chat_templates_source(ctx_server.impl->chat_params.tmpls.get());
normalize_tool_call_arguments_for_template(body_json["messages"], template_src, "PredictStream");
}
// Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.) // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)
SRV_DBG("[CONVERSATION DEBUG] PredictStream: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str()); SRV_DBG("[CONVERSATION DEBUG] PredictStream: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str());
@@ -2061,11 +1992,6 @@ public:
body_json["add_generation_prompt"] = data["add_generation_prompt"]; body_json["add_generation_prompt"] = data["add_generation_prompt"];
} }
if (body_json.contains("messages") && ctx_server.impl->chat_params.tmpls) {
const auto template_src = common_chat_templates_source(ctx_server.impl->chat_params.tmpls.get());
normalize_tool_call_arguments_for_template(body_json["messages"], template_src, "Predict");
}
// Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.) // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)
SRV_DBG("[CONVERSATION DEBUG] Predict: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str()); SRV_DBG("[CONVERSATION DEBUG] Predict: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str());

View File

@@ -3,3 +3,6 @@ protobuf
certifi certifi
packaging==24.1 packaging==24.1
setuptools setuptools
h11
gradio
uvicorn

View File

@@ -4,4 +4,6 @@ certifi
packaging==24.1 packaging==24.1
soundfile soundfile
setuptools setuptools
six six
scipy
librosa

View File

@@ -9,7 +9,12 @@ else
fi fi
installRequirements installRequirements
if [ "x${USE_PIP}" == "xtrue" ]; then
pip install "setuptools<70.0.0"
else
uv pip install "setuptools<70.0.0"
fi
# Apply patch to fix PyTorch compatibility issue in voxcpm # Apply patch to fix PyTorch compatibility issue in voxcpm
# This fixes the "Dimension out of range" error in scaled_dot_product_attention # This fixes the "Dimension out of range" error in scaled_dot_product_attention
# by changing .contiguous() to .unsqueeze(0) in the attention module # by changing .contiguous() to .unsqueeze(0) in the attention module

View File

@@ -83,7 +83,7 @@ type RunCMD struct {
EnableTracing bool `env:"LOCALAI_ENABLE_TRACING,ENABLE_TRACING" help:"Enable API tracing" group:"api"` EnableTracing bool `env:"LOCALAI_ENABLE_TRACING,ENABLE_TRACING" help:"Enable API tracing" group:"api"`
TracingMaxItems int `env:"LOCALAI_TRACING_MAX_ITEMS" default:"1024" help:"Maximum number of traces to keep" group:"api"` TracingMaxItems int `env:"LOCALAI_TRACING_MAX_ITEMS" default:"1024" help:"Maximum number of traces to keep" group:"api"`
AgentJobRetentionDays int `env:"LOCALAI_AGENT_JOB_RETENTION_DAYS,AGENT_JOB_RETENTION_DAYS" default:"30" help:"Number of days to keep agent job history (default: 30)" group:"api"` AgentJobRetentionDays int `env:"LOCALAI_AGENT_JOB_RETENTION_DAYS,AGENT_JOB_RETENTION_DAYS" default:"30" help:"Number of days to keep agent job history (default: 30)" group:"api"`
OpenResponsesStoreTTL string `env:"LOCALAI_OPEN_RESPONSES_STORE_TTL,OPEN_RESPONSES_STORE_TTL" default:"0" help:"TTL for Open Responses store (e.g., 1h, 30m, 0 = no expiration)" group:"api"` OpenResponsesStoreTTL string `env:"LOCALAI_OPEN_RESPONSES_STORE_TTL,OPEN_RESPONSES_STORE_TTL" default:"0" help:"TTL for Open Responses store (e.g., 1h, 30m, 0 = no expiration)" group:"api"`
Version bool Version bool
} }

View File

@@ -23,6 +23,7 @@ import (
"github.com/mudler/LocalAI/core/templates" "github.com/mudler/LocalAI/core/templates"
laudio "github.com/mudler/LocalAI/pkg/audio" laudio "github.com/mudler/LocalAI/pkg/audio"
"github.com/mudler/LocalAI/pkg/functions" "github.com/mudler/LocalAI/pkg/functions"
"github.com/mudler/LocalAI/pkg/utils"
"github.com/mudler/LocalAI/pkg/grpc/proto" "github.com/mudler/LocalAI/pkg/grpc/proto"
model "github.com/mudler/LocalAI/pkg/model" model "github.com/mudler/LocalAI/pkg/model"
"github.com/mudler/LocalAI/pkg/reasoning" "github.com/mudler/LocalAI/pkg/reasoning"
@@ -949,7 +950,12 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
case types.MessageContentTypeInputAudio: case types.MessageContentTypeInputAudio:
textContent += content.Transcript textContent += content.Transcript
case types.MessageContentTypeInputImage: case types.MessageContentTypeInputImage:
msg.StringImages = append(msg.StringImages, content.ImageURL) img, err := utils.GetContentURIAsBase64(content.ImageURL)
if err != nil {
xlog.Warn("Failed to process image", "error", err)
continue
}
msg.StringImages = append(msg.StringImages, img)
imgIndex++ imgIndex++
nrOfImgsInMessage++ nrOfImgsInMessage++
} }

View File

@@ -175,8 +175,8 @@ type ToolFunction struct {
// The description of the function, including guidance on when and how to call it, and guidance about what to tell the user when calling (if anything). // The description of the function, including guidance on when and how to call it, and guidance about what to tell the user when calling (if anything).
Description string `json:"description"` Description string `json:"description"`
// The type of the tool, i.e. function. // The jsonschema representing the parameters
Parameters any `json:"parameters"` Parameters any `json:"parameters,omitempty"`
} }
func (t ToolFunction) ToolType() ToolType { func (t ToolFunction) ToolType() ToolType {

View File

@@ -1,4 +1,46 @@
--- ---
- &nanbeige4
name: "nanbeige4.1-3b-q8"
url: "github:mudler/LocalAI/gallery/nanbeige4.1.yaml@master"
urls:
- https://huggingface.co/Nanbeige/Nanbeige4.1-3B
- https://huggingface.co/Edge-Quant/Nanbeige4.1-3B-Q8_0-GGUF
icon: https://cdn-avatars.huggingface.co/v1/production/uploads/646f0d118ff94af23bc44aab/GXHCollpMRgvYqUXQ2BQ7.png
license: apache-2.0
description: |
Nanbeige4.1-3B is built upon Nanbeige4-3B-Base and represents an enhanced iteration of our previous reasoning model, Nanbeige4-3B-Thinking-2511, achieved through further post-training optimization with supervised fine-tuning (SFT) and reinforcement learning (RL). As a highly competitive open-source model at a small parameter scale, Nanbeige4.1-3B illustrates that compact models can simultaneously achieve robust reasoning, preference alignment, and effective agentic behaviors.
Key features:
Strong Reasoning: Capable of solving complex, multi-step problems through sustained and coherent reasoning within a single forward pass, reliably producing correct answers on benchmarks like LiveCodeBench-Pro, IMO-Answer-Bench, and AIME 2026 I.
Robust Preference Alignment: Outperforms same-scale models (e.g., Qwen3-4B-2507, Nanbeige4-3B-2511) and larger models (e.g., Qwen3-30B-A3B, Qwen3-32B) on Arena-Hard-v2 and Multi-Challenge.
Agentic Capability: First general small model to natively support deep-search tasks and sustain complex problem-solving with >500 rounds of tool invocations; excels in benchmarks like xBench-DeepSearch (75), Browse-Comp (39), and others.
tags:
- llm
- gguf
- gpu
- cpu
- nanbeige
- reasoning
- agent
overrides:
parameters:
model: nanbeige4.1-3b-q8_0.gguf
files:
- filename: nanbeige4.1-3b-q8_0.gguf
sha256: a5a4379e50605c5e5a31bb1716a211fb16691fea7e13ede7f88796e1f617d9e0
uri: huggingface://Edge-Quant/Nanbeige4.1-3B-Q8_0-GGUF/nanbeige4.1-3b-q8_0.gguf
- !!merge <<: *nanbeige4
name: "nanbeige4.1-3b-q4"
urls:
- https://huggingface.co/Nanbeige/Nanbeige4.1-3B
- https://huggingface.co/Edge-Quant/Nanbeige4.1-3B-Q4_K_M-GGUF
overrides:
parameters:
model: nanbeige4.1-3b-q4_k_m.gguf
files:
- filename: nanbeige4.1-3b-q4_k_m.gguf
sha256: 043246350c952877b38958a9e35c480419008b6b2d52bedaf2b805ed2447b4df
uri: huggingface://Edge-Quant/Nanbeige4.1-3B-Q4_K_M-GGUF/nanbeige4.1-3b-q4_k_m.gguf
- name: nemo-parakeet-tdt-0.6b - name: nemo-parakeet-tdt-0.6b
license: apache-2.0 license: apache-2.0
url: "github:mudler/LocalAI/gallery/virtual.yaml@master" url: "github:mudler/LocalAI/gallery/virtual.yaml@master"

16
gallery/nanbeige4.1.yaml Normal file
View File

@@ -0,0 +1,16 @@
---
name: nanbeige4.1
config_file: |
backend: llama-cpp
function:
grammar:
disable: true
known_usecases:
- chat
options:
- use_jinja:true
parameters:
model: llama-cpp/models/nanbeige4.1-3b-q8_0.gguf
template:
use_tokenizer_template: true

View File

@@ -45,9 +45,8 @@ const (
) )
var ( var (
cuda13DirExists bool cuda13DirExists bool
cuda12DirExists bool cuda12DirExists bool
capabilityLogged bool
) )
func init() { func init() {
@@ -72,9 +71,15 @@ func (s *SystemState) Capability(capMap map[string]string) string {
} }
func (s *SystemState) getSystemCapabilities() string { func (s *SystemState) getSystemCapabilities() string {
if s.systemCapabilities != "" {
return s.systemCapabilities
}
capability := os.Getenv(capabilityEnv) capability := os.Getenv(capabilityEnv)
if capability != "" { if capability != "" {
xlog.Info("Using forced capability from environment variable", "capability", capability, "env", capabilityEnv) xlog.Info("Using forced capability from environment variable", "capability", capability, "env", capabilityEnv)
s.systemCapabilities = capability
return capability return capability
} }
@@ -91,20 +96,23 @@ func (s *SystemState) getSystemCapabilities() string {
capability, err := os.ReadFile(capabilityRunFile) capability, err := os.ReadFile(capabilityRunFile)
if err == nil { if err == nil {
xlog.Info("Using forced capability run file", "capabilityRunFile", capabilityRunFile, "capability", string(capability), "env", capabilityRunFileEnv) xlog.Info("Using forced capability run file", "capabilityRunFile", capabilityRunFile, "capability", string(capability), "env", capabilityRunFileEnv)
return strings.Trim(strings.TrimSpace(string(capability)), "\n") s.systemCapabilities = strings.Trim(strings.TrimSpace(string(capability)), "\n")
return s.systemCapabilities
} }
} }
// If we are on mac and arm64, we will return metal // If we are on mac and arm64, we will return metal
if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" { if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
xlog.Info("Using metal capability (arm64 on mac)", "env", capabilityEnv) xlog.Info("Using metal capability (arm64 on mac)", "env", capabilityEnv)
return metal s.systemCapabilities = metal
return s.systemCapabilities
} }
// If we are on mac and x86, we will return darwin-x86 // If we are on mac and x86, we will return darwin-x86
if runtime.GOOS == "darwin" && runtime.GOARCH == "amd64" { if runtime.GOOS == "darwin" && runtime.GOARCH == "amd64" {
xlog.Info("Using darwin-x86 capability (amd64 on mac)", "env", capabilityEnv) xlog.Info("Using darwin-x86 capability (amd64 on mac)", "env", capabilityEnv)
return darwinX86 s.systemCapabilities = darwinX86
return s.systemCapabilities
} }
// If arm64 on linux and a nvidia gpu is detected, we will return nvidia-l4t // If arm64 on linux and a nvidia gpu is detected, we will return nvidia-l4t
@@ -112,39 +120,43 @@ func (s *SystemState) getSystemCapabilities() string {
if s.GPUVendor == Nvidia { if s.GPUVendor == Nvidia {
xlog.Info("Using nvidia-l4t capability (arm64 on linux)", "env", capabilityEnv) xlog.Info("Using nvidia-l4t capability (arm64 on linux)", "env", capabilityEnv)
if cuda13DirExists { if cuda13DirExists {
return nvidiaL4TCuda13 s.systemCapabilities = nvidiaL4TCuda13
return s.systemCapabilities
} }
if cuda12DirExists { if cuda12DirExists {
return nvidiaL4TCuda12 s.systemCapabilities = nvidiaL4TCuda12
return s.systemCapabilities
} }
return nvidiaL4T s.systemCapabilities = nvidiaL4T
return s.systemCapabilities
} }
} }
if cuda13DirExists { if cuda13DirExists {
return nvidiaCuda13 s.systemCapabilities = nvidiaCuda13
return s.systemCapabilities
} }
if cuda12DirExists { if cuda12DirExists {
return nvidiaCuda12 s.systemCapabilities = nvidiaCuda12
return s.systemCapabilities
} }
if s.GPUVendor == "" { if s.GPUVendor == "" {
xlog.Info("Default capability (no GPU detected)", "env", capabilityEnv) xlog.Info("Default capability (no GPU detected)", "env", capabilityEnv)
return defaultCapability s.systemCapabilities = defaultCapability
return s.systemCapabilities
} }
if !capabilityLogged {
xlog.Info("Capability automatically detected", "capability", s.GPUVendor, "env", capabilityEnv)
capabilityLogged = true
}
// If vram is less than 4GB, let's default to CPU but warn the user that they can override that via env // If vram is less than 4GB, let's default to CPU but warn the user that they can override that via env
if s.VRAM <= 4*1024*1024*1024 { if s.VRAM <= 4*1024*1024*1024 {
xlog.Warn("VRAM is less than 4GB, defaulting to CPU", "env", capabilityEnv) xlog.Warn("VRAM is less than 4GB, defaulting to CPU", "env", capabilityEnv)
return defaultCapability s.systemCapabilities = defaultCapability
return s.systemCapabilities
} }
return s.GPUVendor s.systemCapabilities = s.GPUVendor
return s.systemCapabilities
} }
// BackendPreferenceTokens returns a list of substrings that represent the preferred // BackendPreferenceTokens returns a list of substrings that represent the preferred

View File

@@ -19,6 +19,8 @@ type SystemState struct {
Backend Backend Backend Backend
Model Model Model Model
VRAM uint64 VRAM uint64
systemCapabilities string
} }
type SystemStateOptions func(*SystemState) type SystemStateOptions func(*SystemState)
@@ -53,5 +55,7 @@ func GetSystemState(opts ...SystemStateOptions) (*SystemState, error) {
state.VRAM, _ = xsysinfo.TotalAvailableVRAM() state.VRAM, _ = xsysinfo.TotalAvailableVRAM()
xlog.Debug("Total available VRAM", "vram", state.VRAM) xlog.Debug("Total available VRAM", "vram", state.VRAM)
state.getSystemCapabilities()
return state, nil return state, nil
} }