fix(voxcpm): pin setuptools (#8556 )

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
chore: ⬆️ Update ggml-org/llama.cpp to 05a6f0e8946914918758db767f6eb04bc1e38507 (#8553 )
2026-02-14 00:21:02 -05:00 · 2026-02-13 23:44:35 +01:00 · 2026-02-13 22:48:01 +01:00 · 2026-02-13 22:23:06 +01:00 · 2026-02-13 20:00:29 +00:00 · 2026-02-13 18:23:44 +01:00
10 changed files with 109 additions and 104 deletions
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=338085c69e486b7155e5b03d7b5087e02c0e2528
+LLAMA_VERSION?=05a6f0e8946914918758db767f6eb04bc1e38507
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -294,76 +294,6 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const
    return data;
 }

-static bool template_uses_arguments_items_filter(const std::string & template_src) {
-    return template_src.find("arguments|items") != std::string::npos ||
-           template_src.find("arguments | items") != std::string::npos ||
-           template_src.find("arguments| items") != std::string::npos ||
-           template_src.find("arguments |items") != std::string::npos;
-}
-
-static void normalize_tool_call_arguments_for_template(
-    json & messages,
-    const std::string & template_src,
-    const char * request_name)
-{
-    if (!messages.is_array() || !template_uses_arguments_items_filter(template_src)) {
-        return;
-    }
-
-    size_t converted = 0;
-    size_t failed = 0;
-
-    for (auto & message : messages) {
-        if (!message.is_object() || !message.contains("tool_calls") || !message["tool_calls"].is_array()) {
-            continue;
-        }
-
-        for (auto & tool_call : message["tool_calls"]) {
-            if (!tool_call.is_object() || !tool_call.contains("function") || !tool_call["function"].is_object()) {
-                continue;
-            }
-
-            auto & function = tool_call["function"];
-            if (!function.contains("arguments")) {
-                continue;
-            }
-
-            auto & arguments = function["arguments"];
-            if (!arguments.is_string()) {
-                continue;
-            }
-
-            const std::string args_str = arguments.get<std::string>();
-            if (args_str.empty()) {
-                arguments = json::object();
-                converted++;
-                continue;
-            }
-
-            try {
-                json parsed_args = json::parse(args_str);
-                if (parsed_args.is_object()) {
-                    arguments = parsed_args;
-                    converted++;
-                }
-            } catch (const json::parse_error &) {
-                failed++;
-            }
-        }
-    }
-
-    if (converted > 0) {
-        SRV_INF("[TOOLS DEBUG] %s: Converted %zu tool call argument strings to JSON objects for arguments|items template compatibility\n",
-                request_name,
-                converted);
-    }
-    if (failed > 0) {
-        SRV_WRN("[TOOLS DEBUG] %s: Failed to parse %zu tool call argument strings as JSON for arguments|items template compatibility\n",
-                request_name,
-                failed);
-    }
-}
-

 const std::vector<ggml_type> kv_cache_types = {
    GGML_TYPE_F32,
@@ -1325,11 +1255,6 @@ public:
                    body_json["add_generation_prompt"] = data["add_generation_prompt"];
                }

-                if (body_json.contains("messages") && ctx_server.impl->chat_params.tmpls) {
-                    const auto template_src = common_chat_templates_source(ctx_server.impl->chat_params.tmpls.get());
-                    normalize_tool_call_arguments_for_template(body_json["messages"], template_src, "PredictStream");
-                }
-
                // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)
                SRV_DBG("[CONVERSATION DEBUG] PredictStream: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str());

@@ -2061,11 +1986,6 @@ public:
                    body_json["add_generation_prompt"] = data["add_generation_prompt"];
                }

-                if (body_json.contains("messages") && ctx_server.impl->chat_params.tmpls) {
-                    const auto template_src = common_chat_templates_source(ctx_server.impl->chat_params.tmpls.get());
-                    normalize_tool_call_arguments_for_template(body_json["messages"], template_src, "Predict");
-                }
-
                // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)
                SRV_DBG("[CONVERSATION DEBUG] Predict: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str());

--- a/backend/python/voxcpm/install.sh
+++ b/backend/python/voxcpm/install.sh
@@ -9,7 +9,12 @@ else
 fi

 installRequirements
-
+ 
+if [ "x${USE_PIP}" == "xtrue" ]; then
+    pip install "setuptools<70.0.0"
+else
+    uv pip install "setuptools<70.0.0"
+fi
 # Apply patch to fix PyTorch compatibility issue in voxcpm
 # This fixes the "Dimension out of range" error in scaled_dot_product_attention
 # by changing .contiguous() to .unsqueeze(0) in the attention module
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -83,7 +83,7 @@ type RunCMD struct {
 	EnableTracing                      bool     `env:"LOCALAI_ENABLE_TRACING,ENABLE_TRACING" help:"Enable API tracing" group:"api"`
 	TracingMaxItems                    int      `env:"LOCALAI_TRACING_MAX_ITEMS" default:"1024" help:"Maximum number of traces to keep" group:"api"`
 	AgentJobRetentionDays              int      `env:"LOCALAI_AGENT_JOB_RETENTION_DAYS,AGENT_JOB_RETENTION_DAYS" default:"30" help:"Number of days to keep agent job history (default: 30)" group:"api"`
-	OpenResponsesStoreTTL               string   `env:"LOCALAI_OPEN_RESPONSES_STORE_TTL,OPEN_RESPONSES_STORE_TTL" default:"0" help:"TTL for Open Responses store (e.g., 1h, 30m, 0 = no expiration)" group:"api"`
+	OpenResponsesStoreTTL              string   `env:"LOCALAI_OPEN_RESPONSES_STORE_TTL,OPEN_RESPONSES_STORE_TTL" default:"0" help:"TTL for Open Responses store (e.g., 1h, 30m, 0 = no expiration)" group:"api"`

 	Version bool
 }
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -23,6 +23,7 @@ import (
 	"github.com/mudler/LocalAI/core/templates"
 	laudio "github.com/mudler/LocalAI/pkg/audio"
 	"github.com/mudler/LocalAI/pkg/functions"
+	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/reasoning"
@@ -949,7 +950,12 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 				case types.MessageContentTypeInputAudio:
 					textContent += content.Transcript
 				case types.MessageContentTypeInputImage:
-					msg.StringImages = append(msg.StringImages, content.ImageURL)
+					img, err := utils.GetContentURIAsBase64(content.ImageURL)
+					if err != nil {
+						xlog.Warn("Failed to process image", "error", err)
+						continue
+					}
+					msg.StringImages = append(msg.StringImages, img)
 					imgIndex++
 					nrOfImgsInMessage++
 				}
--- a/core/http/endpoints/openai/types/types.go
+++ b/core/http/endpoints/openai/types/types.go
@@ -175,8 +175,8 @@ type ToolFunction struct {
 	// The description of the function, including guidance on when and how to call it, and guidance about what to tell the user when calling (if anything).
 	Description string `json:"description"`

-	// The type of the tool, i.e. function.
-	Parameters any `json:"parameters"`
+	// The jsonschema representing the parameters
+	Parameters any `json:"parameters,omitempty"`
 }

 func (t ToolFunction) ToolType() ToolType {
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,46 @@
 ---
+- &nanbeige4
+  name: "nanbeige4.1-3b-q8"
+  url: "github:mudler/LocalAI/gallery/nanbeige4.1.yaml@master"
+  urls:
+    - https://huggingface.co/Nanbeige/Nanbeige4.1-3B
+    - https://huggingface.co/Edge-Quant/Nanbeige4.1-3B-Q8_0-GGUF
+  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/646f0d118ff94af23bc44aab/GXHCollpMRgvYqUXQ2BQ7.png
+  license: apache-2.0
+  description: |
+    Nanbeige4.1-3B is built upon Nanbeige4-3B-Base and represents an enhanced iteration of our previous reasoning model, Nanbeige4-3B-Thinking-2511, achieved through further post-training optimization with supervised fine-tuning (SFT) and reinforcement learning (RL). As a highly competitive open-source model at a small parameter scale, Nanbeige4.1-3B illustrates that compact models can simultaneously achieve robust reasoning, preference alignment, and effective agentic behaviors.
+
+    Key features:
+      Strong Reasoning: Capable of solving complex, multi-step problems through sustained and coherent reasoning within a single forward pass, reliably producing correct answers on benchmarks like LiveCodeBench-Pro, IMO-Answer-Bench, and AIME 2026 I.
+      Robust Preference Alignment: Outperforms same-scale models (e.g., Qwen3-4B-2507, Nanbeige4-3B-2511) and larger models (e.g., Qwen3-30B-A3B, Qwen3-32B) on Arena-Hard-v2 and Multi-Challenge.
+      Agentic Capability: First general small model to natively support deep-search tasks and sustain complex problem-solving with >500 rounds of tool invocations; excels in benchmarks like xBench-DeepSearch (75), Browse-Comp (39), and others.
+  tags:
+    - llm
+    - gguf
+    - gpu
+    - cpu
+    - nanbeige
+    - reasoning
+    - agent
+  overrides:
+    parameters:
+      model: nanbeige4.1-3b-q8_0.gguf
+  files:
+    - filename: nanbeige4.1-3b-q8_0.gguf
+      sha256: a5a4379e50605c5e5a31bb1716a211fb16691fea7e13ede7f88796e1f617d9e0
+      uri: huggingface://Edge-Quant/Nanbeige4.1-3B-Q8_0-GGUF/nanbeige4.1-3b-q8_0.gguf
+- !!merge <<: *nanbeige4
+  name: "nanbeige4.1-3b-q4"
+  urls:
+    - https://huggingface.co/Nanbeige/Nanbeige4.1-3B
+    - https://huggingface.co/Edge-Quant/Nanbeige4.1-3B-Q4_K_M-GGUF
+  overrides:
+    parameters:
+      model: nanbeige4.1-3b-q4_k_m.gguf
+  files:
+    - filename: nanbeige4.1-3b-q4_k_m.gguf
+      sha256: 043246350c952877b38958a9e35c480419008b6b2d52bedaf2b805ed2447b4df
+      uri: huggingface://Edge-Quant/Nanbeige4.1-3B-Q4_K_M-GGUF/nanbeige4.1-3b-q4_k_m.gguf
 - name: nemo-parakeet-tdt-0.6b
  license: apache-2.0
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
--- a/gallery/nanbeige4.1.yaml
+++ b/gallery/nanbeige4.1.yaml
@@ -0,0 +1,16 @@
+---
+name: nanbeige4.1
+
+config_file: |
+  backend: llama-cpp
+  function:
+      grammar:
+          disable: true
+  known_usecases:
+      - chat
+  options:
+      - use_jinja:true
+  parameters:
+      model: llama-cpp/models/nanbeige4.1-3b-q8_0.gguf
+  template:
+      use_tokenizer_template: true
--- a/pkg/system/capabilities.go
+++ b/pkg/system/capabilities.go
@@ -45,9 +45,8 @@ const (
 )

 var (
-	cuda13DirExists  bool
-	cuda12DirExists  bool
-	capabilityLogged bool
+	cuda13DirExists bool
+	cuda12DirExists bool
 )

 func init() {
@@ -72,9 +71,15 @@ func (s *SystemState) Capability(capMap map[string]string) string {
 }

 func (s *SystemState) getSystemCapabilities() string {
+
+	if s.systemCapabilities != "" {
+		return s.systemCapabilities
+	}
+
 	capability := os.Getenv(capabilityEnv)
 	if capability != "" {
 		xlog.Info("Using forced capability from environment variable", "capability", capability, "env", capabilityEnv)
+		s.systemCapabilities = capability
 		return capability
 	}

@@ -91,20 +96,23 @@ func (s *SystemState) getSystemCapabilities() string {
 		capability, err := os.ReadFile(capabilityRunFile)
 		if err == nil {
 			xlog.Info("Using forced capability run file", "capabilityRunFile", capabilityRunFile, "capability", string(capability), "env", capabilityRunFileEnv)
-			return strings.Trim(strings.TrimSpace(string(capability)), "\n")
+			s.systemCapabilities = strings.Trim(strings.TrimSpace(string(capability)), "\n")
+			return s.systemCapabilities
 		}
 	}

 	// If we are on mac and arm64, we will return metal
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		xlog.Info("Using metal capability (arm64 on mac)", "env", capabilityEnv)
-		return metal
+		s.systemCapabilities = metal
+		return s.systemCapabilities
 	}

 	// If we are on mac and x86, we will return darwin-x86
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "amd64" {
 		xlog.Info("Using darwin-x86 capability (amd64 on mac)", "env", capabilityEnv)
-		return darwinX86
+		s.systemCapabilities = darwinX86
+		return s.systemCapabilities
 	}

 	// If arm64 on linux and a nvidia gpu is detected, we will return nvidia-l4t
@@ -112,39 +120,43 @@ func (s *SystemState) getSystemCapabilities() string {
 		if s.GPUVendor == Nvidia {
 			xlog.Info("Using nvidia-l4t capability (arm64 on linux)", "env", capabilityEnv)
 			if cuda13DirExists {
-				return nvidiaL4TCuda13
+				s.systemCapabilities = nvidiaL4TCuda13
+				return s.systemCapabilities
 			}
 			if cuda12DirExists {
-				return nvidiaL4TCuda12
+				s.systemCapabilities = nvidiaL4TCuda12
+				return s.systemCapabilities
 			}
-			return nvidiaL4T
+			s.systemCapabilities = nvidiaL4T
+			return s.systemCapabilities
 		}
 	}

 	if cuda13DirExists {
-		return nvidiaCuda13
+		s.systemCapabilities = nvidiaCuda13
+		return s.systemCapabilities
 	}

 	if cuda12DirExists {
-		return nvidiaCuda12
+		s.systemCapabilities = nvidiaCuda12
+		return s.systemCapabilities
 	}

 	if s.GPUVendor == "" {
 		xlog.Info("Default capability (no GPU detected)", "env", capabilityEnv)
-		return defaultCapability
+		s.systemCapabilities = defaultCapability
+		return s.systemCapabilities
 	}

-	if !capabilityLogged {
-		xlog.Info("Capability automatically detected", "capability", s.GPUVendor, "env", capabilityEnv)
-		capabilityLogged = true
-	}
 	// If vram is less than 4GB, let's default to CPU but warn the user that they can override that via env
 	if s.VRAM <= 4*1024*1024*1024 {
 		xlog.Warn("VRAM is less than 4GB, defaulting to CPU", "env", capabilityEnv)
-		return defaultCapability
+		s.systemCapabilities = defaultCapability
+		return s.systemCapabilities
 	}

-	return s.GPUVendor
+	s.systemCapabilities = s.GPUVendor
+	return s.systemCapabilities
 }

 // BackendPreferenceTokens returns a list of substrings that represent the preferred
--- a/pkg/system/state.go
+++ b/pkg/system/state.go
@@ -19,6 +19,8 @@ type SystemState struct {
 	Backend   Backend
 	Model     Model
 	VRAM      uint64
+
+	systemCapabilities string
 }

 type SystemStateOptions func(*SystemState)
@@ -53,5 +55,7 @@ func GetSystemState(opts ...SystemStateOptions) (*SystemState, error) {
 	state.VRAM, _ = xsysinfo.TotalAvailableVRAM()
 	xlog.Debug("Total available VRAM", "vram", state.VRAM)

+	state.getSystemCapabilities()
+
 	return state, nil
 }
Author	SHA1	Message	Date
Ettore Di Giacinto	2fb9940b8a	fix(voxcpm): pin setuptools (#8556 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-02-13 23:44:35 +01:00
LocalAI [bot]	2ff0ad4190	chore: ⬆️ Update ggml-org/llama.cpp to `05a6f0e8946914918758db767f6eb04bc1e38507` (#8553 ) ⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-02-13 22:48:01 +01:00
Ettore Di Giacinto	bd12103ed4	chore: compute capabilities once (#8555 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-02-13 22:23:06 +01:00
LocalAI [bot]	2e17edd72a	fix: prevent excessive logging in capability detection (#8552 ) Closes #8527. This PR fixes the excessive logging issue in capability detection by applying the existing capabilityLogged guard to the forced capability run file case. ## Changes - Apply capabilityLogged flag to forced capability detection logging - Prevents repeated log messages during backend discovery and gallery operations Co-authored-by: localai-bot <localai-bot@users.noreply.github.com>	2026-02-13 20:00:29 +00:00
Richard Palethorpe	24aab68b3f	feat(gallery): Add nanbeige4.1-3b (#8551 ) Signed-off-by: Richard Palethorpe <io@richiejp.com>	2026-02-13 18:23:44 +01:00
Richard Palethorpe	5bdbb10593	fix(realtime): Send proper image data to backend (#8547 ) * fix(realtime): Allow empty parameters Signed-off-by: Richard Palethorpe <io@richiejp.com> * fix(realtime): Just pass base64 string to backend Signed-off-by: Richard Palethorpe <io@richiejp.com> --------- Signed-off-by: Richard Palethorpe <io@richiejp.com>	2026-02-13 18:01:07 +01:00