step-flash fixes

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-02-14 16:41:20 -05:00 · 2026-02-12 23:36:16 +01:00
19 changed files with 174 additions and 134 deletions
--- a/.github/gallery-agent/agent.go
+++ b/.github/gallery-agent/agent.go
@@ -146,7 +146,7 @@ func getRealReadme(ctx context.Context, repository string) (string, error) {
 		return "", err
 	}

-	content := result.LastMessage().Content
+	content := newFragment.LastMessage().Content
 	return cleanTextContent(content), nil
 }

--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=05a6f0e8946914918758db767f6eb04bc1e38507
+LLAMA_VERSION?=338085c69e486b7155e5b03d7b5087e02c0e2528
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -294,6 +294,76 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const
    return data;
 }

+static bool template_uses_arguments_items_filter(const std::string & template_src) {
+    return template_src.find("arguments|items") != std::string::npos ||
+           template_src.find("arguments | items") != std::string::npos ||
+           template_src.find("arguments| items") != std::string::npos ||
+           template_src.find("arguments |items") != std::string::npos;
+}
+
+static void normalize_tool_call_arguments_for_template(
+    json & messages,
+    const std::string & template_src,
+    const char * request_name)
+{
+    if (!messages.is_array() || !template_uses_arguments_items_filter(template_src)) {
+        return;
+    }
+
+    size_t converted = 0;
+    size_t failed = 0;
+
+    for (auto & message : messages) {
+        if (!message.is_object() || !message.contains("tool_calls") || !message["tool_calls"].is_array()) {
+            continue;
+        }
+
+        for (auto & tool_call : message["tool_calls"]) {
+            if (!tool_call.is_object() || !tool_call.contains("function") || !tool_call["function"].is_object()) {
+                continue;
+            }
+
+            auto & function = tool_call["function"];
+            if (!function.contains("arguments")) {
+                continue;
+            }
+
+            auto & arguments = function["arguments"];
+            if (!arguments.is_string()) {
+                continue;
+            }
+
+            const std::string args_str = arguments.get<std::string>();
+            if (args_str.empty()) {
+                arguments = json::object();
+                converted++;
+                continue;
+            }
+
+            try {
+                json parsed_args = json::parse(args_str);
+                if (parsed_args.is_object()) {
+                    arguments = parsed_args;
+                    converted++;
+                }
+            } catch (const json::parse_error &) {
+                failed++;
+            }
+        }
+    }
+
+    if (converted > 0) {
+        SRV_INF("[TOOLS DEBUG] %s: Converted %zu tool call argument strings to JSON objects for arguments|items template compatibility\n",
+                request_name,
+                converted);
+    }
+    if (failed > 0) {
+        SRV_WRN("[TOOLS DEBUG] %s: Failed to parse %zu tool call argument strings as JSON for arguments|items template compatibility\n",
+                request_name,
+                failed);
+    }
+}
+

 const std::vector<ggml_type> kv_cache_types = {
    GGML_TYPE_F32,
@@ -417,12 +487,6 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
    // n_ctx_checkpoints: max context checkpoints per slot (default: 8)
    params.n_ctx_checkpoints = 8;

-    // llama memory fit fails if we don't provide a buffer for tensor overrides
-    const size_t ntbo = llama_max_tensor_buft_overrides();
-    while (params.tensor_buft_overrides.size() < ntbo) {
-        params.tensor_buft_overrides.push_back({nullptr, nullptr});
-    }
-
     // decode options. Options are in form optname:optvale, or if booleans only optname.
    for (int i = 0; i < request->options_size(); i++) {
        std::string opt = request->options(i);
@@ -1261,6 +1325,11 @@ public:
                    body_json["add_generation_prompt"] = data["add_generation_prompt"];
                }

+                if (body_json.contains("messages") && ctx_server.impl->chat_params.tmpls) {
+                    const auto template_src = common_chat_templates_source(ctx_server.impl->chat_params.tmpls.get());
+                    normalize_tool_call_arguments_for_template(body_json["messages"], template_src, "PredictStream");
+                }
+
                // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)
                SRV_DBG("[CONVERSATION DEBUG] PredictStream: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str());

@@ -1992,6 +2061,11 @@ public:
                    body_json["add_generation_prompt"] = data["add_generation_prompt"];
                }

+                if (body_json.contains("messages") && ctx_server.impl->chat_params.tmpls) {
+                    const auto template_src = common_chat_templates_source(ctx_server.impl->chat_params.tmpls.get());
+                    normalize_tool_call_arguments_for_template(body_json["messages"], template_src, "Predict");
+                }
+
                // Debug: Print full body_json before template processing (includes messages, tools, tool_choice, etc.)
                SRV_DBG("[CONVERSATION DEBUG] Predict: Full body_json before oaicompat_chat_params_parse:\n%s\n", body_json.dump(2).c_str());

--- a/backend/python/neutts/install.sh
+++ b/backend/python/neutts/install.sh
@@ -32,14 +32,7 @@ if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then
 fi


-git clone --depth 100 https://github.com/neuphonic/neutts-air neutts-air
-
-cd neutts-air
-
-git checkout 1737487debe5b40a0bb97875edce8c66b391722b
-
-cd ..
-
+git clone https://github.com/neuphonic/neutts-air neutts-air

 cp -rfv neutts-air/neuttsair ./

--- a/backend/python/qwen-asr/requirements.txt
+++ b/backend/python/qwen-asr/requirements.txt
@@ -3,6 +3,3 @@ protobuf
 certifi
 packaging==24.1
 setuptools
-h11
-gradio
-uvicorn
--- a/backend/python/qwen-tts/requirements.txt
+++ b/backend/python/qwen-tts/requirements.txt
@@ -4,6 +4,4 @@ certifi
 packaging==24.1
 soundfile
 setuptools
-six
-scipy
-librosa
+six
--- a/backend/python/voxcpm/install.sh
+++ b/backend/python/voxcpm/install.sh
@@ -9,12 +9,7 @@ else
 fi

 installRequirements
- 
-if [ "x${USE_PIP}" == "xtrue" ]; then
-    pip install "setuptools<70.0.0"
-else
-    uv pip install "setuptools<70.0.0"
-fi
+
 # Apply patch to fix PyTorch compatibility issue in voxcpm
 # This fixes the "Dimension out of range" error in scaled_dot_product_attention
 # by changing .contiguous() to .unsqueeze(0) in the attention module
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -83,7 +83,7 @@ type RunCMD struct {
 	EnableTracing                      bool     `env:"LOCALAI_ENABLE_TRACING,ENABLE_TRACING" help:"Enable API tracing" group:"api"`
 	TracingMaxItems                    int      `env:"LOCALAI_TRACING_MAX_ITEMS" default:"1024" help:"Maximum number of traces to keep" group:"api"`
 	AgentJobRetentionDays              int      `env:"LOCALAI_AGENT_JOB_RETENTION_DAYS,AGENT_JOB_RETENTION_DAYS" default:"30" help:"Number of days to keep agent job history (default: 30)" group:"api"`
-	OpenResponsesStoreTTL              string   `env:"LOCALAI_OPEN_RESPONSES_STORE_TTL,OPEN_RESPONSES_STORE_TTL" default:"0" help:"TTL for Open Responses store (e.g., 1h, 30m, 0 = no expiration)" group:"api"`
+	OpenResponsesStoreTTL               string   `env:"LOCALAI_OPEN_RESPONSES_STORE_TTL,OPEN_RESPONSES_STORE_TTL" default:"0" help:"TTL for Open Responses store (e.g., 1h, 30m, 0 = no expiration)" group:"api"`

 	Version bool
 }
--- a/core/http/endpoints/localai/mcp.go
+++ b/core/http/endpoints/localai/mcp.go
@@ -162,6 +162,11 @@ func MCPEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				return err
 			}

+			f, err = defaultLLM.Ask(ctxWithCancellation, f)
+			if err != nil {
+				return err
+			}
+
 			resp := &schema.OpenAIResponse{
 				ID:      id,
 				Created: created,
@@ -247,6 +252,17 @@ func MCPEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				return
 			}

+			// Get final response
+			f, err = defaultLLM.Ask(ctxWithCancellation, f)
+			if err != nil {
+				events <- MCPErrorEvent{
+					Type:    "error",
+					Message: fmt.Sprintf("Failed to get response: %v", err),
+				}
+				ended <- err
+				return
+			}
+
 			// Stream final assistant response
 			content := f.LastMessage().Content
 			events <- MCPAssistantEvent{
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -23,7 +23,6 @@ import (
 	"github.com/mudler/LocalAI/core/templates"
 	laudio "github.com/mudler/LocalAI/pkg/audio"
 	"github.com/mudler/LocalAI/pkg/functions"
-	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/reasoning"
@@ -950,12 +949,7 @@ func triggerResponse(session *Session, conv *Conversation, c *LockedWebsocket, o
 				case types.MessageContentTypeInputAudio:
 					textContent += content.Transcript
 				case types.MessageContentTypeInputImage:
-					img, err := utils.GetContentURIAsBase64(content.ImageURL)
-					if err != nil {
-						xlog.Warn("Failed to process image", "error", err)
-						continue
-					}
-					msg.StringImages = append(msg.StringImages, img)
+					msg.StringImages = append(msg.StringImages, content.ImageURL)
 					imgIndex++
 					nrOfImgsInMessage++
 				}
--- a/core/http/endpoints/openai/types/types.go
+++ b/core/http/endpoints/openai/types/types.go
@@ -175,8 +175,8 @@ type ToolFunction struct {
 	// The description of the function, including guidance on when and how to call it, and guidance about what to tell the user when calling (if anything).
 	Description string `json:"description"`

-	// The jsonschema representing the parameters
-	Parameters any `json:"parameters,omitempty"`
+	// The type of the tool, i.e. function.
+	Parameters any `json:"parameters"`
 }

 func (t ToolFunction) ToolType() ToolType {
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -1004,6 +1004,12 @@ func handleBackgroundMCPNonStream(ctx context.Context, store *ResponseStore, res
 	default:
 	}

+	// Get final response
+	f, err = defaultLLM.Ask(ctx, f)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get response: %w", err)
+	}
+
 	// Convert fragment to Open Responses format
 	fPtr := &f
 	outputItems := convertCogitoFragmentToORItems(fPtr)
@@ -1180,6 +1186,21 @@ func handleBackgroundMCPStream(ctx context.Context, store *ResponseStore, respon
 		default:
 		}

+		// Get final response
+		f, err = defaultLLM.Ask(ctx, f)
+		if err != nil {
+			select {
+			case <-ctx.Done():
+				ended <- ctx.Err()
+			case events <- map[string]interface{}{
+				"type":    "error",
+				"message": fmt.Sprintf("Failed to get response: %v", err),
+			}:
+				ended <- err
+			}
+			return
+		}
+
 		// Stream final assistant message
 		content := f.LastMessage().Content
 		messageID := fmt.Sprintf("msg_%s", uuid.New().String())
@@ -2559,6 +2580,12 @@ func handleMCPNonStream(c echo.Context, responseID string, createdAt int64, inpu
 		return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("failed to execute tools: %v", err), "")
 	}

+	// Get final response
+	f, err = defaultLLM.Ask(ctx, f)
+	if err != nil {
+		return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("failed to get response: %v", err), "")
+	}
+
 	// Convert fragment to Open Responses format
 	fPtr := &f
 	outputItems := convertCogitoFragmentToORItems(fPtr)
@@ -2703,6 +2730,17 @@ func handleMCPStream(c echo.Context, responseID string, createdAt int64, input *
 			return
 		}

+		// Get final response
+		f, err = defaultLLM.Ask(ctx, f)
+		if err != nil {
+			events <- map[string]interface{}{
+				"type":    "error",
+				"message": fmt.Sprintf("Failed to get response: %v", err),
+			}
+			ended <- err
+			return
+		}
+
 		// Stream final assistant message
 		content := f.LastMessage().Content
 		messageID := fmt.Sprintf("msg_%s", uuid.New().String())
--- a/core/services/agent_jobs.go
+++ b/core/services/agent_jobs.go
@@ -27,8 +27,8 @@ import (
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/xsync"
 	"github.com/mudler/cogito"
-	"github.com/mudler/xlog"
 	"github.com/robfig/cron/v3"
+	"github.com/mudler/xlog"
 )

 // AgentJobService manages agent tasks and job execution
@@ -894,6 +894,17 @@ func (s *AgentJobService) executeJobInternal(job schema.Job, task schema.Task, c
 		return fmt.Errorf("failed to execute tools: %w", err)
 	}

+	// Get final response
+	f, err = defaultLLM.Ask(ctx, f)
+	if err != nil {
+		job.Status = schema.JobStatusFailed
+		job.Error = fmt.Sprintf("failed to get response: %v", err)
+		completedAt := time.Now()
+		job.CompletedAt = &completedAt
+		s.jobs.Set(job.ID, job)
+		return fmt.Errorf("failed to get response: %w", err)
+	}
+
 	// Extract traces from fragment.Status after execution
 	// This provides complete information about tool calls and results
 	// We use Status data to supplement/replace callback data for completeness
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,46 +1,4 @@
 ---
- &nanbeige4
-  name: "nanbeige4.1-3b-q8"
-  url: "github:mudler/LocalAI/gallery/nanbeige4.1.yaml@master"
-  urls:
-    - https://huggingface.co/Nanbeige/Nanbeige4.1-3B
-    - https://huggingface.co/Edge-Quant/Nanbeige4.1-3B-Q8_0-GGUF
-  icon: https://cdn-avatars.huggingface.co/v1/production/uploads/646f0d118ff94af23bc44aab/GXHCollpMRgvYqUXQ2BQ7.png
-  license: apache-2.0
-  description: |
-    Nanbeige4.1-3B is built upon Nanbeige4-3B-Base and represents an enhanced iteration of our previous reasoning model, Nanbeige4-3B-Thinking-2511, achieved through further post-training optimization with supervised fine-tuning (SFT) and reinforcement learning (RL). As a highly competitive open-source model at a small parameter scale, Nanbeige4.1-3B illustrates that compact models can simultaneously achieve robust reasoning, preference alignment, and effective agentic behaviors.
-
-    Key features:
-      Strong Reasoning: Capable of solving complex, multi-step problems through sustained and coherent reasoning within a single forward pass, reliably producing correct answers on benchmarks like LiveCodeBench-Pro, IMO-Answer-Bench, and AIME 2026 I.
-      Robust Preference Alignment: Outperforms same-scale models (e.g., Qwen3-4B-2507, Nanbeige4-3B-2511) and larger models (e.g., Qwen3-30B-A3B, Qwen3-32B) on Arena-Hard-v2 and Multi-Challenge.
-      Agentic Capability: First general small model to natively support deep-search tasks and sustain complex problem-solving with >500 rounds of tool invocations; excels in benchmarks like xBench-DeepSearch (75), Browse-Comp (39), and others.
-  tags:
-    - llm
-    - gguf
-    - gpu
-    - cpu
-    - nanbeige
-    - reasoning
-    - agent
-  overrides:
-    parameters:
-      model: nanbeige4.1-3b-q8_0.gguf
-  files:
-    - filename: nanbeige4.1-3b-q8_0.gguf
-      sha256: a5a4379e50605c5e5a31bb1716a211fb16691fea7e13ede7f88796e1f617d9e0
-      uri: huggingface://Edge-Quant/Nanbeige4.1-3B-Q8_0-GGUF/nanbeige4.1-3b-q8_0.gguf
- !!merge <<: *nanbeige4
-  name: "nanbeige4.1-3b-q4"
-  urls:
-    - https://huggingface.co/Nanbeige/Nanbeige4.1-3B
-    - https://huggingface.co/Edge-Quant/Nanbeige4.1-3B-Q4_K_M-GGUF
-  overrides:
-    parameters:
-      model: nanbeige4.1-3b-q4_k_m.gguf
-  files:
-    - filename: nanbeige4.1-3b-q4_k_m.gguf
-      sha256: 043246350c952877b38958a9e35c480419008b6b2d52bedaf2b805ed2447b4df
-      uri: huggingface://Edge-Quant/Nanbeige4.1-3B-Q4_K_M-GGUF/nanbeige4.1-3b-q4_k_m.gguf
 - name: nemo-parakeet-tdt-0.6b
  license: apache-2.0
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
--- a/gallery/nanbeige4.1.yaml
+++ b/gallery/nanbeige4.1.yaml
@@ -1,16 +0,0 @@
---
-name: nanbeige4.1
-
-config_file: |
-  backend: llama-cpp
-  function:
-      grammar:
-          disable: true
-  known_usecases:
-      - chat
-  options:
-      - use_jinja:true
-  parameters:
-      model: llama-cpp/models/nanbeige4.1-3b-q8_0.gguf
-  template:
-      use_tokenizer_template: true
--- a/go.mod
+++ b/go.mod
@@ -33,7 +33,7 @@ require (
 	github.com/mholt/archiver/v3 v3.5.1
 	github.com/microcosm-cc/bluemonday v1.0.27
 	github.com/modelcontextprotocol/go-sdk v1.2.0
-	github.com/mudler/cogito v0.8.2-0.20260214201734-da0d4ceb2b44
+	github.com/mudler/cogito v0.8.1
 	github.com/mudler/edgevpn v0.31.1
 	github.com/mudler/go-processmanager v0.1.0
 	github.com/mudler/memory v0.0.0-20251216220809-d1256471a6c2
--- a/go.sum
+++ b/go.sum
@@ -511,8 +511,6 @@ github.com/mr-tron/base58 v1.2.0 h1:T/HDJBh4ZCPbU39/+c3rRvE0uKBQlU27+QI8LJ4t64o=
 github.com/mr-tron/base58 v1.2.0/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc=
 github.com/mudler/cogito v0.8.1 h1:66qPJkAMrq/Vo8AC/PvXWuVxYPhi7X2DQuJIilL8+3I=
 github.com/mudler/cogito v0.8.1/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
-github.com/mudler/cogito v0.8.2-0.20260214201734-da0d4ceb2b44 h1:joGszpItINnZdoL/0p2077Wz2xnxMGRSRgYN5mS7I4c=
-github.com/mudler/cogito v0.8.2-0.20260214201734-da0d4ceb2b44/go.mod h1:6sfja3lcu2nWRzEc0wwqGNu/eCG3EWgij+8s7xyUeQ4=
 github.com/mudler/edgevpn v0.31.1 h1:7qegiDWd0kAg6ljhNHxqvp8hbo/6BbzSdbb7/2WZfiY=
 github.com/mudler/edgevpn v0.31.1/go.mod h1:ftV5B0nKFzm4R8vR80UYnCb2nf7lxCRgAALxUEEgCf8=
 github.com/mudler/go-piper v0.0.0-20241023091659-2494246fd9fc h1:RxwneJl1VgvikiX28EkpdAyL4yQVnJMrbquKospjHyA=
--- a/pkg/system/capabilities.go
+++ b/pkg/system/capabilities.go
@@ -45,8 +45,9 @@ const (
 )

 var (
-	cuda13DirExists bool
-	cuda12DirExists bool
+	cuda13DirExists  bool
+	cuda12DirExists  bool
+	capabilityLogged bool
 )

 func init() {
@@ -71,15 +72,9 @@ func (s *SystemState) Capability(capMap map[string]string) string {
 }

 func (s *SystemState) getSystemCapabilities() string {
-
-	if s.systemCapabilities != "" {
-		return s.systemCapabilities
-	}
-
 	capability := os.Getenv(capabilityEnv)
 	if capability != "" {
 		xlog.Info("Using forced capability from environment variable", "capability", capability, "env", capabilityEnv)
-		s.systemCapabilities = capability
 		return capability
 	}

@@ -96,23 +91,20 @@ func (s *SystemState) getSystemCapabilities() string {
 		capability, err := os.ReadFile(capabilityRunFile)
 		if err == nil {
 			xlog.Info("Using forced capability run file", "capabilityRunFile", capabilityRunFile, "capability", string(capability), "env", capabilityRunFileEnv)
-			s.systemCapabilities = strings.Trim(strings.TrimSpace(string(capability)), "\n")
-			return s.systemCapabilities
+			return strings.Trim(strings.TrimSpace(string(capability)), "\n")
 		}
 	}

 	// If we are on mac and arm64, we will return metal
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
 		xlog.Info("Using metal capability (arm64 on mac)", "env", capabilityEnv)
-		s.systemCapabilities = metal
-		return s.systemCapabilities
+		return metal
 	}

 	// If we are on mac and x86, we will return darwin-x86
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "amd64" {
 		xlog.Info("Using darwin-x86 capability (amd64 on mac)", "env", capabilityEnv)
-		s.systemCapabilities = darwinX86
-		return s.systemCapabilities
+		return darwinX86
 	}

 	// If arm64 on linux and a nvidia gpu is detected, we will return nvidia-l4t
@@ -120,43 +112,39 @@ func (s *SystemState) getSystemCapabilities() string {
 		if s.GPUVendor == Nvidia {
 			xlog.Info("Using nvidia-l4t capability (arm64 on linux)", "env", capabilityEnv)
 			if cuda13DirExists {
-				s.systemCapabilities = nvidiaL4TCuda13
-				return s.systemCapabilities
+				return nvidiaL4TCuda13
 			}
 			if cuda12DirExists {
-				s.systemCapabilities = nvidiaL4TCuda12
-				return s.systemCapabilities
+				return nvidiaL4TCuda12
 			}
-			s.systemCapabilities = nvidiaL4T
-			return s.systemCapabilities
+			return nvidiaL4T
 		}
 	}

 	if cuda13DirExists {
-		s.systemCapabilities = nvidiaCuda13
-		return s.systemCapabilities
+		return nvidiaCuda13
 	}

 	if cuda12DirExists {
-		s.systemCapabilities = nvidiaCuda12
-		return s.systemCapabilities
+		return nvidiaCuda12
 	}

 	if s.GPUVendor == "" {
 		xlog.Info("Default capability (no GPU detected)", "env", capabilityEnv)
-		s.systemCapabilities = defaultCapability
-		return s.systemCapabilities
+		return defaultCapability
 	}

+	if !capabilityLogged {
+		xlog.Info("Capability automatically detected", "capability", s.GPUVendor, "env", capabilityEnv)
+		capabilityLogged = true
+	}
 	// If vram is less than 4GB, let's default to CPU but warn the user that they can override that via env
 	if s.VRAM <= 4*1024*1024*1024 {
 		xlog.Warn("VRAM is less than 4GB, defaulting to CPU", "env", capabilityEnv)
-		s.systemCapabilities = defaultCapability
-		return s.systemCapabilities
+		return defaultCapability
 	}

-	s.systemCapabilities = s.GPUVendor
-	return s.systemCapabilities
+	return s.GPUVendor
 }

 // BackendPreferenceTokens returns a list of substrings that represent the preferred
--- a/pkg/system/state.go
+++ b/pkg/system/state.go
@@ -19,8 +19,6 @@ type SystemState struct {
 	Backend   Backend
 	Model     Model
 	VRAM      uint64
-
-	systemCapabilities string
 }

 type SystemStateOptions func(*SystemState)
@@ -55,7 +53,5 @@ func GetSystemState(opts ...SystemStateOptions) (*SystemState, error) {
 	state.VRAM, _ = xsysinfo.TotalAvailableVRAM()
 	xlog.Debug("Total available VRAM", "vram", state.VRAM)

-	state.getSystemCapabilities()
-
 	return state, nil
 }