From bca250e2bd42f55dd4fdc361e101491d8fb9217f Mon Sep 17 00:00:00 2001
From: Ettore Di Giacinto <mudler@localai.io>
Date: Fri, 19 Jun 2026 22:02:14 +0000
Subject: [PATCH] =?UTF-8?q?feat(config):=20node-aware=20hardware=20default?=
 =?UTF-8?q?s=20=E2=80=94=20larger=20physical=20batch=20on=20Blackwell?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A larger physical batch (n_batch/n_ubatch) materially lifts MoE prefill on
NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark) — measured
on a GB10 with Qwen3-Coder-30B-A3B, the prefill ceiling rises (ub512 ~2994 ->
ub2048 ~3316 t/s) and saturates around 2048.

The heuristic lives in core/config alongside the other config overriders
(ApplyInferenceDefaults, guessDefaultsFromFile/NGPULayers) — they all fill the
ModelConfig from heuristics, so hardware tuning is the same domain and stays in
one place. It is parameterized on a GPU descriptor (not direct detection) so it
works in both deployment shapes:

- Single host: SetDefaults applies it with the LocalGPU.
- Distributed: only the worker sees the GPU, so the worker reports its compute
  capability on registration (gpu_compute_capability -> BackendNode), and the
  router re-applies the SAME core/config heuristic for the SELECTED node before
  loading — fixing the case where the frontend has no GPU at all.

Explicit `batch:` always wins (only managed default values are touched).
xsysinfo gains NVIDIAComputeCapability() (detection only); all interpretation
lives in core/config. Tests: core/config, pkg/xsysinfo, core/services/nodes.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
---
 core/config/hardware_defaults.go              | 118 ++++++++++++++++++
 core/config/hardware_defaults_test.go         |  59 +++++++++
 core/config/model_config.go                   |   5 +
 core/http/endpoints/localai/nodes.go          |  48 +++----
 core/services/nodes/registry.go               |   6 +
 core/services/nodes/router.go                 |  27 ++++
 .../nodes/router_hardware_internal_test.go    |  33 +++++
 core/services/worker/registration.go          |   5 +
 pkg/xsysinfo/computecap_internal_test.go      |  23 ++++
 pkg/xsysinfo/gpu.go                           |  98 +++++++++++++--
 10 files changed, 390 insertions(+), 32 deletions(-)
 create mode 100644 core/config/hardware_defaults.go
 create mode 100644 core/config/hardware_defaults_test.go
 create mode 100644 core/services/nodes/router_hardware_internal_test.go
 create mode 100644 pkg/xsysinfo/computecap_internal_test.go

diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go
new file mode 100644
index 000000000..617e01632
--- /dev/null
+++ b/core/config/hardware_defaults.go
@@ -0,0 +1,118 @@
+package config
+
+import (
+	"strconv"
+	"strings"
+
+	"github.com/mudler/LocalAI/pkg/xsysinfo"
+	"github.com/mudler/xlog"
+)
+
+// Hardware-driven model-config defaults.
+//
+// This sits alongside the other config overriders (ApplyInferenceDefaults for
+// model families, guessDefaultsFromFile for GGUF/NGPULayers): they all
+// heuristically fill ModelConfig values the user left unset. Hardware tuning is
+// the same domain — "adjust the config from the device that will run it" — so
+// it lives here rather than scattered into the backend or a separate package.
+//
+// The heuristics are parameterized on a GPU descriptor (not on direct
+// detection) so they apply in both deployment shapes: SetDefaults passes the
+// LocalGPU on a single host, and the distributed router passes the *selected
+// node's* reported GPU before loading there (the frontend that loaded the
+// config may have no GPU at all).
+
+// GPU describes the device that will run a model.
+type GPU struct {
+	// Vendor is "nvidia", "amd", … (matches xsysinfo vendor constants).
+	Vendor string
+	// ComputeCapability is the NVIDIA compute capability as "major.minor"
+	// (e.g. "12.1" for GB10 / DGX Spark). Empty for non-NVIDIA / unknown.
+	ComputeCapability string
+	// VRAM is total device memory in bytes (0 = unknown).
+	VRAM uint64
+}
+
+// Physical batch (n_batch / n_ubatch) defaults.
+const (
+	// DefaultPhysicalBatch is the conservative default when no hardware-specific
+	// tuning applies. Matches backend.DefaultBatchSize.
+	DefaultPhysicalBatch = 512
+	// BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs
+	// (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical
+	// batch materially lifts MoE prefill there (per-expert GEMM tiles fill
+	// better); measured on a GB10 with Qwen3-30B-A3B to saturate around 2048.
+	BlackwellPhysicalBatch = 2048
+)
+
+// IsNVIDIABlackwell reports whether the GPU is in the NVIDIA Blackwell consumer
+// family (sm_12x). Datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0)
+// reports a different compute capability and is intentionally not matched.
+func (g GPU) IsNVIDIABlackwell() bool {
+	maj, _ := parseComputeCapability(g.ComputeCapability)
+	return maj >= 12
+}
+
+// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
+// given hardware, used when the model config leaves batch unset.
+func PhysicalBatch(g GPU) int {
+	if g.IsNVIDIABlackwell() {
+		return BlackwellPhysicalBatch
+	}
+	return DefaultPhysicalBatch
+}
+
+// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
+// Callers that re-tune a value chosen by an upstream host (the distributed
+// router correcting the frontend's guess) use this to avoid clobbering an
+// explicit user batch such as 1024.
+func IsManagedPhysicalBatch(n int) bool {
+	return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch
+}
+
+// LocalGPU builds a GPU descriptor from local detection. Used by SetDefaults on
+// a single host; the distributed router builds the descriptor from the selected
+// node's reported info instead.
+func LocalGPU() GPU {
+	vendor, _ := xsysinfo.DetectGPUVendor()
+	return GPU{
+		Vendor:            vendor,
+		ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
+	}
+}
+
+// ApplyHardwareDefaults fills ModelConfig values that depend on the target GPU
+// and were left unset by the user. Currently: a larger physical batch on
+// Blackwell. Explicit config always wins (we only touch zero values).
+func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
+	if cfg == nil {
+		return
+	}
+	if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
+		cfg.Batch = BlackwellPhysicalBatch
+		xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
+			"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
+	}
+}
+
+// parseComputeCapability splits a "major.minor" string into integer parts.
+// Returns (-1, -1) when it can't be parsed.
+func parseComputeCapability(cc string) (int, int) {
+	cc = strings.TrimSpace(cc)
+	if cc == "" {
+		return -1, -1
+	}
+	majStr, minStr := cc, "0"
+	if dot := strings.IndexByte(cc, '.'); dot >= 0 {
+		majStr, minStr = cc[:dot], cc[dot+1:]
+	}
+	maj, err := strconv.Atoi(strings.TrimSpace(majStr))
+	if err != nil {
+		return -1, -1
+	}
+	min, err := strconv.Atoi(strings.TrimSpace(minStr))
+	if err != nil {
+		min = 0
+	}
+	return maj, min
+}
diff --git a/core/config/hardware_defaults_test.go b/core/config/hardware_defaults_test.go
new file mode 100644
index 000000000..3d15ef14b
--- /dev/null
+++ b/core/config/hardware_defaults_test.go
@@ -0,0 +1,59 @@
+package config_test
+
+import (
+	. "github.com/mudler/LocalAI/core/config"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Hardware-driven config defaults", func() {
+	DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
+		func(cc string, want bool) {
+			Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
+		},
+		Entry("GB10 12.1", "12.1", true),
+		Entry("RTX 50 12.0", "12.0", true),
+		Entry("future 13.0", "13.0", true),
+		Entry("Hopper 9.0", "9.0", false),
+		Entry("Ada 8.9", "8.9", false),
+		Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
+		Entry("unknown", "", false),
+	)
+
+	Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
+		It("returns the Blackwell batch on Blackwell", func() {
+			Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
+		})
+		It("returns the default batch otherwise", func() {
+			Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
+			Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
+		})
+		It("recognizes managed defaults but not explicit values", func() {
+			Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
+			Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
+			Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
+		})
+	})
+
+	Describe("ApplyHardwareDefaults", func() {
+		It("raises an unset batch to 2048 on Blackwell", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
+		})
+		It("leaves batch unset on non-Blackwell", func() {
+			cfg := &ModelConfig{}
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
+			Expect(cfg.Batch).To(Equal(0))
+		})
+		It("never overrides an explicit batch", func() {
+			cfg := &ModelConfig{}
+			cfg.Batch = 1024
+			ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
+			Expect(cfg.Batch).To(Equal(1024))
+		})
+		It("no-ops on nil", func() {
+			Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
+		})
+	})
+})
diff --git a/core/config/model_config.go b/core/config/model_config.go
index dfe151a64..b57395916 100644
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -1111,6 +1111,11 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	// This ensures gallery-installed and runtime-loaded models get optimal parameters.
 	ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
 
+	// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
+	// Uses the local GPU here; in distributed mode the router re-applies the same
+	// heuristics for the selected node's GPU before loading. Explicit config wins.
+	ApplyHardwareDefaults(cfg, LocalGPU())
+
 	// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
 	defaultTopP := 0.95
 	defaultTopK := 40
diff --git a/core/http/endpoints/localai/nodes.go b/core/http/endpoints/localai/nodes.go
index 5a6edab22..820cb137f 100644
--- a/core/http/endpoints/localai/nodes.go
+++ b/core/http/endpoints/localai/nodes.go
@@ -70,17 +70,20 @@ func GetNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
 
 // RegisterNodeRequest is the request body for registering a new worker node.
 type RegisterNodeRequest struct {
-	Name          string            `json:"name"`
-	NodeType      string            `json:"node_type,omitempty"` // "backend" (default) or "agent"
-	Address       string            `json:"address"`
-	HTTPAddress   string            `json:"http_address,omitempty"`
-	Token         string            `json:"token,omitempty"`
-	TotalVRAM     uint64            `json:"total_vram,omitempty"`
-	AvailableVRAM uint64            `json:"available_vram,omitempty"`
-	TotalRAM      uint64            `json:"total_ram,omitempty"`
-	AvailableRAM  uint64            `json:"available_ram,omitempty"`
-	GPUVendor     string            `json:"gpu_vendor,omitempty"`
-	Labels        map[string]string `json:"labels,omitempty"`
+	Name          string `json:"name"`
+	NodeType      string `json:"node_type,omitempty"` // "backend" (default) or "agent"
+	Address       string `json:"address"`
+	HTTPAddress   string `json:"http_address,omitempty"`
+	Token         string `json:"token,omitempty"`
+	TotalVRAM     uint64 `json:"total_vram,omitempty"`
+	AvailableVRAM uint64 `json:"available_vram,omitempty"`
+	TotalRAM      uint64 `json:"total_ram,omitempty"`
+	AvailableRAM  uint64 `json:"available_ram,omitempty"`
+	GPUVendor     string `json:"gpu_vendor,omitempty"`
+	// GPUComputeCapability is the worker GPU's compute capability ("major.minor",
+	// e.g. "12.1" for GB10). Used by the router for per-arch option tuning.
+	GPUComputeCapability string            `json:"gpu_compute_capability,omitempty"`
+	Labels               map[string]string `json:"labels,omitempty"`
 	// MaxReplicasPerModel is the per-node cap on replicas of any single model.
 	// Workers older than this field omit it; we coerce 0 → 1 below to preserve
 	// historical single-replica behavior.
@@ -152,17 +155,18 @@ func RegisterNodeEndpoint(registry *nodes.NodeRegistry, expectedToken string, au
 		}
 
 		node := &nodes.BackendNode{
-			Name:                req.Name,
-			NodeType:            nodeType,
-			Address:             req.Address,
-			HTTPAddress:         req.HTTPAddress,
-			TokenHash:           tokenHash,
-			TotalVRAM:           req.TotalVRAM,
-			AvailableVRAM:       req.AvailableVRAM,
-			TotalRAM:            req.TotalRAM,
-			AvailableRAM:        req.AvailableRAM,
-			GPUVendor:           req.GPUVendor,
-			MaxReplicasPerModel: maxReplicasPerModel,
+			Name:                 req.Name,
+			NodeType:             nodeType,
+			Address:              req.Address,
+			HTTPAddress:          req.HTTPAddress,
+			TokenHash:            tokenHash,
+			TotalVRAM:            req.TotalVRAM,
+			AvailableVRAM:        req.AvailableVRAM,
+			TotalRAM:             req.TotalRAM,
+			AvailableRAM:         req.AvailableRAM,
+			GPUVendor:            req.GPUVendor,
+			GPUComputeCapability: req.GPUComputeCapability,
+			MaxReplicasPerModel:  maxReplicasPerModel,
 		}
 
 		ctx := c.Request().Context()
diff --git a/core/services/nodes/registry.go b/core/services/nodes/registry.go
index 3d34d086c..aafee13cb 100644
--- a/core/services/nodes/registry.go
+++ b/core/services/nodes/registry.go
@@ -36,6 +36,11 @@ type BackendNode struct {
 	TotalRAM     uint64 `gorm:"column:total_ram" json:"total_ram"`           // Total system RAM in bytes (fallback when no GPU)
 	AvailableRAM uint64 `gorm:"column:available_ram" json:"available_ram"`   // Available system RAM in bytes
 	GPUVendor    string `gorm:"column:gpu_vendor;size:32" json:"gpu_vendor"` // nvidia, amd, intel, vulkan, unknown
+	// GPUComputeCapability is the worker GPU's compute capability as
+	// "major.minor" (e.g. "12.1" for GB10 / DGX Spark). Reported by the worker
+	// on registration; used by the router to pick per-arch options (e.g. a
+	// larger physical batch on Blackwell). Empty when unknown / non-NVIDIA.
+	GPUComputeCapability string `gorm:"column:gpu_compute_capability;size:16" json:"gpu_compute_capability"`
 	// MaxReplicasPerModel caps how many replicas of any one model can run on
 	// this node concurrently. Default 1 preserves the historical "one
 	// (node, model)" assumption; set higher (via worker --max-replicas-per-model)
@@ -69,6 +74,7 @@ const (
 	ColReservedVRAM        = "reserved_vram"
 	ColAvailableRAM        = "available_ram"
 	ColGPUVendor           = "gpu_vendor"
+	ColGPUComputeCap       = "gpu_compute_capability"
 	ColLastHeartbeat       = "last_heartbeat"
 	ColMaxReplicasPerModel = "max_replicas_per_model"
 )
diff --git a/core/services/nodes/router.go b/core/services/nodes/router.go
index e5ce52306..df778a689 100644
--- a/core/services/nodes/router.go
+++ b/core/services/nodes/router.go
@@ -12,6 +12,7 @@ import (
 	"strings"
 	"time"
 
+	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/services/advisorylock"
 	"github.com/mudler/LocalAI/core/services/nodes/prefixcache"
 	"github.com/mudler/LocalAI/pkg/distributedhdr"
@@ -138,6 +139,27 @@ type scheduleLoadResult struct {
 	ReplicaIndex int
 }
 
+// applyNodeHardwareDefaults tunes node-agnostic ModelOptions to the GPU of the
+// node that was actually selected to run the model, reusing the same hardware
+// heuristics as single-host config loading (core/config). On Blackwell it
+// raises the physical batch; on non-Blackwell it resets a hardware-default that
+// an upstream host (the GPU-less frontend in distributed mode) guessed higher.
+// Only values the heuristics themselves manage are touched, so an explicit user
+// batch (e.g. 1024) is never overridden.
+func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
+	if opts == nil || node == nil {
+		return
+	}
+	if !config.IsManagedPhysicalBatch(int(opts.NBatch)) {
+		return
+	}
+	opts.NBatch = int32(config.PhysicalBatch(config.GPU{
+		Vendor:            node.GPUVendor,
+		ComputeCapability: node.GPUComputeCapability,
+		VRAM:              node.TotalVRAM,
+	}))
+}
+
 // scheduleAndLoad is the shared core for loading a model on a new node.
 // Used by both Route() (for first-time loads) and ScheduleAndLoadModel() (for reconciler scale-ups).
 //
@@ -153,6 +175,11 @@ func (r *SmartRouter) scheduleAndLoad(ctx context.Context, backendType, tracking
 		return nil, fmt.Errorf("no available nodes: %w", err)
 	}
 
+	// Tune node-agnostic options to the SELECTED node's GPU. Only now do we know
+	// which node (and its compute capability) will run the model — the frontend
+	// that built modelOpts may have no GPU at all in distributed mode.
+	applyNodeHardwareDefaults(modelOpts, node)
+
 	// Pre-stage model files via FileStager before loading
 	loadOpts := modelOpts
 	if r.fileStager != nil && modelOpts != nil {
diff --git a/core/services/nodes/router_hardware_internal_test.go b/core/services/nodes/router_hardware_internal_test.go
new file mode 100644
index 000000000..361e02700
--- /dev/null
+++ b/core/services/nodes/router_hardware_internal_test.go
@@ -0,0 +1,33 @@
+package nodes
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("applyNodeHardwareDefaults", func() {
+	It("raises a managed default batch on a Blackwell node", func() {
+		opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
+		Expect(opts.NBatch).To(BeEquivalentTo(config.BlackwellPhysicalBatch))
+	})
+
+	It("resets a Blackwell guess on a non-Blackwell node", func() {
+		// frontend (Blackwell) guessed high, but the selected node is not Blackwell
+		opts := &pb.ModelOptions{NBatch: config.BlackwellPhysicalBatch}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "9.0"})
+		Expect(opts.NBatch).To(BeEquivalentTo(config.DefaultPhysicalBatch))
+	})
+
+	It("never overrides an explicit (non-managed) batch", func() {
+		opts := &pb.ModelOptions{NBatch: 1024}
+		applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
+		Expect(opts.NBatch).To(BeEquivalentTo(int32(1024)))
+	})
+
+	It("no-ops on nil inputs", func() {
+		Expect(func() { applyNodeHardwareDefaults(nil, nil) }).ToNot(Panic())
+	})
+})
diff --git a/core/services/worker/registration.go b/core/services/worker/registration.go
index 87a8a7966..432cc845b 100644
--- a/core/services/worker/registration.go
+++ b/core/services/worker/registration.go
@@ -73,6 +73,10 @@ func (cfg *Config) registrationBody() map[string]any {
 	// Detect GPU info for VRAM-aware scheduling
 	totalVRAM, _ := xsysinfo.TotalAvailableVRAM()
 	gpuVendor, _ := xsysinfo.DetectGPUVendor()
+	// Compute capability (e.g. "12.1" for GB10) lets the router pick per-arch
+	// options (e.g. larger physical batch on Blackwell). Detected on the worker
+	// because only the worker sees the GPU in distributed mode.
+	gpuComputeCap := xsysinfo.NVIDIAComputeCapability()
 
 	maxReplicas := cfg.MaxReplicasPerModel
 	if maxReplicas < 1 {
@@ -85,6 +89,7 @@ func (cfg *Config) registrationBody() map[string]any {
 		"total_vram":             totalVRAM,
 		"available_vram":         totalVRAM, // initially all VRAM is available
 		"gpu_vendor":             gpuVendor,
+		"gpu_compute_capability": gpuComputeCap,
 		"max_replicas_per_model": maxReplicas,
 	}
 
diff --git a/pkg/xsysinfo/computecap_internal_test.go b/pkg/xsysinfo/computecap_internal_test.go
new file mode 100644
index 000000000..3bf2602d0
--- /dev/null
+++ b/pkg/xsysinfo/computecap_internal_test.go
@@ -0,0 +1,23 @@
+package xsysinfo
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("parseComputeCap", func() {
+	DescribeTable("splits major.minor",
+		func(in string, maj, min int) {
+			m, n := parseComputeCap(in)
+			Expect(m).To(Equal(maj))
+			Expect(n).To(Equal(min))
+		},
+		Entry("GB10 / DGX Spark", "12.1", 12, 1),
+		Entry("RTX 50-series", "12.0", 12, 0),
+		Entry("Hopper", "9.0", 9, 0),
+		Entry("major only", "12", 12, 0),
+		Entry("whitespace", " 12.1 ", 12, 1),
+		Entry("empty", "", -1, -1),
+		Entry("garbage", "abc", -1, -1),
+	)
+})
diff --git a/pkg/xsysinfo/gpu.go b/pkg/xsysinfo/gpu.go
index a5575edb8..f0185ddeb 100644
--- a/pkg/xsysinfo/gpu.go
+++ b/pkg/xsysinfo/gpu.go
@@ -38,9 +38,9 @@ var UnifiedMemoryDevices = []string{
 
 // GPUMemoryInfo contains real-time GPU memory usage information
 type GPUMemoryInfo struct {
-	Index        int     `json:"index"`
-	Name         string  `json:"name"`
-	Vendor       string  `json:"vendor"`
+	Index  int    `json:"index"`
+	Name   string `json:"name"`
+	Vendor string `json:"vendor"`
 	// BDF is the canonical PCI bus address (dddd:bb:dd.f) when known.
 	// Populated by detection paths that can attribute the device to a
 	// PCI location (clinfo, future amdgpu/nvidia paths); empty for
@@ -307,6 +307,84 @@ func GetGPUAggregateInfo() GPUAggregateInfo {
 	return aggregate
 }
 
+var (
+	computeCapOnce   sync.Once
+	computeCapResult string
+)
+
+// NVIDIAComputeCapability returns the highest NVIDIA GPU compute capability on
+// this host as a "major.minor" string (e.g. "12.1" for GB10 / DGX Spark), or ""
+// when nvidia-smi is unavailable or reports none. Detected once and cached.
+//
+// This runs where the GPU actually is. In distributed mode it is reported by
+// each worker on registration so the router can make per-node decisions rather
+// than guessing from the (possibly GPU-less) frontend host.
+func NVIDIAComputeCapability() string {
+	computeCapOnce.Do(func() {
+		computeCapResult = detectNVIDIAComputeCapability()
+	})
+	return computeCapResult
+}
+
+func detectNVIDIAComputeCapability() string {
+	if _, err := exec.LookPath("nvidia-smi"); err != nil {
+		return ""
+	}
+
+	cmd := exec.Command("nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader")
+
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	if err := cmd.Run(); err != nil {
+		xlog.Debug("nvidia-smi compute_cap query failed", "error", err, "stderr", stderr.String())
+		return ""
+	}
+
+	best := ""
+	bestMajor, bestMinor := -1, -1
+	for _, line := range strings.Split(strings.TrimSpace(stdout.String()), "\n") {
+		line = strings.TrimSpace(line)
+		if line == "" {
+			continue
+		}
+		maj, min := parseComputeCap(line)
+		if maj < 0 {
+			continue
+		}
+		if maj > bestMajor || (maj == bestMajor && min > bestMinor) {
+			bestMajor, bestMinor, best = maj, min, line
+		}
+	}
+	if best != "" {
+		xlog.Debug("NVIDIA compute capability detected", "compute_cap", best)
+	}
+	return best
+}
+
+// parseComputeCap splits a "major.minor" compute-capability string into its
+// integer parts. Returns (-1, -1) if it can't be parsed.
+func parseComputeCap(cc string) (int, int) {
+	cc = strings.TrimSpace(cc)
+	if cc == "" {
+		return -1, -1
+	}
+	majStr, minStr := cc, "0"
+	if dot := strings.IndexByte(cc, '.'); dot >= 0 {
+		majStr, minStr = cc[:dot], cc[dot+1:]
+	}
+	maj, err := strconv.Atoi(strings.TrimSpace(majStr))
+	if err != nil {
+		return -1, -1
+	}
+	min, err := strconv.Atoi(strings.TrimSpace(minStr))
+	if err != nil {
+		min = 0
+	}
+	return maj, min
+}
+
 // getNVIDIAGPUMemory queries NVIDIA GPUs using nvidia-smi
 func getNVIDIAGPUMemory() []GPUMemoryInfo {
 	// Check if nvidia-smi is available
@@ -866,12 +944,12 @@ func getVulkanGPUMemory() []GPUMemoryInfo {
 }
 
 type vulkanGPUTextInfo struct {
-	index        int
-	name         string
-	deviceType   string
-	totalVRAM    uint64
-	budgetVRAM   uint64
-	usageVRAM    uint64
+	index      int
+	name       string
+	deviceType string
+	totalVRAM  uint64
+	budgetVRAM uint64
+	usageVRAM  uint64
 }
 
 func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
@@ -909,7 +987,7 @@ func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
 		} else if current.usageVRAM != 0 && current.budgetVRAM == 0 {
 			current.budgetVRAM = current.totalVRAM - current.usageVRAM
 		} else if current.usageVRAM == 0 && current.budgetVRAM == 0 {
-			current.usageVRAM  = 0
+			current.usageVRAM = 0
 			current.budgetVRAM = current.totalVRAM
 		}