feat(config): node-aware hardware defaults — larger physical batch on Blackwell

A larger physical batch (n_batch/n_ubatch) materially lifts MoE prefill on
NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark) — measured
on a GB10 with Qwen3-Coder-30B-A3B, the prefill ceiling rises (ub512 ~2994 ->
ub2048 ~3316 t/s) and saturates around 2048.

The heuristic lives in core/config alongside the other config overriders
(ApplyInferenceDefaults, guessDefaultsFromFile/NGPULayers) — they all fill the
ModelConfig from heuristics, so hardware tuning is the same domain and stays in
one place. It is parameterized on a GPU descriptor (not direct detection) so it
works in both deployment shapes:

- Single host: SetDefaults applies it with the LocalGPU.
- Distributed: only the worker sees the GPU, so the worker reports its compute
  capability on registration (gpu_compute_capability -> BackendNode), and the
  router re-applies the SAME core/config heuristic for the SELECTED node before
  loading — fixing the case where the frontend has no GPU at all.

Explicit `batch:` always wins (only managed default values are touched).
xsysinfo gains NVIDIAComputeCapability() (detection only); all interpretation
lives in core/config. Tests: core/config, pkg/xsysinfo, core/services/nodes.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-06-19 22:02:14 +00:00
parent 079ac0e15a
commit bca250e2bd
10 changed files with 390 additions and 32 deletions

View File

@@ -0,0 +1,23 @@
package xsysinfo
import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("parseComputeCap", func() {
DescribeTable("splits major.minor",
func(in string, maj, min int) {
m, n := parseComputeCap(in)
Expect(m).To(Equal(maj))
Expect(n).To(Equal(min))
},
Entry("GB10 / DGX Spark", "12.1", 12, 1),
Entry("RTX 50-series", "12.0", 12, 0),
Entry("Hopper", "9.0", 9, 0),
Entry("major only", "12", 12, 0),
Entry("whitespace", " 12.1 ", 12, 1),
Entry("empty", "", -1, -1),
Entry("garbage", "abc", -1, -1),
)
})

View File

@@ -38,9 +38,9 @@ var UnifiedMemoryDevices = []string{
// GPUMemoryInfo contains real-time GPU memory usage information
type GPUMemoryInfo struct {
Index int `json:"index"`
Name string `json:"name"`
Vendor string `json:"vendor"`
Index int `json:"index"`
Name string `json:"name"`
Vendor string `json:"vendor"`
// BDF is the canonical PCI bus address (dddd:bb:dd.f) when known.
// Populated by detection paths that can attribute the device to a
// PCI location (clinfo, future amdgpu/nvidia paths); empty for
@@ -307,6 +307,84 @@ func GetGPUAggregateInfo() GPUAggregateInfo {
return aggregate
}
var (
computeCapOnce sync.Once
computeCapResult string
)
// NVIDIAComputeCapability returns the highest NVIDIA GPU compute capability on
// this host as a "major.minor" string (e.g. "12.1" for GB10 / DGX Spark), or ""
// when nvidia-smi is unavailable or reports none. Detected once and cached.
//
// This runs where the GPU actually is. In distributed mode it is reported by
// each worker on registration so the router can make per-node decisions rather
// than guessing from the (possibly GPU-less) frontend host.
func NVIDIAComputeCapability() string {
computeCapOnce.Do(func() {
computeCapResult = detectNVIDIAComputeCapability()
})
return computeCapResult
}
func detectNVIDIAComputeCapability() string {
if _, err := exec.LookPath("nvidia-smi"); err != nil {
return ""
}
cmd := exec.Command("nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
xlog.Debug("nvidia-smi compute_cap query failed", "error", err, "stderr", stderr.String())
return ""
}
best := ""
bestMajor, bestMinor := -1, -1
for _, line := range strings.Split(strings.TrimSpace(stdout.String()), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
maj, min := parseComputeCap(line)
if maj < 0 {
continue
}
if maj > bestMajor || (maj == bestMajor && min > bestMinor) {
bestMajor, bestMinor, best = maj, min, line
}
}
if best != "" {
xlog.Debug("NVIDIA compute capability detected", "compute_cap", best)
}
return best
}
// parseComputeCap splits a "major.minor" compute-capability string into its
// integer parts. Returns (-1, -1) if it can't be parsed.
func parseComputeCap(cc string) (int, int) {
cc = strings.TrimSpace(cc)
if cc == "" {
return -1, -1
}
majStr, minStr := cc, "0"
if dot := strings.IndexByte(cc, '.'); dot >= 0 {
majStr, minStr = cc[:dot], cc[dot+1:]
}
maj, err := strconv.Atoi(strings.TrimSpace(majStr))
if err != nil {
return -1, -1
}
min, err := strconv.Atoi(strings.TrimSpace(minStr))
if err != nil {
min = 0
}
return maj, min
}
// getNVIDIAGPUMemory queries NVIDIA GPUs using nvidia-smi
func getNVIDIAGPUMemory() []GPUMemoryInfo {
// Check if nvidia-smi is available
@@ -866,12 +944,12 @@ func getVulkanGPUMemory() []GPUMemoryInfo {
}
type vulkanGPUTextInfo struct {
index int
name string
deviceType string
totalVRAM uint64
budgetVRAM uint64
usageVRAM uint64
index int
name string
deviceType string
totalVRAM uint64
budgetVRAM uint64
usageVRAM uint64
}
func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
@@ -909,7 +987,7 @@ func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
} else if current.usageVRAM != 0 && current.budgetVRAM == 0 {
current.budgetVRAM = current.totalVRAM - current.usageVRAM
} else if current.usageVRAM == 0 && current.budgetVRAM == 0 {
current.usageVRAM = 0
current.usageVRAM = 0
current.budgetVRAM = current.totalVRAM
}