mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-20 06:39:01 -04:00
feat(config): node-aware hardware defaults — larger physical batch on Blackwell
A larger physical batch (n_batch/n_ubatch) materially lifts MoE prefill on NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark) — measured on a GB10 with Qwen3-Coder-30B-A3B, the prefill ceiling rises (ub512 ~2994 -> ub2048 ~3316 t/s) and saturates around 2048. The heuristic lives in core/config alongside the other config overriders (ApplyInferenceDefaults, guessDefaultsFromFile/NGPULayers) — they all fill the ModelConfig from heuristics, so hardware tuning is the same domain and stays in one place. It is parameterized on a GPU descriptor (not direct detection) so it works in both deployment shapes: - Single host: SetDefaults applies it with the LocalGPU. - Distributed: only the worker sees the GPU, so the worker reports its compute capability on registration (gpu_compute_capability -> BackendNode), and the router re-applies the SAME core/config heuristic for the SELECTED node before loading — fixing the case where the frontend has no GPU at all. Explicit `batch:` always wins (only managed default values are touched). xsysinfo gains NVIDIAComputeCapability() (detection only); all interpretation lives in core/config. Tests: core/config, pkg/xsysinfo, core/services/nodes. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
23
pkg/xsysinfo/computecap_internal_test.go
Normal file
23
pkg/xsysinfo/computecap_internal_test.go
Normal file
@@ -0,0 +1,23 @@
|
||||
package xsysinfo
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("parseComputeCap", func() {
|
||||
DescribeTable("splits major.minor",
|
||||
func(in string, maj, min int) {
|
||||
m, n := parseComputeCap(in)
|
||||
Expect(m).To(Equal(maj))
|
||||
Expect(n).To(Equal(min))
|
||||
},
|
||||
Entry("GB10 / DGX Spark", "12.1", 12, 1),
|
||||
Entry("RTX 50-series", "12.0", 12, 0),
|
||||
Entry("Hopper", "9.0", 9, 0),
|
||||
Entry("major only", "12", 12, 0),
|
||||
Entry("whitespace", " 12.1 ", 12, 1),
|
||||
Entry("empty", "", -1, -1),
|
||||
Entry("garbage", "abc", -1, -1),
|
||||
)
|
||||
})
|
||||
@@ -38,9 +38,9 @@ var UnifiedMemoryDevices = []string{
|
||||
|
||||
// GPUMemoryInfo contains real-time GPU memory usage information
|
||||
type GPUMemoryInfo struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
Vendor string `json:"vendor"`
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
Vendor string `json:"vendor"`
|
||||
// BDF is the canonical PCI bus address (dddd:bb:dd.f) when known.
|
||||
// Populated by detection paths that can attribute the device to a
|
||||
// PCI location (clinfo, future amdgpu/nvidia paths); empty for
|
||||
@@ -307,6 +307,84 @@ func GetGPUAggregateInfo() GPUAggregateInfo {
|
||||
return aggregate
|
||||
}
|
||||
|
||||
var (
|
||||
computeCapOnce sync.Once
|
||||
computeCapResult string
|
||||
)
|
||||
|
||||
// NVIDIAComputeCapability returns the highest NVIDIA GPU compute capability on
|
||||
// this host as a "major.minor" string (e.g. "12.1" for GB10 / DGX Spark), or ""
|
||||
// when nvidia-smi is unavailable or reports none. Detected once and cached.
|
||||
//
|
||||
// This runs where the GPU actually is. In distributed mode it is reported by
|
||||
// each worker on registration so the router can make per-node decisions rather
|
||||
// than guessing from the (possibly GPU-less) frontend host.
|
||||
func NVIDIAComputeCapability() string {
|
||||
computeCapOnce.Do(func() {
|
||||
computeCapResult = detectNVIDIAComputeCapability()
|
||||
})
|
||||
return computeCapResult
|
||||
}
|
||||
|
||||
func detectNVIDIAComputeCapability() string {
|
||||
if _, err := exec.LookPath("nvidia-smi"); err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
cmd := exec.Command("nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader")
|
||||
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
xlog.Debug("nvidia-smi compute_cap query failed", "error", err, "stderr", stderr.String())
|
||||
return ""
|
||||
}
|
||||
|
||||
best := ""
|
||||
bestMajor, bestMinor := -1, -1
|
||||
for _, line := range strings.Split(strings.TrimSpace(stdout.String()), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
maj, min := parseComputeCap(line)
|
||||
if maj < 0 {
|
||||
continue
|
||||
}
|
||||
if maj > bestMajor || (maj == bestMajor && min > bestMinor) {
|
||||
bestMajor, bestMinor, best = maj, min, line
|
||||
}
|
||||
}
|
||||
if best != "" {
|
||||
xlog.Debug("NVIDIA compute capability detected", "compute_cap", best)
|
||||
}
|
||||
return best
|
||||
}
|
||||
|
||||
// parseComputeCap splits a "major.minor" compute-capability string into its
|
||||
// integer parts. Returns (-1, -1) if it can't be parsed.
|
||||
func parseComputeCap(cc string) (int, int) {
|
||||
cc = strings.TrimSpace(cc)
|
||||
if cc == "" {
|
||||
return -1, -1
|
||||
}
|
||||
majStr, minStr := cc, "0"
|
||||
if dot := strings.IndexByte(cc, '.'); dot >= 0 {
|
||||
majStr, minStr = cc[:dot], cc[dot+1:]
|
||||
}
|
||||
maj, err := strconv.Atoi(strings.TrimSpace(majStr))
|
||||
if err != nil {
|
||||
return -1, -1
|
||||
}
|
||||
min, err := strconv.Atoi(strings.TrimSpace(minStr))
|
||||
if err != nil {
|
||||
min = 0
|
||||
}
|
||||
return maj, min
|
||||
}
|
||||
|
||||
// getNVIDIAGPUMemory queries NVIDIA GPUs using nvidia-smi
|
||||
func getNVIDIAGPUMemory() []GPUMemoryInfo {
|
||||
// Check if nvidia-smi is available
|
||||
@@ -866,12 +944,12 @@ func getVulkanGPUMemory() []GPUMemoryInfo {
|
||||
}
|
||||
|
||||
type vulkanGPUTextInfo struct {
|
||||
index int
|
||||
name string
|
||||
deviceType string
|
||||
totalVRAM uint64
|
||||
budgetVRAM uint64
|
||||
usageVRAM uint64
|
||||
index int
|
||||
name string
|
||||
deviceType string
|
||||
totalVRAM uint64
|
||||
budgetVRAM uint64
|
||||
usageVRAM uint64
|
||||
}
|
||||
|
||||
func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
|
||||
@@ -909,7 +987,7 @@ func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
|
||||
} else if current.usageVRAM != 0 && current.budgetVRAM == 0 {
|
||||
current.budgetVRAM = current.totalVRAM - current.usageVRAM
|
||||
} else if current.usageVRAM == 0 && current.budgetVRAM == 0 {
|
||||
current.usageVRAM = 0
|
||||
current.usageVRAM = 0
|
||||
current.budgetVRAM = current.totalVRAM
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user