feat(config): node-aware hardware defaults — larger physical batch on Blackwell

A larger physical batch (n_batch/n_ubatch) materially lifts MoE prefill on
NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark) — measured
on a GB10 with Qwen3-Coder-30B-A3B, the prefill ceiling rises (ub512 ~2994 ->
ub2048 ~3316 t/s) and saturates around 2048.

The heuristic lives in core/config alongside the other config overriders
(ApplyInferenceDefaults, guessDefaultsFromFile/NGPULayers) — they all fill the
ModelConfig from heuristics, so hardware tuning is the same domain and stays in
one place. It is parameterized on a GPU descriptor (not direct detection) so it
works in both deployment shapes:

- Single host: SetDefaults applies it with the LocalGPU.
- Distributed: only the worker sees the GPU, so the worker reports its compute
  capability on registration (gpu_compute_capability -> BackendNode), and the
  router re-applies the SAME core/config heuristic for the SELECTED node before
  loading — fixing the case where the frontend has no GPU at all.

Explicit `batch:` always wins (only managed default values are touched).
xsysinfo gains NVIDIAComputeCapability() (detection only); all interpretation
lives in core/config. Tests: core/config, pkg/xsysinfo, core/services/nodes.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
Ettore Di Giacinto
2026-06-19 22:02:14 +00:00
parent 079ac0e15a
commit bca250e2bd
10 changed files with 390 additions and 32 deletions

View File

@@ -0,0 +1,118 @@
package config
import (
"strconv"
"strings"
"github.com/mudler/LocalAI/pkg/xsysinfo"
"github.com/mudler/xlog"
)
// Hardware-driven model-config defaults.
//
// This sits alongside the other config overriders (ApplyInferenceDefaults for
// model families, guessDefaultsFromFile for GGUF/NGPULayers): they all
// heuristically fill ModelConfig values the user left unset. Hardware tuning is
// the same domain — "adjust the config from the device that will run it" — so
// it lives here rather than scattered into the backend or a separate package.
//
// The heuristics are parameterized on a GPU descriptor (not on direct
// detection) so they apply in both deployment shapes: SetDefaults passes the
// LocalGPU on a single host, and the distributed router passes the *selected
// node's* reported GPU before loading there (the frontend that loaded the
// config may have no GPU at all).
// GPU describes the device that will run a model.
type GPU struct {
// Vendor is "nvidia", "amd", … (matches xsysinfo vendor constants).
Vendor string
// ComputeCapability is the NVIDIA compute capability as "major.minor"
// (e.g. "12.1" for GB10 / DGX Spark). Empty for non-NVIDIA / unknown.
ComputeCapability string
// VRAM is total device memory in bytes (0 = unknown).
VRAM uint64
}
// Physical batch (n_batch / n_ubatch) defaults.
const (
// DefaultPhysicalBatch is the conservative default when no hardware-specific
// tuning applies. Matches backend.DefaultBatchSize.
DefaultPhysicalBatch = 512
// BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs
// (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical
// batch materially lifts MoE prefill there (per-expert GEMM tiles fill
// better); measured on a GB10 with Qwen3-30B-A3B to saturate around 2048.
BlackwellPhysicalBatch = 2048
)
// IsNVIDIABlackwell reports whether the GPU is in the NVIDIA Blackwell consumer
// family (sm_12x). Datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0)
// reports a different compute capability and is intentionally not matched.
func (g GPU) IsNVIDIABlackwell() bool {
maj, _ := parseComputeCapability(g.ComputeCapability)
return maj >= 12
}
// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
// given hardware, used when the model config leaves batch unset.
func PhysicalBatch(g GPU) int {
if g.IsNVIDIABlackwell() {
return BlackwellPhysicalBatch
}
return DefaultPhysicalBatch
}
// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
// Callers that re-tune a value chosen by an upstream host (the distributed
// router correcting the frontend's guess) use this to avoid clobbering an
// explicit user batch such as 1024.
func IsManagedPhysicalBatch(n int) bool {
return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch
}
// LocalGPU builds a GPU descriptor from local detection. Used by SetDefaults on
// a single host; the distributed router builds the descriptor from the selected
// node's reported info instead.
func LocalGPU() GPU {
vendor, _ := xsysinfo.DetectGPUVendor()
return GPU{
Vendor: vendor,
ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
}
}
// ApplyHardwareDefaults fills ModelConfig values that depend on the target GPU
// and were left unset by the user. Currently: a larger physical batch on
// Blackwell. Explicit config always wins (we only touch zero values).
func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
if cfg == nil {
return
}
if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
cfg.Batch = BlackwellPhysicalBatch
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
}
}
// parseComputeCapability splits a "major.minor" string into integer parts.
// Returns (-1, -1) when it can't be parsed.
func parseComputeCapability(cc string) (int, int) {
cc = strings.TrimSpace(cc)
if cc == "" {
return -1, -1
}
majStr, minStr := cc, "0"
if dot := strings.IndexByte(cc, '.'); dot >= 0 {
majStr, minStr = cc[:dot], cc[dot+1:]
}
maj, err := strconv.Atoi(strings.TrimSpace(majStr))
if err != nil {
return -1, -1
}
min, err := strconv.Atoi(strings.TrimSpace(minStr))
if err != nil {
min = 0
}
return maj, min
}

View File

@@ -0,0 +1,59 @@
package config_test
import (
. "github.com/mudler/LocalAI/core/config"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("Hardware-driven config defaults", func() {
DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
func(cc string, want bool) {
Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
},
Entry("GB10 12.1", "12.1", true),
Entry("RTX 50 12.0", "12.0", true),
Entry("future 13.0", "13.0", true),
Entry("Hopper 9.0", "9.0", false),
Entry("Ada 8.9", "8.9", false),
Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
Entry("unknown", "", false),
)
Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
It("returns the Blackwell batch on Blackwell", func() {
Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
})
It("returns the default batch otherwise", func() {
Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
})
It("recognizes managed defaults but not explicit values", func() {
Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
})
})
Describe("ApplyHardwareDefaults", func() {
It("raises an unset batch to 2048 on Blackwell", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
})
It("leaves batch unset on non-Blackwell", func() {
cfg := &ModelConfig{}
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
Expect(cfg.Batch).To(Equal(0))
})
It("never overrides an explicit batch", func() {
cfg := &ModelConfig{}
cfg.Batch = 1024
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
Expect(cfg.Batch).To(Equal(1024))
})
It("no-ops on nil", func() {
Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
})
})
})

View File

@@ -1111,6 +1111,11 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
// This ensures gallery-installed and runtime-loaded models get optimal parameters.
ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
// Uses the local GPU here; in distributed mode the router re-applies the same
// heuristics for the selected node's GPU before loading. Explicit config wins.
ApplyHardwareDefaults(cfg, LocalGPU())
// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
defaultTopP := 0.95
defaultTopK := 40

View File

@@ -70,17 +70,20 @@ func GetNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
// RegisterNodeRequest is the request body for registering a new worker node.
type RegisterNodeRequest struct {
Name string `json:"name"`
NodeType string `json:"node_type,omitempty"` // "backend" (default) or "agent"
Address string `json:"address"`
HTTPAddress string `json:"http_address,omitempty"`
Token string `json:"token,omitempty"`
TotalVRAM uint64 `json:"total_vram,omitempty"`
AvailableVRAM uint64 `json:"available_vram,omitempty"`
TotalRAM uint64 `json:"total_ram,omitempty"`
AvailableRAM uint64 `json:"available_ram,omitempty"`
GPUVendor string `json:"gpu_vendor,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
Name string `json:"name"`
NodeType string `json:"node_type,omitempty"` // "backend" (default) or "agent"
Address string `json:"address"`
HTTPAddress string `json:"http_address,omitempty"`
Token string `json:"token,omitempty"`
TotalVRAM uint64 `json:"total_vram,omitempty"`
AvailableVRAM uint64 `json:"available_vram,omitempty"`
TotalRAM uint64 `json:"total_ram,omitempty"`
AvailableRAM uint64 `json:"available_ram,omitempty"`
GPUVendor string `json:"gpu_vendor,omitempty"`
// GPUComputeCapability is the worker GPU's compute capability ("major.minor",
// e.g. "12.1" for GB10). Used by the router for per-arch option tuning.
GPUComputeCapability string `json:"gpu_compute_capability,omitempty"`
Labels map[string]string `json:"labels,omitempty"`
// MaxReplicasPerModel is the per-node cap on replicas of any single model.
// Workers older than this field omit it; we coerce 0 → 1 below to preserve
// historical single-replica behavior.
@@ -152,17 +155,18 @@ func RegisterNodeEndpoint(registry *nodes.NodeRegistry, expectedToken string, au
}
node := &nodes.BackendNode{
Name: req.Name,
NodeType: nodeType,
Address: req.Address,
HTTPAddress: req.HTTPAddress,
TokenHash: tokenHash,
TotalVRAM: req.TotalVRAM,
AvailableVRAM: req.AvailableVRAM,
TotalRAM: req.TotalRAM,
AvailableRAM: req.AvailableRAM,
GPUVendor: req.GPUVendor,
MaxReplicasPerModel: maxReplicasPerModel,
Name: req.Name,
NodeType: nodeType,
Address: req.Address,
HTTPAddress: req.HTTPAddress,
TokenHash: tokenHash,
TotalVRAM: req.TotalVRAM,
AvailableVRAM: req.AvailableVRAM,
TotalRAM: req.TotalRAM,
AvailableRAM: req.AvailableRAM,
GPUVendor: req.GPUVendor,
GPUComputeCapability: req.GPUComputeCapability,
MaxReplicasPerModel: maxReplicasPerModel,
}
ctx := c.Request().Context()

View File

@@ -36,6 +36,11 @@ type BackendNode struct {
TotalRAM uint64 `gorm:"column:total_ram" json:"total_ram"` // Total system RAM in bytes (fallback when no GPU)
AvailableRAM uint64 `gorm:"column:available_ram" json:"available_ram"` // Available system RAM in bytes
GPUVendor string `gorm:"column:gpu_vendor;size:32" json:"gpu_vendor"` // nvidia, amd, intel, vulkan, unknown
// GPUComputeCapability is the worker GPU's compute capability as
// "major.minor" (e.g. "12.1" for GB10 / DGX Spark). Reported by the worker
// on registration; used by the router to pick per-arch options (e.g. a
// larger physical batch on Blackwell). Empty when unknown / non-NVIDIA.
GPUComputeCapability string `gorm:"column:gpu_compute_capability;size:16" json:"gpu_compute_capability"`
// MaxReplicasPerModel caps how many replicas of any one model can run on
// this node concurrently. Default 1 preserves the historical "one
// (node, model)" assumption; set higher (via worker --max-replicas-per-model)
@@ -69,6 +74,7 @@ const (
ColReservedVRAM = "reserved_vram"
ColAvailableRAM = "available_ram"
ColGPUVendor = "gpu_vendor"
ColGPUComputeCap = "gpu_compute_capability"
ColLastHeartbeat = "last_heartbeat"
ColMaxReplicasPerModel = "max_replicas_per_model"
)

View File

@@ -12,6 +12,7 @@ import (
"strings"
"time"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/services/advisorylock"
"github.com/mudler/LocalAI/core/services/nodes/prefixcache"
"github.com/mudler/LocalAI/pkg/distributedhdr"
@@ -138,6 +139,27 @@ type scheduleLoadResult struct {
ReplicaIndex int
}
// applyNodeHardwareDefaults tunes node-agnostic ModelOptions to the GPU of the
// node that was actually selected to run the model, reusing the same hardware
// heuristics as single-host config loading (core/config). On Blackwell it
// raises the physical batch; on non-Blackwell it resets a hardware-default that
// an upstream host (the GPU-less frontend in distributed mode) guessed higher.
// Only values the heuristics themselves manage are touched, so an explicit user
// batch (e.g. 1024) is never overridden.
func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
if opts == nil || node == nil {
return
}
if !config.IsManagedPhysicalBatch(int(opts.NBatch)) {
return
}
opts.NBatch = int32(config.PhysicalBatch(config.GPU{
Vendor: node.GPUVendor,
ComputeCapability: node.GPUComputeCapability,
VRAM: node.TotalVRAM,
}))
}
// scheduleAndLoad is the shared core for loading a model on a new node.
// Used by both Route() (for first-time loads) and ScheduleAndLoadModel() (for reconciler scale-ups).
//
@@ -153,6 +175,11 @@ func (r *SmartRouter) scheduleAndLoad(ctx context.Context, backendType, tracking
return nil, fmt.Errorf("no available nodes: %w", err)
}
// Tune node-agnostic options to the SELECTED node's GPU. Only now do we know
// which node (and its compute capability) will run the model — the frontend
// that built modelOpts may have no GPU at all in distributed mode.
applyNodeHardwareDefaults(modelOpts, node)
// Pre-stage model files via FileStager before loading
loadOpts := modelOpts
if r.fileStager != nil && modelOpts != nil {

View File

@@ -0,0 +1,33 @@
package nodes
import (
"github.com/mudler/LocalAI/core/config"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("applyNodeHardwareDefaults", func() {
It("raises a managed default batch on a Blackwell node", func() {
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch}
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
Expect(opts.NBatch).To(BeEquivalentTo(config.BlackwellPhysicalBatch))
})
It("resets a Blackwell guess on a non-Blackwell node", func() {
// frontend (Blackwell) guessed high, but the selected node is not Blackwell
opts := &pb.ModelOptions{NBatch: config.BlackwellPhysicalBatch}
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "9.0"})
Expect(opts.NBatch).To(BeEquivalentTo(config.DefaultPhysicalBatch))
})
It("never overrides an explicit (non-managed) batch", func() {
opts := &pb.ModelOptions{NBatch: 1024}
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
Expect(opts.NBatch).To(BeEquivalentTo(int32(1024)))
})
It("no-ops on nil inputs", func() {
Expect(func() { applyNodeHardwareDefaults(nil, nil) }).ToNot(Panic())
})
})

View File

@@ -73,6 +73,10 @@ func (cfg *Config) registrationBody() map[string]any {
// Detect GPU info for VRAM-aware scheduling
totalVRAM, _ := xsysinfo.TotalAvailableVRAM()
gpuVendor, _ := xsysinfo.DetectGPUVendor()
// Compute capability (e.g. "12.1" for GB10) lets the router pick per-arch
// options (e.g. larger physical batch on Blackwell). Detected on the worker
// because only the worker sees the GPU in distributed mode.
gpuComputeCap := xsysinfo.NVIDIAComputeCapability()
maxReplicas := cfg.MaxReplicasPerModel
if maxReplicas < 1 {
@@ -85,6 +89,7 @@ func (cfg *Config) registrationBody() map[string]any {
"total_vram": totalVRAM,
"available_vram": totalVRAM, // initially all VRAM is available
"gpu_vendor": gpuVendor,
"gpu_compute_capability": gpuComputeCap,
"max_replicas_per_model": maxReplicas,
}

View File

@@ -0,0 +1,23 @@
package xsysinfo
import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
)
var _ = Describe("parseComputeCap", func() {
DescribeTable("splits major.minor",
func(in string, maj, min int) {
m, n := parseComputeCap(in)
Expect(m).To(Equal(maj))
Expect(n).To(Equal(min))
},
Entry("GB10 / DGX Spark", "12.1", 12, 1),
Entry("RTX 50-series", "12.0", 12, 0),
Entry("Hopper", "9.0", 9, 0),
Entry("major only", "12", 12, 0),
Entry("whitespace", " 12.1 ", 12, 1),
Entry("empty", "", -1, -1),
Entry("garbage", "abc", -1, -1),
)
})

View File

@@ -38,9 +38,9 @@ var UnifiedMemoryDevices = []string{
// GPUMemoryInfo contains real-time GPU memory usage information
type GPUMemoryInfo struct {
Index int `json:"index"`
Name string `json:"name"`
Vendor string `json:"vendor"`
Index int `json:"index"`
Name string `json:"name"`
Vendor string `json:"vendor"`
// BDF is the canonical PCI bus address (dddd:bb:dd.f) when known.
// Populated by detection paths that can attribute the device to a
// PCI location (clinfo, future amdgpu/nvidia paths); empty for
@@ -307,6 +307,84 @@ func GetGPUAggregateInfo() GPUAggregateInfo {
return aggregate
}
var (
computeCapOnce sync.Once
computeCapResult string
)
// NVIDIAComputeCapability returns the highest NVIDIA GPU compute capability on
// this host as a "major.minor" string (e.g. "12.1" for GB10 / DGX Spark), or ""
// when nvidia-smi is unavailable or reports none. Detected once and cached.
//
// This runs where the GPU actually is. In distributed mode it is reported by
// each worker on registration so the router can make per-node decisions rather
// than guessing from the (possibly GPU-less) frontend host.
func NVIDIAComputeCapability() string {
computeCapOnce.Do(func() {
computeCapResult = detectNVIDIAComputeCapability()
})
return computeCapResult
}
func detectNVIDIAComputeCapability() string {
if _, err := exec.LookPath("nvidia-smi"); err != nil {
return ""
}
cmd := exec.Command("nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader")
var stdout, stderr bytes.Buffer
cmd.Stdout = &stdout
cmd.Stderr = &stderr
if err := cmd.Run(); err != nil {
xlog.Debug("nvidia-smi compute_cap query failed", "error", err, "stderr", stderr.String())
return ""
}
best := ""
bestMajor, bestMinor := -1, -1
for _, line := range strings.Split(strings.TrimSpace(stdout.String()), "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
maj, min := parseComputeCap(line)
if maj < 0 {
continue
}
if maj > bestMajor || (maj == bestMajor && min > bestMinor) {
bestMajor, bestMinor, best = maj, min, line
}
}
if best != "" {
xlog.Debug("NVIDIA compute capability detected", "compute_cap", best)
}
return best
}
// parseComputeCap splits a "major.minor" compute-capability string into its
// integer parts. Returns (-1, -1) if it can't be parsed.
func parseComputeCap(cc string) (int, int) {
cc = strings.TrimSpace(cc)
if cc == "" {
return -1, -1
}
majStr, minStr := cc, "0"
if dot := strings.IndexByte(cc, '.'); dot >= 0 {
majStr, minStr = cc[:dot], cc[dot+1:]
}
maj, err := strconv.Atoi(strings.TrimSpace(majStr))
if err != nil {
return -1, -1
}
min, err := strconv.Atoi(strings.TrimSpace(minStr))
if err != nil {
min = 0
}
return maj, min
}
// getNVIDIAGPUMemory queries NVIDIA GPUs using nvidia-smi
func getNVIDIAGPUMemory() []GPUMemoryInfo {
// Check if nvidia-smi is available
@@ -866,12 +944,12 @@ func getVulkanGPUMemory() []GPUMemoryInfo {
}
type vulkanGPUTextInfo struct {
index int
name string
deviceType string
totalVRAM uint64
budgetVRAM uint64
usageVRAM uint64
index int
name string
deviceType string
totalVRAM uint64
budgetVRAM uint64
usageVRAM uint64
}
func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
@@ -909,7 +987,7 @@ func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
} else if current.usageVRAM != 0 && current.budgetVRAM == 0 {
current.budgetVRAM = current.totalVRAM - current.usageVRAM
} else if current.usageVRAM == 0 && current.budgetVRAM == 0 {
current.usageVRAM = 0
current.usageVRAM = 0
current.budgetVRAM = current.totalVRAM
}