mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-19 22:29:10 -04:00
feat(config): node-aware hardware defaults — larger physical batch on Blackwell
A larger physical batch (n_batch/n_ubatch) materially lifts MoE prefill on NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark) — measured on a GB10 with Qwen3-Coder-30B-A3B, the prefill ceiling rises (ub512 ~2994 -> ub2048 ~3316 t/s) and saturates around 2048. The heuristic lives in core/config alongside the other config overriders (ApplyInferenceDefaults, guessDefaultsFromFile/NGPULayers) — they all fill the ModelConfig from heuristics, so hardware tuning is the same domain and stays in one place. It is parameterized on a GPU descriptor (not direct detection) so it works in both deployment shapes: - Single host: SetDefaults applies it with the LocalGPU. - Distributed: only the worker sees the GPU, so the worker reports its compute capability on registration (gpu_compute_capability -> BackendNode), and the router re-applies the SAME core/config heuristic for the SELECTED node before loading — fixing the case where the frontend has no GPU at all. Explicit `batch:` always wins (only managed default values are touched). xsysinfo gains NVIDIAComputeCapability() (detection only); all interpretation lives in core/config. Tests: core/config, pkg/xsysinfo, core/services/nodes. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
118
core/config/hardware_defaults.go
Normal file
118
core/config/hardware_defaults.go
Normal file
@@ -0,0 +1,118 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/xsysinfo"
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
|
||||
// Hardware-driven model-config defaults.
|
||||
//
|
||||
// This sits alongside the other config overriders (ApplyInferenceDefaults for
|
||||
// model families, guessDefaultsFromFile for GGUF/NGPULayers): they all
|
||||
// heuristically fill ModelConfig values the user left unset. Hardware tuning is
|
||||
// the same domain — "adjust the config from the device that will run it" — so
|
||||
// it lives here rather than scattered into the backend or a separate package.
|
||||
//
|
||||
// The heuristics are parameterized on a GPU descriptor (not on direct
|
||||
// detection) so they apply in both deployment shapes: SetDefaults passes the
|
||||
// LocalGPU on a single host, and the distributed router passes the *selected
|
||||
// node's* reported GPU before loading there (the frontend that loaded the
|
||||
// config may have no GPU at all).
|
||||
|
||||
// GPU describes the device that will run a model.
|
||||
type GPU struct {
|
||||
// Vendor is "nvidia", "amd", … (matches xsysinfo vendor constants).
|
||||
Vendor string
|
||||
// ComputeCapability is the NVIDIA compute capability as "major.minor"
|
||||
// (e.g. "12.1" for GB10 / DGX Spark). Empty for non-NVIDIA / unknown.
|
||||
ComputeCapability string
|
||||
// VRAM is total device memory in bytes (0 = unknown).
|
||||
VRAM uint64
|
||||
}
|
||||
|
||||
// Physical batch (n_batch / n_ubatch) defaults.
|
||||
const (
|
||||
// DefaultPhysicalBatch is the conservative default when no hardware-specific
|
||||
// tuning applies. Matches backend.DefaultBatchSize.
|
||||
DefaultPhysicalBatch = 512
|
||||
// BlackwellPhysicalBatch is the default on NVIDIA Blackwell consumer GPUs
|
||||
// (sm_12x: sm_120 RTX 50-series, sm_121 GB10 / DGX Spark). A larger physical
|
||||
// batch materially lifts MoE prefill there (per-expert GEMM tiles fill
|
||||
// better); measured on a GB10 with Qwen3-30B-A3B to saturate around 2048.
|
||||
BlackwellPhysicalBatch = 2048
|
||||
)
|
||||
|
||||
// IsNVIDIABlackwell reports whether the GPU is in the NVIDIA Blackwell consumer
|
||||
// family (sm_12x). Datacenter Blackwell (B100/B200/GB200, sm_100 / cc 10.0)
|
||||
// reports a different compute capability and is intentionally not matched.
|
||||
func (g GPU) IsNVIDIABlackwell() bool {
|
||||
maj, _ := parseComputeCapability(g.ComputeCapability)
|
||||
return maj >= 12
|
||||
}
|
||||
|
||||
// PhysicalBatch returns the canonical physical batch (n_batch/n_ubatch) for the
|
||||
// given hardware, used when the model config leaves batch unset.
|
||||
func PhysicalBatch(g GPU) int {
|
||||
if g.IsNVIDIABlackwell() {
|
||||
return BlackwellPhysicalBatch
|
||||
}
|
||||
return DefaultPhysicalBatch
|
||||
}
|
||||
|
||||
// IsManagedPhysicalBatch reports whether n is a value PhysicalBatch assigns.
|
||||
// Callers that re-tune a value chosen by an upstream host (the distributed
|
||||
// router correcting the frontend's guess) use this to avoid clobbering an
|
||||
// explicit user batch such as 1024.
|
||||
func IsManagedPhysicalBatch(n int) bool {
|
||||
return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch
|
||||
}
|
||||
|
||||
// LocalGPU builds a GPU descriptor from local detection. Used by SetDefaults on
|
||||
// a single host; the distributed router builds the descriptor from the selected
|
||||
// node's reported info instead.
|
||||
func LocalGPU() GPU {
|
||||
vendor, _ := xsysinfo.DetectGPUVendor()
|
||||
return GPU{
|
||||
Vendor: vendor,
|
||||
ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
|
||||
}
|
||||
}
|
||||
|
||||
// ApplyHardwareDefaults fills ModelConfig values that depend on the target GPU
|
||||
// and were left unset by the user. Currently: a larger physical batch on
|
||||
// Blackwell. Explicit config always wins (we only touch zero values).
|
||||
func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
|
||||
if cfg == nil {
|
||||
return
|
||||
}
|
||||
if cfg.Batch == 0 && gpu.IsNVIDIABlackwell() {
|
||||
cfg.Batch = BlackwellPhysicalBatch
|
||||
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
|
||||
"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
|
||||
}
|
||||
}
|
||||
|
||||
// parseComputeCapability splits a "major.minor" string into integer parts.
|
||||
// Returns (-1, -1) when it can't be parsed.
|
||||
func parseComputeCapability(cc string) (int, int) {
|
||||
cc = strings.TrimSpace(cc)
|
||||
if cc == "" {
|
||||
return -1, -1
|
||||
}
|
||||
majStr, minStr := cc, "0"
|
||||
if dot := strings.IndexByte(cc, '.'); dot >= 0 {
|
||||
majStr, minStr = cc[:dot], cc[dot+1:]
|
||||
}
|
||||
maj, err := strconv.Atoi(strings.TrimSpace(majStr))
|
||||
if err != nil {
|
||||
return -1, -1
|
||||
}
|
||||
min, err := strconv.Atoi(strings.TrimSpace(minStr))
|
||||
if err != nil {
|
||||
min = 0
|
||||
}
|
||||
return maj, min
|
||||
}
|
||||
59
core/config/hardware_defaults_test.go
Normal file
59
core/config/hardware_defaults_test.go
Normal file
@@ -0,0 +1,59 @@
|
||||
package config_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/core/config"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("Hardware-driven config defaults", func() {
|
||||
DescribeTable("GPU.IsNVIDIABlackwell (sm_12x consumer family)",
|
||||
func(cc string, want bool) {
|
||||
Expect(GPU{ComputeCapability: cc}.IsNVIDIABlackwell()).To(Equal(want))
|
||||
},
|
||||
Entry("GB10 12.1", "12.1", true),
|
||||
Entry("RTX 50 12.0", "12.0", true),
|
||||
Entry("future 13.0", "13.0", true),
|
||||
Entry("Hopper 9.0", "9.0", false),
|
||||
Entry("Ada 8.9", "8.9", false),
|
||||
Entry("datacenter Blackwell sm_100 10.0", "10.0", false),
|
||||
Entry("unknown", "", false),
|
||||
)
|
||||
|
||||
Describe("PhysicalBatch / IsManagedPhysicalBatch", func() {
|
||||
It("returns the Blackwell batch on Blackwell", func() {
|
||||
Expect(PhysicalBatch(GPU{ComputeCapability: "12.1"})).To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
It("returns the default batch otherwise", func() {
|
||||
Expect(PhysicalBatch(GPU{ComputeCapability: "9.0"})).To(Equal(DefaultPhysicalBatch))
|
||||
Expect(PhysicalBatch(GPU{})).To(Equal(DefaultPhysicalBatch))
|
||||
})
|
||||
It("recognizes managed defaults but not explicit values", func() {
|
||||
Expect(IsManagedPhysicalBatch(DefaultPhysicalBatch)).To(BeTrue())
|
||||
Expect(IsManagedPhysicalBatch(BlackwellPhysicalBatch)).To(BeTrue())
|
||||
Expect(IsManagedPhysicalBatch(1024)).To(BeFalse())
|
||||
})
|
||||
})
|
||||
|
||||
Describe("ApplyHardwareDefaults", func() {
|
||||
It("raises an unset batch to 2048 on Blackwell", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
||||
Expect(cfg.Batch).To(Equal(BlackwellPhysicalBatch))
|
||||
})
|
||||
It("leaves batch unset on non-Blackwell", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "9.0"})
|
||||
Expect(cfg.Batch).To(Equal(0))
|
||||
})
|
||||
It("never overrides an explicit batch", func() {
|
||||
cfg := &ModelConfig{}
|
||||
cfg.Batch = 1024
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1"})
|
||||
Expect(cfg.Batch).To(Equal(1024))
|
||||
})
|
||||
It("no-ops on nil", func() {
|
||||
Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -1111,6 +1111,11 @@ func (cfg *ModelConfig) SetDefaults(opts ...ConfigLoaderOption) {
|
||||
// This ensures gallery-installed and runtime-loaded models get optimal parameters.
|
||||
ApplyInferenceDefaults(cfg, cfg.Name, cfg.Model)
|
||||
|
||||
// Apply hardware-driven defaults (e.g. a larger physical batch on Blackwell).
|
||||
// Uses the local GPU here; in distributed mode the router re-applies the same
|
||||
// heuristics for the selected node's GPU before loading. Explicit config wins.
|
||||
ApplyHardwareDefaults(cfg, LocalGPU())
|
||||
|
||||
// https://github.com/ggerganov/llama.cpp/blob/75cd4c77292034ecec587ecb401366f57338f7c0/common/sampling.h#L22
|
||||
defaultTopP := 0.95
|
||||
defaultTopK := 40
|
||||
|
||||
@@ -70,17 +70,20 @@ func GetNodeEndpoint(registry *nodes.NodeRegistry) echo.HandlerFunc {
|
||||
|
||||
// RegisterNodeRequest is the request body for registering a new worker node.
|
||||
type RegisterNodeRequest struct {
|
||||
Name string `json:"name"`
|
||||
NodeType string `json:"node_type,omitempty"` // "backend" (default) or "agent"
|
||||
Address string `json:"address"`
|
||||
HTTPAddress string `json:"http_address,omitempty"`
|
||||
Token string `json:"token,omitempty"`
|
||||
TotalVRAM uint64 `json:"total_vram,omitempty"`
|
||||
AvailableVRAM uint64 `json:"available_vram,omitempty"`
|
||||
TotalRAM uint64 `json:"total_ram,omitempty"`
|
||||
AvailableRAM uint64 `json:"available_ram,omitempty"`
|
||||
GPUVendor string `json:"gpu_vendor,omitempty"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
Name string `json:"name"`
|
||||
NodeType string `json:"node_type,omitempty"` // "backend" (default) or "agent"
|
||||
Address string `json:"address"`
|
||||
HTTPAddress string `json:"http_address,omitempty"`
|
||||
Token string `json:"token,omitempty"`
|
||||
TotalVRAM uint64 `json:"total_vram,omitempty"`
|
||||
AvailableVRAM uint64 `json:"available_vram,omitempty"`
|
||||
TotalRAM uint64 `json:"total_ram,omitempty"`
|
||||
AvailableRAM uint64 `json:"available_ram,omitempty"`
|
||||
GPUVendor string `json:"gpu_vendor,omitempty"`
|
||||
// GPUComputeCapability is the worker GPU's compute capability ("major.minor",
|
||||
// e.g. "12.1" for GB10). Used by the router for per-arch option tuning.
|
||||
GPUComputeCapability string `json:"gpu_compute_capability,omitempty"`
|
||||
Labels map[string]string `json:"labels,omitempty"`
|
||||
// MaxReplicasPerModel is the per-node cap on replicas of any single model.
|
||||
// Workers older than this field omit it; we coerce 0 → 1 below to preserve
|
||||
// historical single-replica behavior.
|
||||
@@ -152,17 +155,18 @@ func RegisterNodeEndpoint(registry *nodes.NodeRegistry, expectedToken string, au
|
||||
}
|
||||
|
||||
node := &nodes.BackendNode{
|
||||
Name: req.Name,
|
||||
NodeType: nodeType,
|
||||
Address: req.Address,
|
||||
HTTPAddress: req.HTTPAddress,
|
||||
TokenHash: tokenHash,
|
||||
TotalVRAM: req.TotalVRAM,
|
||||
AvailableVRAM: req.AvailableVRAM,
|
||||
TotalRAM: req.TotalRAM,
|
||||
AvailableRAM: req.AvailableRAM,
|
||||
GPUVendor: req.GPUVendor,
|
||||
MaxReplicasPerModel: maxReplicasPerModel,
|
||||
Name: req.Name,
|
||||
NodeType: nodeType,
|
||||
Address: req.Address,
|
||||
HTTPAddress: req.HTTPAddress,
|
||||
TokenHash: tokenHash,
|
||||
TotalVRAM: req.TotalVRAM,
|
||||
AvailableVRAM: req.AvailableVRAM,
|
||||
TotalRAM: req.TotalRAM,
|
||||
AvailableRAM: req.AvailableRAM,
|
||||
GPUVendor: req.GPUVendor,
|
||||
GPUComputeCapability: req.GPUComputeCapability,
|
||||
MaxReplicasPerModel: maxReplicasPerModel,
|
||||
}
|
||||
|
||||
ctx := c.Request().Context()
|
||||
|
||||
@@ -36,6 +36,11 @@ type BackendNode struct {
|
||||
TotalRAM uint64 `gorm:"column:total_ram" json:"total_ram"` // Total system RAM in bytes (fallback when no GPU)
|
||||
AvailableRAM uint64 `gorm:"column:available_ram" json:"available_ram"` // Available system RAM in bytes
|
||||
GPUVendor string `gorm:"column:gpu_vendor;size:32" json:"gpu_vendor"` // nvidia, amd, intel, vulkan, unknown
|
||||
// GPUComputeCapability is the worker GPU's compute capability as
|
||||
// "major.minor" (e.g. "12.1" for GB10 / DGX Spark). Reported by the worker
|
||||
// on registration; used by the router to pick per-arch options (e.g. a
|
||||
// larger physical batch on Blackwell). Empty when unknown / non-NVIDIA.
|
||||
GPUComputeCapability string `gorm:"column:gpu_compute_capability;size:16" json:"gpu_compute_capability"`
|
||||
// MaxReplicasPerModel caps how many replicas of any one model can run on
|
||||
// this node concurrently. Default 1 preserves the historical "one
|
||||
// (node, model)" assumption; set higher (via worker --max-replicas-per-model)
|
||||
@@ -69,6 +74,7 @@ const (
|
||||
ColReservedVRAM = "reserved_vram"
|
||||
ColAvailableRAM = "available_ram"
|
||||
ColGPUVendor = "gpu_vendor"
|
||||
ColGPUComputeCap = "gpu_compute_capability"
|
||||
ColLastHeartbeat = "last_heartbeat"
|
||||
ColMaxReplicasPerModel = "max_replicas_per_model"
|
||||
)
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/services/advisorylock"
|
||||
"github.com/mudler/LocalAI/core/services/nodes/prefixcache"
|
||||
"github.com/mudler/LocalAI/pkg/distributedhdr"
|
||||
@@ -138,6 +139,27 @@ type scheduleLoadResult struct {
|
||||
ReplicaIndex int
|
||||
}
|
||||
|
||||
// applyNodeHardwareDefaults tunes node-agnostic ModelOptions to the GPU of the
|
||||
// node that was actually selected to run the model, reusing the same hardware
|
||||
// heuristics as single-host config loading (core/config). On Blackwell it
|
||||
// raises the physical batch; on non-Blackwell it resets a hardware-default that
|
||||
// an upstream host (the GPU-less frontend in distributed mode) guessed higher.
|
||||
// Only values the heuristics themselves manage are touched, so an explicit user
|
||||
// batch (e.g. 1024) is never overridden.
|
||||
func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
|
||||
if opts == nil || node == nil {
|
||||
return
|
||||
}
|
||||
if !config.IsManagedPhysicalBatch(int(opts.NBatch)) {
|
||||
return
|
||||
}
|
||||
opts.NBatch = int32(config.PhysicalBatch(config.GPU{
|
||||
Vendor: node.GPUVendor,
|
||||
ComputeCapability: node.GPUComputeCapability,
|
||||
VRAM: node.TotalVRAM,
|
||||
}))
|
||||
}
|
||||
|
||||
// scheduleAndLoad is the shared core for loading a model on a new node.
|
||||
// Used by both Route() (for first-time loads) and ScheduleAndLoadModel() (for reconciler scale-ups).
|
||||
//
|
||||
@@ -153,6 +175,11 @@ func (r *SmartRouter) scheduleAndLoad(ctx context.Context, backendType, tracking
|
||||
return nil, fmt.Errorf("no available nodes: %w", err)
|
||||
}
|
||||
|
||||
// Tune node-agnostic options to the SELECTED node's GPU. Only now do we know
|
||||
// which node (and its compute capability) will run the model — the frontend
|
||||
// that built modelOpts may have no GPU at all in distributed mode.
|
||||
applyNodeHardwareDefaults(modelOpts, node)
|
||||
|
||||
// Pre-stage model files via FileStager before loading
|
||||
loadOpts := modelOpts
|
||||
if r.fileStager != nil && modelOpts != nil {
|
||||
|
||||
33
core/services/nodes/router_hardware_internal_test.go
Normal file
33
core/services/nodes/router_hardware_internal_test.go
Normal file
@@ -0,0 +1,33 @@
|
||||
package nodes
|
||||
|
||||
import (
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("applyNodeHardwareDefaults", func() {
|
||||
It("raises a managed default batch on a Blackwell node", func() {
|
||||
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch}
|
||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
|
||||
Expect(opts.NBatch).To(BeEquivalentTo(config.BlackwellPhysicalBatch))
|
||||
})
|
||||
|
||||
It("resets a Blackwell guess on a non-Blackwell node", func() {
|
||||
// frontend (Blackwell) guessed high, but the selected node is not Blackwell
|
||||
opts := &pb.ModelOptions{NBatch: config.BlackwellPhysicalBatch}
|
||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "9.0"})
|
||||
Expect(opts.NBatch).To(BeEquivalentTo(config.DefaultPhysicalBatch))
|
||||
})
|
||||
|
||||
It("never overrides an explicit (non-managed) batch", func() {
|
||||
opts := &pb.ModelOptions{NBatch: 1024}
|
||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1"})
|
||||
Expect(opts.NBatch).To(BeEquivalentTo(int32(1024)))
|
||||
})
|
||||
|
||||
It("no-ops on nil inputs", func() {
|
||||
Expect(func() { applyNodeHardwareDefaults(nil, nil) }).ToNot(Panic())
|
||||
})
|
||||
})
|
||||
@@ -73,6 +73,10 @@ func (cfg *Config) registrationBody() map[string]any {
|
||||
// Detect GPU info for VRAM-aware scheduling
|
||||
totalVRAM, _ := xsysinfo.TotalAvailableVRAM()
|
||||
gpuVendor, _ := xsysinfo.DetectGPUVendor()
|
||||
// Compute capability (e.g. "12.1" for GB10) lets the router pick per-arch
|
||||
// options (e.g. larger physical batch on Blackwell). Detected on the worker
|
||||
// because only the worker sees the GPU in distributed mode.
|
||||
gpuComputeCap := xsysinfo.NVIDIAComputeCapability()
|
||||
|
||||
maxReplicas := cfg.MaxReplicasPerModel
|
||||
if maxReplicas < 1 {
|
||||
@@ -85,6 +89,7 @@ func (cfg *Config) registrationBody() map[string]any {
|
||||
"total_vram": totalVRAM,
|
||||
"available_vram": totalVRAM, // initially all VRAM is available
|
||||
"gpu_vendor": gpuVendor,
|
||||
"gpu_compute_capability": gpuComputeCap,
|
||||
"max_replicas_per_model": maxReplicas,
|
||||
}
|
||||
|
||||
|
||||
23
pkg/xsysinfo/computecap_internal_test.go
Normal file
23
pkg/xsysinfo/computecap_internal_test.go
Normal file
@@ -0,0 +1,23 @@
|
||||
package xsysinfo
|
||||
|
||||
import (
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("parseComputeCap", func() {
|
||||
DescribeTable("splits major.minor",
|
||||
func(in string, maj, min int) {
|
||||
m, n := parseComputeCap(in)
|
||||
Expect(m).To(Equal(maj))
|
||||
Expect(n).To(Equal(min))
|
||||
},
|
||||
Entry("GB10 / DGX Spark", "12.1", 12, 1),
|
||||
Entry("RTX 50-series", "12.0", 12, 0),
|
||||
Entry("Hopper", "9.0", 9, 0),
|
||||
Entry("major only", "12", 12, 0),
|
||||
Entry("whitespace", " 12.1 ", 12, 1),
|
||||
Entry("empty", "", -1, -1),
|
||||
Entry("garbage", "abc", -1, -1),
|
||||
)
|
||||
})
|
||||
@@ -38,9 +38,9 @@ var UnifiedMemoryDevices = []string{
|
||||
|
||||
// GPUMemoryInfo contains real-time GPU memory usage information
|
||||
type GPUMemoryInfo struct {
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
Vendor string `json:"vendor"`
|
||||
Index int `json:"index"`
|
||||
Name string `json:"name"`
|
||||
Vendor string `json:"vendor"`
|
||||
// BDF is the canonical PCI bus address (dddd:bb:dd.f) when known.
|
||||
// Populated by detection paths that can attribute the device to a
|
||||
// PCI location (clinfo, future amdgpu/nvidia paths); empty for
|
||||
@@ -307,6 +307,84 @@ func GetGPUAggregateInfo() GPUAggregateInfo {
|
||||
return aggregate
|
||||
}
|
||||
|
||||
var (
|
||||
computeCapOnce sync.Once
|
||||
computeCapResult string
|
||||
)
|
||||
|
||||
// NVIDIAComputeCapability returns the highest NVIDIA GPU compute capability on
|
||||
// this host as a "major.minor" string (e.g. "12.1" for GB10 / DGX Spark), or ""
|
||||
// when nvidia-smi is unavailable or reports none. Detected once and cached.
|
||||
//
|
||||
// This runs where the GPU actually is. In distributed mode it is reported by
|
||||
// each worker on registration so the router can make per-node decisions rather
|
||||
// than guessing from the (possibly GPU-less) frontend host.
|
||||
func NVIDIAComputeCapability() string {
|
||||
computeCapOnce.Do(func() {
|
||||
computeCapResult = detectNVIDIAComputeCapability()
|
||||
})
|
||||
return computeCapResult
|
||||
}
|
||||
|
||||
func detectNVIDIAComputeCapability() string {
|
||||
if _, err := exec.LookPath("nvidia-smi"); err != nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
cmd := exec.Command("nvidia-smi", "--query-gpu=compute_cap", "--format=csv,noheader")
|
||||
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = &stdout
|
||||
cmd.Stderr = &stderr
|
||||
|
||||
if err := cmd.Run(); err != nil {
|
||||
xlog.Debug("nvidia-smi compute_cap query failed", "error", err, "stderr", stderr.String())
|
||||
return ""
|
||||
}
|
||||
|
||||
best := ""
|
||||
bestMajor, bestMinor := -1, -1
|
||||
for _, line := range strings.Split(strings.TrimSpace(stdout.String()), "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if line == "" {
|
||||
continue
|
||||
}
|
||||
maj, min := parseComputeCap(line)
|
||||
if maj < 0 {
|
||||
continue
|
||||
}
|
||||
if maj > bestMajor || (maj == bestMajor && min > bestMinor) {
|
||||
bestMajor, bestMinor, best = maj, min, line
|
||||
}
|
||||
}
|
||||
if best != "" {
|
||||
xlog.Debug("NVIDIA compute capability detected", "compute_cap", best)
|
||||
}
|
||||
return best
|
||||
}
|
||||
|
||||
// parseComputeCap splits a "major.minor" compute-capability string into its
|
||||
// integer parts. Returns (-1, -1) if it can't be parsed.
|
||||
func parseComputeCap(cc string) (int, int) {
|
||||
cc = strings.TrimSpace(cc)
|
||||
if cc == "" {
|
||||
return -1, -1
|
||||
}
|
||||
majStr, minStr := cc, "0"
|
||||
if dot := strings.IndexByte(cc, '.'); dot >= 0 {
|
||||
majStr, minStr = cc[:dot], cc[dot+1:]
|
||||
}
|
||||
maj, err := strconv.Atoi(strings.TrimSpace(majStr))
|
||||
if err != nil {
|
||||
return -1, -1
|
||||
}
|
||||
min, err := strconv.Atoi(strings.TrimSpace(minStr))
|
||||
if err != nil {
|
||||
min = 0
|
||||
}
|
||||
return maj, min
|
||||
}
|
||||
|
||||
// getNVIDIAGPUMemory queries NVIDIA GPUs using nvidia-smi
|
||||
func getNVIDIAGPUMemory() []GPUMemoryInfo {
|
||||
// Check if nvidia-smi is available
|
||||
@@ -866,12 +944,12 @@ func getVulkanGPUMemory() []GPUMemoryInfo {
|
||||
}
|
||||
|
||||
type vulkanGPUTextInfo struct {
|
||||
index int
|
||||
name string
|
||||
deviceType string
|
||||
totalVRAM uint64
|
||||
budgetVRAM uint64
|
||||
usageVRAM uint64
|
||||
index int
|
||||
name string
|
||||
deviceType string
|
||||
totalVRAM uint64
|
||||
budgetVRAM uint64
|
||||
usageVRAM uint64
|
||||
}
|
||||
|
||||
func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
|
||||
@@ -909,7 +987,7 @@ func parseVulkanGPUMemoryText(r io.Reader) []GPUMemoryInfo {
|
||||
} else if current.usageVRAM != 0 && current.budgetVRAM == 0 {
|
||||
current.budgetVRAM = current.totalVRAM - current.usageVRAM
|
||||
} else if current.usageVRAM == 0 && current.budgetVRAM == 0 {
|
||||
current.usageVRAM = 0
|
||||
current.usageVRAM = 0
|
||||
current.budgetVRAM = current.totalVRAM
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user