Files
LocalAI/core/services/worker/registration.go
LocalAI [bot] b081247d95 feat(config): hardware-tuned defaults — Blackwell batch + VRAM-scaled concurrency (#10411)
* feat(config): node-aware hardware defaults — larger physical batch on Blackwell

A larger physical batch (n_batch/n_ubatch) materially lifts MoE prefill on
NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark) — measured
on a GB10 with Qwen3-Coder-30B-A3B, the prefill ceiling rises (ub512 ~2994 ->
ub2048 ~3316 t/s) and saturates around 2048.

The heuristic lives in core/config alongside the other config overriders
(ApplyInferenceDefaults, guessDefaultsFromFile/NGPULayers) — they all fill the
ModelConfig from heuristics, so hardware tuning is the same domain and stays in
one place. It is parameterized on a GPU descriptor (not direct detection) so it
works in both deployment shapes:

- Single host: SetDefaults applies it with the LocalGPU.
- Distributed: only the worker sees the GPU, so the worker reports its compute
  capability on registration (gpu_compute_capability -> BackendNode), and the
  router re-applies the SAME core/config heuristic for the SELECTED node before
  loading — fixing the case where the frontend has no GPU at all.

Explicit `batch:` always wins (only managed default values are touched).
xsysinfo gains NVIDIAComputeCapability() (detection only); all interpretation
lives in core/config. Tests: core/config, pkg/xsysinfo, core/services/nodes.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* test(config): injectable local-GPU seam + single-instance coverage

Make local GPU detection an injectable package var (localGPU) so the
single-instance path (SetDefaults -> ApplyHardwareDefaults) is deterministically
testable without a real GPU, mirroring the distributed override's coverage.
Adds specs asserting SetDefaults sets the Blackwell physical batch, leaves it
unset on non-Blackwell, and never overrides an explicit batch.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

* feat(config): default concurrent serving (n_parallel) by GPU VRAM

The llama.cpp backend defaults n_parallel=1, which serializes multi-user requests
and leaves continuous batching off (it auto-enables only at n_parallel>1). Fold a
VRAM-scaled parallel-slot default into the hardware-config path so multi-user
serving works out of the box: >=32GiB->8, >=8GiB->4, >=4GiB->2, else unchanged.
With the backend's unified KV the slots SHARE the context budget, so this adds
concurrency without multiplying KV memory. Explicit parallel/n_parallel always
wins. EnsureParallelOption is shared by the single-host path (ApplyHardwareDefaults
with the local GPU) and the distributed router (per selected node's reported VRAM,
since the frontend may have no GPU). LocalGPU now also reports VRAM.

Assisted-by: Claude:opus-4.8 [Claude Code]
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>

---------

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-20 14:45:59 +02:00

148 lines
4.5 KiB
Go

package worker
import (
"cmp"
"fmt"
"os"
"strconv"
"strings"
"github.com/mudler/LocalAI/pkg/xsysinfo"
)
// effectiveBasePort returns the port used as base for gRPC backend processes.
// Priority: Addr port → ServeAddr port → 50051
func (cfg *Config) effectiveBasePort() int {
for _, addr := range []string{cfg.Addr, cfg.ServeAddr} {
if addr != "" {
if _, portStr, ok := strings.Cut(addr, ":"); ok {
if p, _ := strconv.Atoi(portStr); p > 0 {
return p
}
}
}
}
return 50051
}
// advertiseAddr returns the address the frontend should use to reach this node.
func (cfg *Config) advertiseAddr() string {
if cfg.AdvertiseAddr != "" {
return cfg.AdvertiseAddr
}
if cfg.Addr != "" {
return cfg.Addr
}
hostname, _ := os.Hostname()
return fmt.Sprintf("%s:%d", cmp.Or(hostname, "localhost"), cfg.effectiveBasePort())
}
// resolveHTTPAddr returns the address to bind the HTTP file transfer server to.
// Uses basePort-1 so it doesn't conflict with dynamically allocated gRPC ports
// which grow upward from basePort.
func (cfg *Config) resolveHTTPAddr() string {
if cfg.HTTPAddr != "" {
return cfg.HTTPAddr
}
return fmt.Sprintf("0.0.0.0:%d", cfg.effectiveBasePort()-1)
}
// advertiseHTTPAddr returns the HTTP address the frontend should use to reach
// this node for file transfer.
func (cfg *Config) advertiseHTTPAddr() string {
if cfg.AdvertiseHTTPAddr != "" {
return cfg.AdvertiseHTTPAddr
}
advHost, _, _ := strings.Cut(cfg.advertiseAddr(), ":")
httpPort := cfg.effectiveBasePort() - 1
return fmt.Sprintf("%s:%d", advHost, httpPort)
}
// registrationBody builds the JSON body for node registration.
func (cfg *Config) registrationBody() map[string]any {
nodeName := cfg.NodeName
if nodeName == "" {
hostname, err := os.Hostname()
if err != nil {
nodeName = fmt.Sprintf("node-%d", os.Getpid())
} else {
nodeName = hostname
}
}
// Detect GPU info for VRAM-aware scheduling
totalVRAM, _ := xsysinfo.TotalAvailableVRAM()
gpuVendor, _ := xsysinfo.DetectGPUVendor()
// Compute capability (e.g. "12.1" for GB10) lets the router pick per-arch
// options (e.g. larger physical batch on Blackwell). Detected on the worker
// because only the worker sees the GPU in distributed mode.
gpuComputeCap := xsysinfo.NVIDIAComputeCapability()
maxReplicas := cfg.MaxReplicasPerModel
if maxReplicas < 1 {
maxReplicas = 1
}
body := map[string]any{
"name": nodeName,
"address": cfg.advertiseAddr(),
"http_address": cfg.advertiseHTTPAddr(),
"total_vram": totalVRAM,
"available_vram": totalVRAM, // initially all VRAM is available
"gpu_vendor": gpuVendor,
"gpu_compute_capability": gpuComputeCap,
"max_replicas_per_model": maxReplicas,
}
// If no GPU detected, report system RAM so the scheduler/UI has capacity info
if totalVRAM == 0 {
if ramInfo, err := xsysinfo.GetSystemRAMInfo(); err == nil {
body["total_ram"] = ramInfo.Total
body["available_ram"] = ramInfo.Available
}
}
if cfg.RegistrationToken != "" {
body["token"] = cfg.RegistrationToken
}
// Parse and add static node labels. Always include the auto-label
// `node.replica-slots=N` so AND-selectors in ModelSchedulingConfig can
// target high-capacity nodes (e.g. {"node.replica-slots":"4"}).
labels := make(map[string]string)
if cfg.NodeLabels != "" {
for _, pair := range strings.Split(cfg.NodeLabels, ",") {
pair = strings.TrimSpace(pair)
if k, v, ok := strings.Cut(pair, "="); ok {
labels[strings.TrimSpace(k)] = strings.TrimSpace(v)
}
}
}
labels["node.replica-slots"] = strconv.Itoa(maxReplicas)
body["labels"] = labels
return body
}
// heartbeatBody returns the current VRAM/RAM stats for heartbeat payloads.
//
// When aggregate VRAM usage is unknown (no GPU, or temporary detection
// failure), we deliberately OMIT available_vram so the frontend keeps its
// last good value — overwriting with 0 makes the UI show the node as "fully
// used", while reporting total-as-available lies to the scheduler about
// free capacity.
func (cfg *Config) heartbeatBody() map[string]any {
body := map[string]any{}
aggregate := xsysinfo.GetGPUAggregateInfo()
if aggregate.TotalVRAM > 0 {
body["available_vram"] = aggregate.FreeVRAM
}
// CPU-only workers (or workers that lost GPU visibility momentarily):
// report system RAM so the scheduler still has capacity info.
if aggregate.TotalVRAM == 0 {
if ramInfo, err := xsysinfo.GetSystemRAMInfo(); err == nil {
body["available_ram"] = ramInfo.Available
}
}
return body
}