mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-22 07:39:02 -04:00
* feat(config): node-aware hardware defaults — larger physical batch on Blackwell A larger physical batch (n_batch/n_ubatch) materially lifts MoE prefill on NVIDIA Blackwell consumer GPUs (sm_120/121, incl. GB10 / DGX Spark) — measured on a GB10 with Qwen3-Coder-30B-A3B, the prefill ceiling rises (ub512 ~2994 -> ub2048 ~3316 t/s) and saturates around 2048. The heuristic lives in core/config alongside the other config overriders (ApplyInferenceDefaults, guessDefaultsFromFile/NGPULayers) — they all fill the ModelConfig from heuristics, so hardware tuning is the same domain and stays in one place. It is parameterized on a GPU descriptor (not direct detection) so it works in both deployment shapes: - Single host: SetDefaults applies it with the LocalGPU. - Distributed: only the worker sees the GPU, so the worker reports its compute capability on registration (gpu_compute_capability -> BackendNode), and the router re-applies the SAME core/config heuristic for the SELECTED node before loading — fixing the case where the frontend has no GPU at all. Explicit `batch:` always wins (only managed default values are touched). xsysinfo gains NVIDIAComputeCapability() (detection only); all interpretation lives in core/config. Tests: core/config, pkg/xsysinfo, core/services/nodes. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * test(config): injectable local-GPU seam + single-instance coverage Make local GPU detection an injectable package var (localGPU) so the single-instance path (SetDefaults -> ApplyHardwareDefaults) is deterministically testable without a real GPU, mirroring the distributed override's coverage. Adds specs asserting SetDefaults sets the Blackwell physical batch, leaves it unset on non-Blackwell, and never overrides an explicit batch. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(config): default concurrent serving (n_parallel) by GPU VRAM The llama.cpp backend defaults n_parallel=1, which serializes multi-user requests and leaves continuous batching off (it auto-enables only at n_parallel>1). Fold a VRAM-scaled parallel-slot default into the hardware-config path so multi-user serving works out of the box: >=32GiB->8, >=8GiB->4, >=4GiB->2, else unchanged. With the backend's unified KV the slots SHARE the context budget, so this adds concurrency without multiplying KV memory. Explicit parallel/n_parallel always wins. EnsureParallelOption is shared by the single-host path (ApplyHardwareDefaults with the local GPU) and the distributed router (per selected node's reported VRAM, since the frontend may have no GPU). LocalGPU now also reports VRAM. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
148 lines
4.5 KiB
Go
148 lines
4.5 KiB
Go
package worker
|
|
|
|
import (
|
|
"cmp"
|
|
"fmt"
|
|
"os"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"github.com/mudler/LocalAI/pkg/xsysinfo"
|
|
)
|
|
|
|
// effectiveBasePort returns the port used as base for gRPC backend processes.
|
|
// Priority: Addr port → ServeAddr port → 50051
|
|
func (cfg *Config) effectiveBasePort() int {
|
|
for _, addr := range []string{cfg.Addr, cfg.ServeAddr} {
|
|
if addr != "" {
|
|
if _, portStr, ok := strings.Cut(addr, ":"); ok {
|
|
if p, _ := strconv.Atoi(portStr); p > 0 {
|
|
return p
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return 50051
|
|
}
|
|
|
|
// advertiseAddr returns the address the frontend should use to reach this node.
|
|
func (cfg *Config) advertiseAddr() string {
|
|
if cfg.AdvertiseAddr != "" {
|
|
return cfg.AdvertiseAddr
|
|
}
|
|
if cfg.Addr != "" {
|
|
return cfg.Addr
|
|
}
|
|
hostname, _ := os.Hostname()
|
|
return fmt.Sprintf("%s:%d", cmp.Or(hostname, "localhost"), cfg.effectiveBasePort())
|
|
}
|
|
|
|
// resolveHTTPAddr returns the address to bind the HTTP file transfer server to.
|
|
// Uses basePort-1 so it doesn't conflict with dynamically allocated gRPC ports
|
|
// which grow upward from basePort.
|
|
func (cfg *Config) resolveHTTPAddr() string {
|
|
if cfg.HTTPAddr != "" {
|
|
return cfg.HTTPAddr
|
|
}
|
|
return fmt.Sprintf("0.0.0.0:%d", cfg.effectiveBasePort()-1)
|
|
}
|
|
|
|
// advertiseHTTPAddr returns the HTTP address the frontend should use to reach
|
|
// this node for file transfer.
|
|
func (cfg *Config) advertiseHTTPAddr() string {
|
|
if cfg.AdvertiseHTTPAddr != "" {
|
|
return cfg.AdvertiseHTTPAddr
|
|
}
|
|
advHost, _, _ := strings.Cut(cfg.advertiseAddr(), ":")
|
|
httpPort := cfg.effectiveBasePort() - 1
|
|
return fmt.Sprintf("%s:%d", advHost, httpPort)
|
|
}
|
|
|
|
// registrationBody builds the JSON body for node registration.
|
|
func (cfg *Config) registrationBody() map[string]any {
|
|
nodeName := cfg.NodeName
|
|
if nodeName == "" {
|
|
hostname, err := os.Hostname()
|
|
if err != nil {
|
|
nodeName = fmt.Sprintf("node-%d", os.Getpid())
|
|
} else {
|
|
nodeName = hostname
|
|
}
|
|
}
|
|
|
|
// Detect GPU info for VRAM-aware scheduling
|
|
totalVRAM, _ := xsysinfo.TotalAvailableVRAM()
|
|
gpuVendor, _ := xsysinfo.DetectGPUVendor()
|
|
// Compute capability (e.g. "12.1" for GB10) lets the router pick per-arch
|
|
// options (e.g. larger physical batch on Blackwell). Detected on the worker
|
|
// because only the worker sees the GPU in distributed mode.
|
|
gpuComputeCap := xsysinfo.NVIDIAComputeCapability()
|
|
|
|
maxReplicas := cfg.MaxReplicasPerModel
|
|
if maxReplicas < 1 {
|
|
maxReplicas = 1
|
|
}
|
|
body := map[string]any{
|
|
"name": nodeName,
|
|
"address": cfg.advertiseAddr(),
|
|
"http_address": cfg.advertiseHTTPAddr(),
|
|
"total_vram": totalVRAM,
|
|
"available_vram": totalVRAM, // initially all VRAM is available
|
|
"gpu_vendor": gpuVendor,
|
|
"gpu_compute_capability": gpuComputeCap,
|
|
"max_replicas_per_model": maxReplicas,
|
|
}
|
|
|
|
// If no GPU detected, report system RAM so the scheduler/UI has capacity info
|
|
if totalVRAM == 0 {
|
|
if ramInfo, err := xsysinfo.GetSystemRAMInfo(); err == nil {
|
|
body["total_ram"] = ramInfo.Total
|
|
body["available_ram"] = ramInfo.Available
|
|
}
|
|
}
|
|
if cfg.RegistrationToken != "" {
|
|
body["token"] = cfg.RegistrationToken
|
|
}
|
|
|
|
// Parse and add static node labels. Always include the auto-label
|
|
// `node.replica-slots=N` so AND-selectors in ModelSchedulingConfig can
|
|
// target high-capacity nodes (e.g. {"node.replica-slots":"4"}).
|
|
labels := make(map[string]string)
|
|
if cfg.NodeLabels != "" {
|
|
for _, pair := range strings.Split(cfg.NodeLabels, ",") {
|
|
pair = strings.TrimSpace(pair)
|
|
if k, v, ok := strings.Cut(pair, "="); ok {
|
|
labels[strings.TrimSpace(k)] = strings.TrimSpace(v)
|
|
}
|
|
}
|
|
}
|
|
labels["node.replica-slots"] = strconv.Itoa(maxReplicas)
|
|
body["labels"] = labels
|
|
|
|
return body
|
|
}
|
|
|
|
// heartbeatBody returns the current VRAM/RAM stats for heartbeat payloads.
|
|
//
|
|
// When aggregate VRAM usage is unknown (no GPU, or temporary detection
|
|
// failure), we deliberately OMIT available_vram so the frontend keeps its
|
|
// last good value — overwriting with 0 makes the UI show the node as "fully
|
|
// used", while reporting total-as-available lies to the scheduler about
|
|
// free capacity.
|
|
func (cfg *Config) heartbeatBody() map[string]any {
|
|
body := map[string]any{}
|
|
aggregate := xsysinfo.GetGPUAggregateInfo()
|
|
if aggregate.TotalVRAM > 0 {
|
|
body["available_vram"] = aggregate.FreeVRAM
|
|
}
|
|
|
|
// CPU-only workers (or workers that lost GPU visibility momentarily):
|
|
// report system RAM so the scheduler still has capacity info.
|
|
if aggregate.TotalVRAM == 0 {
|
|
if ramInfo, err := xsysinfo.GetSystemRAMInfo(); err == nil {
|
|
body["available_ram"] = ramInfo.Available
|
|
}
|
|
}
|
|
return body
|
|
}
|