mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-20 06:39:01 -04:00
feat(config): default concurrent serving (n_parallel) by GPU VRAM
The llama.cpp backend defaults n_parallel=1, which serializes multi-user requests and leaves continuous batching off (it auto-enables only at n_parallel>1). Fold a VRAM-scaled parallel-slot default into the hardware-config path so multi-user serving works out of the box: >=32GiB->8, >=8GiB->4, >=4GiB->2, else unchanged. With the backend's unified KV the slots SHARE the context budget, so this adds concurrency without multiplying KV memory. Explicit parallel/n_parallel always wins. EnsureParallelOption is shared by the single-host path (ApplyHardwareDefaults with the local GPU) and the distributed router (per selected node's reported VRAM, since the frontend may have no GPU). LocalGPU now also reports VRAM. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package config
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
@@ -70,15 +71,72 @@ func IsManagedPhysicalBatch(n int) bool {
|
||||
return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch
|
||||
}
|
||||
|
||||
// Parallel-slot (n_parallel) VRAM tiers. llama.cpp serializes requests at
|
||||
// n_parallel=1 (the backend default) and only auto-enables continuous batching
|
||||
// when n_parallel > 1 — so a single-slot default makes concurrent requests
|
||||
// queue. We default a slot count by GPU size so multi-user serving works out of
|
||||
// the box. With the backend's unified KV cache the slots SHARE the context
|
||||
// budget, so more slots add concurrency without multiplying KV memory.
|
||||
const (
|
||||
parallelSlotsVRAMHigh = uint64(32) << 30 // >=32 GiB -> 8 slots
|
||||
parallelSlotsVRAMMid = uint64(8) << 30 // >=8 GiB -> 4 slots
|
||||
parallelSlotsVRAMLow = uint64(4) << 30 // >=4 GiB -> 2 slots
|
||||
)
|
||||
|
||||
// DefaultParallelSlots returns the n_parallel default for the given GPU. Returns
|
||||
// 1 (no concurrency) when VRAM is unknown or too small, so we never change
|
||||
// behavior on CPU-only / tiny devices.
|
||||
func DefaultParallelSlots(g GPU) int {
|
||||
switch {
|
||||
case g.VRAM >= parallelSlotsVRAMHigh:
|
||||
return 8
|
||||
case g.VRAM >= parallelSlotsVRAMMid:
|
||||
return 4
|
||||
case g.VRAM >= parallelSlotsVRAMLow:
|
||||
return 2
|
||||
default:
|
||||
return 1
|
||||
}
|
||||
}
|
||||
|
||||
// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the
|
||||
// model doesn't already set one (and the GPU warrants concurrency). Returns the
|
||||
// possibly-extended options. Shared by the single-host config path
|
||||
// (ApplyHardwareDefaults) and the distributed router (per selected node).
|
||||
func EnsureParallelOption(opts []string, gpu GPU) []string {
|
||||
if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) {
|
||||
return append(opts, fmt.Sprintf("parallel:%d", slots))
|
||||
}
|
||||
return opts
|
||||
}
|
||||
|
||||
// hasParallelOption reports whether the model already sets parallel/n_parallel
|
||||
// (backend options are "name:value" strings) so we never override an explicit value.
|
||||
func hasParallelOption(opts []string) bool {
|
||||
for _, o := range opts {
|
||||
name := o
|
||||
if i := strings.IndexByte(o, ':'); i >= 0 {
|
||||
name = o[:i]
|
||||
}
|
||||
switch strings.TrimSpace(strings.ToLower(name)) {
|
||||
case "parallel", "n_parallel":
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// localGPU builds a GPU descriptor from local detection, used by SetDefaults on
|
||||
// a single host (the distributed router builds it from the selected node's
|
||||
// reported info instead). It is a package var so tests can inject a
|
||||
// deterministic device — detection does a live nvidia-smi call.
|
||||
var localGPU = func() GPU {
|
||||
vendor, _ := xsysinfo.DetectGPUVendor()
|
||||
vram, _ := xsysinfo.TotalAvailableVRAM()
|
||||
return GPU{
|
||||
Vendor: vendor,
|
||||
ComputeCapability: xsysinfo.NVIDIAComputeCapability(),
|
||||
VRAM: vram,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,6 +152,19 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) {
|
||||
xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch",
|
||||
"batch", cfg.Batch, "compute_cap", gpu.ComputeCapability)
|
||||
}
|
||||
|
||||
// Enable concurrent serving by default on a capable GPU: without this the
|
||||
// llama.cpp backend runs n_parallel=1 and serializes multi-user requests
|
||||
// (continuous batching stays off). Unified KV means the slots share the
|
||||
// context budget, so this is concurrency without extra KV memory. Explicit
|
||||
// parallel/n_parallel in the model options always wins.
|
||||
if before := len(cfg.Options); true {
|
||||
cfg.Options = EnsureParallelOption(cfg.Options, gpu)
|
||||
if len(cfg.Options) > before {
|
||||
xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving",
|
||||
"option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// parseComputeCapability splits a "major.minor" string into integer parts.
|
||||
|
||||
@@ -56,4 +56,42 @@ var _ = Describe("Hardware-driven config defaults", func() {
|
||||
Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic())
|
||||
})
|
||||
})
|
||||
|
||||
const gib = uint64(1) << 30
|
||||
|
||||
DescribeTable("DefaultParallelSlots (by VRAM)",
|
||||
func(vramGiB uint64, want int) {
|
||||
Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want))
|
||||
},
|
||||
Entry("GB10 119 GiB", uint64(119), 8),
|
||||
Entry("48 GiB", uint64(48), 8),
|
||||
Entry("24 GiB", uint64(24), 4),
|
||||
Entry("8 GiB", uint64(8), 4),
|
||||
Entry("6 GiB", uint64(6), 2),
|
||||
Entry("2 GiB", uint64(2), 1),
|
||||
Entry("unknown 0", uint64(0), 1),
|
||||
)
|
||||
|
||||
Describe("ApplyHardwareDefaults parallel slots", func() {
|
||||
It("adds a VRAM-scaled parallel option on a capable GPU", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib})
|
||||
Expect(cfg.Options).To(ContainElement("parallel:8"))
|
||||
})
|
||||
It("scales the slot count down with VRAM", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib})
|
||||
Expect(cfg.Options).To(ContainElement("parallel:4"))
|
||||
})
|
||||
It("adds no parallel option on small/unknown VRAM", func() {
|
||||
cfg := &ModelConfig{}
|
||||
ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib})
|
||||
Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel")))
|
||||
})
|
||||
It("never overrides an explicit parallel option", func() {
|
||||
cfg := &ModelConfig{Options: []string{"parallel:2"}}
|
||||
ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib})
|
||||
Expect(cfg.Options).To(Equal([]string{"parallel:2"}))
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
@@ -150,14 +150,17 @@ func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) {
|
||||
if opts == nil || node == nil {
|
||||
return
|
||||
}
|
||||
if !config.IsManagedPhysicalBatch(int(opts.NBatch)) {
|
||||
return
|
||||
}
|
||||
opts.NBatch = int32(config.PhysicalBatch(config.GPU{
|
||||
gpu := config.GPU{
|
||||
Vendor: node.GPUVendor,
|
||||
ComputeCapability: node.GPUComputeCapability,
|
||||
VRAM: node.TotalVRAM,
|
||||
}))
|
||||
}
|
||||
if config.IsManagedPhysicalBatch(int(opts.NBatch)) {
|
||||
opts.NBatch = int32(config.PhysicalBatch(gpu))
|
||||
}
|
||||
// Default concurrent serving for the selected node (the frontend that built
|
||||
// the options may have no GPU). Only adds when no parallel option is set.
|
||||
opts.Options = config.EnsureParallelOption(opts.Options, gpu)
|
||||
}
|
||||
|
||||
// scheduleAndLoad is the shared core for loading a model on a new node.
|
||||
|
||||
@@ -27,6 +27,19 @@ var _ = Describe("applyNodeHardwareDefaults", func() {
|
||||
Expect(opts.NBatch).To(BeEquivalentTo(int32(1024)))
|
||||
})
|
||||
|
||||
It("adds a VRAM-scaled parallel option for the selected node", func() {
|
||||
// frontend may have had no GPU (no parallel option); the node has a big GPU
|
||||
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch}
|
||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})
|
||||
Expect(opts.Options).To(ContainElement("parallel:8"))
|
||||
})
|
||||
|
||||
It("never overrides an explicit parallel option on the node path", func() {
|
||||
opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, Options: []string{"parallel:2"}}
|
||||
applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30})
|
||||
Expect(opts.Options).To(Equal([]string{"parallel:2"}))
|
||||
})
|
||||
|
||||
It("no-ops on nil inputs", func() {
|
||||
Expect(func() { applyNodeHardwareDefaults(nil, nil) }).ToNot(Panic())
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user