From 6715d75f227fe1790418337e4b6ff0dcc249e87f Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Sat, 20 Jun 2026 09:35:04 +0000 Subject: [PATCH] feat(config): default concurrent serving (n_parallel) by GPU VRAM The llama.cpp backend defaults n_parallel=1, which serializes multi-user requests and leaves continuous batching off (it auto-enables only at n_parallel>1). Fold a VRAM-scaled parallel-slot default into the hardware-config path so multi-user serving works out of the box: >=32GiB->8, >=8GiB->4, >=4GiB->2, else unchanged. With the backend's unified KV the slots SHARE the context budget, so this adds concurrency without multiplying KV memory. Explicit parallel/n_parallel always wins. EnsureParallelOption is shared by the single-host path (ApplyHardwareDefaults with the local GPU) and the distributed router (per selected node's reported VRAM, since the frontend may have no GPU). LocalGPU now also reports VRAM. Assisted-by: Claude:opus-4.8 [Claude Code] Signed-off-by: Ettore Di Giacinto --- core/config/hardware_defaults.go | 71 +++++++++++++++++++ core/config/hardware_defaults_test.go | 38 ++++++++++ core/services/nodes/router.go | 13 ++-- .../nodes/router_hardware_internal_test.go | 13 ++++ 4 files changed, 130 insertions(+), 5 deletions(-) diff --git a/core/config/hardware_defaults.go b/core/config/hardware_defaults.go index d4c6b09f7..2ed54265f 100644 --- a/core/config/hardware_defaults.go +++ b/core/config/hardware_defaults.go @@ -1,6 +1,7 @@ package config import ( + "fmt" "strconv" "strings" @@ -70,15 +71,72 @@ func IsManagedPhysicalBatch(n int) bool { return n == DefaultPhysicalBatch || n == BlackwellPhysicalBatch } +// Parallel-slot (n_parallel) VRAM tiers. llama.cpp serializes requests at +// n_parallel=1 (the backend default) and only auto-enables continuous batching +// when n_parallel > 1 — so a single-slot default makes concurrent requests +// queue. We default a slot count by GPU size so multi-user serving works out of +// the box. With the backend's unified KV cache the slots SHARE the context +// budget, so more slots add concurrency without multiplying KV memory. +const ( + parallelSlotsVRAMHigh = uint64(32) << 30 // >=32 GiB -> 8 slots + parallelSlotsVRAMMid = uint64(8) << 30 // >=8 GiB -> 4 slots + parallelSlotsVRAMLow = uint64(4) << 30 // >=4 GiB -> 2 slots +) + +// DefaultParallelSlots returns the n_parallel default for the given GPU. Returns +// 1 (no concurrency) when VRAM is unknown or too small, so we never change +// behavior on CPU-only / tiny devices. +func DefaultParallelSlots(g GPU) int { + switch { + case g.VRAM >= parallelSlotsVRAMHigh: + return 8 + case g.VRAM >= parallelSlotsVRAMMid: + return 4 + case g.VRAM >= parallelSlotsVRAMLow: + return 2 + default: + return 1 + } +} + +// EnsureParallelOption appends a VRAM-scaled "parallel:N" backend option when the +// model doesn't already set one (and the GPU warrants concurrency). Returns the +// possibly-extended options. Shared by the single-host config path +// (ApplyHardwareDefaults) and the distributed router (per selected node). +func EnsureParallelOption(opts []string, gpu GPU) []string { + if slots := DefaultParallelSlots(gpu); slots > 1 && !hasParallelOption(opts) { + return append(opts, fmt.Sprintf("parallel:%d", slots)) + } + return opts +} + +// hasParallelOption reports whether the model already sets parallel/n_parallel +// (backend options are "name:value" strings) so we never override an explicit value. +func hasParallelOption(opts []string) bool { + for _, o := range opts { + name := o + if i := strings.IndexByte(o, ':'); i >= 0 { + name = o[:i] + } + switch strings.TrimSpace(strings.ToLower(name)) { + case "parallel", "n_parallel": + return true + } + } + return false +} + // localGPU builds a GPU descriptor from local detection, used by SetDefaults on // a single host (the distributed router builds it from the selected node's // reported info instead). It is a package var so tests can inject a // deterministic device — detection does a live nvidia-smi call. var localGPU = func() GPU { vendor, _ := xsysinfo.DetectGPUVendor() + vram, _ := xsysinfo.TotalAvailableVRAM() return GPU{ Vendor: vendor, ComputeCapability: xsysinfo.NVIDIAComputeCapability(), + VRAM: vram, } } @@ -94,6 +152,19 @@ func ApplyHardwareDefaults(cfg *ModelConfig, gpu GPU) { xlog.Debug("[hardware_defaults] Blackwell GPU: defaulting physical batch", "batch", cfg.Batch, "compute_cap", gpu.ComputeCapability) } + + // Enable concurrent serving by default on a capable GPU: without this the + // llama.cpp backend runs n_parallel=1 and serializes multi-user requests + // (continuous batching stays off). Unified KV means the slots share the + // context budget, so this is concurrency without extra KV memory. Explicit + // parallel/n_parallel in the model options always wins. + if before := len(cfg.Options); true { + cfg.Options = EnsureParallelOption(cfg.Options, gpu) + if len(cfg.Options) > before { + xlog.Debug("[hardware_defaults] defaulting parallel slots for concurrent serving", + "option", cfg.Options[len(cfg.Options)-1], "vram_gib", gpu.VRAM>>30) + } + } } // parseComputeCapability splits a "major.minor" string into integer parts. diff --git a/core/config/hardware_defaults_test.go b/core/config/hardware_defaults_test.go index 3d15ef14b..ae7bf3964 100644 --- a/core/config/hardware_defaults_test.go +++ b/core/config/hardware_defaults_test.go @@ -56,4 +56,42 @@ var _ = Describe("Hardware-driven config defaults", func() { Expect(func() { ApplyHardwareDefaults(nil, GPU{ComputeCapability: "12.1"}) }).ToNot(Panic()) }) }) + + const gib = uint64(1) << 30 + + DescribeTable("DefaultParallelSlots (by VRAM)", + func(vramGiB uint64, want int) { + Expect(DefaultParallelSlots(GPU{VRAM: vramGiB * gib})).To(Equal(want)) + }, + Entry("GB10 119 GiB", uint64(119), 8), + Entry("48 GiB", uint64(48), 8), + Entry("24 GiB", uint64(24), 4), + Entry("8 GiB", uint64(8), 4), + Entry("6 GiB", uint64(6), 2), + Entry("2 GiB", uint64(2), 1), + Entry("unknown 0", uint64(0), 1), + ) + + Describe("ApplyHardwareDefaults parallel slots", func() { + It("adds a VRAM-scaled parallel option on a capable GPU", func() { + cfg := &ModelConfig{} + ApplyHardwareDefaults(cfg, GPU{ComputeCapability: "12.1", VRAM: 119 * gib}) + Expect(cfg.Options).To(ContainElement("parallel:8")) + }) + It("scales the slot count down with VRAM", func() { + cfg := &ModelConfig{} + ApplyHardwareDefaults(cfg, GPU{VRAM: 24 * gib}) + Expect(cfg.Options).To(ContainElement("parallel:4")) + }) + It("adds no parallel option on small/unknown VRAM", func() { + cfg := &ModelConfig{} + ApplyHardwareDefaults(cfg, GPU{VRAM: 2 * gib}) + Expect(cfg.Options).ToNot(ContainElement(ContainSubstring("parallel"))) + }) + It("never overrides an explicit parallel option", func() { + cfg := &ModelConfig{Options: []string{"parallel:2"}} + ApplyHardwareDefaults(cfg, GPU{VRAM: 119 * gib}) + Expect(cfg.Options).To(Equal([]string{"parallel:2"})) + }) + }) }) diff --git a/core/services/nodes/router.go b/core/services/nodes/router.go index df778a689..ccbf48f43 100644 --- a/core/services/nodes/router.go +++ b/core/services/nodes/router.go @@ -150,14 +150,17 @@ func applyNodeHardwareDefaults(opts *pb.ModelOptions, node *BackendNode) { if opts == nil || node == nil { return } - if !config.IsManagedPhysicalBatch(int(opts.NBatch)) { - return - } - opts.NBatch = int32(config.PhysicalBatch(config.GPU{ + gpu := config.GPU{ Vendor: node.GPUVendor, ComputeCapability: node.GPUComputeCapability, VRAM: node.TotalVRAM, - })) + } + if config.IsManagedPhysicalBatch(int(opts.NBatch)) { + opts.NBatch = int32(config.PhysicalBatch(gpu)) + } + // Default concurrent serving for the selected node (the frontend that built + // the options may have no GPU). Only adds when no parallel option is set. + opts.Options = config.EnsureParallelOption(opts.Options, gpu) } // scheduleAndLoad is the shared core for loading a model on a new node. diff --git a/core/services/nodes/router_hardware_internal_test.go b/core/services/nodes/router_hardware_internal_test.go index 361e02700..2418bf444 100644 --- a/core/services/nodes/router_hardware_internal_test.go +++ b/core/services/nodes/router_hardware_internal_test.go @@ -27,6 +27,19 @@ var _ = Describe("applyNodeHardwareDefaults", func() { Expect(opts.NBatch).To(BeEquivalentTo(int32(1024))) }) + It("adds a VRAM-scaled parallel option for the selected node", func() { + // frontend may have had no GPU (no parallel option); the node has a big GPU + opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch} + applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30}) + Expect(opts.Options).To(ContainElement("parallel:8")) + }) + + It("never overrides an explicit parallel option on the node path", func() { + opts := &pb.ModelOptions{NBatch: config.DefaultPhysicalBatch, Options: []string{"parallel:2"}} + applyNodeHardwareDefaults(opts, &BackendNode{GPUComputeCapability: "12.1", TotalVRAM: 119 << 30}) + Expect(opts.Options).To(Equal([]string{"parallel:2"})) + }) + It("no-ops on nil inputs", func() { Expect(func() { applyNodeHardwareDefaults(nil, nil) }).ToNot(Panic()) })